source: trunk/gsdl3/src/java/org/greenstone/gsdl3/service/XMLRetrieve.java@ 5945

Last change on this file since 5945 was 5264, checked in by kjdon, 21 years ago

removed unnecessary print statements

  • Property svn:keywords set to Author Date Id Revision
File size: 14.5 KB
Line 
1package org.greenstone.gsdl3.service;
2
3
4// Greenstone classes
5import org.greenstone.gsdl3.util.*;
6
7// XML classes
8import org.w3c.dom.Document;
9import org.w3c.dom.Element;
10import org.w3c.dom.Node;
11import org.w3c.dom.Attr;
12import org.w3c.dom.Text;
13import org.w3c.dom.NodeList;
14import org.w3c.dom.NamedNodeMap;
15
16// General Java classes
17import java.io.File;
18import java.util.Vector;
19import java.util.HashMap;
20
21
22public class XMLRetrieve extends ServiceRack {
23
24 protected static final String CONTENT_SERVICE = "DocumentContentRetrieve";
25 protected static final String METADATA_SERVICE = "DocumentMetadataRetrieve";
26 protected static final String STRUCTURE_SERVICE = "DocumentStructureRetrieve";
27
28 protected String toc_xsl_name = "";
29 protected String document_encoding = "";
30 protected Element collection_doc_list = null;
31
32 protected boolean provide_content = true;
33 protected boolean provide_structure = true;
34 protected boolean provide_metadata = true;
35
36
37 public boolean configure(Element info, Element extra_info) {
38
39 // look for the parameters
40 Element param_list = (Element)GSXML.getChildByTagName(info, GSXML.PARAM_ELEM+GSXML.LIST_MODIFIER);
41 HashMap params;
42 String services_to_provide = "";
43 if (param_list != null) {
44 params = GSXML.extractParams(param_list, false);
45 this.toc_xsl_name = (String)params.get("tocXSLT");
46 this.document_encoding = (String)params.get("documentEncoding");
47 services_to_provide = (String)params.get("provideServices");
48 }
49 if (this.toc_xsl_name == null || this.toc_xsl_name.equals("")) {
50 this.toc_xsl_name = "default_toc";
51 }
52 this.toc_xsl_name = this.toc_xsl_name+".xsl";
53
54 if (this.document_encoding == null || this.document_encoding.equals("")) {
55 this.document_encoding = "UTF-8";
56 }
57
58 if (services_to_provide != null && !services_to_provide.equals("")) {
59 if (services_to_provide.indexOf("content")==-1) {
60 provide_content = false;
61 }
62 if (services_to_provide.indexOf("metadata")==-1) {
63 provide_metadata = false;
64 }
65 if (services_to_provide.indexOf("structure")==-1) {
66 provide_structure = false;
67 }
68
69 }
70
71 // set up short_service_info_ - for now just has name and type
72 Element retrieve_service;
73 if (provide_content) {
74 retrieve_service = this.doc.createElement(GSXML.SERVICE_ELEM);
75 retrieve_service.setAttribute(GSXML.TYPE_ATT, GSXML.SERVICE_TYPE_RETRIEVE);
76 retrieve_service.setAttribute(GSXML.NAME_ATT, CONTENT_SERVICE);
77 this.short_service_info.appendChild(retrieve_service);
78 }
79 if (provide_metadata) {
80 retrieve_service = this.doc.createElement(GSXML.SERVICE_ELEM);
81 retrieve_service.setAttribute(GSXML.TYPE_ATT, GSXML.SERVICE_TYPE_RETRIEVE);
82 retrieve_service.setAttribute(GSXML.NAME_ATT, METADATA_SERVICE);
83 this.short_service_info.appendChild(retrieve_service);
84 }
85 if (provide_structure) {
86 retrieve_service = this.doc.createElement(GSXML.SERVICE_ELEM);
87 retrieve_service.setAttribute(GSXML.TYPE_ATT, GSXML.SERVICE_TYPE_RETRIEVE);
88 retrieve_service.setAttribute(GSXML.NAME_ATT, STRUCTURE_SERVICE);
89 this.short_service_info.appendChild(retrieve_service);
90 }
91 // find the doc list from the extra_info and keep it - should this be in collect.cfg or build.cfg??
92 collection_doc_list = (Element)GSXML.getChildByTagName(extra_info, GSXML.DOCUMENT_ELEM+GSXML.LIST_MODIFIER);
93
94 return true;
95 }
96
97 // this may get called but is not useful in the case of retrieve services
98 protected Element getServiceDescription(String service_id, String lang, String subset) {
99
100 Element retrieve_service = this.doc.createElement(GSXML.SERVICE_ELEM);
101 retrieve_service.setAttribute(GSXML.TYPE_ATT, GSXML.SERVICE_TYPE_RETRIEVE);
102 retrieve_service.setAttribute(GSXML.NAME_ATT, service_id);
103 return retrieve_service;
104 }
105
106 protected Element processDocumentContentRetrieve(Element request) {
107 Element result = this.doc.createElement(GSXML.RESPONSE_ELEM);
108 result.setAttribute(GSXML.FROM_ATT, CONTENT_SERVICE);
109 result.setAttribute(GSXML.TYPE_ATT, GSXML.REQUEST_TYPE_PROCESS);
110
111 Element doc_list = (Element)GSXML.getChildByTagName(request, GSXML.DOC_NODE_ELEM+GSXML.LIST_MODIFIER);
112 if (doc_list == null) {
113 return result;
114 }
115 Element result_doc_list = (Element)this.doc.importNode(doc_list, true);
116 result.appendChild(result_doc_list);
117
118 NodeList docs = result_doc_list.getElementsByTagName(GSXML.DOC_NODE_ELEM);
119 for (int i=0; i<docs.getLength(); i++) {
120
121 Element doc = (Element)docs.item(i);
122 Element content = this.doc.createElement(GSXML.NODE_CONTENT_ELEM);
123 doc.appendChild(content);
124
125 String node_id = doc.getAttribute(GSXML.NODE_ID_ATT);
126 String doc_name = getWorkName(node_id);
127
128 Element doc_elem = loadDocument(doc_name); // should perhaps cache the read in docs??
129 if (doc_elem == null) {
130 continue;
131 }
132
133 // if we have asked for the whole doc, just append it
134 if (doc_name.equals(node_id)) {
135 content.appendChild(this.doc.importNode(doc_elem, true));
136 continue;
137 }
138
139 // else we only want a sub section
140
141 Element section = getSection(doc_elem, node_id);
142 if (section != null) {
143 content.appendChild(this.doc.importNode(section, true));
144 }
145
146 } // for each doc
147
148 return result;
149
150 }
151
152 protected Element processDocumentStructureRetrieve(Element request) {
153 Element result = this.doc.createElement(GSXML.RESPONSE_ELEM);
154 result.setAttribute(GSXML.FROM_ATT, STRUCTURE_SERVICE);
155 result.setAttribute(GSXML.TYPE_ATT, GSXML.REQUEST_TYPE_PROCESS);
156
157 Element doc_list = (Element)GSXML.getChildByTagName(request, GSXML.DOC_NODE_ELEM+GSXML.LIST_MODIFIER);
158 if (doc_list == null) {
159 System.err.println("XMLRetrieve.DocumentStructureRetrieve: no documents specified in the request. ");
160 return result;
161 }
162
163 Element result_doc_list = (Element)this.doc.importNode(doc_list, true);
164 result.appendChild(result_doc_list);
165 // first look for the stylesheet in the collection
166 File stylesheet = new File(GSFile.collStylesheetFile(this.site_home, this.cluster_name, this.toc_xsl_name));
167 if (!stylesheet.exists()) {
168 // now try in the site
169 stylesheet = new File(GSFile.siteStylesheetFile(this.site_home, this.toc_xsl_name));
170 }
171 if (!stylesheet.exists()) {
172 System.err.println("XMLRetrieve.DocumentStructureRetrieve: couldn't find the stylesheet file to produce the table of contents:"+stylesheet.getPath());
173 return result;
174 }
175
176 // for now, we dont have any params, and we always return the structure of the whole document
177
178 XMLTransformer transformer = new XMLTransformer();
179 NodeList docs = result_doc_list.getElementsByTagName(GSXML.DOC_NODE_ELEM);
180
181 for (int i=0; i<docs.getLength(); i++) {
182
183 Element doc = (Element)docs.item(i);
184
185 Element structure = this.doc.createElement(GSXML.NODE_STRUCTURE_ELEM);
186 doc.appendChild(structure);
187 String doc_name = doc.getAttribute(GSXML.NODE_ID_ATT);
188 // make sure we are at the top level
189 doc_name = getWorkName(doc_name);
190
191 File doc_file = new File(GSFile.collectionIndexDir(this.site_home, this.cluster_name)+File.separator+"text"+File.separatorChar+doc_name+".xml");
192
193 if (!doc_file.exists()) {
194 System.err.println("XMLRetrieve.DocumentStructureRetrieve: couldn't find file in coll "+this.cluster_name +", file "+doc_name+".xml");
195 } else {
196 try {
197 Node toc = transformer.transform(stylesheet, doc_file);
198 structure.appendChild(this.doc.importNode(toc, true));
199 } catch (Exception e) {
200 System.err.println("XMLRetrieve.DocumentStructureRetrieve: couldn't transform the document to get the toc");
201 }
202 }
203
204 }
205
206 return result;
207
208 }
209
210 // this just extracts a bit of text from the section to use as the Title
211 // this should be overwritten for any format that has something more suitable
212 protected Element processDocumentMetadataRetrieve(Element request) {
213 Element result = this.doc.createElement(GSXML.RESPONSE_ELEM);
214 result.setAttribute(GSXML.FROM_ATT, METADATA_SERVICE);
215 result.setAttribute(GSXML.TYPE_ATT, GSXML.REQUEST_TYPE_PROCESS);
216
217 Element doc_list = (Element)GSXML.getChildByTagName(request, GSXML.DOC_NODE_ELEM+GSXML.LIST_MODIFIER);
218 if (doc_list == null) {
219 System.err.println("XMLRetrieve.DocumentMetadataRetrieve: no documents in the request");
220 return result;
221 }
222
223 Element result_doc_list = (Element)this.doc.importNode(doc_list, true);
224 result.appendChild(result_doc_list);
225
226 Element param_list = (Element) GSXML.getChildByTagName(request, GSXML.PARAM_ELEM+GSXML.LIST_MODIFIER);
227 if (param_list == null) {
228 System.err.println("XMLRetrieve.DocumentMetadataRetrieve: no metadata in the request");
229 return result;
230 }
231
232 Vector meta_name_list = new Vector();
233 boolean all_metadata = false;
234 // Process the request parameters
235 Element param = (Element) param_list.getFirstChild();
236 while (param != null) {
237 // Identify the metadata information desired
238 if (param.getAttribute(GSXML.NAME_ATT).equals("metadata")) {
239 String metadata = GSXML.getValue(param);
240 if (metadata.equals("all")) {
241 all_metadata = true;
242 break;
243 }
244 meta_name_list.add(metadata);
245 }
246 param = (Element) param.getNextSibling();
247 }
248
249 NodeList docs = result_doc_list.getElementsByTagName(GSXML.DOC_NODE_ELEM);
250 for (int i=0; i<docs.getLength(); i++) {
251 Element doc = (Element)docs.item(i);
252 String node_id = doc.getAttribute(GSXML.NODE_ID_ATT);
253 String doc_name = getWorkName(node_id);
254
255 Element metadata_list = getMetadata(node_id, all_metadata, meta_name_list);
256 doc.appendChild(metadata_list);
257 }
258
259 return result;
260 }
261
262 protected Element loadDocument(String doc_name) {
263 // try to find the document
264 File doc_file = new File(GSFile.collectionIndexDir(this.site_home, this.cluster_name)+File.separator+"text"+File.separatorChar+doc_name+".xml");
265
266 if (!doc_file.exists()) {
267 System.out.println("XMLRetrieve.loadDocument: couldn't find file in coll "+this.cluster_name +", file "+doc_name+".xml");
268 return null;
269 }
270
271 Document the_doc = null;
272 try {
273 the_doc = this.converter.getDOM(doc_file, this.document_encoding);
274 } catch (Exception e) {
275 System.err.println("XMLRetrieve.loadDocument: couldn't create a DOM from file "+doc_file.getPath());
276 return null;
277 }
278
279 return the_doc.getDocumentElement();
280
281 }
282
283
284 protected Element getSection(Element doc_elem, String node_id) {
285 String [] bits = node_id.split("\\.");
286 if (bits.length > 4) {
287 System.err.println("XMLRetrieve.getSection: badly formatted node id ("+node_id +"), cant retrieve the section");
288 return null;
289 }
290
291 String id="";
292 String tagname = "";
293 String scope = "";
294 if (bits.length==2) {
295 tagname = bits[1];
296 } else {
297 scope = bits[1];
298 tagname = bits[2];
299
300 if (bits.length == 4) {
301 id = bits[3];
302 }
303 }
304 scope = translateScope(scope);
305 Element top=null;
306 if (!scope.equals("")) {
307 top = (Element)GSXML.getNodeByPath(doc_elem, scope);
308 if (top == null) {
309 // something gone wrong
310 return null;
311 }
312 } else {
313 top = doc_elem;
314 }
315
316 NodeList elements = top.getElementsByTagName(tagname);
317 if (elements.getLength() == 0) {
318 return null;
319 }
320 // no id, just return the first one
321 if (id.equals("")) {
322 return (Element)elements.item(0);
323 }
324 // have an id, need to check and find the right one.
325 for (int i=0; i<elements.getLength();i++) {
326 Element e = (Element)elements.item(i);
327 if (e.getAttribute("gs3:id").equals(id)) {
328 return e;
329 }
330 }
331 return null;
332
333 }
334
335 protected Element getMetadata(String node_id, boolean all, Vector meta_name_list) {
336
337 // our default strategy here is to only return Title and root:Title
338 // ignore all others
339 // the title of a section is just a little bit of the text inside it.
340 // the root:Title is the title from the doc info in the config file
341 Element metadata_list = this.doc.createElement(GSXML.METADATA_ELEM+ GSXML.LIST_MODIFIER);
342 String doc_name = getWorkName(node_id);
343 boolean node_is_root = false;
344 if (doc_name.equals(node_id)) {
345 node_is_root = true;
346 }
347
348 Element this_doc = GSXML.getNamedElement(this.collection_doc_list, GSXML.DOCUMENT_ELEM, GSXML.NAME_ATT, doc_name);
349 Element doc_meta_list = (Element) GSXML.getChildByTagName(this_doc, GSXML.METADATA_ELEM+GSXML.LIST_MODIFIER);
350
351 boolean get_section_title = false;
352
353 if (all) {
354 if (node_is_root) {
355 return (Element)this.doc.importNode(doc_meta_list, true);
356 } else {
357 get_section_title = true;
358 }
359
360 } else {
361 // have to process metadata one by one
362 for (int i=0; i<meta_name_list.size(); i++) {
363 String meta_name = (String) meta_name_list.elementAt(i);
364 String actual_meta_name = meta_name;
365 if (meta_name.startsWith("root:")) {
366 actual_meta_name = meta_name.substring(5);
367 } else {
368 // its a section level one - check to see if doc is root
369 if (!node_is_root) {
370 if (meta_name.equals("Title")) {
371 get_section_title = true;
372 }
373 continue; // move on to teh next metadata
374 }
375 }
376
377 // here, we look for the specific meta elem in doc_meta_list
378 Element meta_item = GSXML.getNamedElement(doc_meta_list, GSXML.METADATA_ELEM, GSXML.NAME_ATT, actual_meta_name);
379 if (meta_item != null) {
380 meta_item = (Element)this.doc.importNode(meta_item, true);
381 meta_item.setAttribute(GSXML.NAME_ATT, meta_name);
382 metadata_list.appendChild(meta_item);
383 }
384 } // for each metadata
385 }
386
387 // now we have processed all teh doc metadata, just have section one to go, if needed
388 if (get_section_title) {
389
390 Element doc_elem = loadDocument(doc_name);
391 if (doc_elem != null) {
392 Element section = getSection(doc_elem, node_id);
393 if (section != null) {
394 Element title_meta = extractTitleMeta(section);
395 metadata_list.appendChild(title_meta);
396
397 }
398 }
399
400 }
401 return metadata_list;
402 }
403
404 protected Element extractTitleMeta(Element section) {
405 Element meta_elem = this.doc.createElement(GSXML.METADATA_ELEM);
406 meta_elem.setAttribute(GSXML.NAME_ATT, "Title");
407
408 String title = "dummy title";
409 Text t = this.doc.createTextNode(title);
410 meta_elem.appendChild(t);
411 return meta_elem;
412
413 }
414 // some methods for handling nodeIDs - they may be different for different colls, so they can be overwritten
415
416 // the full default nodeID looks like work.scope.tag.id
417 // the shorter versions are work, work.tag, work.scope.tag
418 protected String getWorkName(String node_id) {
419 int pos = node_id.indexOf('.');
420 if (pos == -1) {
421 return node_id;
422 }
423 return node_id.substring(0, pos);
424 }
425
426 // this assumes that the scope refers to a top level node - this may be overwritten if the scope bit in the id is a shorthand of some sort
427 protected String translateScope(String scope) {
428 return scope;
429 }
430
431}
Note: See TracBrowser for help on using the repository browser.