source: trunk/gsdl3/src/java/org/greenstone/gsdl3/service/XMLRetrieve.java@ 6872

Last change on this file since 6872 was 6872, checked in by kjdon, 20 years ago

fixed a comment

  • Property svn:keywords set to Author Date Id Revision
File size: 14.8 KB
Line 
1package org.greenstone.gsdl3.service;
2
3
4// Greenstone classes
5import org.greenstone.gsdl3.util.*;
6
7// XML classes
8import org.w3c.dom.Document;
9import org.w3c.dom.Element;
10import org.w3c.dom.Node;
11import org.w3c.dom.Attr;
12import org.w3c.dom.Text;
13import org.w3c.dom.NodeList;
14import org.w3c.dom.NamedNodeMap;
15
16// General Java classes
17import java.io.File;
18import java.util.Vector;
19import java.util.HashMap;
20
21
22public class XMLRetrieve extends ServiceRack {
23
24 protected static final String CONTENT_SERVICE = "DocumentContentRetrieve";
25 protected static final String METADATA_SERVICE = "DocumentMetadataRetrieve";
26 protected static final String STRUCTURE_SERVICE = "DocumentStructureRetrieve";
27
28 protected String toc_xsl_name = "";
29 protected String document_encoding = "";
30 protected String document_root_tag = "";
31
32 protected Element collection_doc_list = null;
33
34 protected boolean provide_content = true;
35 protected boolean provide_structure = true;
36 protected boolean provide_metadata = true;
37
38
39 public boolean configure(Element info, Element extra_info) {
40
41 System.out.println("configuring XMLRetrieve...");
42 // look for the parameters
43 Element param_list = (Element)GSXML.getChildByTagName(info, GSXML.PARAM_ELEM+GSXML.LIST_MODIFIER);
44 HashMap params;
45 String services_to_provide = "";
46 if (param_list != null) {
47 params = GSXML.extractParams(param_list, false);
48 this.toc_xsl_name = (String)params.get("tocXSLT");
49 this.document_encoding = (String)params.get("documentEncoding");
50 this.document_root_tag = (String)params.get("documentRootTag");
51 services_to_provide = (String)params.get("provideServices");
52 }
53 if (this.toc_xsl_name == null || this.toc_xsl_name.equals("")) {
54 this.toc_xsl_name = "default_toc";
55 }
56 this.toc_xsl_name = this.toc_xsl_name+".xsl";
57
58 if (this.document_encoding == null || this.document_encoding.equals("")) {
59 this.document_encoding = "UTF-8";
60 }
61
62 if (services_to_provide != null && !services_to_provide.equals("")) {
63 if (services_to_provide.indexOf("content")==-1) {
64 provide_content = false;
65 }
66 if (services_to_provide.indexOf("metadata")==-1) {
67 provide_metadata = false;
68 }
69 if (services_to_provide.indexOf("structure")==-1) {
70 provide_structure = false;
71 }
72
73 }
74
75 // set up short_service_info_ - for now just has name and type
76 Element retrieve_service;
77 if (provide_content) {
78 retrieve_service = this.doc.createElement(GSXML.SERVICE_ELEM);
79 retrieve_service.setAttribute(GSXML.TYPE_ATT, GSXML.SERVICE_TYPE_RETRIEVE);
80 retrieve_service.setAttribute(GSXML.NAME_ATT, CONTENT_SERVICE);
81 this.short_service_info.appendChild(retrieve_service);
82 }
83 if (provide_metadata) {
84 retrieve_service = this.doc.createElement(GSXML.SERVICE_ELEM);
85 retrieve_service.setAttribute(GSXML.TYPE_ATT, GSXML.SERVICE_TYPE_RETRIEVE);
86 retrieve_service.setAttribute(GSXML.NAME_ATT, METADATA_SERVICE);
87 this.short_service_info.appendChild(retrieve_service);
88 }
89 if (provide_structure) {
90 retrieve_service = this.doc.createElement(GSXML.SERVICE_ELEM);
91 retrieve_service.setAttribute(GSXML.TYPE_ATT, GSXML.SERVICE_TYPE_RETRIEVE);
92 retrieve_service.setAttribute(GSXML.NAME_ATT, STRUCTURE_SERVICE);
93 this.short_service_info.appendChild(retrieve_service);
94 }
95 // find the doc list from the extra_info and keep it - should this be in collect.cfg or build.cfg??
96 collection_doc_list = (Element)GSXML.getChildByTagName(extra_info, GSXML.DOCUMENT_ELEM+GSXML.LIST_MODIFIER);
97
98 this.converter.setEntityResolver(new GSEntityResolver());
99 return true;
100 }
101
102 // this may get called but is not useful in the case of retrieve services
103 protected Element getServiceDescription(String service_id, String lang, String subset) {
104
105 Element retrieve_service = this.doc.createElement(GSXML.SERVICE_ELEM);
106 retrieve_service.setAttribute(GSXML.TYPE_ATT, GSXML.SERVICE_TYPE_RETRIEVE);
107 retrieve_service.setAttribute(GSXML.NAME_ATT, service_id);
108 return retrieve_service;
109 }
110
111 protected Element processDocumentContentRetrieve(Element request) {
112 Element result = this.doc.createElement(GSXML.RESPONSE_ELEM);
113 result.setAttribute(GSXML.FROM_ATT, CONTENT_SERVICE);
114 result.setAttribute(GSXML.TYPE_ATT, GSXML.REQUEST_TYPE_PROCESS);
115
116 Element doc_list = (Element)GSXML.getChildByTagName(request, GSXML.DOC_NODE_ELEM+GSXML.LIST_MODIFIER);
117 if (doc_list == null) {
118 return result;
119 }
120 Element result_doc_list = (Element)this.doc.importNode(doc_list, true);
121 result.appendChild(result_doc_list);
122
123 NodeList docs = result_doc_list.getElementsByTagName(GSXML.DOC_NODE_ELEM);
124 for (int i=0; i<docs.getLength(); i++) {
125
126 Element doc = (Element)docs.item(i);
127 Element content = this.doc.createElement(GSXML.NODE_CONTENT_ELEM);
128 doc.appendChild(content);
129
130 String node_id = doc.getAttribute(GSXML.NODE_ID_ATT);
131 String doc_name = getWorkName(node_id);
132
133 Element doc_elem = loadDocument(doc_name); // should perhaps cache the read in docs??
134 if (doc_elem == null) {
135 continue;
136 }
137
138
139 // if we have asked for the whole doc, just append it
140 if (doc_name.equals(node_id)) {
141 content.appendChild(this.doc.importNode(doc_elem, true));
142 continue;
143 }
144
145 // else we only want a sub section
146
147 Element section = getSection(doc_elem, node_id);
148 if (section != null) {
149 content.appendChild(this.doc.importNode(section, true));
150 }
151
152 } // for each doc
153
154 return result;
155
156 }
157
158 protected Element processDocumentStructureRetrieve(Element request) {
159 Element result = this.doc.createElement(GSXML.RESPONSE_ELEM);
160 result.setAttribute(GSXML.FROM_ATT, STRUCTURE_SERVICE);
161 result.setAttribute(GSXML.TYPE_ATT, GSXML.REQUEST_TYPE_PROCESS);
162
163 Element doc_list = (Element)GSXML.getChildByTagName(request, GSXML.DOC_NODE_ELEM+GSXML.LIST_MODIFIER);
164 if (doc_list == null) {
165 System.err.println("XMLRetrieve.DocumentStructureRetrieve: no documents specified in the request. ");
166 return result;
167 }
168
169 Element result_doc_list = (Element)this.doc.importNode(doc_list, true);
170 result.appendChild(result_doc_list);
171 // first look for the stylesheet in the collection
172 File stylesheet = new File(GSFile.collStylesheetFile(this.site_home, this.cluster_name, this.toc_xsl_name));
173 if (!stylesheet.exists()) {
174 // now try in the site
175 stylesheet = new File(GSFile.siteStylesheetFile(this.site_home, this.toc_xsl_name));
176 }
177 if (!stylesheet.exists()) {
178 System.err.println("XMLRetrieve.DocumentStructureRetrieve: couldn't find the stylesheet file to produce the table of contents:"+stylesheet.getPath());
179 return result;
180 }
181
182 // for now, we dont have any params, and we always return the structure of the whole document
183
184 XMLTransformer transformer = new XMLTransformer();
185 NodeList docs = result_doc_list.getElementsByTagName(GSXML.DOC_NODE_ELEM);
186
187 for (int i=0; i<docs.getLength(); i++) {
188
189 Element doc = (Element)docs.item(i);
190
191 Element structure = this.doc.createElement(GSXML.NODE_STRUCTURE_ELEM);
192 doc.appendChild(structure);
193 String doc_name = doc.getAttribute(GSXML.NODE_ID_ATT);
194 // make sure we are at the top level
195 doc_name = getWorkName(doc_name);
196
197 File doc_file = new File(GSFile.collectionIndexDir(this.site_home, this.cluster_name)+File.separator+"text"+File.separatorChar+doc_name+".xml");
198
199 if (!doc_file.exists()) {
200 System.err.println("XMLRetrieve.DocumentStructureRetrieve: couldn't find file in coll "+this.cluster_name +", file "+doc_name+".xml");
201 } else {
202 try {
203 Node toc = transformer.transform(stylesheet, doc_file);
204 structure.appendChild(this.doc.importNode(toc, true));
205 } catch (Exception e) {
206 System.err.println("XMLRetrieve.DocumentStructureRetrieve: couldn't transform the document to get the toc");
207 }
208 }
209
210 }
211
212 return result;
213
214 }
215
216 // this just extracts a bit of text from the section to use as the Title
217 // this should be overwritten for any format that has something more suitable
218 protected Element processDocumentMetadataRetrieve(Element request) {
219 Element result = this.doc.createElement(GSXML.RESPONSE_ELEM);
220 result.setAttribute(GSXML.FROM_ATT, METADATA_SERVICE);
221 result.setAttribute(GSXML.TYPE_ATT, GSXML.REQUEST_TYPE_PROCESS);
222
223 Element doc_list = (Element)GSXML.getChildByTagName(request, GSXML.DOC_NODE_ELEM+GSXML.LIST_MODIFIER);
224 if (doc_list == null) {
225 System.err.println("XMLRetrieve.DocumentMetadataRetrieve: no documents in the request");
226 return result;
227 }
228
229 Element result_doc_list = (Element)this.doc.importNode(doc_list, true);
230 result.appendChild(result_doc_list);
231
232 Element param_list = (Element) GSXML.getChildByTagName(request, GSXML.PARAM_ELEM+GSXML.LIST_MODIFIER);
233 if (param_list == null) {
234 System.err.println("XMLRetrieve.DocumentMetadataRetrieve: no metadata in the request");
235 return result;
236 }
237
238 Vector meta_name_list = new Vector();
239 boolean all_metadata = false;
240 // Process the request parameters
241 Element param = (Element) param_list.getFirstChild();
242 while (param != null) {
243 // Identify the metadata information desired
244 if (param.getAttribute(GSXML.NAME_ATT).equals("metadata")) {
245 String metadata = GSXML.getValue(param);
246 if (metadata.equals("all")) {
247 all_metadata = true;
248 break;
249 }
250 meta_name_list.add(metadata);
251 }
252 param = (Element) param.getNextSibling();
253 }
254
255 NodeList docs = result_doc_list.getElementsByTagName(GSXML.DOC_NODE_ELEM);
256 for (int i=0; i<docs.getLength(); i++) {
257 Element doc = (Element)docs.item(i);
258 String node_id = doc.getAttribute(GSXML.NODE_ID_ATT);
259 String doc_name = getWorkName(node_id);
260
261 Element metadata_list = getMetadata(node_id, all_metadata, meta_name_list);
262 doc.appendChild(metadata_list);
263 }
264
265 return result;
266 }
267
268 protected Element loadDocument(String doc_name) {
269 // try to find the document
270 File doc_file = new File(GSFile.collectionIndexDir(this.site_home, this.cluster_name)+File.separator+"text"+File.separatorChar+doc_name+".xml");
271
272 if (!doc_file.exists()) {
273 System.out.println("XMLRetrieve.loadDocument: couldn't find file in coll "+this.cluster_name +", file "+doc_name+".xml");
274 return null;
275 }
276
277 Document the_doc = null;
278 try {
279 the_doc = this.converter.getDOM(doc_file, this.document_encoding);
280 } catch (Exception e) {
281 System.err.println("XMLRetrieve.loadDocument: couldn't create a DOM from file "+doc_file.getPath());
282 return null;
283 }
284
285 return the_doc.getDocumentElement();
286
287 }
288
289
290 protected Element getSection(Element doc_elem, String node_id) {
291 String [] bits = node_id.split("\\.");
292 if (bits.length > 4) {
293 System.err.println("XMLRetrieve.getSection: badly formatted node id ("+node_id +"), cant retrieve the section");
294 return null;
295 }
296
297 String id="";
298 String tagname = "";
299 String scope = "";
300 if (bits.length==2) {
301 tagname = bits[1];
302 } else {
303 scope = bits[1];
304 tagname = bits[2];
305
306 if (bits.length == 4) {
307 id = bits[3];
308 }
309 }
310 scope = translateScope(scope);
311 Element top=null;
312 if (!scope.equals("")) {
313 top = (Element)GSXML.getNodeByPath(doc_elem, scope);
314 if (top == null) {
315 // something gone wrong
316 return null;
317 }
318 } else {
319 top = doc_elem;
320 }
321
322 NodeList elements = top.getElementsByTagName(tagname);
323 if (elements.getLength() == 0) {
324 return null;
325 }
326 // no id, just return the first one
327 if (id.equals("")) {
328 return (Element)elements.item(0);
329 }
330 // have an id, need to check and find the right one.
331 for (int i=0; i<elements.getLength();i++) {
332 Element e = (Element)elements.item(i);
333 if (e.getAttribute("gs3:id").equals(id)) {
334 return e;
335 }
336 }
337 return null;
338
339 }
340
341 protected Element getMetadata(String node_id, boolean all, Vector meta_name_list) {
342
343 // our default strategy here is to only return Title and root:Title
344 // ignore all others
345 // the title of a section is just a little bit of the text inside it.
346 // the root_Title is the title from the doc info in the config file
347 Element metadata_list = this.doc.createElement(GSXML.METADATA_ELEM+ GSXML.LIST_MODIFIER);
348 String doc_name = getWorkName(node_id);
349 boolean node_is_root = false;
350 if (doc_name.equals(node_id)) {
351 node_is_root = true;
352 }
353
354 Element this_doc = GSXML.getNamedElement(this.collection_doc_list, GSXML.DOCUMENT_ELEM, GSXML.NAME_ATT, doc_name);
355 Element doc_meta_list = (Element) GSXML.getChildByTagName(this_doc, GSXML.METADATA_ELEM+GSXML.LIST_MODIFIER);
356
357 boolean get_section_title = false;
358
359 if (all) {
360 if (node_is_root) {
361 return (Element)this.doc.importNode(doc_meta_list, true);
362 } else {
363 get_section_title = true;
364 }
365
366 } else {
367 // have to process metadata one by one
368 for (int i=0; i<meta_name_list.size(); i++) {
369 String meta_name = (String) meta_name_list.elementAt(i);
370 String actual_meta_name = meta_name;
371 if (meta_name.startsWith("root_")) {
372 actual_meta_name = meta_name.substring(5);
373 } else {
374 // its a section level one - check to see if doc is root
375 if (!node_is_root) {
376 if (meta_name.equals("Title")) {
377 get_section_title = true;
378 }
379 continue; // move on to teh next metadata
380 }
381 }
382
383 // here, we look for the specific meta elem in doc_meta_list
384 Element meta_item = GSXML.getNamedElement(doc_meta_list, GSXML.METADATA_ELEM, GSXML.NAME_ATT, actual_meta_name);
385 if (meta_item != null) {
386 meta_item = (Element)this.doc.importNode(meta_item, true);
387 meta_item.setAttribute(GSXML.NAME_ATT, meta_name);
388 metadata_list.appendChild(meta_item);
389 }
390 } // for each metadata
391 }
392
393 // now we have processed all teh doc metadata, just have section one to go, if needed
394 if (get_section_title) {
395
396 Element doc_elem = loadDocument(doc_name);
397 if (doc_elem != null) {
398 Element section = getSection(doc_elem, node_id);
399 if (section != null) {
400 Element title_meta = extractTitleMeta(section);
401 if (title_meta != null) {
402 metadata_list.appendChild(title_meta);
403 }
404 }
405 }
406
407 }
408 return metadata_list;
409 }
410
411 protected Element extractTitleMeta(Element section) {
412 Element meta_elem = this.doc.createElement(GSXML.METADATA_ELEM);
413 meta_elem.setAttribute(GSXML.NAME_ATT, "Title");
414
415 String title = "dummy title";
416 Text t = this.doc.createTextNode(title);
417 meta_elem.appendChild(t);
418 return meta_elem;
419
420 }
421 // some methods for handling nodeIDs - they may be different for different colls, so they can be overwritten
422
423 // the full default nodeID looks like work.scope.tag.id
424 // the shorter versions are work, work.tag, work.scope.tag
425 protected String getWorkName(String node_id) {
426 int pos = node_id.indexOf('.');
427 if (pos == -1) {
428 return node_id;
429 }
430 return node_id.substring(0, pos);
431 }
432
433 // this assumes that the scope refers to a top level node - this may be overwritten if the scope bit in the id is a shorthand of some sort
434 protected String translateScope(String scope) {
435 if (this.document_root_tag != null) {
436 return GSPath.appendLink(this.document_root_tag, scope);
437 }
438 return scope;
439 }
440
441}
442
Note: See TracBrowser for help on using the repository browser.