source: greenstone3/trunk/src/java/org/greenstone/gsdl3/service/XMLRetrieve.java@ 14225

Last change on this file since 14225 was 14225, checked in by xiao, 17 years ago

change getFirstChild() to getFirstElementChild() in case an extra line break or white space added before the first element child which might cause a cast exception.

  • Property svn:keywords set to Author Date Id Revision
File size: 14.8 KB
Line 
1package org.greenstone.gsdl3.service;
2
3
4// Greenstone classes
5import org.greenstone.gsdl3.util.*;
6
7// XML classes
8import org.w3c.dom.Document;
9import org.w3c.dom.Element;
10import org.w3c.dom.Node;
11import org.w3c.dom.Attr;
12import org.w3c.dom.Text;
13import org.w3c.dom.NodeList;
14import org.w3c.dom.NamedNodeMap;
15
16// General Java classes
17import java.io.File;
18import java.util.Vector;
19import java.util.HashMap;
20
21import org.apache.log4j.*;
22
23public class XMLRetrieve extends ServiceRack {
24
25 static Logger logger = Logger.getLogger(org.greenstone.gsdl3.service.XMLRetrieve.class.getName());
26 protected static final String CONTENT_SERVICE = "DocumentContentRetrieve";
27 protected static final String METADATA_SERVICE = "DocumentMetadataRetrieve";
28 protected static final String STRUCTURE_SERVICE = "DocumentStructureRetrieve";
29
30 protected String toc_xsl_name = "";
31 protected String document_encoding = "";
32 protected String document_root_tag = "";
33
34 protected Element collection_doc_list = null;
35
36 protected boolean provide_content = true;
37 protected boolean provide_structure = true;
38 protected boolean provide_metadata = true;
39
40
41 public boolean configure(Element info, Element extra_info) {
42 if (!super.configure(info, extra_info)){
43 return false;
44 }
45 logger.info("configuring XMLRetrieve...");
46 // look for the parameters
47 Element param_list = (Element)GSXML.getChildByTagName(info, GSXML.PARAM_ELEM+GSXML.LIST_MODIFIER);
48 HashMap params;
49 String services_to_provide = "";
50 if (param_list != null) {
51 params = GSXML.extractParams(param_list, false);
52 this.toc_xsl_name = (String)params.get("tocXSLT");
53 this.document_encoding = (String)params.get("documentEncoding");
54 this.document_root_tag = (String)params.get("documentRootTag");
55 services_to_provide = (String)params.get("provideServices");
56 }
57 if (this.toc_xsl_name == null || this.toc_xsl_name.equals("")) {
58 this.toc_xsl_name = "default_toc";
59 }
60 this.toc_xsl_name = this.toc_xsl_name+".xsl";
61
62 if (this.document_encoding == null || this.document_encoding.equals("")) {
63 this.document_encoding = "UTF-8";
64 }
65
66 if (services_to_provide != null && !services_to_provide.equals("")) {
67 if (services_to_provide.indexOf("content")==-1) {
68 provide_content = false;
69 }
70 if (services_to_provide.indexOf("metadata")==-1) {
71 provide_metadata = false;
72 }
73 if (services_to_provide.indexOf("structure")==-1) {
74 provide_structure = false;
75 }
76
77 }
78
79 // set up short_service_info_ - for now just has name and type
80 Element retrieve_service;
81 if (provide_content) {
82 retrieve_service = this.doc.createElement(GSXML.SERVICE_ELEM);
83 retrieve_service.setAttribute(GSXML.TYPE_ATT, GSXML.SERVICE_TYPE_RETRIEVE);
84 retrieve_service.setAttribute(GSXML.NAME_ATT, CONTENT_SERVICE);
85 this.short_service_info.appendChild(retrieve_service);
86 }
87 if (provide_metadata) {
88 retrieve_service = this.doc.createElement(GSXML.SERVICE_ELEM);
89 retrieve_service.setAttribute(GSXML.TYPE_ATT, GSXML.SERVICE_TYPE_RETRIEVE);
90 retrieve_service.setAttribute(GSXML.NAME_ATT, METADATA_SERVICE);
91 this.short_service_info.appendChild(retrieve_service);
92 }
93 if (provide_structure) {
94 retrieve_service = this.doc.createElement(GSXML.SERVICE_ELEM);
95 retrieve_service.setAttribute(GSXML.TYPE_ATT, GSXML.SERVICE_TYPE_RETRIEVE);
96 retrieve_service.setAttribute(GSXML.NAME_ATT, STRUCTURE_SERVICE);
97 this.short_service_info.appendChild(retrieve_service);
98 }
99 // find the doc list from the extra_info and keep it - should this be in collect.cfg or build.cfg??
100 collection_doc_list = (Element)GSXML.getChildByTagName(extra_info, GSXML.DOCUMENT_ELEM+GSXML.LIST_MODIFIER);
101
102 GSEntityResolver resolver = new GSEntityResolver();
103 resolver.setClassLoader(this.class_loader);
104 this.converter.setEntityResolver(resolver);
105 return true;
106 }
107
108 // this may get called but is not useful in the case of retrieve services
109 protected Element getServiceDescription(String service_id, String lang, String subset) {
110
111 Element retrieve_service = this.doc.createElement(GSXML.SERVICE_ELEM);
112 retrieve_service.setAttribute(GSXML.TYPE_ATT, GSXML.SERVICE_TYPE_RETRIEVE);
113 retrieve_service.setAttribute(GSXML.NAME_ATT, service_id);
114 return retrieve_service;
115 }
116
117 protected Element processDocumentContentRetrieve(Element request) {
118 Element result = this.doc.createElement(GSXML.RESPONSE_ELEM);
119 result.setAttribute(GSXML.FROM_ATT, CONTENT_SERVICE);
120 result.setAttribute(GSXML.TYPE_ATT, GSXML.REQUEST_TYPE_PROCESS);
121
122 Element doc_list = (Element)GSXML.getChildByTagName(request, GSXML.DOC_NODE_ELEM+GSXML.LIST_MODIFIER);
123 if (doc_list == null) {
124 return result;
125 }
126 Element result_doc_list = (Element)this.doc.importNode(doc_list, true);
127 result.appendChild(result_doc_list);
128
129 NodeList docs = result_doc_list.getElementsByTagName(GSXML.DOC_NODE_ELEM);
130 for (int i=0; i<docs.getLength(); i++) {
131
132 Element doc = (Element)docs.item(i);
133 Element content = this.doc.createElement(GSXML.NODE_CONTENT_ELEM);
134 doc.appendChild(content);
135
136 String node_id = doc.getAttribute(GSXML.NODE_ID_ATT);
137 String doc_name = getWorkName(node_id);
138
139 Element doc_elem = loadDocument(doc_name); // should perhaps cache the read in docs??
140 if (doc_elem == null) {
141 continue;
142 }
143
144
145 // if we have asked for the whole doc, just append it
146 if (doc_name.equals(node_id)) {
147 content.appendChild(this.doc.importNode(doc_elem, true));
148 continue;
149 }
150
151 // else we only want a sub section
152
153 Element section = getSection(doc_elem, node_id);
154 if (section != null) {
155 content.appendChild(this.doc.importNode(section, true));
156 }
157
158 } // for each doc
159
160 return result;
161
162 }
163
164 protected Element processDocumentStructureRetrieve(Element request) {
165 Element result = this.doc.createElement(GSXML.RESPONSE_ELEM);
166 result.setAttribute(GSXML.FROM_ATT, STRUCTURE_SERVICE);
167 result.setAttribute(GSXML.TYPE_ATT, GSXML.REQUEST_TYPE_PROCESS);
168
169 Element doc_list = (Element)GSXML.getChildByTagName(request, GSXML.DOC_NODE_ELEM+GSXML.LIST_MODIFIER);
170 if (doc_list == null) {
171 logger.error("no documents specified in the request. ");
172 return result;
173 }
174
175 Element result_doc_list = (Element)this.doc.importNode(doc_list, true);
176 result.appendChild(result_doc_list);
177 // first look for the stylesheet in the collection
178 File stylesheet = new File(GSFile.collStylesheetFile(this.site_home, this.cluster_name, this.toc_xsl_name));
179 if (!stylesheet.exists()) {
180 // now try in the site
181 stylesheet = new File(GSFile.siteStylesheetFile(this.site_home, this.toc_xsl_name));
182 }
183 if (!stylesheet.exists()) {
184 logger.error("couldn't find the stylesheet file to produce the table of contents:"+stylesheet.getPath());
185 return result;
186 }
187
188 // for now, we dont have any params, and we always return the structure of the whole document
189
190 XMLTransformer transformer = new XMLTransformer();
191 NodeList docs = result_doc_list.getElementsByTagName(GSXML.DOC_NODE_ELEM);
192
193 for (int i=0; i<docs.getLength(); i++) {
194
195 Element doc = (Element)docs.item(i);
196
197 Element structure = this.doc.createElement(GSXML.NODE_STRUCTURE_ELEM);
198 doc.appendChild(structure);
199 String doc_name = doc.getAttribute(GSXML.NODE_ID_ATT);
200 // make sure we are at the top level
201 doc_name = getWorkName(doc_name);
202
203 File doc_file = new File(GSFile.collectionIndexDir(this.site_home, this.cluster_name)+File.separator+"text"+File.separatorChar+doc_name+".xml");
204
205 if (!doc_file.exists()) {
206 logger.error("couldn't find file in coll "+this.cluster_name +", file "+doc_name+".xml");
207 } else {
208 try {
209 Node toc = transformer.transform(stylesheet, doc_file);
210 structure.appendChild(this.doc.importNode(toc, true));
211 } catch (Exception e) {
212 logger.error("couldn't transform the document to get the toc");
213 }
214 }
215
216 }
217
218 return result;
219
220 }
221
222 // this just extracts a bit of text from the section to use as the Title
223 // this should be overwritten for any format that has something more suitable
224 protected Element processDocumentMetadataRetrieve(Element request) {
225 Element result = this.doc.createElement(GSXML.RESPONSE_ELEM);
226 result.setAttribute(GSXML.FROM_ATT, METADATA_SERVICE);
227 result.setAttribute(GSXML.TYPE_ATT, GSXML.REQUEST_TYPE_PROCESS);
228
229 Element doc_list = (Element)GSXML.getChildByTagName(request, GSXML.DOC_NODE_ELEM+GSXML.LIST_MODIFIER);
230 if (doc_list == null) {
231 logger.error("no documents in the request");
232 return result;
233 }
234
235 Element result_doc_list = (Element)this.doc.importNode(doc_list, true);
236 result.appendChild(result_doc_list);
237
238 Element param_list = (Element) GSXML.getChildByTagName(request, GSXML.PARAM_ELEM+GSXML.LIST_MODIFIER);
239 if (param_list == null) {
240 logger.error("no metadata in the request");
241 return result;
242 }
243
244 Vector meta_name_list = new Vector();
245 boolean all_metadata = false;
246 // Process the request parameters
247 Element param = GSXML.getFirstElementChild(param_list);//(Element) param_list.getFirstChild();
248 while (param != null) {
249 // Identify the metadata information desired
250 if (param.getAttribute(GSXML.NAME_ATT).equals("metadata")) {
251 String metadata = GSXML.getValue(param);
252 if (metadata.equals("all")) {
253 all_metadata = true;
254 break;
255 }
256 meta_name_list.add(metadata);
257 }
258 param = (Element) param.getNextSibling();
259 }
260
261 NodeList docs = result_doc_list.getElementsByTagName(GSXML.DOC_NODE_ELEM);
262 for (int i=0; i<docs.getLength(); i++) {
263 Element doc = (Element)docs.item(i);
264 String node_id = doc.getAttribute(GSXML.NODE_ID_ATT);
265 String doc_name = getWorkName(node_id);
266
267 Element metadata_list = getMetadata(node_id, all_metadata, meta_name_list);
268 doc.appendChild(metadata_list);
269 }
270
271 return result;
272 }
273
274 protected Element loadDocument(String doc_name) {
275 // try to find the document
276 File doc_file = new File(GSFile.collectionIndexDir(this.site_home, this.cluster_name)+File.separator+"text"+File.separatorChar+doc_name+".xml");
277
278 if (!doc_file.exists()) {
279 logger.info("couldn't find file in coll "+this.cluster_name +", file "+doc_name+".xml");
280 return null;
281 }
282
283 Document the_doc = null;
284 try {
285 the_doc = this.converter.getDOM(doc_file, this.document_encoding);
286 } catch (Exception e) {
287 logger.error("couldn't create a DOM from file "+doc_file.getPath());
288 return null;
289 }
290
291 return the_doc.getDocumentElement();
292
293 }
294
295
296 protected Element getSection(Element doc_elem, String node_id) {
297 String [] bits = node_id.split("\\.");
298 if (bits.length > 4) {
299 logger.error("badly formatted node id ("+node_id +"), cant retrieve the section");
300 return null;
301 }
302
303 String id="";
304 String tagname = "";
305 String scope = "";
306 if (bits.length==2) {
307 tagname = bits[1];
308 } else {
309 scope = bits[1];
310 tagname = bits[2];
311
312 if (bits.length == 4) {
313 id = bits[3];
314 }
315 }
316 scope = translateScope(scope);
317 Element top=null;
318 if (!scope.equals("")) {
319 top = (Element)GSXML.getNodeByPath(doc_elem, scope);
320 if (top == null) {
321 // something gone wrong
322 return null;
323 }
324 } else {
325 top = doc_elem;
326 }
327
328 NodeList elements = top.getElementsByTagName(tagname);
329 if (elements.getLength() == 0) {
330 return null;
331 }
332 // no id, just return the first one
333 if (id.equals("")) {
334 return (Element)elements.item(0);
335 }
336 // have an id, need to check and find the right one.
337 for (int i=0; i<elements.getLength();i++) {
338 Element e = (Element)elements.item(i);
339 if (e.getAttribute("gs3:id").equals(id)) {
340 return e;
341 }
342 }
343 return null;
344
345 }
346
347 protected Element getMetadata(String node_id, boolean all, Vector meta_name_list) {
348
349 // our default strategy here is to only return Title and root:Title
350 // ignore all others
351 // the title of a section is just a little bit of the text inside it.
352 // the root_Title is the title from the doc info in the config file
353 Element metadata_list = this.doc.createElement(GSXML.METADATA_ELEM+ GSXML.LIST_MODIFIER);
354 String doc_name = getWorkName(node_id);
355 boolean node_is_root = false;
356 if (doc_name.equals(node_id)) {
357 node_is_root = true;
358 }
359
360 Element this_doc = GSXML.getNamedElement(this.collection_doc_list, GSXML.DOCUMENT_ELEM, GSXML.NAME_ATT, doc_name);
361 Element doc_meta_list = (Element) GSXML.getChildByTagName(this_doc, GSXML.METADATA_ELEM+GSXML.LIST_MODIFIER);
362
363 boolean get_section_title = false;
364
365 if (all) {
366 if (node_is_root) {
367 return (Element)this.doc.importNode(doc_meta_list, true);
368 } else {
369 get_section_title = true;
370 }
371
372 } else {
373 // have to process metadata one by one
374 for (int i=0; i<meta_name_list.size(); i++) {
375 String meta_name = (String) meta_name_list.elementAt(i);
376 String actual_meta_name = meta_name;
377 if (meta_name.startsWith("root_")) {
378 actual_meta_name = meta_name.substring(5);
379 } else {
380 // its a section level one - check to see if doc is root
381 if (!node_is_root) {
382 if (meta_name.equals("Title")) {
383 get_section_title = true;
384 }
385 continue; // move on to teh next metadata
386 }
387 }
388
389 // here, we look for the specific meta elem in doc_meta_list
390 Element meta_item = GSXML.getNamedElement(doc_meta_list, GSXML.METADATA_ELEM, GSXML.NAME_ATT, actual_meta_name);
391 if (meta_item != null) {
392 meta_item = (Element)this.doc.importNode(meta_item, true);
393 meta_item.setAttribute(GSXML.NAME_ATT, meta_name);
394 metadata_list.appendChild(meta_item);
395 }
396 } // for each metadata
397 }
398
399 // now we have processed all teh doc metadata, just have section one to go, if needed
400 if (get_section_title) {
401
402 Element doc_elem = loadDocument(doc_name);
403 if (doc_elem != null) {
404 Element section = getSection(doc_elem, node_id);
405 if (section != null) {
406 Element title_meta = extractTitleMeta(section);
407 if (title_meta != null) {
408 metadata_list.appendChild(title_meta);
409 }
410 }
411 }
412
413 }
414 return metadata_list;
415 }
416
417 protected Element extractTitleMeta(Element section) {
418 Element meta_elem = this.doc.createElement(GSXML.METADATA_ELEM);
419 meta_elem.setAttribute(GSXML.NAME_ATT, "Title");
420
421 String title = "dummy title";
422 Text t = this.doc.createTextNode(title);
423 meta_elem.appendChild(t);
424 return meta_elem;
425
426 }
427 // some methods for handling nodeIDs - they may be different for different colls, so they can be overwritten
428
429 // the full default nodeID looks like work.scope.tag.id
430 // the shorter versions are work, work.tag, work.scope.tag
431 protected String getWorkName(String node_id) {
432 int pos = node_id.indexOf('.');
433 if (pos == -1) {
434 return node_id;
435 }
436 return node_id.substring(0, pos);
437 }
438
439 // this assumes that the scope refers to a top level node - this may be overwritten if the scope bit in the id is a shorthand of some sort
440 protected String translateScope(String scope) {
441 if (this.document_root_tag != null) {
442 return GSPath.appendLink(this.document_root_tag, scope);
443 }
444 return scope;
445 }
446
447}
448
Note: See TracBrowser for help on using the repository browser.