source: trunk/gsdl3/src/java/org/greenstone/gsdl3/service/XMLRetrieve.java@ 13270

Last change on this file since 13270 was 13270, checked in by shaoqun, 17 years ago

replace Category class which is deprecated with Logger class

  • Property svn:keywords set to Author Date Id Revision
File size: 14.7 KB
Line 
1package org.greenstone.gsdl3.service;
2
3
4// Greenstone classes
5import org.greenstone.gsdl3.util.*;
6
7// XML classes
8import org.w3c.dom.Document;
9import org.w3c.dom.Element;
10import org.w3c.dom.Node;
11import org.w3c.dom.Attr;
12import org.w3c.dom.Text;
13import org.w3c.dom.NodeList;
14import org.w3c.dom.NamedNodeMap;
15
16// General Java classes
17import java.io.File;
18import java.util.Vector;
19import java.util.HashMap;
20
21import org.apache.log4j.*;
22
23public class XMLRetrieve extends ServiceRack {
24
25 static Logger logger = Logger.getLogger(org.greenstone.gsdl3.service.XMLRetrieve.class.getName());
26 protected static final String CONTENT_SERVICE = "DocumentContentRetrieve";
27 protected static final String METADATA_SERVICE = "DocumentMetadataRetrieve";
28 protected static final String STRUCTURE_SERVICE = "DocumentStructureRetrieve";
29
30 protected String toc_xsl_name = "";
31 protected String document_encoding = "";
32 protected String document_root_tag = "";
33
34 protected Element collection_doc_list = null;
35
36 protected boolean provide_content = true;
37 protected boolean provide_structure = true;
38 protected boolean provide_metadata = true;
39
40
41 public boolean configure(Element info, Element extra_info) {
42 if (!super.configure(info, extra_info)){
43 return false;
44 }
45 logger.info("configuring XMLRetrieve...");
46 // look for the parameters
47 Element param_list = (Element)GSXML.getChildByTagName(info, GSXML.PARAM_ELEM+GSXML.LIST_MODIFIER);
48 HashMap params;
49 String services_to_provide = "";
50 if (param_list != null) {
51 params = GSXML.extractParams(param_list, false);
52 this.toc_xsl_name = (String)params.get("tocXSLT");
53 this.document_encoding = (String)params.get("documentEncoding");
54 this.document_root_tag = (String)params.get("documentRootTag");
55 services_to_provide = (String)params.get("provideServices");
56 }
57 if (this.toc_xsl_name == null || this.toc_xsl_name.equals("")) {
58 this.toc_xsl_name = "default_toc";
59 }
60 this.toc_xsl_name = this.toc_xsl_name+".xsl";
61
62 if (this.document_encoding == null || this.document_encoding.equals("")) {
63 this.document_encoding = "UTF-8";
64 }
65
66 if (services_to_provide != null && !services_to_provide.equals("")) {
67 if (services_to_provide.indexOf("content")==-1) {
68 provide_content = false;
69 }
70 if (services_to_provide.indexOf("metadata")==-1) {
71 provide_metadata = false;
72 }
73 if (services_to_provide.indexOf("structure")==-1) {
74 provide_structure = false;
75 }
76
77 }
78
79 // set up short_service_info_ - for now just has name and type
80 Element retrieve_service;
81 if (provide_content) {
82 retrieve_service = this.doc.createElement(GSXML.SERVICE_ELEM);
83 retrieve_service.setAttribute(GSXML.TYPE_ATT, GSXML.SERVICE_TYPE_RETRIEVE);
84 retrieve_service.setAttribute(GSXML.NAME_ATT, CONTENT_SERVICE);
85 this.short_service_info.appendChild(retrieve_service);
86 }
87 if (provide_metadata) {
88 retrieve_service = this.doc.createElement(GSXML.SERVICE_ELEM);
89 retrieve_service.setAttribute(GSXML.TYPE_ATT, GSXML.SERVICE_TYPE_RETRIEVE);
90 retrieve_service.setAttribute(GSXML.NAME_ATT, METADATA_SERVICE);
91 this.short_service_info.appendChild(retrieve_service);
92 }
93 if (provide_structure) {
94 retrieve_service = this.doc.createElement(GSXML.SERVICE_ELEM);
95 retrieve_service.setAttribute(GSXML.TYPE_ATT, GSXML.SERVICE_TYPE_RETRIEVE);
96 retrieve_service.setAttribute(GSXML.NAME_ATT, STRUCTURE_SERVICE);
97 this.short_service_info.appendChild(retrieve_service);
98 }
99 // find the doc list from the extra_info and keep it - should this be in collect.cfg or build.cfg??
100 collection_doc_list = (Element)GSXML.getChildByTagName(extra_info, GSXML.DOCUMENT_ELEM+GSXML.LIST_MODIFIER);
101
102 GSEntityResolver resolver = new GSEntityResolver();
103 resolver.setClassLoader(this.class_loader);
104 this.converter.setEntityResolver(resolver);
105 return true;
106 }
107
108 // this may get called but is not useful in the case of retrieve services
109 protected Element getServiceDescription(String service_id, String lang, String subset) {
110
111 Element retrieve_service = this.doc.createElement(GSXML.SERVICE_ELEM);
112 retrieve_service.setAttribute(GSXML.TYPE_ATT, GSXML.SERVICE_TYPE_RETRIEVE);
113 retrieve_service.setAttribute(GSXML.NAME_ATT, service_id);
114 return retrieve_service;
115 }
116
117 protected Element processDocumentContentRetrieve(Element request) {
118 Element result = this.doc.createElement(GSXML.RESPONSE_ELEM);
119 result.setAttribute(GSXML.FROM_ATT, CONTENT_SERVICE);
120 result.setAttribute(GSXML.TYPE_ATT, GSXML.REQUEST_TYPE_PROCESS);
121
122 Element doc_list = (Element)GSXML.getChildByTagName(request, GSXML.DOC_NODE_ELEM+GSXML.LIST_MODIFIER);
123 if (doc_list == null) {
124 return result;
125 }
126 Element result_doc_list = (Element)this.doc.importNode(doc_list, true);
127 result.appendChild(result_doc_list);
128
129 NodeList docs = result_doc_list.getElementsByTagName(GSXML.DOC_NODE_ELEM);
130 for (int i=0; i<docs.getLength(); i++) {
131
132 Element doc = (Element)docs.item(i);
133 Element content = this.doc.createElement(GSXML.NODE_CONTENT_ELEM);
134 doc.appendChild(content);
135
136 String node_id = doc.getAttribute(GSXML.NODE_ID_ATT);
137 String doc_name = getWorkName(node_id);
138
139 Element doc_elem = loadDocument(doc_name); // should perhaps cache the read in docs??
140 if (doc_elem == null) {
141 continue;
142 }
143
144
145 // if we have asked for the whole doc, just append it
146 if (doc_name.equals(node_id)) {
147 content.appendChild(this.doc.importNode(doc_elem, true));
148 continue;
149 }
150
151 // else we only want a sub section
152
153 Element section = getSection(doc_elem, node_id);
154 if (section != null) {
155 content.appendChild(this.doc.importNode(section, true));
156 }
157
158 } // for each doc
159
160 return result;
161
162 }
163
164 protected Element processDocumentStructureRetrieve(Element request) {
165 Element result = this.doc.createElement(GSXML.RESPONSE_ELEM);
166 result.setAttribute(GSXML.FROM_ATT, STRUCTURE_SERVICE);
167 result.setAttribute(GSXML.TYPE_ATT, GSXML.REQUEST_TYPE_PROCESS);
168
169 Element doc_list = (Element)GSXML.getChildByTagName(request, GSXML.DOC_NODE_ELEM+GSXML.LIST_MODIFIER);
170 if (doc_list == null) {
171 logger.error("no documents specified in the request. ");
172 return result;
173 }
174
175 Element result_doc_list = (Element)this.doc.importNode(doc_list, true);
176 result.appendChild(result_doc_list);
177 // first look for the stylesheet in the collection
178 File stylesheet = new File(GSFile.collStylesheetFile(this.site_home, this.cluster_name, this.toc_xsl_name));
179 if (!stylesheet.exists()) {
180 // now try in the site
181 stylesheet = new File(GSFile.siteStylesheetFile(this.site_home, this.toc_xsl_name));
182 }
183 if (!stylesheet.exists()) {
184 logger.error("couldn't find the stylesheet file to produce the table of contents:"+stylesheet.getPath());
185 return result;
186 }
187
188 // for now, we dont have any params, and we always return the structure of the whole document
189
190 XMLTransformer transformer = new XMLTransformer();
191 NodeList docs = result_doc_list.getElementsByTagName(GSXML.DOC_NODE_ELEM);
192
193 for (int i=0; i<docs.getLength(); i++) {
194
195 Element doc = (Element)docs.item(i);
196
197 Element structure = this.doc.createElement(GSXML.NODE_STRUCTURE_ELEM);
198 doc.appendChild(structure);
199 String doc_name = doc.getAttribute(GSXML.NODE_ID_ATT);
200 // make sure we are at the top level
201 doc_name = getWorkName(doc_name);
202
203 File doc_file = new File(GSFile.collectionIndexDir(this.site_home, this.cluster_name)+File.separator+"text"+File.separatorChar+doc_name+".xml");
204
205 if (!doc_file.exists()) {
206 logger.error("couldn't find file in coll "+this.cluster_name +", file "+doc_name+".xml");
207 } else {
208 try {
209 Node toc = transformer.transform(stylesheet, doc_file);
210 structure.appendChild(this.doc.importNode(toc, true));
211 } catch (Exception e) {
212 logger.error("couldn't transform the document to get the toc");
213 }
214 }
215
216 }
217
218 return result;
219
220 }
221
222 // this just extracts a bit of text from the section to use as the Title
223 // this should be overwritten for any format that has something more suitable
224 protected Element processDocumentMetadataRetrieve(Element request) {
225 Element result = this.doc.createElement(GSXML.RESPONSE_ELEM);
226 result.setAttribute(GSXML.FROM_ATT, METADATA_SERVICE);
227 result.setAttribute(GSXML.TYPE_ATT, GSXML.REQUEST_TYPE_PROCESS);
228
229 Element doc_list = (Element)GSXML.getChildByTagName(request, GSXML.DOC_NODE_ELEM+GSXML.LIST_MODIFIER);
230 if (doc_list == null) {
231 logger.error("no documents in the request");
232 return result;
233 }
234
235 Element result_doc_list = (Element)this.doc.importNode(doc_list, true);
236 result.appendChild(result_doc_list);
237
238 Element param_list = (Element) GSXML.getChildByTagName(request, GSXML.PARAM_ELEM+GSXML.LIST_MODIFIER);
239 if (param_list == null) {
240 logger.error("no metadata in the request");
241 return result;
242 }
243
244 Vector meta_name_list = new Vector();
245 boolean all_metadata = false;
246 // Process the request parameters
247 Element param = (Element) param_list.getFirstChild();
248 while (param != null) {
249 // Identify the metadata information desired
250 if (param.getAttribute(GSXML.NAME_ATT).equals("metadata")) {
251 String metadata = GSXML.getValue(param);
252 if (metadata.equals("all")) {
253 all_metadata = true;
254 break;
255 }
256 meta_name_list.add(metadata);
257 }
258 param = (Element) param.getNextSibling();
259 }
260
261 NodeList docs = result_doc_list.getElementsByTagName(GSXML.DOC_NODE_ELEM);
262 for (int i=0; i<docs.getLength(); i++) {
263 Element doc = (Element)docs.item(i);
264 String node_id = doc.getAttribute(GSXML.NODE_ID_ATT);
265 String doc_name = getWorkName(node_id);
266
267 Element metadata_list = getMetadata(node_id, all_metadata, meta_name_list);
268 doc.appendChild(metadata_list);
269 }
270
271 return result;
272 }
273
274 protected Element loadDocument(String doc_name) {
275 // try to find the document
276 File doc_file = new File(GSFile.collectionIndexDir(this.site_home, this.cluster_name)+File.separator+"text"+File.separatorChar+doc_name+".xml");
277
278 if (!doc_file.exists()) {
279 logger.info("couldn't find file in coll "+this.cluster_name +", file "+doc_name+".xml");
280 return null;
281 }
282
283 Document the_doc = null;
284 try {
285 the_doc = this.converter.getDOM(doc_file, this.document_encoding);
286 } catch (Exception e) {
287 logger.error("couldn't create a DOM from file "+doc_file.getPath());
288 return null;
289 }
290
291 return the_doc.getDocumentElement();
292
293 }
294
295
296 protected Element getSection(Element doc_elem, String node_id) {
297 String [] bits = node_id.split("\\.");
298 if (bits.length > 4) {
299 logger.error("badly formatted node id ("+node_id +"), cant retrieve the section");
300 return null;
301 }
302
303 String id="";
304 String tagname = "";
305 String scope = "";
306 if (bits.length==2) {
307 tagname = bits[1];
308 } else {
309 scope = bits[1];
310 tagname = bits[2];
311
312 if (bits.length == 4) {
313 id = bits[3];
314 }
315 }
316 scope = translateScope(scope);
317 Element top=null;
318 if (!scope.equals("")) {
319 top = (Element)GSXML.getNodeByPath(doc_elem, scope);
320 if (top == null) {
321 // something gone wrong
322 return null;
323 }
324 } else {
325 top = doc_elem;
326 }
327
328 NodeList elements = top.getElementsByTagName(tagname);
329 if (elements.getLength() == 0) {
330 return null;
331 }
332 // no id, just return the first one
333 if (id.equals("")) {
334 return (Element)elements.item(0);
335 }
336 // have an id, need to check and find the right one.
337 for (int i=0; i<elements.getLength();i++) {
338 Element e = (Element)elements.item(i);
339 if (e.getAttribute("gs3:id").equals(id)) {
340 return e;
341 }
342 }
343 return null;
344
345 }
346
347 protected Element getMetadata(String node_id, boolean all, Vector meta_name_list) {
348
349 // our default strategy here is to only return Title and root:Title
350 // ignore all others
351 // the title of a section is just a little bit of the text inside it.
352 // the root_Title is the title from the doc info in the config file
353 Element metadata_list = this.doc.createElement(GSXML.METADATA_ELEM+ GSXML.LIST_MODIFIER);
354 String doc_name = getWorkName(node_id);
355 boolean node_is_root = false;
356 if (doc_name.equals(node_id)) {
357 node_is_root = true;
358 }
359
360 Element this_doc = GSXML.getNamedElement(this.collection_doc_list, GSXML.DOCUMENT_ELEM, GSXML.NAME_ATT, doc_name);
361 Element doc_meta_list = (Element) GSXML.getChildByTagName(this_doc, GSXML.METADATA_ELEM+GSXML.LIST_MODIFIER);
362
363 boolean get_section_title = false;
364
365 if (all) {
366 if (node_is_root) {
367 return (Element)this.doc.importNode(doc_meta_list, true);
368 } else {
369 get_section_title = true;
370 }
371
372 } else {
373 // have to process metadata one by one
374 for (int i=0; i<meta_name_list.size(); i++) {
375 String meta_name = (String) meta_name_list.elementAt(i);
376 String actual_meta_name = meta_name;
377 if (meta_name.startsWith("root_")) {
378 actual_meta_name = meta_name.substring(5);
379 } else {
380 // its a section level one - check to see if doc is root
381 if (!node_is_root) {
382 if (meta_name.equals("Title")) {
383 get_section_title = true;
384 }
385 continue; // move on to teh next metadata
386 }
387 }
388
389 // here, we look for the specific meta elem in doc_meta_list
390 Element meta_item = GSXML.getNamedElement(doc_meta_list, GSXML.METADATA_ELEM, GSXML.NAME_ATT, actual_meta_name);
391 if (meta_item != null) {
392 meta_item = (Element)this.doc.importNode(meta_item, true);
393 meta_item.setAttribute(GSXML.NAME_ATT, meta_name);
394 metadata_list.appendChild(meta_item);
395 }
396 } // for each metadata
397 }
398
399 // now we have processed all teh doc metadata, just have section one to go, if needed
400 if (get_section_title) {
401
402 Element doc_elem = loadDocument(doc_name);
403 if (doc_elem != null) {
404 Element section = getSection(doc_elem, node_id);
405 if (section != null) {
406 Element title_meta = extractTitleMeta(section);
407 if (title_meta != null) {
408 metadata_list.appendChild(title_meta);
409 }
410 }
411 }
412
413 }
414 return metadata_list;
415 }
416
417 protected Element extractTitleMeta(Element section) {
418 Element meta_elem = this.doc.createElement(GSXML.METADATA_ELEM);
419 meta_elem.setAttribute(GSXML.NAME_ATT, "Title");
420
421 String title = "dummy title";
422 Text t = this.doc.createTextNode(title);
423 meta_elem.appendChild(t);
424 return meta_elem;
425
426 }
427 // some methods for handling nodeIDs - they may be different for different colls, so they can be overwritten
428
429 // the full default nodeID looks like work.scope.tag.id
430 // the shorter versions are work, work.tag, work.scope.tag
431 protected String getWorkName(String node_id) {
432 int pos = node_id.indexOf('.');
433 if (pos == -1) {
434 return node_id;
435 }
436 return node_id.substring(0, pos);
437 }
438
439 // this assumes that the scope refers to a top level node - this may be overwritten if the scope bit in the id is a shorthand of some sort
440 protected String translateScope(String scope) {
441 if (this.document_root_tag != null) {
442 return GSPath.appendLink(this.document_root_tag, scope);
443 }
444 return scope;
445 }
446
447}
448
Note: See TracBrowser for help on using the repository browser.