source: main/trunk/greenstone3/src/java/org/greenstone/gsdl3/service/XMLRetrieve.java@ 25635

Last change on this file since 25635 was 25635, checked in by sjm84, 12 years ago

Fixing Greenstone 3's use (or lack thereof) of generics, this was done automatically so we may want to change it over time. This change will also auto-format any files that have not already been formatted.

  • Property svn:keywords set to Author Date Id Revision
File size: 14.8 KB
Line 
1package org.greenstone.gsdl3.service;
2
3
4// Greenstone classes
5import org.greenstone.gsdl3.util.*;
6
7// XML classes
8import org.w3c.dom.Document;
9import org.w3c.dom.Element;
10import org.w3c.dom.Node;
11import org.w3c.dom.Attr;
12import org.w3c.dom.Text;
13import org.w3c.dom.NodeList;
14import org.w3c.dom.NamedNodeMap;
15
16// General Java classes
17import java.io.File;
18import java.io.Serializable;
19import java.util.Vector;
20import java.util.HashMap;
21
22import org.apache.log4j.*;
23
24public class XMLRetrieve extends ServiceRack {
25
26 static Logger logger = Logger.getLogger(org.greenstone.gsdl3.service.XMLRetrieve.class.getName());
27 protected static final String CONTENT_SERVICE = "DocumentContentRetrieve";
28 protected static final String METADATA_SERVICE = "DocumentMetadataRetrieve";
29 protected static final String STRUCTURE_SERVICE = "DocumentStructureRetrieve";
30
31 protected String toc_xsl_name = "";
32 protected String document_encoding = "";
33 protected String document_root_tag = "";
34
35 protected Element collection_doc_list = null;
36
37 protected boolean provide_content = true;
38 protected boolean provide_structure = true;
39 protected boolean provide_metadata = true;
40
41
42 public boolean configure(Element info, Element extra_info) {
43 if (!super.configure(info, extra_info)){
44 return false;
45 }
46 logger.info("configuring XMLRetrieve...");
47 // look for the parameters
48 Element param_list = (Element)GSXML.getChildByTagName(info, GSXML.PARAM_ELEM+GSXML.LIST_MODIFIER);
49 HashMap<String, Serializable> params;
50 String services_to_provide = "";
51 if (param_list != null) {
52 params = GSXML.extractParams(param_list, false);
53 this.toc_xsl_name = (String)params.get("tocXSLT");
54 this.document_encoding = (String)params.get("documentEncoding");
55 this.document_root_tag = (String)params.get("documentRootTag");
56 services_to_provide = (String)params.get("provideServices");
57 }
58 if (this.toc_xsl_name == null || this.toc_xsl_name.equals("")) {
59 this.toc_xsl_name = "default_toc";
60 }
61 this.toc_xsl_name = this.toc_xsl_name+".xsl";
62
63 if (this.document_encoding == null || this.document_encoding.equals("")) {
64 this.document_encoding = "UTF-8";
65 }
66
67 if (services_to_provide != null && !services_to_provide.equals("")) {
68 if (services_to_provide.indexOf("content")==-1) {
69 provide_content = false;
70 }
71 if (services_to_provide.indexOf("metadata")==-1) {
72 provide_metadata = false;
73 }
74 if (services_to_provide.indexOf("structure")==-1) {
75 provide_structure = false;
76 }
77
78 }
79
80 // set up short_service_info_ - for now just has name and type
81 Element retrieve_service;
82 if (provide_content) {
83 retrieve_service = this.doc.createElement(GSXML.SERVICE_ELEM);
84 retrieve_service.setAttribute(GSXML.TYPE_ATT, GSXML.SERVICE_TYPE_RETRIEVE);
85 retrieve_service.setAttribute(GSXML.NAME_ATT, CONTENT_SERVICE);
86 this.short_service_info.appendChild(retrieve_service);
87 }
88 if (provide_metadata) {
89 retrieve_service = this.doc.createElement(GSXML.SERVICE_ELEM);
90 retrieve_service.setAttribute(GSXML.TYPE_ATT, GSXML.SERVICE_TYPE_RETRIEVE);
91 retrieve_service.setAttribute(GSXML.NAME_ATT, METADATA_SERVICE);
92 this.short_service_info.appendChild(retrieve_service);
93 }
94 if (provide_structure) {
95 retrieve_service = this.doc.createElement(GSXML.SERVICE_ELEM);
96 retrieve_service.setAttribute(GSXML.TYPE_ATT, GSXML.SERVICE_TYPE_RETRIEVE);
97 retrieve_service.setAttribute(GSXML.NAME_ATT, STRUCTURE_SERVICE);
98 this.short_service_info.appendChild(retrieve_service);
99 }
100 // find the doc list from the extra_info and keep it - should this be in collect.cfg or build.cfg??
101 collection_doc_list = (Element)GSXML.getChildByTagName(extra_info, GSXML.DOCUMENT_ELEM+GSXML.LIST_MODIFIER);
102
103 GSEntityResolver resolver = new GSEntityResolver();
104 resolver.setClassLoader(this.class_loader);
105 this.converter.setEntityResolver(resolver);
106 return true;
107 }
108
109 // this may get called but is not useful in the case of retrieve services
110 protected Element getServiceDescription(String service_id, String lang, String subset) {
111
112 Element retrieve_service = this.doc.createElement(GSXML.SERVICE_ELEM);
113 retrieve_service.setAttribute(GSXML.TYPE_ATT, GSXML.SERVICE_TYPE_RETRIEVE);
114 retrieve_service.setAttribute(GSXML.NAME_ATT, service_id);
115 return retrieve_service;
116 }
117
118 protected Element processDocumentContentRetrieve(Element request) {
119 Element result = this.doc.createElement(GSXML.RESPONSE_ELEM);
120 result.setAttribute(GSXML.FROM_ATT, CONTENT_SERVICE);
121 result.setAttribute(GSXML.TYPE_ATT, GSXML.REQUEST_TYPE_PROCESS);
122
123 Element doc_list = (Element)GSXML.getChildByTagName(request, GSXML.DOC_NODE_ELEM+GSXML.LIST_MODIFIER);
124 if (doc_list == null) {
125 return result;
126 }
127 Element result_doc_list = (Element)this.doc.importNode(doc_list, true);
128 result.appendChild(result_doc_list);
129
130 NodeList docs = result_doc_list.getElementsByTagName(GSXML.DOC_NODE_ELEM);
131 for (int i=0; i<docs.getLength(); i++) {
132
133 Element doc = (Element)docs.item(i);
134 Element content = this.doc.createElement(GSXML.NODE_CONTENT_ELEM);
135 doc.appendChild(content);
136
137 String node_id = doc.getAttribute(GSXML.NODE_ID_ATT);
138 String doc_name = getWorkName(node_id);
139
140 Element doc_elem = loadDocument(doc_name); // should perhaps cache the read in docs??
141 if (doc_elem == null) {
142 continue;
143 }
144
145
146 // if we have asked for the whole doc, just append it
147 if (doc_name.equals(node_id)) {
148 content.appendChild(this.doc.importNode(doc_elem, true));
149 continue;
150 }
151
152 // else we only want a sub section
153
154 Element section = getSection(doc_elem, node_id);
155 if (section != null) {
156 content.appendChild(this.doc.importNode(section, true));
157 }
158
159 } // for each doc
160
161 return result;
162
163 }
164
165 protected Element processDocumentStructureRetrieve(Element request) {
166 Element result = this.doc.createElement(GSXML.RESPONSE_ELEM);
167 result.setAttribute(GSXML.FROM_ATT, STRUCTURE_SERVICE);
168 result.setAttribute(GSXML.TYPE_ATT, GSXML.REQUEST_TYPE_PROCESS);
169
170 Element doc_list = (Element)GSXML.getChildByTagName(request, GSXML.DOC_NODE_ELEM+GSXML.LIST_MODIFIER);
171 if (doc_list == null) {
172 logger.error("no documents specified in the request. ");
173 return result;
174 }
175
176 Element result_doc_list = (Element)this.doc.importNode(doc_list, true);
177 result.appendChild(result_doc_list);
178 // first look for the stylesheet in the collection
179 File stylesheet = new File(GSFile.collStylesheetFile(this.site_home, this.cluster_name, this.toc_xsl_name));
180 if (!stylesheet.exists()) {
181 // now try in the site
182 stylesheet = new File(GSFile.siteStylesheetFile(this.site_home, this.toc_xsl_name));
183 }
184 if (!stylesheet.exists()) {
185 logger.error("couldn't find the stylesheet file to produce the table of contents:"+stylesheet.getPath());
186 return result;
187 }
188
189 // for now, we dont have any params, and we always return the structure of the whole document
190
191 XMLTransformer transformer = new XMLTransformer();
192 NodeList docs = result_doc_list.getElementsByTagName(GSXML.DOC_NODE_ELEM);
193
194 for (int i=0; i<docs.getLength(); i++) {
195
196 Element doc = (Element)docs.item(i);
197
198 Element structure = this.doc.createElement(GSXML.NODE_STRUCTURE_ELEM);
199 doc.appendChild(structure);
200 String doc_name = doc.getAttribute(GSXML.NODE_ID_ATT);
201 // make sure we are at the top level
202 doc_name = getWorkName(doc_name);
203
204 File doc_file = new File(GSFile.collectionIndexDir(this.site_home, this.cluster_name)+File.separator+"text"+File.separatorChar+doc_name+".xml");
205
206 if (!doc_file.exists()) {
207 logger.error("couldn't find file in coll "+this.cluster_name +", file "+doc_name+".xml");
208 } else {
209 try {
210 Node toc = transformer.transform(stylesheet, doc_file);
211 structure.appendChild(this.doc.importNode(toc, true));
212 } catch (Exception e) {
213 logger.error("couldn't transform the document to get the toc");
214 }
215 }
216
217 }
218
219 return result;
220
221 }
222
223 // this just extracts a bit of text from the section to use as the Title
224 // this should be overwritten for any format that has something more suitable
225 protected Element processDocumentMetadataRetrieve(Element request) {
226 Element result = this.doc.createElement(GSXML.RESPONSE_ELEM);
227 result.setAttribute(GSXML.FROM_ATT, METADATA_SERVICE);
228 result.setAttribute(GSXML.TYPE_ATT, GSXML.REQUEST_TYPE_PROCESS);
229
230 Element doc_list = (Element)GSXML.getChildByTagName(request, GSXML.DOC_NODE_ELEM+GSXML.LIST_MODIFIER);
231 if (doc_list == null) {
232 logger.error("no documents in the request");
233 return result;
234 }
235
236 Element result_doc_list = (Element)this.doc.importNode(doc_list, true);
237 result.appendChild(result_doc_list);
238
239 Element param_list = (Element) GSXML.getChildByTagName(request, GSXML.PARAM_ELEM+GSXML.LIST_MODIFIER);
240 if (param_list == null) {
241 logger.error("no metadata in the request");
242 return result;
243 }
244
245 Vector<String> meta_name_list = new Vector<String>();
246 boolean all_metadata = false;
247 // Process the request parameters
248 Element param = GSXML.getFirstElementChild(param_list);//(Element) param_list.getFirstChild();
249 while (param != null) {
250 // Identify the metadata information desired
251 if (param.getAttribute(GSXML.NAME_ATT).equals("metadata")) {
252 String metadata = GSXML.getValue(param);
253 if (metadata.equals("all")) {
254 all_metadata = true;
255 break;
256 }
257 meta_name_list.add(metadata);
258 }
259 param = (Element) param.getNextSibling();
260 }
261
262 NodeList docs = result_doc_list.getElementsByTagName(GSXML.DOC_NODE_ELEM);
263 for (int i=0; i<docs.getLength(); i++) {
264 Element doc = (Element)docs.item(i);
265 String node_id = doc.getAttribute(GSXML.NODE_ID_ATT);
266 String doc_name = getWorkName(node_id);
267
268 Element metadata_list = getMetadata(node_id, all_metadata, meta_name_list);
269 doc.appendChild(metadata_list);
270 }
271
272 return result;
273 }
274
275 protected Element loadDocument(String doc_name) {
276 // try to find the document
277 File doc_file = new File(GSFile.collectionIndexDir(this.site_home, this.cluster_name)+File.separator+"text"+File.separatorChar+doc_name+".xml");
278
279 if (!doc_file.exists()) {
280 logger.info("couldn't find file in coll "+this.cluster_name +", file "+doc_name+".xml");
281 return null;
282 }
283
284 Document the_doc = null;
285 try {
286 the_doc = this.converter.getDOM(doc_file, this.document_encoding);
287 } catch (Exception e) {
288 logger.error("couldn't create a DOM from file "+doc_file.getPath());
289 return null;
290 }
291
292 return the_doc.getDocumentElement();
293
294 }
295
296
297 protected Element getSection(Element doc_elem, String node_id) {
298 String [] bits = node_id.split("\\.");
299 if (bits.length > 4) {
300 logger.error("badly formatted node id ("+node_id +"), cant retrieve the section");
301 return null;
302 }
303
304 String id="";
305 String tagname = "";
306 String scope = "";
307 if (bits.length==2) {
308 tagname = bits[1];
309 } else {
310 scope = bits[1];
311 tagname = bits[2];
312
313 if (bits.length == 4) {
314 id = bits[3];
315 }
316 }
317 scope = translateScope(scope);
318 Element top=null;
319 if (!scope.equals("")) {
320 top = (Element)GSXML.getNodeByPath(doc_elem, scope);
321 if (top == null) {
322 // something gone wrong
323 return null;
324 }
325 } else {
326 top = doc_elem;
327 }
328
329 NodeList elements = top.getElementsByTagName(tagname);
330 if (elements.getLength() == 0) {
331 return null;
332 }
333 // no id, just return the first one
334 if (id.equals("")) {
335 return (Element)elements.item(0);
336 }
337 // have an id, need to check and find the right one.
338 for (int i=0; i<elements.getLength();i++) {
339 Element e = (Element)elements.item(i);
340 if (e.getAttribute("gs3:id").equals(id)) {
341 return e;
342 }
343 }
344 return null;
345
346 }
347
348 protected Element getMetadata(String node_id, boolean all, Vector<String> meta_name_list) {
349
350 // our default strategy here is to only return Title and root:Title
351 // ignore all others
352 // the title of a section is just a little bit of the text inside it.
353 // the root_Title is the title from the doc info in the config file
354 Element metadata_list = this.doc.createElement(GSXML.METADATA_ELEM+ GSXML.LIST_MODIFIER);
355 String doc_name = getWorkName(node_id);
356 boolean node_is_root = false;
357 if (doc_name.equals(node_id)) {
358 node_is_root = true;
359 }
360
361 Element this_doc = GSXML.getNamedElement(this.collection_doc_list, GSXML.DOCUMENT_ELEM, GSXML.NAME_ATT, doc_name);
362 Element doc_meta_list = (Element) GSXML.getChildByTagName(this_doc, GSXML.METADATA_ELEM+GSXML.LIST_MODIFIER);
363
364 boolean get_section_title = false;
365
366 if (all) {
367 if (node_is_root) {
368 return (Element)this.doc.importNode(doc_meta_list, true);
369 } else {
370 get_section_title = true;
371 }
372
373 } else {
374 // have to process metadata one by one
375 for (int i=0; i<meta_name_list.size(); i++) {
376 String meta_name = meta_name_list.elementAt(i);
377 String actual_meta_name = meta_name;
378 if (meta_name.startsWith("root_")) {
379 actual_meta_name = meta_name.substring(5);
380 } else {
381 // its a section level one - check to see if doc is root
382 if (!node_is_root) {
383 if (meta_name.equals("Title")) {
384 get_section_title = true;
385 }
386 continue; // move on to teh next metadata
387 }
388 }
389
390 // here, we look for the specific meta elem in doc_meta_list
391 Element meta_item = GSXML.getNamedElement(doc_meta_list, GSXML.METADATA_ELEM, GSXML.NAME_ATT, actual_meta_name);
392 if (meta_item != null) {
393 meta_item = (Element)this.doc.importNode(meta_item, true);
394 meta_item.setAttribute(GSXML.NAME_ATT, meta_name);
395 metadata_list.appendChild(meta_item);
396 }
397 } // for each metadata
398 }
399
400 // now we have processed all teh doc metadata, just have section one to go, if needed
401 if (get_section_title) {
402
403 Element doc_elem = loadDocument(doc_name);
404 if (doc_elem != null) {
405 Element section = getSection(doc_elem, node_id);
406 if (section != null) {
407 Element title_meta = extractTitleMeta(section);
408 if (title_meta != null) {
409 metadata_list.appendChild(title_meta);
410 }
411 }
412 }
413
414 }
415 return metadata_list;
416 }
417
418 protected Element extractTitleMeta(Element section) {
419 Element meta_elem = this.doc.createElement(GSXML.METADATA_ELEM);
420 meta_elem.setAttribute(GSXML.NAME_ATT, "Title");
421
422 String title = "dummy title";
423 Text t = this.doc.createTextNode(title);
424 meta_elem.appendChild(t);
425 return meta_elem;
426
427 }
428 // some methods for handling nodeIDs - they may be different for different colls, so they can be overwritten
429
430 // the full default nodeID looks like work.scope.tag.id
431 // the shorter versions are work, work.tag, work.scope.tag
432 protected String getWorkName(String node_id) {
433 int pos = node_id.indexOf('.');
434 if (pos == -1) {
435 return node_id;
436 }
437 return node_id.substring(0, pos);
438 }
439
440 // this assumes that the scope refers to a top level node - this may be overwritten if the scope bit in the id is a shorthand of some sort
441 protected String translateScope(String scope) {
442 if (this.document_root_tag != null) {
443 return GSPath.appendLink(this.document_root_tag, scope);
444 }
445 return scope;
446 }
447
448}
449
Note: See TracBrowser for help on using the repository browser.