source: main/trunk/greenstone3/src/java/org/greenstone/gsdl3/service/XMLRetrieve.java@ 32453

Last change on this file since 32453 was 28966, checked in by kjdon, 10 years ago

Lots of changes. Mainly to do with removing this.doc from everywhere. Document is not thread safe. Now we tend to create a new Document everytime we are starting a new page/message etc. in service this.desc_doc is available as teh document to create service info stuff. But it should only be used for this and not for other messages. newDOM is now static for XMLConverter. method param changes for some GSXML methods.

  • Property svn:keywords set to Author Date Id Revision
File size: 16.0 KB
Line 
1/*
2 * ServiceRack.java
3 * Copyright (C) 2014 New Zealand Digital Library, http://www.nzdl.org
4 *
5 * This program is free software; you can redistribute it and/or modify
6 * it under the terms of the GNU General Public License as published by
7 * the Free Software Foundation; either version 2 of the License, or
8 * (at your option) any later version.
9 *
10 * This program is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 * GNU General Public License for more details.
14 *
15 * You should have received a copy of the GNU General Public License
16 * along with this program; if not, write to the Free Software
17 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
18 */
19
20package org.greenstone.gsdl3.service;
21
22
23// Greenstone classes
24import org.greenstone.gsdl3.util.*;
25
26// XML classes
27import org.w3c.dom.Document;
28import org.w3c.dom.Element;
29import org.w3c.dom.Node;
30import org.w3c.dom.Attr;
31import org.w3c.dom.Text;
32import org.w3c.dom.NodeList;
33import org.w3c.dom.NamedNodeMap;
34
35// General Java classes
36import java.io.File;
37import java.io.Serializable;
38import java.util.Vector;
39import java.util.HashMap;
40
41import org.apache.log4j.*;
42
43public class XMLRetrieve extends ServiceRack {
44
45 static Logger logger = Logger.getLogger(org.greenstone.gsdl3.service.XMLRetrieve.class.getName());
46 protected static final String CONTENT_SERVICE = "DocumentContentRetrieve";
47 protected static final String METADATA_SERVICE = "DocumentMetadataRetrieve";
48 protected static final String STRUCTURE_SERVICE = "DocumentStructureRetrieve";
49
50 protected String toc_xsl_name = "";
51 protected String document_encoding = "";
52 protected String document_root_tag = "";
53
54 protected Element collection_doc_list = null;
55
56 protected boolean provide_content = true;
57 protected boolean provide_structure = true;
58 protected boolean provide_metadata = true;
59
60 protected GSEntityResolver entity_resolver = null;
61
62 public boolean configure(Element info, Element extra_info) {
63 if (!super.configure(info, extra_info)){
64 return false;
65 }
66 logger.info("configuring XMLRetrieve...");
67 // look for the parameters
68 Element param_list = (Element)GSXML.getChildByTagName(info, GSXML.PARAM_ELEM+GSXML.LIST_MODIFIER);
69 HashMap<String, Serializable> params;
70 String services_to_provide = "";
71 if (param_list != null) {
72 params = GSXML.extractParams(param_list, false);
73 this.toc_xsl_name = (String)params.get("tocXSLT");
74 this.document_encoding = (String)params.get("documentEncoding");
75 this.document_root_tag = (String)params.get("documentRootTag");
76 services_to_provide = (String)params.get("provideServices");
77 }
78 if (this.toc_xsl_name == null || this.toc_xsl_name.equals("")) {
79 this.toc_xsl_name = "default_toc";
80 }
81 this.toc_xsl_name = this.toc_xsl_name+".xsl";
82
83 if (this.document_encoding == null || this.document_encoding.equals("")) {
84 this.document_encoding = "UTF-8";
85 }
86
87 if (services_to_provide != null && !services_to_provide.equals("")) {
88 if (services_to_provide.indexOf("content")==-1) {
89 provide_content = false;
90 }
91 if (services_to_provide.indexOf("metadata")==-1) {
92 provide_metadata = false;
93 }
94 if (services_to_provide.indexOf("structure")==-1) {
95 provide_structure = false;
96 }
97
98 }
99
100 // set up short_service_info_ - for now just has name and type
101 Element retrieve_service;
102 if (provide_content) {
103 retrieve_service = this.desc_doc.createElement(GSXML.SERVICE_ELEM);
104 retrieve_service.setAttribute(GSXML.TYPE_ATT, GSXML.SERVICE_TYPE_RETRIEVE);
105 retrieve_service.setAttribute(GSXML.NAME_ATT, CONTENT_SERVICE);
106 this.short_service_info.appendChild(retrieve_service);
107 }
108 if (provide_metadata) {
109 retrieve_service = this.desc_doc.createElement(GSXML.SERVICE_ELEM);
110 retrieve_service.setAttribute(GSXML.TYPE_ATT, GSXML.SERVICE_TYPE_RETRIEVE);
111 retrieve_service.setAttribute(GSXML.NAME_ATT, METADATA_SERVICE);
112 this.short_service_info.appendChild(retrieve_service);
113 }
114 if (provide_structure) {
115 retrieve_service = this.desc_doc.createElement(GSXML.SERVICE_ELEM);
116 retrieve_service.setAttribute(GSXML.TYPE_ATT, GSXML.SERVICE_TYPE_RETRIEVE);
117 retrieve_service.setAttribute(GSXML.NAME_ATT, STRUCTURE_SERVICE);
118 this.short_service_info.appendChild(retrieve_service);
119 }
120 // find the doc list from the extra_info and keep it - should this be in collect.cfg or build.cfg??
121 collection_doc_list = (Element)GSXML.getChildByTagName(extra_info, GSXML.DOCUMENT_ELEM+GSXML.LIST_MODIFIER);
122
123 entity_resolver = new GSEntityResolver();
124 entity_resolver.setClassLoader(this.class_loader);
125 //this.converter.setEntityResolver(resolver);
126 return true;
127 }
128
129 // this may get called but is not useful in the case of retrieve services
130 protected Element getServiceDescription(Document doc, String service_id, String lang, String subset) {
131
132 Element retrieve_service = doc.createElement(GSXML.SERVICE_ELEM);
133 retrieve_service.setAttribute(GSXML.TYPE_ATT, GSXML.SERVICE_TYPE_RETRIEVE);
134 retrieve_service.setAttribute(GSXML.NAME_ATT, service_id);
135 return retrieve_service;
136 }
137
138 protected Element processDocumentContentRetrieve(Element request) {
139 Document result_doc = XMLConverter.newDOM();
140 Element result = result_doc.createElement(GSXML.RESPONSE_ELEM);
141 result.setAttribute(GSXML.FROM_ATT, CONTENT_SERVICE);
142 result.setAttribute(GSXML.TYPE_ATT, GSXML.REQUEST_TYPE_PROCESS);
143
144 Element doc_list = (Element)GSXML.getChildByTagName(request, GSXML.DOC_NODE_ELEM+GSXML.LIST_MODIFIER);
145 if (doc_list == null) {
146 return result;
147 }
148 Element result_doc_list = (Element)result_doc.importNode(doc_list, true);
149 result.appendChild(result_doc_list);
150
151 NodeList docs = result_doc_list.getElementsByTagName(GSXML.DOC_NODE_ELEM);
152 for (int i=0; i<docs.getLength(); i++) {
153
154 Element doc = (Element)docs.item(i);
155 Element content = result_doc.createElement(GSXML.NODE_CONTENT_ELEM);
156 doc.appendChild(content);
157
158 String node_id = doc.getAttribute(GSXML.NODE_ID_ATT);
159 String doc_name = getWorkName(node_id);
160
161 Element doc_elem = loadDocument(doc_name); // should perhaps cache the read in docs??
162 if (doc_elem == null) {
163 continue;
164 }
165
166
167 // if we have asked for the whole doc, just append it
168 if (doc_name.equals(node_id)) {
169 content.appendChild(result_doc.importNode(doc_elem, true));
170 continue;
171 }
172
173 // else we only want a sub section
174
175 Element section = getSection(doc_elem, node_id);
176 if (section != null) {
177 content.appendChild(result_doc.importNode(section, true));
178 }
179
180 } // for each doc
181
182 return result;
183
184 }
185
186 protected Element processDocumentStructureRetrieve(Element request) {
187 Document result_doc = XMLConverter.newDOM();
188 Element result = result_doc.createElement(GSXML.RESPONSE_ELEM);
189 result.setAttribute(GSXML.FROM_ATT, STRUCTURE_SERVICE);
190 result.setAttribute(GSXML.TYPE_ATT, GSXML.REQUEST_TYPE_PROCESS);
191
192 Element doc_list = (Element)GSXML.getChildByTagName(request, GSXML.DOC_NODE_ELEM+GSXML.LIST_MODIFIER);
193 if (doc_list == null) {
194 logger.error("no documents specified in the request. ");
195 return result;
196 }
197
198 Element result_doc_list = (Element)result_doc.importNode(doc_list, true);
199 result.appendChild(result_doc_list);
200 // first look for the stylesheet in the collection
201 File stylesheet = new File(GSFile.collStylesheetFile(this.site_home, this.cluster_name, this.toc_xsl_name));
202 if (!stylesheet.exists()) {
203 // now try in the site
204 stylesheet = new File(GSFile.siteStylesheetFile(this.site_home, this.toc_xsl_name));
205 }
206 if (!stylesheet.exists()) {
207 logger.error("couldn't find the stylesheet file to produce the table of contents:"+stylesheet.getPath());
208 return result;
209 }
210
211 // for now, we dont have any params, and we always return the structure of the whole document
212
213 XMLTransformer transformer = new XMLTransformer();
214 NodeList docs = result_doc_list.getElementsByTagName(GSXML.DOC_NODE_ELEM);
215
216 for (int i=0; i<docs.getLength(); i++) {
217
218 Element doc = (Element)docs.item(i);
219
220 Element structure = result_doc.createElement(GSXML.NODE_STRUCTURE_ELEM);
221 doc.appendChild(structure);
222 String doc_name = doc.getAttribute(GSXML.NODE_ID_ATT);
223 // make sure we are at the top level
224 doc_name = getWorkName(doc_name);
225
226 File doc_file = new File(GSFile.collectionIndexDir(this.site_home, this.cluster_name)+File.separator+"text"+File.separatorChar+doc_name+".xml");
227
228 if (!doc_file.exists()) {
229 logger.error("couldn't find file in coll "+this.cluster_name +", file "+doc_name+".xml");
230 } else {
231 try {
232 Node toc = transformer.transform(stylesheet, doc_file);
233 structure.appendChild(result_doc.importNode(toc, true));
234 } catch (Exception e) {
235 logger.error("couldn't transform the document to get the toc");
236 }
237 }
238
239 }
240
241 return result;
242
243 }
244
245 // this just extracts a bit of text from the section to use as the Title
246 // this should be overwritten for any format that has something more suitable
247 protected Element processDocumentMetadataRetrieve(Element request) {
248 Document result_doc = XMLConverter.newDOM();
249 Element result = result_doc.createElement(GSXML.RESPONSE_ELEM);
250 result.setAttribute(GSXML.FROM_ATT, METADATA_SERVICE);
251 result.setAttribute(GSXML.TYPE_ATT, GSXML.REQUEST_TYPE_PROCESS);
252
253 Element doc_list = (Element)GSXML.getChildByTagName(request, GSXML.DOC_NODE_ELEM+GSXML.LIST_MODIFIER);
254 if (doc_list == null) {
255 logger.error("no documents in the request");
256 return result;
257 }
258
259 Element result_doc_list = (Element)result_doc.importNode(doc_list, true);
260 result.appendChild(result_doc_list);
261
262 Element param_list = (Element) GSXML.getChildByTagName(request, GSXML.PARAM_ELEM+GSXML.LIST_MODIFIER);
263 if (param_list == null) {
264 logger.error("no metadata in the request");
265 return result;
266 }
267
268 Vector<String> meta_name_list = new Vector<String>();
269 boolean all_metadata = false;
270 // Process the request parameters
271 Element param = GSXML.getFirstElementChild(param_list);//(Element) param_list.getFirstChild();
272 while (param != null) {
273 // Identify the metadata information desired
274 if (param.getAttribute(GSXML.NAME_ATT).equals("metadata")) {
275 String metadata = GSXML.getValue(param);
276 if (metadata.equals("all")) {
277 all_metadata = true;
278 break;
279 }
280 meta_name_list.add(metadata);
281 }
282 param = (Element) param.getNextSibling();
283 }
284
285 NodeList docs = result_doc_list.getElementsByTagName(GSXML.DOC_NODE_ELEM);
286 for (int i=0; i<docs.getLength(); i++) {
287 Element doc = (Element)docs.item(i);
288 String node_id = doc.getAttribute(GSXML.NODE_ID_ATT);
289 String doc_name = getWorkName(node_id);
290
291 Element metadata_list = getMetadata(result_doc, node_id, all_metadata, meta_name_list);
292 doc.appendChild(metadata_list);
293 }
294
295 return result;
296 }
297
298 protected Element loadDocument(String doc_name) {
299 // try to find the document
300 File doc_file = new File(GSFile.collectionIndexDir(this.site_home, this.cluster_name)+File.separator+"text"+File.separatorChar+doc_name+".xml");
301
302 if (!doc_file.exists()) {
303 logger.info("couldn't find file in coll "+this.cluster_name +", file "+doc_name+".xml");
304 return null;
305 }
306
307 Document the_doc = null;
308 try {
309 the_doc = this.converter.getDOM(doc_file, this.document_encoding, this.entity_resolver);
310 } catch (Exception e) {
311 logger.error("couldn't create a DOM from file "+doc_file.getPath());
312 return null;
313 }
314
315 return the_doc.getDocumentElement();
316
317 }
318
319
320 protected Element getSection(Element doc_elem, String node_id) {
321 String [] bits = node_id.split("\\.");
322 if (bits.length > 4) {
323 logger.error("badly formatted node id ("+node_id +"), cant retrieve the section");
324 return null;
325 }
326
327 String id="";
328 String tagname = "";
329 String scope = "";
330 if (bits.length==2) {
331 tagname = bits[1];
332 } else {
333 scope = bits[1];
334 tagname = bits[2];
335
336 if (bits.length == 4) {
337 id = bits[3];
338 }
339 }
340 scope = translateScope(scope);
341 Element top=null;
342 if (!scope.equals("")) {
343 top = (Element)GSXML.getNodeByPath(doc_elem, scope);
344 if (top == null) {
345 // something gone wrong
346 return null;
347 }
348 } else {
349 top = doc_elem;
350 }
351
352 NodeList elements = top.getElementsByTagName(tagname);
353 if (elements.getLength() == 0) {
354 return null;
355 }
356 // no id, just return the first one
357 if (id.equals("")) {
358 return (Element)elements.item(0);
359 }
360 // have an id, need to check and find the right one.
361 for (int i=0; i<elements.getLength();i++) {
362 Element e = (Element)elements.item(i);
363 if (e.getAttribute("gs3:id").equals(id)) {
364 return e;
365 }
366 }
367 return null;
368
369 }
370
371 protected Element getMetadata(Document result_doc, String node_id, boolean all, Vector<String> meta_name_list) {
372
373 // our default strategy here is to only return Title and root:Title
374 // ignore all others
375 // the title of a section is just a little bit of the text inside it.
376 // the root_Title is the title from the doc info in the config file
377 Element metadata_list = result_doc.createElement(GSXML.METADATA_ELEM+ GSXML.LIST_MODIFIER);
378 String doc_name = getWorkName(node_id);
379 boolean node_is_root = false;
380 if (doc_name.equals(node_id)) {
381 node_is_root = true;
382 }
383
384 Element this_doc = GSXML.getNamedElement(this.collection_doc_list, GSXML.DOCUMENT_ELEM, GSXML.NAME_ATT, doc_name);
385 Element doc_meta_list = (Element) GSXML.getChildByTagName(this_doc, GSXML.METADATA_ELEM+GSXML.LIST_MODIFIER);
386
387 boolean get_section_title = false;
388
389 if (all) {
390 if (node_is_root) {
391 return (Element)result_doc.importNode(doc_meta_list, true);
392 } else {
393 get_section_title = true;
394 }
395
396 } else {
397 // have to process metadata one by one
398 for (int i=0; i<meta_name_list.size(); i++) {
399 String meta_name = meta_name_list.elementAt(i);
400 String actual_meta_name = meta_name;
401 if (meta_name.startsWith("root_")) {
402 actual_meta_name = meta_name.substring(5);
403 } else {
404 // its a section level one - check to see if doc is root
405 if (!node_is_root) {
406 if (meta_name.equals("Title")) {
407 get_section_title = true;
408 }
409 continue; // move on to teh next metadata
410 }
411 }
412
413 // here, we look for the specific meta elem in doc_meta_list
414 Element meta_item = GSXML.getNamedElement(doc_meta_list, GSXML.METADATA_ELEM, GSXML.NAME_ATT, actual_meta_name);
415 if (meta_item != null) {
416 meta_item = (Element)result_doc.importNode(meta_item, true);
417 meta_item.setAttribute(GSXML.NAME_ATT, meta_name);
418 metadata_list.appendChild(meta_item);
419 }
420 } // for each metadata
421 }
422
423 // now we have processed all teh doc metadata, just have section one to go, if needed
424 if (get_section_title) {
425
426 Element doc_elem = loadDocument(doc_name);
427 if (doc_elem != null) {
428 Element section = getSection(doc_elem, node_id);
429 if (section != null) {
430 Element title_meta = extractTitleMeta(result_doc, section);
431 if (title_meta != null) {
432 metadata_list.appendChild(title_meta);
433 }
434 }
435 }
436
437 }
438 return metadata_list;
439 }
440
441 protected Element extractTitleMeta(Document result_doc, Element section) {
442 Element meta_elem = result_doc.createElement(GSXML.METADATA_ELEM);
443 meta_elem.setAttribute(GSXML.NAME_ATT, "Title");
444
445 String title = "dummy title";
446 Text t = result_doc.createTextNode(title);
447 meta_elem.appendChild(t);
448 return meta_elem;
449
450 }
451 // some methods for handling nodeIDs - they may be different for different colls, so they can be overwritten
452
453 // the full default nodeID looks like work.scope.tag.id
454 // the shorter versions are work, work.tag, work.scope.tag
455 protected String getWorkName(String node_id) {
456 int pos = node_id.indexOf('.');
457 if (pos == -1) {
458 return node_id;
459 }
460 return node_id.substring(0, pos);
461 }
462
463 // this assumes that the scope refers to a top level node - this may be overwritten if the scope bit in the id is a shorthand of some sort
464 protected String translateScope(String scope) {
465 if (this.document_root_tag != null) {
466 return GSPath.appendLink(this.document_root_tag, scope);
467 }
468 return scope;
469 }
470
471}
472
Note: See TracBrowser for help on using the repository browser.