source: main/trunk/greenstone3/src/java/org/greenstone/gsdl3/service/XMLRetrieve.java@ 32490

Last change on this file since 32490 was 32490, checked in by kjdon, 6 years ago

we need to supply entity resolver to the transform call, otherwise it can't find the DTD (gberg collection)

  • Property svn:keywords set to Author Date Id Revision
File size: 16.0 KB
Line 
1/*
2 * ServiceRack.java
3 * Copyright (C) 2014 New Zealand Digital Library, http://www.nzdl.org
4 *
5 * This program is free software; you can redistribute it and/or modify
6 * it under the terms of the GNU General Public License as published by
7 * the Free Software Foundation; either version 2 of the License, or
8 * (at your option) any later version.
9 *
10 * This program is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 * GNU General Public License for more details.
14 *
15 * You should have received a copy of the GNU General Public License
16 * along with this program; if not, write to the Free Software
17 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
18 */
19
20package org.greenstone.gsdl3.service;
21
22
23// Greenstone classes
24import org.greenstone.gsdl3.util.*;
25
26// XML classes
27import org.w3c.dom.Document;
28import org.w3c.dom.Element;
29import org.w3c.dom.Node;
30import org.w3c.dom.Attr;
31import org.w3c.dom.Text;
32import org.w3c.dom.NodeList;
33import org.w3c.dom.NamedNodeMap;
34
35// General Java classes
36import java.io.File;
37import java.io.Serializable;
38import java.util.Vector;
39import java.util.HashMap;
40
41import org.apache.log4j.*;
42
43public class XMLRetrieve extends ServiceRack {
44
45 static Logger logger = Logger.getLogger(org.greenstone.gsdl3.service.XMLRetrieve.class.getName());
46 protected static final String CONTENT_SERVICE = "DocumentContentRetrieve";
47 protected static final String METADATA_SERVICE = "DocumentMetadataRetrieve";
48 protected static final String STRUCTURE_SERVICE = "DocumentStructureRetrieve";
49
50 protected String toc_xsl_name = "";
51 protected String document_encoding = "";
52 protected String document_root_tag = "";
53
54 protected Element collection_doc_list = null;
55
56 protected boolean provide_content = true;
57 protected boolean provide_structure = true;
58 protected boolean provide_metadata = true;
59
60 protected GSEntityResolver entity_resolver = null;
61
62 public boolean configure(Element info, Element extra_info) {
63 if (!super.configure(info, extra_info)){
64 return false;
65 }
66 logger.info("configuring XMLRetrieve...");
67 // look for the parameters
68 Element param_list = (Element)GSXML.getChildByTagName(info, GSXML.PARAM_ELEM+GSXML.LIST_MODIFIER);
69 HashMap<String, Serializable> params;
70 String services_to_provide = "";
71 if (param_list != null) {
72 params = GSXML.extractParams(param_list, false);
73 this.toc_xsl_name = (String)params.get("tocXSLT");
74 this.document_encoding = (String)params.get("documentEncoding");
75 this.document_root_tag = (String)params.get("documentRootTag");
76 services_to_provide = (String)params.get("provideServices");
77 }
78 if (this.toc_xsl_name == null || this.toc_xsl_name.equals("")) {
79 this.toc_xsl_name = "default_toc";
80 }
81 this.toc_xsl_name = this.toc_xsl_name+".xsl";
82
83 if (this.document_encoding == null || this.document_encoding.equals("")) {
84 this.document_encoding = "UTF-8";
85 }
86
87 if (services_to_provide != null && !services_to_provide.equals("")) {
88 if (services_to_provide.indexOf("content")==-1) {
89 provide_content = false;
90 }
91 if (services_to_provide.indexOf("metadata")==-1) {
92 provide_metadata = false;
93 }
94 if (services_to_provide.indexOf("structure")==-1) {
95 provide_structure = false;
96 }
97
98 }
99
100 // set up short_service_info_ - for now just has name and type
101 Element retrieve_service;
102 if (provide_content) {
103 retrieve_service = this.desc_doc.createElement(GSXML.SERVICE_ELEM);
104 retrieve_service.setAttribute(GSXML.TYPE_ATT, GSXML.SERVICE_TYPE_RETRIEVE);
105 retrieve_service.setAttribute(GSXML.NAME_ATT, CONTENT_SERVICE);
106 this.short_service_info.appendChild(retrieve_service);
107 }
108 if (provide_metadata) {
109 retrieve_service = this.desc_doc.createElement(GSXML.SERVICE_ELEM);
110 retrieve_service.setAttribute(GSXML.TYPE_ATT, GSXML.SERVICE_TYPE_RETRIEVE);
111 retrieve_service.setAttribute(GSXML.NAME_ATT, METADATA_SERVICE);
112 this.short_service_info.appendChild(retrieve_service);
113 }
114 if (provide_structure) {
115 retrieve_service = this.desc_doc.createElement(GSXML.SERVICE_ELEM);
116 retrieve_service.setAttribute(GSXML.TYPE_ATT, GSXML.SERVICE_TYPE_RETRIEVE);
117 retrieve_service.setAttribute(GSXML.NAME_ATT, STRUCTURE_SERVICE);
118 this.short_service_info.appendChild(retrieve_service);
119 }
120 // find the doc list from the extra_info and keep it - should this be in collect.cfg or build.cfg??
121 collection_doc_list = (Element)GSXML.getChildByTagName(extra_info, GSXML.DOCUMENT_ELEM+GSXML.LIST_MODIFIER);
122
123 entity_resolver = new GSEntityResolver();
124 entity_resolver.setClassLoader(this.class_loader);
125 //this.converter.setEntityResolver(resolver);
126 return true;
127 }
128
129 // this may get called but is not useful in the case of retrieve services
130 protected Element getServiceDescription(Document doc, String service_id, String lang, String subset) {
131
132 Element retrieve_service = doc.createElement(GSXML.SERVICE_ELEM);
133 retrieve_service.setAttribute(GSXML.TYPE_ATT, GSXML.SERVICE_TYPE_RETRIEVE);
134 retrieve_service.setAttribute(GSXML.NAME_ATT, service_id);
135 return retrieve_service;
136 }
137
138 protected Element processDocumentContentRetrieve(Element request) {
139 Document result_doc = XMLConverter.newDOM();
140 Element result = result_doc.createElement(GSXML.RESPONSE_ELEM);
141 result.setAttribute(GSXML.FROM_ATT, CONTENT_SERVICE);
142 result.setAttribute(GSXML.TYPE_ATT, GSXML.REQUEST_TYPE_PROCESS);
143
144 Element doc_list = (Element)GSXML.getChildByTagName(request, GSXML.DOC_NODE_ELEM+GSXML.LIST_MODIFIER);
145 if (doc_list == null) {
146 return result;
147 }
148 Element result_doc_list = (Element)result_doc.importNode(doc_list, true);
149 result.appendChild(result_doc_list);
150
151 NodeList docs = result_doc_list.getElementsByTagName(GSXML.DOC_NODE_ELEM);
152 for (int i=0; i<docs.getLength(); i++) {
153
154 Element doc = (Element)docs.item(i);
155 Element content = result_doc.createElement(GSXML.NODE_CONTENT_ELEM);
156 doc.appendChild(content);
157
158 String node_id = doc.getAttribute(GSXML.NODE_ID_ATT);
159 String doc_name = getWorkName(node_id);
160
161 Element doc_elem = loadDocument(doc_name); // should perhaps cache the read in docs??
162 if (doc_elem == null) {
163 continue;
164 }
165
166
167 // if we have asked for the whole doc, just append it
168 if (doc_name.equals(node_id)) {
169 content.appendChild(result_doc.importNode(doc_elem, true));
170 continue;
171 }
172
173 // else we only want a sub section
174
175 Element section = getSection(doc_elem, node_id);
176 if (section != null) {
177 content.appendChild(result_doc.importNode(section, true));
178 }
179
180 } // for each doc
181
182 return result;
183
184 }
185
186 protected Element processDocumentStructureRetrieve(Element request) {
187 Document result_doc = XMLConverter.newDOM();
188 Element result = result_doc.createElement(GSXML.RESPONSE_ELEM);
189 result.setAttribute(GSXML.FROM_ATT, STRUCTURE_SERVICE);
190 result.setAttribute(GSXML.TYPE_ATT, GSXML.REQUEST_TYPE_PROCESS);
191
192 Element doc_list = (Element)GSXML.getChildByTagName(request, GSXML.DOC_NODE_ELEM+GSXML.LIST_MODIFIER);
193 if (doc_list == null) {
194 logger.error("no documents specified in the request. ");
195 return result;
196 }
197
198 Element result_doc_list = (Element)result_doc.importNode(doc_list, true);
199 result.appendChild(result_doc_list);
200 // first look for the stylesheet in the collection
201 File stylesheet = new File(GSFile.collStylesheetFile(this.site_home, this.cluster_name, this.toc_xsl_name));
202 if (!stylesheet.exists()) {
203 // now try in the site
204 stylesheet = new File(GSFile.siteStylesheetFile(this.site_home, this.toc_xsl_name));
205 }
206 if (!stylesheet.exists()) {
207 logger.error("couldn't find the stylesheet file to produce the table of contents:"+stylesheet.getPath());
208 return result;
209 }
210
211 // for now, we dont have any params, and we always return the structure of the whole document
212
213 XMLTransformer transformer = new XMLTransformer();
214 NodeList docs = result_doc_list.getElementsByTagName(GSXML.DOC_NODE_ELEM);
215
216 for (int i=0; i<docs.getLength(); i++) {
217
218 Element doc = (Element)docs.item(i);
219
220 Element structure = result_doc.createElement(GSXML.NODE_STRUCTURE_ELEM);
221 doc.appendChild(structure);
222 String doc_name = doc.getAttribute(GSXML.NODE_ID_ATT);
223 // make sure we are at the top level
224 doc_name = getWorkName(doc_name);
225
226 File doc_file = new File(GSFile.collectionIndexDir(this.site_home, this.cluster_name)+File.separator+"text"+File.separatorChar+doc_name+".xml");
227
228 if (!doc_file.exists()) {
229 logger.error("couldn't find file in coll "+this.cluster_name +", file "+doc_name+".xml");
230 } else {
231 try {
232 Node toc = transformer.transform(stylesheet, doc_file, null, this.entity_resolver);
233 structure.appendChild(result_doc.importNode(toc, true));
234 } catch (Exception e) {
235 logger.error("couldn't transform the document to get the toc");
236 }
237 }
238
239 }
240
241 return result;
242
243 }
244
245 // this just extracts a bit of text from the section to use as the Title
246 // this should be overwritten for any format that has something more suitable
247 protected Element processDocumentMetadataRetrieve(Element request) {
248 Document result_doc = XMLConverter.newDOM();
249 Element result = result_doc.createElement(GSXML.RESPONSE_ELEM);
250 result.setAttribute(GSXML.FROM_ATT, METADATA_SERVICE);
251 result.setAttribute(GSXML.TYPE_ATT, GSXML.REQUEST_TYPE_PROCESS);
252
253 Element doc_list = (Element)GSXML.getChildByTagName(request, GSXML.DOC_NODE_ELEM+GSXML.LIST_MODIFIER);
254 if (doc_list == null) {
255 logger.error("no documents in the request");
256 return result;
257 }
258
259 Element result_doc_list = (Element)result_doc.importNode(doc_list, true);
260 result.appendChild(result_doc_list);
261
262 Element param_list = (Element) GSXML.getChildByTagName(request, GSXML.PARAM_ELEM+GSXML.LIST_MODIFIER);
263 if (param_list == null) {
264 logger.error("no metadata in the request");
265 return result;
266 }
267
268 Vector<String> meta_name_list = new Vector<String>();
269 boolean all_metadata = false;
270 // Process the request parameters
271 Element param = GSXML.getFirstElementChild(param_list);//(Element) param_list.getFirstChild();
272 while (param != null) {
273 // Identify the metadata information desired
274 if (param.getAttribute(GSXML.NAME_ATT).equals("metadata")) {
275 String metadata = GSXML.getValue(param);
276 if (metadata.equals("all")) {
277 all_metadata = true;
278 break;
279 }
280 meta_name_list.add(metadata);
281 }
282 param = (Element) param.getNextSibling();
283 }
284
285 NodeList docs = result_doc_list.getElementsByTagName(GSXML.DOC_NODE_ELEM);
286 for (int i=0; i<docs.getLength(); i++) {
287 Element doc = (Element)docs.item(i);
288 String node_id = doc.getAttribute(GSXML.NODE_ID_ATT);
289 String doc_name = getWorkName(node_id);
290
291 Element metadata_list = getMetadata(result_doc, node_id, all_metadata, meta_name_list);
292 doc.appendChild(metadata_list);
293 }
294
295 return result;
296 }
297
298 protected Element loadDocument(String doc_name) {
299 // try to find the document
300 File doc_file = new File(GSFile.collectionIndexDir(this.site_home, this.cluster_name)+File.separator+"text"+File.separatorChar+doc_name+".xml");
301
302 if (!doc_file.exists()) {
303 logger.info("couldn't find file in coll "+this.cluster_name +", file "+doc_name+".xml");
304 return null;
305 }
306
307 Document the_doc = null;
308 try {
309 the_doc = this.converter.getDOM(doc_file, this.document_encoding, this.entity_resolver);
310 } catch (Exception e) {
311 logger.error("couldn't create a DOM from file "+doc_file.getPath());
312 return null;
313 }
314
315 return the_doc.getDocumentElement();
316
317 }
318
319
320 protected Element getSection(Element doc_elem, String node_id) {
321 String [] bits = node_id.split("\\.");
322 if (bits.length > 4) {
323 logger.error("badly formatted node id ("+node_id +"), cant retrieve the section");
324 return null;
325 }
326
327 String id="";
328 String tagname = "";
329 String scope = "";
330 if (bits.length==2) {
331 tagname = bits[1];
332 } else {
333 scope = bits[1];
334 tagname = bits[2];
335
336 if (bits.length == 4) {
337 id = bits[3];
338 }
339 }
340 scope = translateScope(scope);
341 Element top=null;
342 if (!scope.equals("")) {
343 top = (Element)GSXML.getNodeByPath(doc_elem, scope);
344 if (top == null) {
345 // something gone wrong
346 return null;
347 }
348 } else {
349 top = doc_elem;
350 }
351
352 NodeList elements = top.getElementsByTagName(tagname);
353 if (elements.getLength() == 0) {
354 return null;
355 }
356 // no id, just return the first one
357 if (id.equals("")) {
358 return (Element)elements.item(0);
359 }
360 // have an id, need to check and find the right one.
361 for (int i=0; i<elements.getLength();i++) {
362 Element e = (Element)elements.item(i);
363 if (e.getAttribute("gs3:id").equals(id)) {
364 return e;
365 }
366 }
367 return null;
368
369 }
370
371 protected Element getMetadata(Document result_doc, String node_id, boolean all, Vector<String> meta_name_list) {
372
373 // our default strategy here is to only return Title and root:Title
374 // ignore all others
375 // the title of a section is just a little bit of the text inside it.
376 // the root_Title is the title from the doc info in the config file
377 Element metadata_list = result_doc.createElement(GSXML.METADATA_ELEM+ GSXML.LIST_MODIFIER);
378 String doc_name = getWorkName(node_id);
379 boolean node_is_root = false;
380 if (doc_name.equals(node_id)) {
381 node_is_root = true;
382 }
383
384 Element this_doc = GSXML.getNamedElement(this.collection_doc_list, GSXML.DOCUMENT_ELEM, GSXML.NAME_ATT, doc_name);
385 Element doc_meta_list = (Element) GSXML.getChildByTagName(this_doc, GSXML.METADATA_ELEM+GSXML.LIST_MODIFIER);
386
387 boolean get_section_title = false;
388
389 if (all) {
390 if (node_is_root) {
391 return (Element)result_doc.importNode(doc_meta_list, true);
392 } else {
393 get_section_title = true;
394 }
395
396 } else {
397 // have to process metadata one by one
398 for (int i=0; i<meta_name_list.size(); i++) {
399 String meta_name = meta_name_list.elementAt(i);
400 String actual_meta_name = meta_name;
401 if (meta_name.startsWith("root_")) {
402 actual_meta_name = meta_name.substring(5);
403 } else {
404 // its a section level one - check to see if doc is root
405 if (!node_is_root) {
406 if (meta_name.equals("Title")) {
407 get_section_title = true;
408 }
409 continue; // move on to teh next metadata
410 }
411 }
412
413 // here, we look for the specific meta elem in doc_meta_list
414 Element meta_item = GSXML.getNamedElement(doc_meta_list, GSXML.METADATA_ELEM, GSXML.NAME_ATT, actual_meta_name);
415 if (meta_item != null) {
416 meta_item = (Element)result_doc.importNode(meta_item, true);
417 meta_item.setAttribute(GSXML.NAME_ATT, meta_name);
418 metadata_list.appendChild(meta_item);
419 }
420 } // for each metadata
421 }
422
423 // now we have processed all teh doc metadata, just have section one to go, if needed
424 if (get_section_title) {
425
426 Element doc_elem = loadDocument(doc_name);
427 if (doc_elem != null) {
428 Element section = getSection(doc_elem, node_id);
429 if (section != null) {
430 Element title_meta = extractTitleMeta(result_doc, section);
431 if (title_meta != null) {
432 metadata_list.appendChild(title_meta);
433 }
434 }
435 }
436
437 }
438 return metadata_list;
439 }
440
441 protected Element extractTitleMeta(Document result_doc, Element section) {
442 Element meta_elem = result_doc.createElement(GSXML.METADATA_ELEM);
443 meta_elem.setAttribute(GSXML.NAME_ATT, "Title");
444
445 String title = "dummy title";
446 Text t = result_doc.createTextNode(title);
447 meta_elem.appendChild(t);
448 return meta_elem;
449
450 }
451 // some methods for handling nodeIDs - they may be different for different colls, so they can be overwritten
452
453 // the full default nodeID looks like work.scope.tag.id
454 // the shorter versions are work, work.tag, work.scope.tag
455 protected String getWorkName(String node_id) {
456 int pos = node_id.indexOf('.');
457 if (pos == -1) {
458 return node_id;
459 }
460 return node_id.substring(0, pos);
461 }
462
463 // this assumes that the scope refers to a top level node - this may be overwritten if the scope bit in the id is a shorthand of some sort
464 protected String translateScope(String scope) {
465 if (this.document_root_tag != null) {
466 return GSPath.appendLink(this.document_root_tag, scope);
467 }
468 return scope;
469 }
470
471}
472
Note: See TracBrowser for help on using the repository browser.