Context Navigation

XMLRetrieve.java@ 32490

Last change on this file since 32490 was 32490, checked in by kjdon, 6 years ago
we need to supply entity resolver to the transform call, otherwise it can't find the DTD (gberg collection)
Property svn:keywords set to `Author Date Id Revision`
File size: 16.0 KB

Line
1	/*
2	* ServiceRack.java
3	* Copyright (C) 2014 New Zealand Digital Library, http://www.nzdl.org
4	*
5	* This program is free software; you can redistribute it and/or modify
6	* it under the terms of the GNU General Public License as published by
7	* the Free Software Foundation; either version 2 of the License, or
8	* (at your option) any later version.
9	*
10	* This program is distributed in the hope that it will be useful,
11	* but WITHOUT ANY WARRANTY; without even the implied warranty of
12	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13	* GNU General Public License for more details.
14	*
15	* You should have received a copy of the GNU General Public License
16	* along with this program; if not, write to the Free Software
17	* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
18	*/
19
20	package org.greenstone.gsdl3.service;
21
22
23	// Greenstone classes
24	import org.greenstone.gsdl3.util.*;
25
26	// XML classes
27	import org.w3c.dom.Document;
28	import org.w3c.dom.Element;
29	import org.w3c.dom.Node;
30	import org.w3c.dom.Attr;
31	import org.w3c.dom.Text;
32	import org.w3c.dom.NodeList;
33	import org.w3c.dom.NamedNodeMap;
34
35	// General Java classes
36	import java.io.File;
37	import java.io.Serializable;
38	import java.util.Vector;
39	import java.util.HashMap;
40
41	import org.apache.log4j.*;
42
43	public class XMLRetrieve extends ServiceRack {
44
45	static Logger logger = Logger.getLogger(org.greenstone.gsdl3.service.XMLRetrieve.class.getName());
46	protected static final String CONTENT_SERVICE = "DocumentContentRetrieve";
47	protected static final String METADATA_SERVICE = "DocumentMetadataRetrieve";
48	protected static final String STRUCTURE_SERVICE = "DocumentStructureRetrieve";
49
50	protected String toc_xsl_name = "";
51	protected String document_encoding = "";
52	protected String document_root_tag = "";
53
54	protected Element collection_doc_list = null;
55
56	protected boolean provide_content = true;
57	protected boolean provide_structure = true;
58	protected boolean provide_metadata = true;
59
60	protected GSEntityResolver entity_resolver = null;
61
62	public boolean configure(Element info, Element extra_info) {
63	if (!super.configure(info, extra_info)){
64	return false;
65	}
66	logger.info("configuring XMLRetrieve...");
67	// look for the parameters
68	Element param_list = (Element)GSXML.getChildByTagName(info, GSXML.PARAM_ELEM+GSXML.LIST_MODIFIER);
69	HashMap<String, Serializable> params;
70	String services_to_provide = "";
71	if (param_list != null) {
72	params = GSXML.extractParams(param_list, false);
73	this.toc_xsl_name = (String)params.get("tocXSLT");
74	this.document_encoding = (String)params.get("documentEncoding");
75	this.document_root_tag = (String)params.get("documentRootTag");
76	services_to_provide = (String)params.get("provideServices");
77	}
78	if (this.toc_xsl_name == null \|\| this.toc_xsl_name.equals("")) {
79	this.toc_xsl_name = "default_toc";
80	}
81	this.toc_xsl_name = this.toc_xsl_name+".xsl";
82
83	if (this.document_encoding == null \|\| this.document_encoding.equals("")) {
84	this.document_encoding = "UTF-8";
85	}
86
87	if (services_to_provide != null && !services_to_provide.equals("")) {
88	if (services_to_provide.indexOf("content")==-1) {
89	provide_content = false;
90	}
91	if (services_to_provide.indexOf("metadata")==-1) {
92	provide_metadata = false;
93	}
94	if (services_to_provide.indexOf("structure")==-1) {
95	provide_structure = false;
96	}
97
98	}
99
100	// set up short_service_info_ - for now just has name and type
101	Element retrieve_service;
102	if (provide_content) {
103	retrieve_service = this.desc_doc.createElement(GSXML.SERVICE_ELEM);
104	retrieve_service.setAttribute(GSXML.TYPE_ATT, GSXML.SERVICE_TYPE_RETRIEVE);
105	retrieve_service.setAttribute(GSXML.NAME_ATT, CONTENT_SERVICE);
106	this.short_service_info.appendChild(retrieve_service);
107	}
108	if (provide_metadata) {
109	retrieve_service = this.desc_doc.createElement(GSXML.SERVICE_ELEM);
110	retrieve_service.setAttribute(GSXML.TYPE_ATT, GSXML.SERVICE_TYPE_RETRIEVE);
111	retrieve_service.setAttribute(GSXML.NAME_ATT, METADATA_SERVICE);
112	this.short_service_info.appendChild(retrieve_service);
113	}
114	if (provide_structure) {
115	retrieve_service = this.desc_doc.createElement(GSXML.SERVICE_ELEM);
116	retrieve_service.setAttribute(GSXML.TYPE_ATT, GSXML.SERVICE_TYPE_RETRIEVE);
117	retrieve_service.setAttribute(GSXML.NAME_ATT, STRUCTURE_SERVICE);
118	this.short_service_info.appendChild(retrieve_service);
119	}
120	// find the doc list from the extra_info and keep it - should this be in collect.cfg or build.cfg??
121	collection_doc_list = (Element)GSXML.getChildByTagName(extra_info, GSXML.DOCUMENT_ELEM+GSXML.LIST_MODIFIER);
122
123	entity_resolver = new GSEntityResolver();
124	entity_resolver.setClassLoader(this.class_loader);
125	//this.converter.setEntityResolver(resolver);
126	return true;
127	}
128
129	// this may get called but is not useful in the case of retrieve services
130	protected Element getServiceDescription(Document doc, String service_id, String lang, String subset) {
131
132	Element retrieve_service = doc.createElement(GSXML.SERVICE_ELEM);
133	retrieve_service.setAttribute(GSXML.TYPE_ATT, GSXML.SERVICE_TYPE_RETRIEVE);
134	retrieve_service.setAttribute(GSXML.NAME_ATT, service_id);
135	return retrieve_service;
136	}
137
138	protected Element processDocumentContentRetrieve(Element request) {
139	Document result_doc = XMLConverter.newDOM();
140	Element result = result_doc.createElement(GSXML.RESPONSE_ELEM);
141	result.setAttribute(GSXML.FROM_ATT, CONTENT_SERVICE);
142	result.setAttribute(GSXML.TYPE_ATT, GSXML.REQUEST_TYPE_PROCESS);
143
144	Element doc_list = (Element)GSXML.getChildByTagName(request, GSXML.DOC_NODE_ELEM+GSXML.LIST_MODIFIER);
145	if (doc_list == null) {
146	return result;
147	}
148	Element result_doc_list = (Element)result_doc.importNode(doc_list, true);
149	result.appendChild(result_doc_list);
150
151	NodeList docs = result_doc_list.getElementsByTagName(GSXML.DOC_NODE_ELEM);
152	for (int i=0; i<docs.getLength(); i++) {
153
154	Element doc = (Element)docs.item(i);
155	Element content = result_doc.createElement(GSXML.NODE_CONTENT_ELEM);
156	doc.appendChild(content);
157
158	String node_id = doc.getAttribute(GSXML.NODE_ID_ATT);
159	String doc_name = getWorkName(node_id);
160
161	Element doc_elem = loadDocument(doc_name); // should perhaps cache the read in docs??
162	if (doc_elem == null) {
163	continue;
164	}
165
166
167	// if we have asked for the whole doc, just append it
168	if (doc_name.equals(node_id)) {
169	content.appendChild(result_doc.importNode(doc_elem, true));
170	continue;
171	}
172
173	// else we only want a sub section
174
175	Element section = getSection(doc_elem, node_id);
176	if (section != null) {
177	content.appendChild(result_doc.importNode(section, true));
178	}
179
180	} // for each doc
181
182	return result;
183
184	}
185
186	protected Element processDocumentStructureRetrieve(Element request) {
187	Document result_doc = XMLConverter.newDOM();
188	Element result = result_doc.createElement(GSXML.RESPONSE_ELEM);
189	result.setAttribute(GSXML.FROM_ATT, STRUCTURE_SERVICE);
190	result.setAttribute(GSXML.TYPE_ATT, GSXML.REQUEST_TYPE_PROCESS);
191
192	Element doc_list = (Element)GSXML.getChildByTagName(request, GSXML.DOC_NODE_ELEM+GSXML.LIST_MODIFIER);
193	if (doc_list == null) {
194	logger.error("no documents specified in the request. ");
195	return result;
196	}
197
198	Element result_doc_list = (Element)result_doc.importNode(doc_list, true);
199	result.appendChild(result_doc_list);
200	// first look for the stylesheet in the collection
201	File stylesheet = new File(GSFile.collStylesheetFile(this.site_home, this.cluster_name, this.toc_xsl_name));
202	if (!stylesheet.exists()) {
203	// now try in the site
204	stylesheet = new File(GSFile.siteStylesheetFile(this.site_home, this.toc_xsl_name));
205	}
206	if (!stylesheet.exists()) {
207	logger.error("couldn't find the stylesheet file to produce the table of contents:"+stylesheet.getPath());
208	return result;
209	}
210
211	// for now, we dont have any params, and we always return the structure of the whole document
212
213	XMLTransformer transformer = new XMLTransformer();
214	NodeList docs = result_doc_list.getElementsByTagName(GSXML.DOC_NODE_ELEM);
215
216	for (int i=0; i<docs.getLength(); i++) {
217
218	Element doc = (Element)docs.item(i);
219
220	Element structure = result_doc.createElement(GSXML.NODE_STRUCTURE_ELEM);
221	doc.appendChild(structure);
222	String doc_name = doc.getAttribute(GSXML.NODE_ID_ATT);
223	// make sure we are at the top level
224	doc_name = getWorkName(doc_name);
225
226	File doc_file = new File(GSFile.collectionIndexDir(this.site_home, this.cluster_name)+File.separator+"text"+File.separatorChar+doc_name+".xml");
227
228	if (!doc_file.exists()) {
229	logger.error("couldn't find file in coll "+this.cluster_name +", file "+doc_name+".xml");
230	} else {
231	try {
232	Node toc = transformer.transform(stylesheet, doc_file, null, this.entity_resolver);
233	structure.appendChild(result_doc.importNode(toc, true));
234	} catch (Exception e) {
235	logger.error("couldn't transform the document to get the toc");
236	}
237	}
238
239	}
240
241	return result;
242
243	}
244
245	// this just extracts a bit of text from the section to use as the Title
246	// this should be overwritten for any format that has something more suitable
247	protected Element processDocumentMetadataRetrieve(Element request) {
248	Document result_doc = XMLConverter.newDOM();
249	Element result = result_doc.createElement(GSXML.RESPONSE_ELEM);
250	result.setAttribute(GSXML.FROM_ATT, METADATA_SERVICE);
251	result.setAttribute(GSXML.TYPE_ATT, GSXML.REQUEST_TYPE_PROCESS);
252
253	Element doc_list = (Element)GSXML.getChildByTagName(request, GSXML.DOC_NODE_ELEM+GSXML.LIST_MODIFIER);
254	if (doc_list == null) {
255	logger.error("no documents in the request");
256	return result;
257	}
258
259	Element result_doc_list = (Element)result_doc.importNode(doc_list, true);
260	result.appendChild(result_doc_list);
261
262	Element param_list = (Element) GSXML.getChildByTagName(request, GSXML.PARAM_ELEM+GSXML.LIST_MODIFIER);
263	if (param_list == null) {
264	logger.error("no metadata in the request");
265	return result;
266	}
267
268	Vector<String> meta_name_list = new Vector<String>();
269	boolean all_metadata = false;
270	// Process the request parameters
271	Element param = GSXML.getFirstElementChild(param_list);//(Element) param_list.getFirstChild();
272	while (param != null) {
273	// Identify the metadata information desired
274	if (param.getAttribute(GSXML.NAME_ATT).equals("metadata")) {
275	String metadata = GSXML.getValue(param);
276	if (metadata.equals("all")) {
277	all_metadata = true;
278	break;
279	}
280	meta_name_list.add(metadata);
281	}
282	param = (Element) param.getNextSibling();
283	}
284
285	NodeList docs = result_doc_list.getElementsByTagName(GSXML.DOC_NODE_ELEM);
286	for (int i=0; i<docs.getLength(); i++) {
287	Element doc = (Element)docs.item(i);
288	String node_id = doc.getAttribute(GSXML.NODE_ID_ATT);
289	String doc_name = getWorkName(node_id);
290
291	Element metadata_list = getMetadata(result_doc, node_id, all_metadata, meta_name_list);
292	doc.appendChild(metadata_list);
293	}
294
295	return result;
296	}
297
298	protected Element loadDocument(String doc_name) {
299	// try to find the document
300	File doc_file = new File(GSFile.collectionIndexDir(this.site_home, this.cluster_name)+File.separator+"text"+File.separatorChar+doc_name+".xml");
301
302	if (!doc_file.exists()) {
303	logger.info("couldn't find file in coll "+this.cluster_name +", file "+doc_name+".xml");
304	return null;
305	}
306
307	Document the_doc = null;
308	try {
309	the_doc = this.converter.getDOM(doc_file, this.document_encoding, this.entity_resolver);
310	} catch (Exception e) {
311	logger.error("couldn't create a DOM from file "+doc_file.getPath());
312	return null;
313	}
314
315	return the_doc.getDocumentElement();
316
317	}
318
319
320	protected Element getSection(Element doc_elem, String node_id) {
321	String [] bits = node_id.split("\\.");
322	if (bits.length > 4) {
323	logger.error("badly formatted node id ("+node_id +"), cant retrieve the section");
324	return null;
325	}
326
327	String id="";
328	String tagname = "";
329	String scope = "";
330	if (bits.length==2) {
331	tagname = bits[1];
332	} else {
333	scope = bits[1];
334	tagname = bits[2];
335
336	if (bits.length == 4) {
337	id = bits[3];
338	}
339	}
340	scope = translateScope(scope);
341	Element top=null;
342	if (!scope.equals("")) {
343	top = (Element)GSXML.getNodeByPath(doc_elem, scope);
344	if (top == null) {
345	// something gone wrong
346	return null;
347	}
348	} else {
349	top = doc_elem;
350	}
351
352	NodeList elements = top.getElementsByTagName(tagname);
353	if (elements.getLength() == 0) {
354	return null;
355	}
356	// no id, just return the first one
357	if (id.equals("")) {
358	return (Element)elements.item(0);
359	}
360	// have an id, need to check and find the right one.
361	for (int i=0; i<elements.getLength();i++) {
362	Element e = (Element)elements.item(i);
363	if (e.getAttribute("gs3:id").equals(id)) {
364	return e;
365	}
366	}
367	return null;
368
369	}
370
371	protected Element getMetadata(Document result_doc, String node_id, boolean all, Vector<String> meta_name_list) {
372
373	// our default strategy here is to only return Title and root:Title
374	// ignore all others
375	// the title of a section is just a little bit of the text inside it.
376	// the root_Title is the title from the doc info in the config file
377	Element metadata_list = result_doc.createElement(GSXML.METADATA_ELEM+ GSXML.LIST_MODIFIER);
378	String doc_name = getWorkName(node_id);
379	boolean node_is_root = false;
380	if (doc_name.equals(node_id)) {
381	node_is_root = true;
382	}
383
384	Element this_doc = GSXML.getNamedElement(this.collection_doc_list, GSXML.DOCUMENT_ELEM, GSXML.NAME_ATT, doc_name);
385	Element doc_meta_list = (Element) GSXML.getChildByTagName(this_doc, GSXML.METADATA_ELEM+GSXML.LIST_MODIFIER);
386
387	boolean get_section_title = false;
388
389	if (all) {
390	if (node_is_root) {
391	return (Element)result_doc.importNode(doc_meta_list, true);
392	} else {
393	get_section_title = true;
394	}
395
396	} else {
397	// have to process metadata one by one
398	for (int i=0; i<meta_name_list.size(); i++) {
399	String meta_name = meta_name_list.elementAt(i);
400	String actual_meta_name = meta_name;
401	if (meta_name.startsWith("root_")) {
402	actual_meta_name = meta_name.substring(5);
403	} else {
404	// its a section level one - check to see if doc is root
405	if (!node_is_root) {
406	if (meta_name.equals("Title")) {
407	get_section_title = true;
408	}
409	continue; // move on to teh next metadata
410	}
411	}
412
413	// here, we look for the specific meta elem in doc_meta_list
414	Element meta_item = GSXML.getNamedElement(doc_meta_list, GSXML.METADATA_ELEM, GSXML.NAME_ATT, actual_meta_name);
415	if (meta_item != null) {
416	meta_item = (Element)result_doc.importNode(meta_item, true);
417	meta_item.setAttribute(GSXML.NAME_ATT, meta_name);
418	metadata_list.appendChild(meta_item);
419	}
420	} // for each metadata
421	}
422
423	// now we have processed all teh doc metadata, just have section one to go, if needed
424	if (get_section_title) {
425
426	Element doc_elem = loadDocument(doc_name);
427	if (doc_elem != null) {
428	Element section = getSection(doc_elem, node_id);
429	if (section != null) {
430	Element title_meta = extractTitleMeta(result_doc, section);
431	if (title_meta != null) {
432	metadata_list.appendChild(title_meta);
433	}
434	}
435	}
436
437	}
438	return metadata_list;
439	}
440
441	protected Element extractTitleMeta(Document result_doc, Element section) {
442	Element meta_elem = result_doc.createElement(GSXML.METADATA_ELEM);
443	meta_elem.setAttribute(GSXML.NAME_ATT, "Title");
444
445	String title = "dummy title";
446	Text t = result_doc.createTextNode(title);
447	meta_elem.appendChild(t);
448	return meta_elem;
449
450	}
451	// some methods for handling nodeIDs - they may be different for different colls, so they can be overwritten
452
453	// the full default nodeID looks like work.scope.tag.id
454	// the shorter versions are work, work.tag, work.scope.tag
455	protected String getWorkName(String node_id) {
456	int pos = node_id.indexOf('.');
457	if (pos == -1) {
458	return node_id;
459	}
460	return node_id.substring(0, pos);
461	}
462
463	// this assumes that the scope refers to a top level node - this may be overwritten if the scope bit in the id is a shorthand of some sort
464	protected String translateScope(String scope) {
465	if (this.document_root_tag != null) {
466	return GSPath.appendLink(this.document_root_tag, scope);
467	}
468	return scope;
469	}
470
471	}
472

Note: See TracBrowser for help on using the repository browser.

Context Navigation

source: main/trunk/greenstone3/src/java/org/greenstone/gsdl3/service/XMLRetrieve.java@ 32490

Download in other formats: