Context Navigation

source: main/trunk/greenstone3/src/java/org/greenstone/gsdl3/service/XMLRetrieve.java@ 32453

Last change on this file since 32453 was 28966, checked in by kjdon, 10 years ago
Lots of changes. Mainly to do with removing this.doc from everywhere. Document is not thread safe. Now we tend to create a new Document everytime we are starting a new page/message etc. in service this.desc_doc is available as teh document to create service info stuff. But it should only be used for this and not for other messages. newDOM is now static for XMLConverter. method param changes for some GSXML methods.
Property svn:keywords set to `Author Date Id Revision`
File size: 16.0 KB

Line
1	/*
2	* ServiceRack.java
3	* Copyright (C) 2014 New Zealand Digital Library, http://www.nzdl.org
4	*
5	* This program is free software; you can redistribute it and/or modify
6	* it under the terms of the GNU General Public License as published by
7	* the Free Software Foundation; either version 2 of the License, or
8	* (at your option) any later version.
9	*
10	* This program is distributed in the hope that it will be useful,
11	* but WITHOUT ANY WARRANTY; without even the implied warranty of
12	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13	* GNU General Public License for more details.
14	*
15	* You should have received a copy of the GNU General Public License
16	* along with this program; if not, write to the Free Software
17	* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
18	*/
19
20	package org.greenstone.gsdl3.service;
21
22
23	// Greenstone classes
24	import org.greenstone.gsdl3.util.*;
25
26	// XML classes
27	import org.w3c.dom.Document;
28	import org.w3c.dom.Element;
29	import org.w3c.dom.Node;
30	import org.w3c.dom.Attr;
31	import org.w3c.dom.Text;
32	import org.w3c.dom.NodeList;
33	import org.w3c.dom.NamedNodeMap;
34
35	// General Java classes
36	import java.io.File;
37	import java.io.Serializable;
38	import java.util.Vector;
39	import java.util.HashMap;
40
41	import org.apache.log4j.*;
42
43	public class XMLRetrieve extends ServiceRack {
44
45	static Logger logger = Logger.getLogger(org.greenstone.gsdl3.service.XMLRetrieve.class.getName());
46	protected static final String CONTENT_SERVICE = "DocumentContentRetrieve";
47	protected static final String METADATA_SERVICE = "DocumentMetadataRetrieve";
48	protected static final String STRUCTURE_SERVICE = "DocumentStructureRetrieve";
49
50	protected String toc_xsl_name = "";
51	protected String document_encoding = "";
52	protected String document_root_tag = "";
53
54	protected Element collection_doc_list = null;
55
56	protected boolean provide_content = true;
57	protected boolean provide_structure = true;
58	protected boolean provide_metadata = true;
59
60	protected GSEntityResolver entity_resolver = null;
61
62	public boolean configure(Element info, Element extra_info) {
63	if (!super.configure(info, extra_info)){
64	return false;
65	}
66	logger.info("configuring XMLRetrieve...");
67	// look for the parameters
68	Element param_list = (Element)GSXML.getChildByTagName(info, GSXML.PARAM_ELEM+GSXML.LIST_MODIFIER);
69	HashMap<String, Serializable> params;
70	String services_to_provide = "";
71	if (param_list != null) {
72	params = GSXML.extractParams(param_list, false);
73	this.toc_xsl_name = (String)params.get("tocXSLT");
74	this.document_encoding = (String)params.get("documentEncoding");
75	this.document_root_tag = (String)params.get("documentRootTag");
76	services_to_provide = (String)params.get("provideServices");
77	}
78	if (this.toc_xsl_name == null \|\| this.toc_xsl_name.equals("")) {
79	this.toc_xsl_name = "default_toc";
80	}
81	this.toc_xsl_name = this.toc_xsl_name+".xsl";
82
83	if (this.document_encoding == null \|\| this.document_encoding.equals("")) {
84	this.document_encoding = "UTF-8";
85	}
86
87	if (services_to_provide != null && !services_to_provide.equals("")) {
88	if (services_to_provide.indexOf("content")==-1) {
89	provide_content = false;
90	}
91	if (services_to_provide.indexOf("metadata")==-1) {
92	provide_metadata = false;
93	}
94	if (services_to_provide.indexOf("structure")==-1) {
95	provide_structure = false;
96	}
97
98	}
99
100	// set up short_service_info_ - for now just has name and type
101	Element retrieve_service;
102	if (provide_content) {
103	retrieve_service = this.desc_doc.createElement(GSXML.SERVICE_ELEM);
104	retrieve_service.setAttribute(GSXML.TYPE_ATT, GSXML.SERVICE_TYPE_RETRIEVE);
105	retrieve_service.setAttribute(GSXML.NAME_ATT, CONTENT_SERVICE);
106	this.short_service_info.appendChild(retrieve_service);
107	}
108	if (provide_metadata) {
109	retrieve_service = this.desc_doc.createElement(GSXML.SERVICE_ELEM);
110	retrieve_service.setAttribute(GSXML.TYPE_ATT, GSXML.SERVICE_TYPE_RETRIEVE);
111	retrieve_service.setAttribute(GSXML.NAME_ATT, METADATA_SERVICE);
112	this.short_service_info.appendChild(retrieve_service);
113	}
114	if (provide_structure) {
115	retrieve_service = this.desc_doc.createElement(GSXML.SERVICE_ELEM);
116	retrieve_service.setAttribute(GSXML.TYPE_ATT, GSXML.SERVICE_TYPE_RETRIEVE);
117	retrieve_service.setAttribute(GSXML.NAME_ATT, STRUCTURE_SERVICE);
118	this.short_service_info.appendChild(retrieve_service);
119	}
120	// find the doc list from the extra_info and keep it - should this be in collect.cfg or build.cfg??
121	collection_doc_list = (Element)GSXML.getChildByTagName(extra_info, GSXML.DOCUMENT_ELEM+GSXML.LIST_MODIFIER);
122
123	entity_resolver = new GSEntityResolver();
124	entity_resolver.setClassLoader(this.class_loader);
125	//this.converter.setEntityResolver(resolver);
126	return true;
127	}
128
129	// this may get called but is not useful in the case of retrieve services
130	protected Element getServiceDescription(Document doc, String service_id, String lang, String subset) {
131
132	Element retrieve_service = doc.createElement(GSXML.SERVICE_ELEM);
133	retrieve_service.setAttribute(GSXML.TYPE_ATT, GSXML.SERVICE_TYPE_RETRIEVE);
134	retrieve_service.setAttribute(GSXML.NAME_ATT, service_id);
135	return retrieve_service;
136	}
137
138	protected Element processDocumentContentRetrieve(Element request) {
139	Document result_doc = XMLConverter.newDOM();
140	Element result = result_doc.createElement(GSXML.RESPONSE_ELEM);
141	result.setAttribute(GSXML.FROM_ATT, CONTENT_SERVICE);
142	result.setAttribute(GSXML.TYPE_ATT, GSXML.REQUEST_TYPE_PROCESS);
143
144	Element doc_list = (Element)GSXML.getChildByTagName(request, GSXML.DOC_NODE_ELEM+GSXML.LIST_MODIFIER);
145	if (doc_list == null) {
146	return result;
147	}
148	Element result_doc_list = (Element)result_doc.importNode(doc_list, true);
149	result.appendChild(result_doc_list);
150
151	NodeList docs = result_doc_list.getElementsByTagName(GSXML.DOC_NODE_ELEM);
152	for (int i=0; i<docs.getLength(); i++) {
153
154	Element doc = (Element)docs.item(i);
155	Element content = result_doc.createElement(GSXML.NODE_CONTENT_ELEM);
156	doc.appendChild(content);
157
158	String node_id = doc.getAttribute(GSXML.NODE_ID_ATT);
159	String doc_name = getWorkName(node_id);
160
161	Element doc_elem = loadDocument(doc_name); // should perhaps cache the read in docs??
162	if (doc_elem == null) {
163	continue;
164	}
165
166
167	// if we have asked for the whole doc, just append it
168	if (doc_name.equals(node_id)) {
169	content.appendChild(result_doc.importNode(doc_elem, true));
170	continue;
171	}
172
173	// else we only want a sub section
174
175	Element section = getSection(doc_elem, node_id);
176	if (section != null) {
177	content.appendChild(result_doc.importNode(section, true));
178	}
179
180	} // for each doc
181
182	return result;
183
184	}
185
186	protected Element processDocumentStructureRetrieve(Element request) {
187	Document result_doc = XMLConverter.newDOM();
188	Element result = result_doc.createElement(GSXML.RESPONSE_ELEM);
189	result.setAttribute(GSXML.FROM_ATT, STRUCTURE_SERVICE);
190	result.setAttribute(GSXML.TYPE_ATT, GSXML.REQUEST_TYPE_PROCESS);
191
192	Element doc_list = (Element)GSXML.getChildByTagName(request, GSXML.DOC_NODE_ELEM+GSXML.LIST_MODIFIER);
193	if (doc_list == null) {
194	logger.error("no documents specified in the request. ");
195	return result;
196	}
197
198	Element result_doc_list = (Element)result_doc.importNode(doc_list, true);
199	result.appendChild(result_doc_list);
200	// first look for the stylesheet in the collection
201	File stylesheet = new File(GSFile.collStylesheetFile(this.site_home, this.cluster_name, this.toc_xsl_name));
202	if (!stylesheet.exists()) {
203	// now try in the site
204	stylesheet = new File(GSFile.siteStylesheetFile(this.site_home, this.toc_xsl_name));
205	}
206	if (!stylesheet.exists()) {
207	logger.error("couldn't find the stylesheet file to produce the table of contents:"+stylesheet.getPath());
208	return result;
209	}
210
211	// for now, we dont have any params, and we always return the structure of the whole document
212
213	XMLTransformer transformer = new XMLTransformer();
214	NodeList docs = result_doc_list.getElementsByTagName(GSXML.DOC_NODE_ELEM);
215
216	for (int i=0; i<docs.getLength(); i++) {
217
218	Element doc = (Element)docs.item(i);
219
220	Element structure = result_doc.createElement(GSXML.NODE_STRUCTURE_ELEM);
221	doc.appendChild(structure);
222	String doc_name = doc.getAttribute(GSXML.NODE_ID_ATT);
223	// make sure we are at the top level
224	doc_name = getWorkName(doc_name);
225
226	File doc_file = new File(GSFile.collectionIndexDir(this.site_home, this.cluster_name)+File.separator+"text"+File.separatorChar+doc_name+".xml");
227
228	if (!doc_file.exists()) {
229	logger.error("couldn't find file in coll "+this.cluster_name +", file "+doc_name+".xml");
230	} else {
231	try {
232	Node toc = transformer.transform(stylesheet, doc_file);
233	structure.appendChild(result_doc.importNode(toc, true));
234	} catch (Exception e) {
235	logger.error("couldn't transform the document to get the toc");
236	}
237	}
238
239	}
240
241	return result;
242
243	}
244
245	// this just extracts a bit of text from the section to use as the Title
246	// this should be overwritten for any format that has something more suitable
247	protected Element processDocumentMetadataRetrieve(Element request) {
248	Document result_doc = XMLConverter.newDOM();
249	Element result = result_doc.createElement(GSXML.RESPONSE_ELEM);
250	result.setAttribute(GSXML.FROM_ATT, METADATA_SERVICE);
251	result.setAttribute(GSXML.TYPE_ATT, GSXML.REQUEST_TYPE_PROCESS);
252
253	Element doc_list = (Element)GSXML.getChildByTagName(request, GSXML.DOC_NODE_ELEM+GSXML.LIST_MODIFIER);
254	if (doc_list == null) {
255	logger.error("no documents in the request");
256	return result;
257	}
258
259	Element result_doc_list = (Element)result_doc.importNode(doc_list, true);
260	result.appendChild(result_doc_list);
261
262	Element param_list = (Element) GSXML.getChildByTagName(request, GSXML.PARAM_ELEM+GSXML.LIST_MODIFIER);
263	if (param_list == null) {
264	logger.error("no metadata in the request");
265	return result;
266	}
267
268	Vector<String> meta_name_list = new Vector<String>();
269	boolean all_metadata = false;
270	// Process the request parameters
271	Element param = GSXML.getFirstElementChild(param_list);//(Element) param_list.getFirstChild();
272	while (param != null) {
273	// Identify the metadata information desired
274	if (param.getAttribute(GSXML.NAME_ATT).equals("metadata")) {
275	String metadata = GSXML.getValue(param);
276	if (metadata.equals("all")) {
277	all_metadata = true;
278	break;
279	}
280	meta_name_list.add(metadata);
281	}
282	param = (Element) param.getNextSibling();
283	}
284
285	NodeList docs = result_doc_list.getElementsByTagName(GSXML.DOC_NODE_ELEM);
286	for (int i=0; i<docs.getLength(); i++) {
287	Element doc = (Element)docs.item(i);
288	String node_id = doc.getAttribute(GSXML.NODE_ID_ATT);
289	String doc_name = getWorkName(node_id);
290
291	Element metadata_list = getMetadata(result_doc, node_id, all_metadata, meta_name_list);
292	doc.appendChild(metadata_list);
293	}
294
295	return result;
296	}
297
298	protected Element loadDocument(String doc_name) {
299	// try to find the document
300	File doc_file = new File(GSFile.collectionIndexDir(this.site_home, this.cluster_name)+File.separator+"text"+File.separatorChar+doc_name+".xml");
301
302	if (!doc_file.exists()) {
303	logger.info("couldn't find file in coll "+this.cluster_name +", file "+doc_name+".xml");
304	return null;
305	}
306
307	Document the_doc = null;
308	try {
309	the_doc = this.converter.getDOM(doc_file, this.document_encoding, this.entity_resolver);
310	} catch (Exception e) {
311	logger.error("couldn't create a DOM from file "+doc_file.getPath());
312	return null;
313	}
314
315	return the_doc.getDocumentElement();
316
317	}
318
319
320	protected Element getSection(Element doc_elem, String node_id) {
321	String [] bits = node_id.split("\\.");
322	if (bits.length > 4) {
323	logger.error("badly formatted node id ("+node_id +"), cant retrieve the section");
324	return null;
325	}
326
327	String id="";
328	String tagname = "";
329	String scope = "";
330	if (bits.length==2) {
331	tagname = bits[1];
332	} else {
333	scope = bits[1];
334	tagname = bits[2];
335
336	if (bits.length == 4) {
337	id = bits[3];
338	}
339	}
340	scope = translateScope(scope);
341	Element top=null;
342	if (!scope.equals("")) {
343	top = (Element)GSXML.getNodeByPath(doc_elem, scope);
344	if (top == null) {
345	// something gone wrong
346	return null;
347	}
348	} else {
349	top = doc_elem;
350	}
351
352	NodeList elements = top.getElementsByTagName(tagname);
353	if (elements.getLength() == 0) {
354	return null;
355	}
356	// no id, just return the first one
357	if (id.equals("")) {
358	return (Element)elements.item(0);
359	}
360	// have an id, need to check and find the right one.
361	for (int i=0; i<elements.getLength();i++) {
362	Element e = (Element)elements.item(i);
363	if (e.getAttribute("gs3:id").equals(id)) {
364	return e;
365	}
366	}
367	return null;
368
369	}
370
371	protected Element getMetadata(Document result_doc, String node_id, boolean all, Vector<String> meta_name_list) {
372
373	// our default strategy here is to only return Title and root:Title
374	// ignore all others
375	// the title of a section is just a little bit of the text inside it.
376	// the root_Title is the title from the doc info in the config file
377	Element metadata_list = result_doc.createElement(GSXML.METADATA_ELEM+ GSXML.LIST_MODIFIER);
378	String doc_name = getWorkName(node_id);
379	boolean node_is_root = false;
380	if (doc_name.equals(node_id)) {
381	node_is_root = true;
382	}
383
384	Element this_doc = GSXML.getNamedElement(this.collection_doc_list, GSXML.DOCUMENT_ELEM, GSXML.NAME_ATT, doc_name);
385	Element doc_meta_list = (Element) GSXML.getChildByTagName(this_doc, GSXML.METADATA_ELEM+GSXML.LIST_MODIFIER);
386
387	boolean get_section_title = false;
388
389	if (all) {
390	if (node_is_root) {
391	return (Element)result_doc.importNode(doc_meta_list, true);
392	} else {
393	get_section_title = true;
394	}
395
396	} else {
397	// have to process metadata one by one
398	for (int i=0; i<meta_name_list.size(); i++) {
399	String meta_name = meta_name_list.elementAt(i);
400	String actual_meta_name = meta_name;
401	if (meta_name.startsWith("root_")) {
402	actual_meta_name = meta_name.substring(5);
403	} else {
404	// its a section level one - check to see if doc is root
405	if (!node_is_root) {
406	if (meta_name.equals("Title")) {
407	get_section_title = true;
408	}
409	continue; // move on to teh next metadata
410	}
411	}
412
413	// here, we look for the specific meta elem in doc_meta_list
414	Element meta_item = GSXML.getNamedElement(doc_meta_list, GSXML.METADATA_ELEM, GSXML.NAME_ATT, actual_meta_name);
415	if (meta_item != null) {
416	meta_item = (Element)result_doc.importNode(meta_item, true);
417	meta_item.setAttribute(GSXML.NAME_ATT, meta_name);
418	metadata_list.appendChild(meta_item);
419	}
420	} // for each metadata
421	}
422
423	// now we have processed all teh doc metadata, just have section one to go, if needed
424	if (get_section_title) {
425
426	Element doc_elem = loadDocument(doc_name);
427	if (doc_elem != null) {
428	Element section = getSection(doc_elem, node_id);
429	if (section != null) {
430	Element title_meta = extractTitleMeta(result_doc, section);
431	if (title_meta != null) {
432	metadata_list.appendChild(title_meta);
433	}
434	}
435	}
436
437	}
438	return metadata_list;
439	}
440
441	protected Element extractTitleMeta(Document result_doc, Element section) {
442	Element meta_elem = result_doc.createElement(GSXML.METADATA_ELEM);
443	meta_elem.setAttribute(GSXML.NAME_ATT, "Title");
444
445	String title = "dummy title";
446	Text t = result_doc.createTextNode(title);
447	meta_elem.appendChild(t);
448	return meta_elem;
449
450	}
451	// some methods for handling nodeIDs - they may be different for different colls, so they can be overwritten
452
453	// the full default nodeID looks like work.scope.tag.id
454	// the shorter versions are work, work.tag, work.scope.tag
455	protected String getWorkName(String node_id) {
456	int pos = node_id.indexOf('.');
457	if (pos == -1) {
458	return node_id;
459	}
460	return node_id.substring(0, pos);
461	}
462
463	// this assumes that the scope refers to a top level node - this may be overwritten if the scope bit in the id is a shorthand of some sort
464	protected String translateScope(String scope) {
465	if (this.document_root_tag != null) {
466	return GSPath.appendLink(this.document_root_tag, scope);
467	}
468	return scope;
469	}
470
471	}
472

Note: See TracBrowser for help on using the repository browser.

Download in other formats: