Context Navigation

source: main/trunk/greenstone3/src/java/org/greenstone/gsdl3/service/XMLRetrieve.java@ 25635

Last change on this file since 25635 was 25635, checked in by sjm84, 12 years ago
Fixing Greenstone 3's use (or lack thereof) of generics, this was done automatically so we may want to change it over time. This change will also auto-format any files that have not already been formatted.
Property svn:keywords set to `Author Date Id Revision`
File size: 14.8 KB

Line
1	package org.greenstone.gsdl3.service;
2
3
4	// Greenstone classes
5	import org.greenstone.gsdl3.util.*;
6
7	// XML classes
8	import org.w3c.dom.Document;
9	import org.w3c.dom.Element;
10	import org.w3c.dom.Node;
11	import org.w3c.dom.Attr;
12	import org.w3c.dom.Text;
13	import org.w3c.dom.NodeList;
14	import org.w3c.dom.NamedNodeMap;
15
16	// General Java classes
17	import java.io.File;
18	import java.io.Serializable;
19	import java.util.Vector;
20	import java.util.HashMap;
21
22	import org.apache.log4j.*;
23
24	public class XMLRetrieve extends ServiceRack {
25
26	static Logger logger = Logger.getLogger(org.greenstone.gsdl3.service.XMLRetrieve.class.getName());
27	protected static final String CONTENT_SERVICE = "DocumentContentRetrieve";
28	protected static final String METADATA_SERVICE = "DocumentMetadataRetrieve";
29	protected static final String STRUCTURE_SERVICE = "DocumentStructureRetrieve";
30
31	protected String toc_xsl_name = "";
32	protected String document_encoding = "";
33	protected String document_root_tag = "";
34
35	protected Element collection_doc_list = null;
36
37	protected boolean provide_content = true;
38	protected boolean provide_structure = true;
39	protected boolean provide_metadata = true;
40
41
42	public boolean configure(Element info, Element extra_info) {
43	if (!super.configure(info, extra_info)){
44	return false;
45	}
46	logger.info("configuring XMLRetrieve...");
47	// look for the parameters
48	Element param_list = (Element)GSXML.getChildByTagName(info, GSXML.PARAM_ELEM+GSXML.LIST_MODIFIER);
49	HashMap<String, Serializable> params;
50	String services_to_provide = "";
51	if (param_list != null) {
52	params = GSXML.extractParams(param_list, false);
53	this.toc_xsl_name = (String)params.get("tocXSLT");
54	this.document_encoding = (String)params.get("documentEncoding");
55	this.document_root_tag = (String)params.get("documentRootTag");
56	services_to_provide = (String)params.get("provideServices");
57	}
58	if (this.toc_xsl_name == null \|\| this.toc_xsl_name.equals("")) {
59	this.toc_xsl_name = "default_toc";
60	}
61	this.toc_xsl_name = this.toc_xsl_name+".xsl";
62
63	if (this.document_encoding == null \|\| this.document_encoding.equals("")) {
64	this.document_encoding = "UTF-8";
65	}
66
67	if (services_to_provide != null && !services_to_provide.equals("")) {
68	if (services_to_provide.indexOf("content")==-1) {
69	provide_content = false;
70	}
71	if (services_to_provide.indexOf("metadata")==-1) {
72	provide_metadata = false;
73	}
74	if (services_to_provide.indexOf("structure")==-1) {
75	provide_structure = false;
76	}
77
78	}
79
80	// set up short_service_info_ - for now just has name and type
81	Element retrieve_service;
82	if (provide_content) {
83	retrieve_service = this.doc.createElement(GSXML.SERVICE_ELEM);
84	retrieve_service.setAttribute(GSXML.TYPE_ATT, GSXML.SERVICE_TYPE_RETRIEVE);
85	retrieve_service.setAttribute(GSXML.NAME_ATT, CONTENT_SERVICE);
86	this.short_service_info.appendChild(retrieve_service);
87	}
88	if (provide_metadata) {
89	retrieve_service = this.doc.createElement(GSXML.SERVICE_ELEM);
90	retrieve_service.setAttribute(GSXML.TYPE_ATT, GSXML.SERVICE_TYPE_RETRIEVE);
91	retrieve_service.setAttribute(GSXML.NAME_ATT, METADATA_SERVICE);
92	this.short_service_info.appendChild(retrieve_service);
93	}
94	if (provide_structure) {
95	retrieve_service = this.doc.createElement(GSXML.SERVICE_ELEM);
96	retrieve_service.setAttribute(GSXML.TYPE_ATT, GSXML.SERVICE_TYPE_RETRIEVE);
97	retrieve_service.setAttribute(GSXML.NAME_ATT, STRUCTURE_SERVICE);
98	this.short_service_info.appendChild(retrieve_service);
99	}
100	// find the doc list from the extra_info and keep it - should this be in collect.cfg or build.cfg??
101	collection_doc_list = (Element)GSXML.getChildByTagName(extra_info, GSXML.DOCUMENT_ELEM+GSXML.LIST_MODIFIER);
102
103	GSEntityResolver resolver = new GSEntityResolver();
104	resolver.setClassLoader(this.class_loader);
105	this.converter.setEntityResolver(resolver);
106	return true;
107	}
108
109	// this may get called but is not useful in the case of retrieve services
110	protected Element getServiceDescription(String service_id, String lang, String subset) {
111
112	Element retrieve_service = this.doc.createElement(GSXML.SERVICE_ELEM);
113	retrieve_service.setAttribute(GSXML.TYPE_ATT, GSXML.SERVICE_TYPE_RETRIEVE);
114	retrieve_service.setAttribute(GSXML.NAME_ATT, service_id);
115	return retrieve_service;
116	}
117
118	protected Element processDocumentContentRetrieve(Element request) {
119	Element result = this.doc.createElement(GSXML.RESPONSE_ELEM);
120	result.setAttribute(GSXML.FROM_ATT, CONTENT_SERVICE);
121	result.setAttribute(GSXML.TYPE_ATT, GSXML.REQUEST_TYPE_PROCESS);
122
123	Element doc_list = (Element)GSXML.getChildByTagName(request, GSXML.DOC_NODE_ELEM+GSXML.LIST_MODIFIER);
124	if (doc_list == null) {
125	return result;
126	}
127	Element result_doc_list = (Element)this.doc.importNode(doc_list, true);
128	result.appendChild(result_doc_list);
129
130	NodeList docs = result_doc_list.getElementsByTagName(GSXML.DOC_NODE_ELEM);
131	for (int i=0; i<docs.getLength(); i++) {
132
133	Element doc = (Element)docs.item(i);
134	Element content = this.doc.createElement(GSXML.NODE_CONTENT_ELEM);
135	doc.appendChild(content);
136
137	String node_id = doc.getAttribute(GSXML.NODE_ID_ATT);
138	String doc_name = getWorkName(node_id);
139
140	Element doc_elem = loadDocument(doc_name); // should perhaps cache the read in docs??
141	if (doc_elem == null) {
142	continue;
143	}
144
145
146	// if we have asked for the whole doc, just append it
147	if (doc_name.equals(node_id)) {
148	content.appendChild(this.doc.importNode(doc_elem, true));
149	continue;
150	}
151
152	// else we only want a sub section
153
154	Element section = getSection(doc_elem, node_id);
155	if (section != null) {
156	content.appendChild(this.doc.importNode(section, true));
157	}
158
159	} // for each doc
160
161	return result;
162
163	}
164
165	protected Element processDocumentStructureRetrieve(Element request) {
166	Element result = this.doc.createElement(GSXML.RESPONSE_ELEM);
167	result.setAttribute(GSXML.FROM_ATT, STRUCTURE_SERVICE);
168	result.setAttribute(GSXML.TYPE_ATT, GSXML.REQUEST_TYPE_PROCESS);
169
170	Element doc_list = (Element)GSXML.getChildByTagName(request, GSXML.DOC_NODE_ELEM+GSXML.LIST_MODIFIER);
171	if (doc_list == null) {
172	logger.error("no documents specified in the request. ");
173	return result;
174	}
175
176	Element result_doc_list = (Element)this.doc.importNode(doc_list, true);
177	result.appendChild(result_doc_list);
178	// first look for the stylesheet in the collection
179	File stylesheet = new File(GSFile.collStylesheetFile(this.site_home, this.cluster_name, this.toc_xsl_name));
180	if (!stylesheet.exists()) {
181	// now try in the site
182	stylesheet = new File(GSFile.siteStylesheetFile(this.site_home, this.toc_xsl_name));
183	}
184	if (!stylesheet.exists()) {
185	logger.error("couldn't find the stylesheet file to produce the table of contents:"+stylesheet.getPath());
186	return result;
187	}
188
189	// for now, we dont have any params, and we always return the structure of the whole document
190
191	XMLTransformer transformer = new XMLTransformer();
192	NodeList docs = result_doc_list.getElementsByTagName(GSXML.DOC_NODE_ELEM);
193
194	for (int i=0; i<docs.getLength(); i++) {
195
196	Element doc = (Element)docs.item(i);
197
198	Element structure = this.doc.createElement(GSXML.NODE_STRUCTURE_ELEM);
199	doc.appendChild(structure);
200	String doc_name = doc.getAttribute(GSXML.NODE_ID_ATT);
201	// make sure we are at the top level
202	doc_name = getWorkName(doc_name);
203
204	File doc_file = new File(GSFile.collectionIndexDir(this.site_home, this.cluster_name)+File.separator+"text"+File.separatorChar+doc_name+".xml");
205
206	if (!doc_file.exists()) {
207	logger.error("couldn't find file in coll "+this.cluster_name +", file "+doc_name+".xml");
208	} else {
209	try {
210	Node toc = transformer.transform(stylesheet, doc_file);
211	structure.appendChild(this.doc.importNode(toc, true));
212	} catch (Exception e) {
213	logger.error("couldn't transform the document to get the toc");
214	}
215	}
216
217	}
218
219	return result;
220
221	}
222
223	// this just extracts a bit of text from the section to use as the Title
224	// this should be overwritten for any format that has something more suitable
225	protected Element processDocumentMetadataRetrieve(Element request) {
226	Element result = this.doc.createElement(GSXML.RESPONSE_ELEM);
227	result.setAttribute(GSXML.FROM_ATT, METADATA_SERVICE);
228	result.setAttribute(GSXML.TYPE_ATT, GSXML.REQUEST_TYPE_PROCESS);
229
230	Element doc_list = (Element)GSXML.getChildByTagName(request, GSXML.DOC_NODE_ELEM+GSXML.LIST_MODIFIER);
231	if (doc_list == null) {
232	logger.error("no documents in the request");
233	return result;
234	}
235
236	Element result_doc_list = (Element)this.doc.importNode(doc_list, true);
237	result.appendChild(result_doc_list);
238
239	Element param_list = (Element) GSXML.getChildByTagName(request, GSXML.PARAM_ELEM+GSXML.LIST_MODIFIER);
240	if (param_list == null) {
241	logger.error("no metadata in the request");
242	return result;
243	}
244
245	Vector<String> meta_name_list = new Vector<String>();
246	boolean all_metadata = false;
247	// Process the request parameters
248	Element param = GSXML.getFirstElementChild(param_list);//(Element) param_list.getFirstChild();
249	while (param != null) {
250	// Identify the metadata information desired
251	if (param.getAttribute(GSXML.NAME_ATT).equals("metadata")) {
252	String metadata = GSXML.getValue(param);
253	if (metadata.equals("all")) {
254	all_metadata = true;
255	break;
256	}
257	meta_name_list.add(metadata);
258	}
259	param = (Element) param.getNextSibling();
260	}
261
262	NodeList docs = result_doc_list.getElementsByTagName(GSXML.DOC_NODE_ELEM);
263	for (int i=0; i<docs.getLength(); i++) {
264	Element doc = (Element)docs.item(i);
265	String node_id = doc.getAttribute(GSXML.NODE_ID_ATT);
266	String doc_name = getWorkName(node_id);
267
268	Element metadata_list = getMetadata(node_id, all_metadata, meta_name_list);
269	doc.appendChild(metadata_list);
270	}
271
272	return result;
273	}
274
275	protected Element loadDocument(String doc_name) {
276	// try to find the document
277	File doc_file = new File(GSFile.collectionIndexDir(this.site_home, this.cluster_name)+File.separator+"text"+File.separatorChar+doc_name+".xml");
278
279	if (!doc_file.exists()) {
280	logger.info("couldn't find file in coll "+this.cluster_name +", file "+doc_name+".xml");
281	return null;
282	}
283
284	Document the_doc = null;
285	try {
286	the_doc = this.converter.getDOM(doc_file, this.document_encoding);
287	} catch (Exception e) {
288	logger.error("couldn't create a DOM from file "+doc_file.getPath());
289	return null;
290	}
291
292	return the_doc.getDocumentElement();
293
294	}
295
296
297	protected Element getSection(Element doc_elem, String node_id) {
298	String [] bits = node_id.split("\\.");
299	if (bits.length > 4) {
300	logger.error("badly formatted node id ("+node_id +"), cant retrieve the section");
301	return null;
302	}
303
304	String id="";
305	String tagname = "";
306	String scope = "";
307	if (bits.length==2) {
308	tagname = bits[1];
309	} else {
310	scope = bits[1];
311	tagname = bits[2];
312
313	if (bits.length == 4) {
314	id = bits[3];
315	}
316	}
317	scope = translateScope(scope);
318	Element top=null;
319	if (!scope.equals("")) {
320	top = (Element)GSXML.getNodeByPath(doc_elem, scope);
321	if (top == null) {
322	// something gone wrong
323	return null;
324	}
325	} else {
326	top = doc_elem;
327	}
328
329	NodeList elements = top.getElementsByTagName(tagname);
330	if (elements.getLength() == 0) {
331	return null;
332	}
333	// no id, just return the first one
334	if (id.equals("")) {
335	return (Element)elements.item(0);
336	}
337	// have an id, need to check and find the right one.
338	for (int i=0; i<elements.getLength();i++) {
339	Element e = (Element)elements.item(i);
340	if (e.getAttribute("gs3:id").equals(id)) {
341	return e;
342	}
343	}
344	return null;
345
346	}
347
348	protected Element getMetadata(String node_id, boolean all, Vector<String> meta_name_list) {
349
350	// our default strategy here is to only return Title and root:Title
351	// ignore all others
352	// the title of a section is just a little bit of the text inside it.
353	// the root_Title is the title from the doc info in the config file
354	Element metadata_list = this.doc.createElement(GSXML.METADATA_ELEM+ GSXML.LIST_MODIFIER);
355	String doc_name = getWorkName(node_id);
356	boolean node_is_root = false;
357	if (doc_name.equals(node_id)) {
358	node_is_root = true;
359	}
360
361	Element this_doc = GSXML.getNamedElement(this.collection_doc_list, GSXML.DOCUMENT_ELEM, GSXML.NAME_ATT, doc_name);
362	Element doc_meta_list = (Element) GSXML.getChildByTagName(this_doc, GSXML.METADATA_ELEM+GSXML.LIST_MODIFIER);
363
364	boolean get_section_title = false;
365
366	if (all) {
367	if (node_is_root) {
368	return (Element)this.doc.importNode(doc_meta_list, true);
369	} else {
370	get_section_title = true;
371	}
372
373	} else {
374	// have to process metadata one by one
375	for (int i=0; i<meta_name_list.size(); i++) {
376	String meta_name = meta_name_list.elementAt(i);
377	String actual_meta_name = meta_name;
378	if (meta_name.startsWith("root_")) {
379	actual_meta_name = meta_name.substring(5);
380	} else {
381	// its a section level one - check to see if doc is root
382	if (!node_is_root) {
383	if (meta_name.equals("Title")) {
384	get_section_title = true;
385	}
386	continue; // move on to teh next metadata
387	}
388	}
389
390	// here, we look for the specific meta elem in doc_meta_list
391	Element meta_item = GSXML.getNamedElement(doc_meta_list, GSXML.METADATA_ELEM, GSXML.NAME_ATT, actual_meta_name);
392	if (meta_item != null) {
393	meta_item = (Element)this.doc.importNode(meta_item, true);
394	meta_item.setAttribute(GSXML.NAME_ATT, meta_name);
395	metadata_list.appendChild(meta_item);
396	}
397	} // for each metadata
398	}
399
400	// now we have processed all teh doc metadata, just have section one to go, if needed
401	if (get_section_title) {
402
403	Element doc_elem = loadDocument(doc_name);
404	if (doc_elem != null) {
405	Element section = getSection(doc_elem, node_id);
406	if (section != null) {
407	Element title_meta = extractTitleMeta(section);
408	if (title_meta != null) {
409	metadata_list.appendChild(title_meta);
410	}
411	}
412	}
413
414	}
415	return metadata_list;
416	}
417
418	protected Element extractTitleMeta(Element section) {
419	Element meta_elem = this.doc.createElement(GSXML.METADATA_ELEM);
420	meta_elem.setAttribute(GSXML.NAME_ATT, "Title");
421
422	String title = "dummy title";
423	Text t = this.doc.createTextNode(title);
424	meta_elem.appendChild(t);
425	return meta_elem;
426
427	}
428	// some methods for handling nodeIDs - they may be different for different colls, so they can be overwritten
429
430	// the full default nodeID looks like work.scope.tag.id
431	// the shorter versions are work, work.tag, work.scope.tag
432	protected String getWorkName(String node_id) {
433	int pos = node_id.indexOf('.');
434	if (pos == -1) {
435	return node_id;
436	}
437	return node_id.substring(0, pos);
438	}
439
440	// this assumes that the scope refers to a top level node - this may be overwritten if the scope bit in the id is a shorthand of some sort
441	protected String translateScope(String scope) {
442	if (this.document_root_tag != null) {
443	return GSPath.appendLink(this.document_root_tag, scope);
444	}
445	return scope;
446	}
447
448	}
449

Note: See TracBrowser for help on using the repository browser.

Download in other formats: