Context Navigation

source: greenstone3/trunk/src/java/org/greenstone/gsdl3/service/XMLRetrieve.java@ 14225

Last change on this file since 14225 was 14225, checked in by xiao, 17 years ago
change getFirstChild() to getFirstElementChild() in case an extra line break or white space added before the first element child which might cause a cast exception.
Property svn:keywords set to `Author Date Id Revision`
File size: 14.8 KB

Line
1	package org.greenstone.gsdl3.service;
2
3
4	// Greenstone classes
5	import org.greenstone.gsdl3.util.*;
6
7	// XML classes
8	import org.w3c.dom.Document;
9	import org.w3c.dom.Element;
10	import org.w3c.dom.Node;
11	import org.w3c.dom.Attr;
12	import org.w3c.dom.Text;
13	import org.w3c.dom.NodeList;
14	import org.w3c.dom.NamedNodeMap;
15
16	// General Java classes
17	import java.io.File;
18	import java.util.Vector;
19	import java.util.HashMap;
20
21	import org.apache.log4j.*;
22
23	public class XMLRetrieve extends ServiceRack {
24
25	static Logger logger = Logger.getLogger(org.greenstone.gsdl3.service.XMLRetrieve.class.getName());
26	protected static final String CONTENT_SERVICE = "DocumentContentRetrieve";
27	protected static final String METADATA_SERVICE = "DocumentMetadataRetrieve";
28	protected static final String STRUCTURE_SERVICE = "DocumentStructureRetrieve";
29
30	protected String toc_xsl_name = "";
31	protected String document_encoding = "";
32	protected String document_root_tag = "";
33
34	protected Element collection_doc_list = null;
35
36	protected boolean provide_content = true;
37	protected boolean provide_structure = true;
38	protected boolean provide_metadata = true;
39
40
41	public boolean configure(Element info, Element extra_info) {
42	if (!super.configure(info, extra_info)){
43	return false;
44	}
45	logger.info("configuring XMLRetrieve...");
46	// look for the parameters
47	Element param_list = (Element)GSXML.getChildByTagName(info, GSXML.PARAM_ELEM+GSXML.LIST_MODIFIER);
48	HashMap params;
49	String services_to_provide = "";
50	if (param_list != null) {
51	params = GSXML.extractParams(param_list, false);
52	this.toc_xsl_name = (String)params.get("tocXSLT");
53	this.document_encoding = (String)params.get("documentEncoding");
54	this.document_root_tag = (String)params.get("documentRootTag");
55	services_to_provide = (String)params.get("provideServices");
56	}
57	if (this.toc_xsl_name == null \|\| this.toc_xsl_name.equals("")) {
58	this.toc_xsl_name = "default_toc";
59	}
60	this.toc_xsl_name = this.toc_xsl_name+".xsl";
61
62	if (this.document_encoding == null \|\| this.document_encoding.equals("")) {
63	this.document_encoding = "UTF-8";
64	}
65
66	if (services_to_provide != null && !services_to_provide.equals("")) {
67	if (services_to_provide.indexOf("content")==-1) {
68	provide_content = false;
69	}
70	if (services_to_provide.indexOf("metadata")==-1) {
71	provide_metadata = false;
72	}
73	if (services_to_provide.indexOf("structure")==-1) {
74	provide_structure = false;
75	}
76
77	}
78
79	// set up short_service_info_ - for now just has name and type
80	Element retrieve_service;
81	if (provide_content) {
82	retrieve_service = this.doc.createElement(GSXML.SERVICE_ELEM);
83	retrieve_service.setAttribute(GSXML.TYPE_ATT, GSXML.SERVICE_TYPE_RETRIEVE);
84	retrieve_service.setAttribute(GSXML.NAME_ATT, CONTENT_SERVICE);
85	this.short_service_info.appendChild(retrieve_service);
86	}
87	if (provide_metadata) {
88	retrieve_service = this.doc.createElement(GSXML.SERVICE_ELEM);
89	retrieve_service.setAttribute(GSXML.TYPE_ATT, GSXML.SERVICE_TYPE_RETRIEVE);
90	retrieve_service.setAttribute(GSXML.NAME_ATT, METADATA_SERVICE);
91	this.short_service_info.appendChild(retrieve_service);
92	}
93	if (provide_structure) {
94	retrieve_service = this.doc.createElement(GSXML.SERVICE_ELEM);
95	retrieve_service.setAttribute(GSXML.TYPE_ATT, GSXML.SERVICE_TYPE_RETRIEVE);
96	retrieve_service.setAttribute(GSXML.NAME_ATT, STRUCTURE_SERVICE);
97	this.short_service_info.appendChild(retrieve_service);
98	}
99	// find the doc list from the extra_info and keep it - should this be in collect.cfg or build.cfg??
100	collection_doc_list = (Element)GSXML.getChildByTagName(extra_info, GSXML.DOCUMENT_ELEM+GSXML.LIST_MODIFIER);
101
102	GSEntityResolver resolver = new GSEntityResolver();
103	resolver.setClassLoader(this.class_loader);
104	this.converter.setEntityResolver(resolver);
105	return true;
106	}
107
108	// this may get called but is not useful in the case of retrieve services
109	protected Element getServiceDescription(String service_id, String lang, String subset) {
110
111	Element retrieve_service = this.doc.createElement(GSXML.SERVICE_ELEM);
112	retrieve_service.setAttribute(GSXML.TYPE_ATT, GSXML.SERVICE_TYPE_RETRIEVE);
113	retrieve_service.setAttribute(GSXML.NAME_ATT, service_id);
114	return retrieve_service;
115	}
116
117	protected Element processDocumentContentRetrieve(Element request) {
118	Element result = this.doc.createElement(GSXML.RESPONSE_ELEM);
119	result.setAttribute(GSXML.FROM_ATT, CONTENT_SERVICE);
120	result.setAttribute(GSXML.TYPE_ATT, GSXML.REQUEST_TYPE_PROCESS);
121
122	Element doc_list = (Element)GSXML.getChildByTagName(request, GSXML.DOC_NODE_ELEM+GSXML.LIST_MODIFIER);
123	if (doc_list == null) {
124	return result;
125	}
126	Element result_doc_list = (Element)this.doc.importNode(doc_list, true);
127	result.appendChild(result_doc_list);
128
129	NodeList docs = result_doc_list.getElementsByTagName(GSXML.DOC_NODE_ELEM);
130	for (int i=0; i<docs.getLength(); i++) {
131
132	Element doc = (Element)docs.item(i);
133	Element content = this.doc.createElement(GSXML.NODE_CONTENT_ELEM);
134	doc.appendChild(content);
135
136	String node_id = doc.getAttribute(GSXML.NODE_ID_ATT);
137	String doc_name = getWorkName(node_id);
138
139	Element doc_elem = loadDocument(doc_name); // should perhaps cache the read in docs??
140	if (doc_elem == null) {
141	continue;
142	}
143
144
145	// if we have asked for the whole doc, just append it
146	if (doc_name.equals(node_id)) {
147	content.appendChild(this.doc.importNode(doc_elem, true));
148	continue;
149	}
150
151	// else we only want a sub section
152
153	Element section = getSection(doc_elem, node_id);
154	if (section != null) {
155	content.appendChild(this.doc.importNode(section, true));
156	}
157
158	} // for each doc
159
160	return result;
161
162	}
163
164	protected Element processDocumentStructureRetrieve(Element request) {
165	Element result = this.doc.createElement(GSXML.RESPONSE_ELEM);
166	result.setAttribute(GSXML.FROM_ATT, STRUCTURE_SERVICE);
167	result.setAttribute(GSXML.TYPE_ATT, GSXML.REQUEST_TYPE_PROCESS);
168
169	Element doc_list = (Element)GSXML.getChildByTagName(request, GSXML.DOC_NODE_ELEM+GSXML.LIST_MODIFIER);
170	if (doc_list == null) {
171	logger.error("no documents specified in the request. ");
172	return result;
173	}
174
175	Element result_doc_list = (Element)this.doc.importNode(doc_list, true);
176	result.appendChild(result_doc_list);
177	// first look for the stylesheet in the collection
178	File stylesheet = new File(GSFile.collStylesheetFile(this.site_home, this.cluster_name, this.toc_xsl_name));
179	if (!stylesheet.exists()) {
180	// now try in the site
181	stylesheet = new File(GSFile.siteStylesheetFile(this.site_home, this.toc_xsl_name));
182	}
183	if (!stylesheet.exists()) {
184	logger.error("couldn't find the stylesheet file to produce the table of contents:"+stylesheet.getPath());
185	return result;
186	}
187
188	// for now, we dont have any params, and we always return the structure of the whole document
189
190	XMLTransformer transformer = new XMLTransformer();
191	NodeList docs = result_doc_list.getElementsByTagName(GSXML.DOC_NODE_ELEM);
192
193	for (int i=0; i<docs.getLength(); i++) {
194
195	Element doc = (Element)docs.item(i);
196
197	Element structure = this.doc.createElement(GSXML.NODE_STRUCTURE_ELEM);
198	doc.appendChild(structure);
199	String doc_name = doc.getAttribute(GSXML.NODE_ID_ATT);
200	// make sure we are at the top level
201	doc_name = getWorkName(doc_name);
202
203	File doc_file = new File(GSFile.collectionIndexDir(this.site_home, this.cluster_name)+File.separator+"text"+File.separatorChar+doc_name+".xml");
204
205	if (!doc_file.exists()) {
206	logger.error("couldn't find file in coll "+this.cluster_name +", file "+doc_name+".xml");
207	} else {
208	try {
209	Node toc = transformer.transform(stylesheet, doc_file);
210	structure.appendChild(this.doc.importNode(toc, true));
211	} catch (Exception e) {
212	logger.error("couldn't transform the document to get the toc");
213	}
214	}
215
216	}
217
218	return result;
219
220	}
221
222	// this just extracts a bit of text from the section to use as the Title
223	// this should be overwritten for any format that has something more suitable
224	protected Element processDocumentMetadataRetrieve(Element request) {
225	Element result = this.doc.createElement(GSXML.RESPONSE_ELEM);
226	result.setAttribute(GSXML.FROM_ATT, METADATA_SERVICE);
227	result.setAttribute(GSXML.TYPE_ATT, GSXML.REQUEST_TYPE_PROCESS);
228
229	Element doc_list = (Element)GSXML.getChildByTagName(request, GSXML.DOC_NODE_ELEM+GSXML.LIST_MODIFIER);
230	if (doc_list == null) {
231	logger.error("no documents in the request");
232	return result;
233	}
234
235	Element result_doc_list = (Element)this.doc.importNode(doc_list, true);
236	result.appendChild(result_doc_list);
237
238	Element param_list = (Element) GSXML.getChildByTagName(request, GSXML.PARAM_ELEM+GSXML.LIST_MODIFIER);
239	if (param_list == null) {
240	logger.error("no metadata in the request");
241	return result;
242	}
243
244	Vector meta_name_list = new Vector();
245	boolean all_metadata = false;
246	// Process the request parameters
247	Element param = GSXML.getFirstElementChild(param_list);//(Element) param_list.getFirstChild();
248	while (param != null) {
249	// Identify the metadata information desired
250	if (param.getAttribute(GSXML.NAME_ATT).equals("metadata")) {
251	String metadata = GSXML.getValue(param);
252	if (metadata.equals("all")) {
253	all_metadata = true;
254	break;
255	}
256	meta_name_list.add(metadata);
257	}
258	param = (Element) param.getNextSibling();
259	}
260
261	NodeList docs = result_doc_list.getElementsByTagName(GSXML.DOC_NODE_ELEM);
262	for (int i=0; i<docs.getLength(); i++) {
263	Element doc = (Element)docs.item(i);
264	String node_id = doc.getAttribute(GSXML.NODE_ID_ATT);
265	String doc_name = getWorkName(node_id);
266
267	Element metadata_list = getMetadata(node_id, all_metadata, meta_name_list);
268	doc.appendChild(metadata_list);
269	}
270
271	return result;
272	}
273
274	protected Element loadDocument(String doc_name) {
275	// try to find the document
276	File doc_file = new File(GSFile.collectionIndexDir(this.site_home, this.cluster_name)+File.separator+"text"+File.separatorChar+doc_name+".xml");
277
278	if (!doc_file.exists()) {
279	logger.info("couldn't find file in coll "+this.cluster_name +", file "+doc_name+".xml");
280	return null;
281	}
282
283	Document the_doc = null;
284	try {
285	the_doc = this.converter.getDOM(doc_file, this.document_encoding);
286	} catch (Exception e) {
287	logger.error("couldn't create a DOM from file "+doc_file.getPath());
288	return null;
289	}
290
291	return the_doc.getDocumentElement();
292
293	}
294
295
296	protected Element getSection(Element doc_elem, String node_id) {
297	String [] bits = node_id.split("\\.");
298	if (bits.length > 4) {
299	logger.error("badly formatted node id ("+node_id +"), cant retrieve the section");
300	return null;
301	}
302
303	String id="";
304	String tagname = "";
305	String scope = "";
306	if (bits.length==2) {
307	tagname = bits[1];
308	} else {
309	scope = bits[1];
310	tagname = bits[2];
311
312	if (bits.length == 4) {
313	id = bits[3];
314	}
315	}
316	scope = translateScope(scope);
317	Element top=null;
318	if (!scope.equals("")) {
319	top = (Element)GSXML.getNodeByPath(doc_elem, scope);
320	if (top == null) {
321	// something gone wrong
322	return null;
323	}
324	} else {
325	top = doc_elem;
326	}
327
328	NodeList elements = top.getElementsByTagName(tagname);
329	if (elements.getLength() == 0) {
330	return null;
331	}
332	// no id, just return the first one
333	if (id.equals("")) {
334	return (Element)elements.item(0);
335	}
336	// have an id, need to check and find the right one.
337	for (int i=0; i<elements.getLength();i++) {
338	Element e = (Element)elements.item(i);
339	if (e.getAttribute("gs3:id").equals(id)) {
340	return e;
341	}
342	}
343	return null;
344
345	}
346
347	protected Element getMetadata(String node_id, boolean all, Vector meta_name_list) {
348
349	// our default strategy here is to only return Title and root:Title
350	// ignore all others
351	// the title of a section is just a little bit of the text inside it.
352	// the root_Title is the title from the doc info in the config file
353	Element metadata_list = this.doc.createElement(GSXML.METADATA_ELEM+ GSXML.LIST_MODIFIER);
354	String doc_name = getWorkName(node_id);
355	boolean node_is_root = false;
356	if (doc_name.equals(node_id)) {
357	node_is_root = true;
358	}
359
360	Element this_doc = GSXML.getNamedElement(this.collection_doc_list, GSXML.DOCUMENT_ELEM, GSXML.NAME_ATT, doc_name);
361	Element doc_meta_list = (Element) GSXML.getChildByTagName(this_doc, GSXML.METADATA_ELEM+GSXML.LIST_MODIFIER);
362
363	boolean get_section_title = false;
364
365	if (all) {
366	if (node_is_root) {
367	return (Element)this.doc.importNode(doc_meta_list, true);
368	} else {
369	get_section_title = true;
370	}
371
372	} else {
373	// have to process metadata one by one
374	for (int i=0; i<meta_name_list.size(); i++) {
375	String meta_name = (String) meta_name_list.elementAt(i);
376	String actual_meta_name = meta_name;
377	if (meta_name.startsWith("root_")) {
378	actual_meta_name = meta_name.substring(5);
379	} else {
380	// its a section level one - check to see if doc is root
381	if (!node_is_root) {
382	if (meta_name.equals("Title")) {
383	get_section_title = true;
384	}
385	continue; // move on to teh next metadata
386	}
387	}
388
389	// here, we look for the specific meta elem in doc_meta_list
390	Element meta_item = GSXML.getNamedElement(doc_meta_list, GSXML.METADATA_ELEM, GSXML.NAME_ATT, actual_meta_name);
391	if (meta_item != null) {
392	meta_item = (Element)this.doc.importNode(meta_item, true);
393	meta_item.setAttribute(GSXML.NAME_ATT, meta_name);
394	metadata_list.appendChild(meta_item);
395	}
396	} // for each metadata
397	}
398
399	// now we have processed all teh doc metadata, just have section one to go, if needed
400	if (get_section_title) {
401
402	Element doc_elem = loadDocument(doc_name);
403	if (doc_elem != null) {
404	Element section = getSection(doc_elem, node_id);
405	if (section != null) {
406	Element title_meta = extractTitleMeta(section);
407	if (title_meta != null) {
408	metadata_list.appendChild(title_meta);
409	}
410	}
411	}
412
413	}
414	return metadata_list;
415	}
416
417	protected Element extractTitleMeta(Element section) {
418	Element meta_elem = this.doc.createElement(GSXML.METADATA_ELEM);
419	meta_elem.setAttribute(GSXML.NAME_ATT, "Title");
420
421	String title = "dummy title";
422	Text t = this.doc.createTextNode(title);
423	meta_elem.appendChild(t);
424	return meta_elem;
425
426	}
427	// some methods for handling nodeIDs - they may be different for different colls, so they can be overwritten
428
429	// the full default nodeID looks like work.scope.tag.id
430	// the shorter versions are work, work.tag, work.scope.tag
431	protected String getWorkName(String node_id) {
432	int pos = node_id.indexOf('.');
433	if (pos == -1) {
434	return node_id;
435	}
436	return node_id.substring(0, pos);
437	}
438
439	// this assumes that the scope refers to a top level node - this may be overwritten if the scope bit in the id is a shorthand of some sort
440	protected String translateScope(String scope) {
441	if (this.document_root_tag != null) {
442	return GSPath.appendLink(this.document_root_tag, scope);
443	}
444	return scope;
445	}
446
447	}
448

Note: See TracBrowser for help on using the repository browser.

Download in other formats: