Context Navigation

source: trunk/gsdl3/src/java/org/greenstone/gsdl3/service/XMLRetrieve.java@ 13270

Last change on this file since 13270 was 13270, checked in by shaoqun, 17 years ago
replace Category class which is deprecated with Logger class
Property svn:keywords set to `Author Date Id Revision`
File size: 14.7 KB

Line
1	package org.greenstone.gsdl3.service;
2
3
4	// Greenstone classes
5	import org.greenstone.gsdl3.util.*;
6
7	// XML classes
8	import org.w3c.dom.Document;
9	import org.w3c.dom.Element;
10	import org.w3c.dom.Node;
11	import org.w3c.dom.Attr;
12	import org.w3c.dom.Text;
13	import org.w3c.dom.NodeList;
14	import org.w3c.dom.NamedNodeMap;
15
16	// General Java classes
17	import java.io.File;
18	import java.util.Vector;
19	import java.util.HashMap;
20
21	import org.apache.log4j.*;
22
23	public class XMLRetrieve extends ServiceRack {
24
25	static Logger logger = Logger.getLogger(org.greenstone.gsdl3.service.XMLRetrieve.class.getName());
26	protected static final String CONTENT_SERVICE = "DocumentContentRetrieve";
27	protected static final String METADATA_SERVICE = "DocumentMetadataRetrieve";
28	protected static final String STRUCTURE_SERVICE = "DocumentStructureRetrieve";
29
30	protected String toc_xsl_name = "";
31	protected String document_encoding = "";
32	protected String document_root_tag = "";
33
34	protected Element collection_doc_list = null;
35
36	protected boolean provide_content = true;
37	protected boolean provide_structure = true;
38	protected boolean provide_metadata = true;
39
40
41	public boolean configure(Element info, Element extra_info) {
42	if (!super.configure(info, extra_info)){
43	return false;
44	}
45	logger.info("configuring XMLRetrieve...");
46	// look for the parameters
47	Element param_list = (Element)GSXML.getChildByTagName(info, GSXML.PARAM_ELEM+GSXML.LIST_MODIFIER);
48	HashMap params;
49	String services_to_provide = "";
50	if (param_list != null) {
51	params = GSXML.extractParams(param_list, false);
52	this.toc_xsl_name = (String)params.get("tocXSLT");
53	this.document_encoding = (String)params.get("documentEncoding");
54	this.document_root_tag = (String)params.get("documentRootTag");
55	services_to_provide = (String)params.get("provideServices");
56	}
57	if (this.toc_xsl_name == null \|\| this.toc_xsl_name.equals("")) {
58	this.toc_xsl_name = "default_toc";
59	}
60	this.toc_xsl_name = this.toc_xsl_name+".xsl";
61
62	if (this.document_encoding == null \|\| this.document_encoding.equals("")) {
63	this.document_encoding = "UTF-8";
64	}
65
66	if (services_to_provide != null && !services_to_provide.equals("")) {
67	if (services_to_provide.indexOf("content")==-1) {
68	provide_content = false;
69	}
70	if (services_to_provide.indexOf("metadata")==-1) {
71	provide_metadata = false;
72	}
73	if (services_to_provide.indexOf("structure")==-1) {
74	provide_structure = false;
75	}
76
77	}
78
79	// set up short_service_info_ - for now just has name and type
80	Element retrieve_service;
81	if (provide_content) {
82	retrieve_service = this.doc.createElement(GSXML.SERVICE_ELEM);
83	retrieve_service.setAttribute(GSXML.TYPE_ATT, GSXML.SERVICE_TYPE_RETRIEVE);
84	retrieve_service.setAttribute(GSXML.NAME_ATT, CONTENT_SERVICE);
85	this.short_service_info.appendChild(retrieve_service);
86	}
87	if (provide_metadata) {
88	retrieve_service = this.doc.createElement(GSXML.SERVICE_ELEM);
89	retrieve_service.setAttribute(GSXML.TYPE_ATT, GSXML.SERVICE_TYPE_RETRIEVE);
90	retrieve_service.setAttribute(GSXML.NAME_ATT, METADATA_SERVICE);
91	this.short_service_info.appendChild(retrieve_service);
92	}
93	if (provide_structure) {
94	retrieve_service = this.doc.createElement(GSXML.SERVICE_ELEM);
95	retrieve_service.setAttribute(GSXML.TYPE_ATT, GSXML.SERVICE_TYPE_RETRIEVE);
96	retrieve_service.setAttribute(GSXML.NAME_ATT, STRUCTURE_SERVICE);
97	this.short_service_info.appendChild(retrieve_service);
98	}
99	// find the doc list from the extra_info and keep it - should this be in collect.cfg or build.cfg??
100	collection_doc_list = (Element)GSXML.getChildByTagName(extra_info, GSXML.DOCUMENT_ELEM+GSXML.LIST_MODIFIER);
101
102	GSEntityResolver resolver = new GSEntityResolver();
103	resolver.setClassLoader(this.class_loader);
104	this.converter.setEntityResolver(resolver);
105	return true;
106	}
107
108	// this may get called but is not useful in the case of retrieve services
109	protected Element getServiceDescription(String service_id, String lang, String subset) {
110
111	Element retrieve_service = this.doc.createElement(GSXML.SERVICE_ELEM);
112	retrieve_service.setAttribute(GSXML.TYPE_ATT, GSXML.SERVICE_TYPE_RETRIEVE);
113	retrieve_service.setAttribute(GSXML.NAME_ATT, service_id);
114	return retrieve_service;
115	}
116
117	protected Element processDocumentContentRetrieve(Element request) {
118	Element result = this.doc.createElement(GSXML.RESPONSE_ELEM);
119	result.setAttribute(GSXML.FROM_ATT, CONTENT_SERVICE);
120	result.setAttribute(GSXML.TYPE_ATT, GSXML.REQUEST_TYPE_PROCESS);
121
122	Element doc_list = (Element)GSXML.getChildByTagName(request, GSXML.DOC_NODE_ELEM+GSXML.LIST_MODIFIER);
123	if (doc_list == null) {
124	return result;
125	}
126	Element result_doc_list = (Element)this.doc.importNode(doc_list, true);
127	result.appendChild(result_doc_list);
128
129	NodeList docs = result_doc_list.getElementsByTagName(GSXML.DOC_NODE_ELEM);
130	for (int i=0; i<docs.getLength(); i++) {
131
132	Element doc = (Element)docs.item(i);
133	Element content = this.doc.createElement(GSXML.NODE_CONTENT_ELEM);
134	doc.appendChild(content);
135
136	String node_id = doc.getAttribute(GSXML.NODE_ID_ATT);
137	String doc_name = getWorkName(node_id);
138
139	Element doc_elem = loadDocument(doc_name); // should perhaps cache the read in docs??
140	if (doc_elem == null) {
141	continue;
142	}
143
144
145	// if we have asked for the whole doc, just append it
146	if (doc_name.equals(node_id)) {
147	content.appendChild(this.doc.importNode(doc_elem, true));
148	continue;
149	}
150
151	// else we only want a sub section
152
153	Element section = getSection(doc_elem, node_id);
154	if (section != null) {
155	content.appendChild(this.doc.importNode(section, true));
156	}
157
158	} // for each doc
159
160	return result;
161
162	}
163
164	protected Element processDocumentStructureRetrieve(Element request) {
165	Element result = this.doc.createElement(GSXML.RESPONSE_ELEM);
166	result.setAttribute(GSXML.FROM_ATT, STRUCTURE_SERVICE);
167	result.setAttribute(GSXML.TYPE_ATT, GSXML.REQUEST_TYPE_PROCESS);
168
169	Element doc_list = (Element)GSXML.getChildByTagName(request, GSXML.DOC_NODE_ELEM+GSXML.LIST_MODIFIER);
170	if (doc_list == null) {
171	logger.error("no documents specified in the request. ");
172	return result;
173	}
174
175	Element result_doc_list = (Element)this.doc.importNode(doc_list, true);
176	result.appendChild(result_doc_list);
177	// first look for the stylesheet in the collection
178	File stylesheet = new File(GSFile.collStylesheetFile(this.site_home, this.cluster_name, this.toc_xsl_name));
179	if (!stylesheet.exists()) {
180	// now try in the site
181	stylesheet = new File(GSFile.siteStylesheetFile(this.site_home, this.toc_xsl_name));
182	}
183	if (!stylesheet.exists()) {
184	logger.error("couldn't find the stylesheet file to produce the table of contents:"+stylesheet.getPath());
185	return result;
186	}
187
188	// for now, we dont have any params, and we always return the structure of the whole document
189
190	XMLTransformer transformer = new XMLTransformer();
191	NodeList docs = result_doc_list.getElementsByTagName(GSXML.DOC_NODE_ELEM);
192
193	for (int i=0; i<docs.getLength(); i++) {
194
195	Element doc = (Element)docs.item(i);
196
197	Element structure = this.doc.createElement(GSXML.NODE_STRUCTURE_ELEM);
198	doc.appendChild(structure);
199	String doc_name = doc.getAttribute(GSXML.NODE_ID_ATT);
200	// make sure we are at the top level
201	doc_name = getWorkName(doc_name);
202
203	File doc_file = new File(GSFile.collectionIndexDir(this.site_home, this.cluster_name)+File.separator+"text"+File.separatorChar+doc_name+".xml");
204
205	if (!doc_file.exists()) {
206	logger.error("couldn't find file in coll "+this.cluster_name +", file "+doc_name+".xml");
207	} else {
208	try {
209	Node toc = transformer.transform(stylesheet, doc_file);
210	structure.appendChild(this.doc.importNode(toc, true));
211	} catch (Exception e) {
212	logger.error("couldn't transform the document to get the toc");
213	}
214	}
215
216	}
217
218	return result;
219
220	}
221
222	// this just extracts a bit of text from the section to use as the Title
223	// this should be overwritten for any format that has something more suitable
224	protected Element processDocumentMetadataRetrieve(Element request) {
225	Element result = this.doc.createElement(GSXML.RESPONSE_ELEM);
226	result.setAttribute(GSXML.FROM_ATT, METADATA_SERVICE);
227	result.setAttribute(GSXML.TYPE_ATT, GSXML.REQUEST_TYPE_PROCESS);
228
229	Element doc_list = (Element)GSXML.getChildByTagName(request, GSXML.DOC_NODE_ELEM+GSXML.LIST_MODIFIER);
230	if (doc_list == null) {
231	logger.error("no documents in the request");
232	return result;
233	}
234
235	Element result_doc_list = (Element)this.doc.importNode(doc_list, true);
236	result.appendChild(result_doc_list);
237
238	Element param_list = (Element) GSXML.getChildByTagName(request, GSXML.PARAM_ELEM+GSXML.LIST_MODIFIER);
239	if (param_list == null) {
240	logger.error("no metadata in the request");
241	return result;
242	}
243
244	Vector meta_name_list = new Vector();
245	boolean all_metadata = false;
246	// Process the request parameters
247	Element param = (Element) param_list.getFirstChild();
248	while (param != null) {
249	// Identify the metadata information desired
250	if (param.getAttribute(GSXML.NAME_ATT).equals("metadata")) {
251	String metadata = GSXML.getValue(param);
252	if (metadata.equals("all")) {
253	all_metadata = true;
254	break;
255	}
256	meta_name_list.add(metadata);
257	}
258	param = (Element) param.getNextSibling();
259	}
260
261	NodeList docs = result_doc_list.getElementsByTagName(GSXML.DOC_NODE_ELEM);
262	for (int i=0; i<docs.getLength(); i++) {
263	Element doc = (Element)docs.item(i);
264	String node_id = doc.getAttribute(GSXML.NODE_ID_ATT);
265	String doc_name = getWorkName(node_id);
266
267	Element metadata_list = getMetadata(node_id, all_metadata, meta_name_list);
268	doc.appendChild(metadata_list);
269	}
270
271	return result;
272	}
273
274	protected Element loadDocument(String doc_name) {
275	// try to find the document
276	File doc_file = new File(GSFile.collectionIndexDir(this.site_home, this.cluster_name)+File.separator+"text"+File.separatorChar+doc_name+".xml");
277
278	if (!doc_file.exists()) {
279	logger.info("couldn't find file in coll "+this.cluster_name +", file "+doc_name+".xml");
280	return null;
281	}
282
283	Document the_doc = null;
284	try {
285	the_doc = this.converter.getDOM(doc_file, this.document_encoding);
286	} catch (Exception e) {
287	logger.error("couldn't create a DOM from file "+doc_file.getPath());
288	return null;
289	}
290
291	return the_doc.getDocumentElement();
292
293	}
294
295
296	protected Element getSection(Element doc_elem, String node_id) {
297	String [] bits = node_id.split("\\.");
298	if (bits.length > 4) {
299	logger.error("badly formatted node id ("+node_id +"), cant retrieve the section");
300	return null;
301	}
302
303	String id="";
304	String tagname = "";
305	String scope = "";
306	if (bits.length==2) {
307	tagname = bits[1];
308	} else {
309	scope = bits[1];
310	tagname = bits[2];
311
312	if (bits.length == 4) {
313	id = bits[3];
314	}
315	}
316	scope = translateScope(scope);
317	Element top=null;
318	if (!scope.equals("")) {
319	top = (Element)GSXML.getNodeByPath(doc_elem, scope);
320	if (top == null) {
321	// something gone wrong
322	return null;
323	}
324	} else {
325	top = doc_elem;
326	}
327
328	NodeList elements = top.getElementsByTagName(tagname);
329	if (elements.getLength() == 0) {
330	return null;
331	}
332	// no id, just return the first one
333	if (id.equals("")) {
334	return (Element)elements.item(0);
335	}
336	// have an id, need to check and find the right one.
337	for (int i=0; i<elements.getLength();i++) {
338	Element e = (Element)elements.item(i);
339	if (e.getAttribute("gs3:id").equals(id)) {
340	return e;
341	}
342	}
343	return null;
344
345	}
346
347	protected Element getMetadata(String node_id, boolean all, Vector meta_name_list) {
348
349	// our default strategy here is to only return Title and root:Title
350	// ignore all others
351	// the title of a section is just a little bit of the text inside it.
352	// the root_Title is the title from the doc info in the config file
353	Element metadata_list = this.doc.createElement(GSXML.METADATA_ELEM+ GSXML.LIST_MODIFIER);
354	String doc_name = getWorkName(node_id);
355	boolean node_is_root = false;
356	if (doc_name.equals(node_id)) {
357	node_is_root = true;
358	}
359
360	Element this_doc = GSXML.getNamedElement(this.collection_doc_list, GSXML.DOCUMENT_ELEM, GSXML.NAME_ATT, doc_name);
361	Element doc_meta_list = (Element) GSXML.getChildByTagName(this_doc, GSXML.METADATA_ELEM+GSXML.LIST_MODIFIER);
362
363	boolean get_section_title = false;
364
365	if (all) {
366	if (node_is_root) {
367	return (Element)this.doc.importNode(doc_meta_list, true);
368	} else {
369	get_section_title = true;
370	}
371
372	} else {
373	// have to process metadata one by one
374	for (int i=0; i<meta_name_list.size(); i++) {
375	String meta_name = (String) meta_name_list.elementAt(i);
376	String actual_meta_name = meta_name;
377	if (meta_name.startsWith("root_")) {
378	actual_meta_name = meta_name.substring(5);
379	} else {
380	// its a section level one - check to see if doc is root
381	if (!node_is_root) {
382	if (meta_name.equals("Title")) {
383	get_section_title = true;
384	}
385	continue; // move on to teh next metadata
386	}
387	}
388
389	// here, we look for the specific meta elem in doc_meta_list
390	Element meta_item = GSXML.getNamedElement(doc_meta_list, GSXML.METADATA_ELEM, GSXML.NAME_ATT, actual_meta_name);
391	if (meta_item != null) {
392	meta_item = (Element)this.doc.importNode(meta_item, true);
393	meta_item.setAttribute(GSXML.NAME_ATT, meta_name);
394	metadata_list.appendChild(meta_item);
395	}
396	} // for each metadata
397	}
398
399	// now we have processed all teh doc metadata, just have section one to go, if needed
400	if (get_section_title) {
401
402	Element doc_elem = loadDocument(doc_name);
403	if (doc_elem != null) {
404	Element section = getSection(doc_elem, node_id);
405	if (section != null) {
406	Element title_meta = extractTitleMeta(section);
407	if (title_meta != null) {
408	metadata_list.appendChild(title_meta);
409	}
410	}
411	}
412
413	}
414	return metadata_list;
415	}
416
417	protected Element extractTitleMeta(Element section) {
418	Element meta_elem = this.doc.createElement(GSXML.METADATA_ELEM);
419	meta_elem.setAttribute(GSXML.NAME_ATT, "Title");
420
421	String title = "dummy title";
422	Text t = this.doc.createTextNode(title);
423	meta_elem.appendChild(t);
424	return meta_elem;
425
426	}
427	// some methods for handling nodeIDs - they may be different for different colls, so they can be overwritten
428
429	// the full default nodeID looks like work.scope.tag.id
430	// the shorter versions are work, work.tag, work.scope.tag
431	protected String getWorkName(String node_id) {
432	int pos = node_id.indexOf('.');
433	if (pos == -1) {
434	return node_id;
435	}
436	return node_id.substring(0, pos);
437	}
438
439	// this assumes that the scope refers to a top level node - this may be overwritten if the scope bit in the id is a shorthand of some sort
440	protected String translateScope(String scope) {
441	if (this.document_root_tag != null) {
442	return GSPath.appendLink(this.document_root_tag, scope);
443	}
444	return scope;
445	}
446
447	}
448

Note: See TracBrowser for help on using the repository browser.

Download in other formats: