Context Navigation

source: trunk/gsdl3/src/java/org/greenstone/gsdl3/service/XMLRetrieve.java@ 6872

Last change on this file since 6872 was 6872, checked in by kjdon, 20 years ago
fixed a comment
Property svn:keywords set to `Author Date Id Revision`
File size: 14.8 KB

Line
1	package org.greenstone.gsdl3.service;
2
3
4	// Greenstone classes
5	import org.greenstone.gsdl3.util.*;
6
7	// XML classes
8	import org.w3c.dom.Document;
9	import org.w3c.dom.Element;
10	import org.w3c.dom.Node;
11	import org.w3c.dom.Attr;
12	import org.w3c.dom.Text;
13	import org.w3c.dom.NodeList;
14	import org.w3c.dom.NamedNodeMap;
15
16	// General Java classes
17	import java.io.File;
18	import java.util.Vector;
19	import java.util.HashMap;
20
21
22	public class XMLRetrieve extends ServiceRack {
23
24	protected static final String CONTENT_SERVICE = "DocumentContentRetrieve";
25	protected static final String METADATA_SERVICE = "DocumentMetadataRetrieve";
26	protected static final String STRUCTURE_SERVICE = "DocumentStructureRetrieve";
27
28	protected String toc_xsl_name = "";
29	protected String document_encoding = "";
30	protected String document_root_tag = "";
31
32	protected Element collection_doc_list = null;
33
34	protected boolean provide_content = true;
35	protected boolean provide_structure = true;
36	protected boolean provide_metadata = true;
37
38
39	public boolean configure(Element info, Element extra_info) {
40
41	System.out.println("configuring XMLRetrieve...");
42	// look for the parameters
43	Element param_list = (Element)GSXML.getChildByTagName(info, GSXML.PARAM_ELEM+GSXML.LIST_MODIFIER);
44	HashMap params;
45	String services_to_provide = "";
46	if (param_list != null) {
47	params = GSXML.extractParams(param_list, false);
48	this.toc_xsl_name = (String)params.get("tocXSLT");
49	this.document_encoding = (String)params.get("documentEncoding");
50	this.document_root_tag = (String)params.get("documentRootTag");
51	services_to_provide = (String)params.get("provideServices");
52	}
53	if (this.toc_xsl_name == null \|\| this.toc_xsl_name.equals("")) {
54	this.toc_xsl_name = "default_toc";
55	}
56	this.toc_xsl_name = this.toc_xsl_name+".xsl";
57
58	if (this.document_encoding == null \|\| this.document_encoding.equals("")) {
59	this.document_encoding = "UTF-8";
60	}
61
62	if (services_to_provide != null && !services_to_provide.equals("")) {
63	if (services_to_provide.indexOf("content")==-1) {
64	provide_content = false;
65	}
66	if (services_to_provide.indexOf("metadata")==-1) {
67	provide_metadata = false;
68	}
69	if (services_to_provide.indexOf("structure")==-1) {
70	provide_structure = false;
71	}
72
73	}
74
75	// set up short_service_info_ - for now just has name and type
76	Element retrieve_service;
77	if (provide_content) {
78	retrieve_service = this.doc.createElement(GSXML.SERVICE_ELEM);
79	retrieve_service.setAttribute(GSXML.TYPE_ATT, GSXML.SERVICE_TYPE_RETRIEVE);
80	retrieve_service.setAttribute(GSXML.NAME_ATT, CONTENT_SERVICE);
81	this.short_service_info.appendChild(retrieve_service);
82	}
83	if (provide_metadata) {
84	retrieve_service = this.doc.createElement(GSXML.SERVICE_ELEM);
85	retrieve_service.setAttribute(GSXML.TYPE_ATT, GSXML.SERVICE_TYPE_RETRIEVE);
86	retrieve_service.setAttribute(GSXML.NAME_ATT, METADATA_SERVICE);
87	this.short_service_info.appendChild(retrieve_service);
88	}
89	if (provide_structure) {
90	retrieve_service = this.doc.createElement(GSXML.SERVICE_ELEM);
91	retrieve_service.setAttribute(GSXML.TYPE_ATT, GSXML.SERVICE_TYPE_RETRIEVE);
92	retrieve_service.setAttribute(GSXML.NAME_ATT, STRUCTURE_SERVICE);
93	this.short_service_info.appendChild(retrieve_service);
94	}
95	// find the doc list from the extra_info and keep it - should this be in collect.cfg or build.cfg??
96	collection_doc_list = (Element)GSXML.getChildByTagName(extra_info, GSXML.DOCUMENT_ELEM+GSXML.LIST_MODIFIER);
97
98	this.converter.setEntityResolver(new GSEntityResolver());
99	return true;
100	}
101
102	// this may get called but is not useful in the case of retrieve services
103	protected Element getServiceDescription(String service_id, String lang, String subset) {
104
105	Element retrieve_service = this.doc.createElement(GSXML.SERVICE_ELEM);
106	retrieve_service.setAttribute(GSXML.TYPE_ATT, GSXML.SERVICE_TYPE_RETRIEVE);
107	retrieve_service.setAttribute(GSXML.NAME_ATT, service_id);
108	return retrieve_service;
109	}
110
111	protected Element processDocumentContentRetrieve(Element request) {
112	Element result = this.doc.createElement(GSXML.RESPONSE_ELEM);
113	result.setAttribute(GSXML.FROM_ATT, CONTENT_SERVICE);
114	result.setAttribute(GSXML.TYPE_ATT, GSXML.REQUEST_TYPE_PROCESS);
115
116	Element doc_list = (Element)GSXML.getChildByTagName(request, GSXML.DOC_NODE_ELEM+GSXML.LIST_MODIFIER);
117	if (doc_list == null) {
118	return result;
119	}
120	Element result_doc_list = (Element)this.doc.importNode(doc_list, true);
121	result.appendChild(result_doc_list);
122
123	NodeList docs = result_doc_list.getElementsByTagName(GSXML.DOC_NODE_ELEM);
124	for (int i=0; i<docs.getLength(); i++) {
125
126	Element doc = (Element)docs.item(i);
127	Element content = this.doc.createElement(GSXML.NODE_CONTENT_ELEM);
128	doc.appendChild(content);
129
130	String node_id = doc.getAttribute(GSXML.NODE_ID_ATT);
131	String doc_name = getWorkName(node_id);
132
133	Element doc_elem = loadDocument(doc_name); // should perhaps cache the read in docs??
134	if (doc_elem == null) {
135	continue;
136	}
137
138
139	// if we have asked for the whole doc, just append it
140	if (doc_name.equals(node_id)) {
141	content.appendChild(this.doc.importNode(doc_elem, true));
142	continue;
143	}
144
145	// else we only want a sub section
146
147	Element section = getSection(doc_elem, node_id);
148	if (section != null) {
149	content.appendChild(this.doc.importNode(section, true));
150	}
151
152	} // for each doc
153
154	return result;
155
156	}
157
158	protected Element processDocumentStructureRetrieve(Element request) {
159	Element result = this.doc.createElement(GSXML.RESPONSE_ELEM);
160	result.setAttribute(GSXML.FROM_ATT, STRUCTURE_SERVICE);
161	result.setAttribute(GSXML.TYPE_ATT, GSXML.REQUEST_TYPE_PROCESS);
162
163	Element doc_list = (Element)GSXML.getChildByTagName(request, GSXML.DOC_NODE_ELEM+GSXML.LIST_MODIFIER);
164	if (doc_list == null) {
165	System.err.println("XMLRetrieve.DocumentStructureRetrieve: no documents specified in the request. ");
166	return result;
167	}
168
169	Element result_doc_list = (Element)this.doc.importNode(doc_list, true);
170	result.appendChild(result_doc_list);
171	// first look for the stylesheet in the collection
172	File stylesheet = new File(GSFile.collStylesheetFile(this.site_home, this.cluster_name, this.toc_xsl_name));
173	if (!stylesheet.exists()) {
174	// now try in the site
175	stylesheet = new File(GSFile.siteStylesheetFile(this.site_home, this.toc_xsl_name));
176	}
177	if (!stylesheet.exists()) {
178	System.err.println("XMLRetrieve.DocumentStructureRetrieve: couldn't find the stylesheet file to produce the table of contents:"+stylesheet.getPath());
179	return result;
180	}
181
182	// for now, we dont have any params, and we always return the structure of the whole document
183
184	XMLTransformer transformer = new XMLTransformer();
185	NodeList docs = result_doc_list.getElementsByTagName(GSXML.DOC_NODE_ELEM);
186
187	for (int i=0; i<docs.getLength(); i++) {
188
189	Element doc = (Element)docs.item(i);
190
191	Element structure = this.doc.createElement(GSXML.NODE_STRUCTURE_ELEM);
192	doc.appendChild(structure);
193	String doc_name = doc.getAttribute(GSXML.NODE_ID_ATT);
194	// make sure we are at the top level
195	doc_name = getWorkName(doc_name);
196
197	File doc_file = new File(GSFile.collectionIndexDir(this.site_home, this.cluster_name)+File.separator+"text"+File.separatorChar+doc_name+".xml");
198
199	if (!doc_file.exists()) {
200	System.err.println("XMLRetrieve.DocumentStructureRetrieve: couldn't find file in coll "+this.cluster_name +", file "+doc_name+".xml");
201	} else {
202	try {
203	Node toc = transformer.transform(stylesheet, doc_file);
204	structure.appendChild(this.doc.importNode(toc, true));
205	} catch (Exception e) {
206	System.err.println("XMLRetrieve.DocumentStructureRetrieve: couldn't transform the document to get the toc");
207	}
208	}
209
210	}
211
212	return result;
213
214	}
215
216	// this just extracts a bit of text from the section to use as the Title
217	// this should be overwritten for any format that has something more suitable
218	protected Element processDocumentMetadataRetrieve(Element request) {
219	Element result = this.doc.createElement(GSXML.RESPONSE_ELEM);
220	result.setAttribute(GSXML.FROM_ATT, METADATA_SERVICE);
221	result.setAttribute(GSXML.TYPE_ATT, GSXML.REQUEST_TYPE_PROCESS);
222
223	Element doc_list = (Element)GSXML.getChildByTagName(request, GSXML.DOC_NODE_ELEM+GSXML.LIST_MODIFIER);
224	if (doc_list == null) {
225	System.err.println("XMLRetrieve.DocumentMetadataRetrieve: no documents in the request");
226	return result;
227	}
228
229	Element result_doc_list = (Element)this.doc.importNode(doc_list, true);
230	result.appendChild(result_doc_list);
231
232	Element param_list = (Element) GSXML.getChildByTagName(request, GSXML.PARAM_ELEM+GSXML.LIST_MODIFIER);
233	if (param_list == null) {
234	System.err.println("XMLRetrieve.DocumentMetadataRetrieve: no metadata in the request");
235	return result;
236	}
237
238	Vector meta_name_list = new Vector();
239	boolean all_metadata = false;
240	// Process the request parameters
241	Element param = (Element) param_list.getFirstChild();
242	while (param != null) {
243	// Identify the metadata information desired
244	if (param.getAttribute(GSXML.NAME_ATT).equals("metadata")) {
245	String metadata = GSXML.getValue(param);
246	if (metadata.equals("all")) {
247	all_metadata = true;
248	break;
249	}
250	meta_name_list.add(metadata);
251	}
252	param = (Element) param.getNextSibling();
253	}
254
255	NodeList docs = result_doc_list.getElementsByTagName(GSXML.DOC_NODE_ELEM);
256	for (int i=0; i<docs.getLength(); i++) {
257	Element doc = (Element)docs.item(i);
258	String node_id = doc.getAttribute(GSXML.NODE_ID_ATT);
259	String doc_name = getWorkName(node_id);
260
261	Element metadata_list = getMetadata(node_id, all_metadata, meta_name_list);
262	doc.appendChild(metadata_list);
263	}
264
265	return result;
266	}
267
268	protected Element loadDocument(String doc_name) {
269	// try to find the document
270	File doc_file = new File(GSFile.collectionIndexDir(this.site_home, this.cluster_name)+File.separator+"text"+File.separatorChar+doc_name+".xml");
271
272	if (!doc_file.exists()) {
273	System.out.println("XMLRetrieve.loadDocument: couldn't find file in coll "+this.cluster_name +", file "+doc_name+".xml");
274	return null;
275	}
276
277	Document the_doc = null;
278	try {
279	the_doc = this.converter.getDOM(doc_file, this.document_encoding);
280	} catch (Exception e) {
281	System.err.println("XMLRetrieve.loadDocument: couldn't create a DOM from file "+doc_file.getPath());
282	return null;
283	}
284
285	return the_doc.getDocumentElement();
286
287	}
288
289
290	protected Element getSection(Element doc_elem, String node_id) {
291	String [] bits = node_id.split("\\.");
292	if (bits.length > 4) {
293	System.err.println("XMLRetrieve.getSection: badly formatted node id ("+node_id +"), cant retrieve the section");
294	return null;
295	}
296
297	String id="";
298	String tagname = "";
299	String scope = "";
300	if (bits.length==2) {
301	tagname = bits[1];
302	} else {
303	scope = bits[1];
304	tagname = bits[2];
305
306	if (bits.length == 4) {
307	id = bits[3];
308	}
309	}
310	scope = translateScope(scope);
311	Element top=null;
312	if (!scope.equals("")) {
313	top = (Element)GSXML.getNodeByPath(doc_elem, scope);
314	if (top == null) {
315	// something gone wrong
316	return null;
317	}
318	} else {
319	top = doc_elem;
320	}
321
322	NodeList elements = top.getElementsByTagName(tagname);
323	if (elements.getLength() == 0) {
324	return null;
325	}
326	// no id, just return the first one
327	if (id.equals("")) {
328	return (Element)elements.item(0);
329	}
330	// have an id, need to check and find the right one.
331	for (int i=0; i<elements.getLength();i++) {
332	Element e = (Element)elements.item(i);
333	if (e.getAttribute("gs3:id").equals(id)) {
334	return e;
335	}
336	}
337	return null;
338
339	}
340
341	protected Element getMetadata(String node_id, boolean all, Vector meta_name_list) {
342
343	// our default strategy here is to only return Title and root:Title
344	// ignore all others
345	// the title of a section is just a little bit of the text inside it.
346	// the root_Title is the title from the doc info in the config file
347	Element metadata_list = this.doc.createElement(GSXML.METADATA_ELEM+ GSXML.LIST_MODIFIER);
348	String doc_name = getWorkName(node_id);
349	boolean node_is_root = false;
350	if (doc_name.equals(node_id)) {
351	node_is_root = true;
352	}
353
354	Element this_doc = GSXML.getNamedElement(this.collection_doc_list, GSXML.DOCUMENT_ELEM, GSXML.NAME_ATT, doc_name);
355	Element doc_meta_list = (Element) GSXML.getChildByTagName(this_doc, GSXML.METADATA_ELEM+GSXML.LIST_MODIFIER);
356
357	boolean get_section_title = false;
358
359	if (all) {
360	if (node_is_root) {
361	return (Element)this.doc.importNode(doc_meta_list, true);
362	} else {
363	get_section_title = true;
364	}
365
366	} else {
367	// have to process metadata one by one
368	for (int i=0; i<meta_name_list.size(); i++) {
369	String meta_name = (String) meta_name_list.elementAt(i);
370	String actual_meta_name = meta_name;
371	if (meta_name.startsWith("root_")) {
372	actual_meta_name = meta_name.substring(5);
373	} else {
374	// its a section level one - check to see if doc is root
375	if (!node_is_root) {
376	if (meta_name.equals("Title")) {
377	get_section_title = true;
378	}
379	continue; // move on to teh next metadata
380	}
381	}
382
383	// here, we look for the specific meta elem in doc_meta_list
384	Element meta_item = GSXML.getNamedElement(doc_meta_list, GSXML.METADATA_ELEM, GSXML.NAME_ATT, actual_meta_name);
385	if (meta_item != null) {
386	meta_item = (Element)this.doc.importNode(meta_item, true);
387	meta_item.setAttribute(GSXML.NAME_ATT, meta_name);
388	metadata_list.appendChild(meta_item);
389	}
390	} // for each metadata
391	}
392
393	// now we have processed all teh doc metadata, just have section one to go, if needed
394	if (get_section_title) {
395
396	Element doc_elem = loadDocument(doc_name);
397	if (doc_elem != null) {
398	Element section = getSection(doc_elem, node_id);
399	if (section != null) {
400	Element title_meta = extractTitleMeta(section);
401	if (title_meta != null) {
402	metadata_list.appendChild(title_meta);
403	}
404	}
405	}
406
407	}
408	return metadata_list;
409	}
410
411	protected Element extractTitleMeta(Element section) {
412	Element meta_elem = this.doc.createElement(GSXML.METADATA_ELEM);
413	meta_elem.setAttribute(GSXML.NAME_ATT, "Title");
414
415	String title = "dummy title";
416	Text t = this.doc.createTextNode(title);
417	meta_elem.appendChild(t);
418	return meta_elem;
419
420	}
421	// some methods for handling nodeIDs - they may be different for different colls, so they can be overwritten
422
423	// the full default nodeID looks like work.scope.tag.id
424	// the shorter versions are work, work.tag, work.scope.tag
425	protected String getWorkName(String node_id) {
426	int pos = node_id.indexOf('.');
427	if (pos == -1) {
428	return node_id;
429	}
430	return node_id.substring(0, pos);
431	}
432
433	// this assumes that the scope refers to a top level node - this may be overwritten if the scope bit in the id is a shorthand of some sort
434	protected String translateScope(String scope) {
435	if (this.document_root_tag != null) {
436	return GSPath.appendLink(this.document_root_tag, scope);
437	}
438	return scope;
439	}
440
441	}
442

Note: See TracBrowser for help on using the repository browser.

Download in other formats: