Context Navigation

source: trunk/gsdl3/src/java/org/greenstone/gsdl3/service/XMLRetrieve.java@ 5945

Last change on this file since 5945 was 5264, checked in by kjdon, 21 years ago
removed unnecessary print statements
Property svn:keywords set to `Author Date Id Revision`
File size: 14.5 KB

Line
1	package org.greenstone.gsdl3.service;
2
3
4	// Greenstone classes
5	import org.greenstone.gsdl3.util.*;
6
7	// XML classes
8	import org.w3c.dom.Document;
9	import org.w3c.dom.Element;
10	import org.w3c.dom.Node;
11	import org.w3c.dom.Attr;
12	import org.w3c.dom.Text;
13	import org.w3c.dom.NodeList;
14	import org.w3c.dom.NamedNodeMap;
15
16	// General Java classes
17	import java.io.File;
18	import java.util.Vector;
19	import java.util.HashMap;
20
21
22	public class XMLRetrieve extends ServiceRack {
23
24	protected static final String CONTENT_SERVICE = "DocumentContentRetrieve";
25	protected static final String METADATA_SERVICE = "DocumentMetadataRetrieve";
26	protected static final String STRUCTURE_SERVICE = "DocumentStructureRetrieve";
27
28	protected String toc_xsl_name = "";
29	protected String document_encoding = "";
30	protected Element collection_doc_list = null;
31
32	protected boolean provide_content = true;
33	protected boolean provide_structure = true;
34	protected boolean provide_metadata = true;
35
36
37	public boolean configure(Element info, Element extra_info) {
38
39	// look for the parameters
40	Element param_list = (Element)GSXML.getChildByTagName(info, GSXML.PARAM_ELEM+GSXML.LIST_MODIFIER);
41	HashMap params;
42	String services_to_provide = "";
43	if (param_list != null) {
44	params = GSXML.extractParams(param_list, false);
45	this.toc_xsl_name = (String)params.get("tocXSLT");
46	this.document_encoding = (String)params.get("documentEncoding");
47	services_to_provide = (String)params.get("provideServices");
48	}
49	if (this.toc_xsl_name == null \|\| this.toc_xsl_name.equals("")) {
50	this.toc_xsl_name = "default_toc";
51	}
52	this.toc_xsl_name = this.toc_xsl_name+".xsl";
53
54	if (this.document_encoding == null \|\| this.document_encoding.equals("")) {
55	this.document_encoding = "UTF-8";
56	}
57
58	if (services_to_provide != null && !services_to_provide.equals("")) {
59	if (services_to_provide.indexOf("content")==-1) {
60	provide_content = false;
61	}
62	if (services_to_provide.indexOf("metadata")==-1) {
63	provide_metadata = false;
64	}
65	if (services_to_provide.indexOf("structure")==-1) {
66	provide_structure = false;
67	}
68
69	}
70
71	// set up short_service_info_ - for now just has name and type
72	Element retrieve_service;
73	if (provide_content) {
74	retrieve_service = this.doc.createElement(GSXML.SERVICE_ELEM);
75	retrieve_service.setAttribute(GSXML.TYPE_ATT, GSXML.SERVICE_TYPE_RETRIEVE);
76	retrieve_service.setAttribute(GSXML.NAME_ATT, CONTENT_SERVICE);
77	this.short_service_info.appendChild(retrieve_service);
78	}
79	if (provide_metadata) {
80	retrieve_service = this.doc.createElement(GSXML.SERVICE_ELEM);
81	retrieve_service.setAttribute(GSXML.TYPE_ATT, GSXML.SERVICE_TYPE_RETRIEVE);
82	retrieve_service.setAttribute(GSXML.NAME_ATT, METADATA_SERVICE);
83	this.short_service_info.appendChild(retrieve_service);
84	}
85	if (provide_structure) {
86	retrieve_service = this.doc.createElement(GSXML.SERVICE_ELEM);
87	retrieve_service.setAttribute(GSXML.TYPE_ATT, GSXML.SERVICE_TYPE_RETRIEVE);
88	retrieve_service.setAttribute(GSXML.NAME_ATT, STRUCTURE_SERVICE);
89	this.short_service_info.appendChild(retrieve_service);
90	}
91	// find the doc list from the extra_info and keep it - should this be in collect.cfg or build.cfg??
92	collection_doc_list = (Element)GSXML.getChildByTagName(extra_info, GSXML.DOCUMENT_ELEM+GSXML.LIST_MODIFIER);
93
94	return true;
95	}
96
97	// this may get called but is not useful in the case of retrieve services
98	protected Element getServiceDescription(String service_id, String lang, String subset) {
99
100	Element retrieve_service = this.doc.createElement(GSXML.SERVICE_ELEM);
101	retrieve_service.setAttribute(GSXML.TYPE_ATT, GSXML.SERVICE_TYPE_RETRIEVE);
102	retrieve_service.setAttribute(GSXML.NAME_ATT, service_id);
103	return retrieve_service;
104	}
105
106	protected Element processDocumentContentRetrieve(Element request) {
107	Element result = this.doc.createElement(GSXML.RESPONSE_ELEM);
108	result.setAttribute(GSXML.FROM_ATT, CONTENT_SERVICE);
109	result.setAttribute(GSXML.TYPE_ATT, GSXML.REQUEST_TYPE_PROCESS);
110
111	Element doc_list = (Element)GSXML.getChildByTagName(request, GSXML.DOC_NODE_ELEM+GSXML.LIST_MODIFIER);
112	if (doc_list == null) {
113	return result;
114	}
115	Element result_doc_list = (Element)this.doc.importNode(doc_list, true);
116	result.appendChild(result_doc_list);
117
118	NodeList docs = result_doc_list.getElementsByTagName(GSXML.DOC_NODE_ELEM);
119	for (int i=0; i<docs.getLength(); i++) {
120
121	Element doc = (Element)docs.item(i);
122	Element content = this.doc.createElement(GSXML.NODE_CONTENT_ELEM);
123	doc.appendChild(content);
124
125	String node_id = doc.getAttribute(GSXML.NODE_ID_ATT);
126	String doc_name = getWorkName(node_id);
127
128	Element doc_elem = loadDocument(doc_name); // should perhaps cache the read in docs??
129	if (doc_elem == null) {
130	continue;
131	}
132
133	// if we have asked for the whole doc, just append it
134	if (doc_name.equals(node_id)) {
135	content.appendChild(this.doc.importNode(doc_elem, true));
136	continue;
137	}
138
139	// else we only want a sub section
140
141	Element section = getSection(doc_elem, node_id);
142	if (section != null) {
143	content.appendChild(this.doc.importNode(section, true));
144	}
145
146	} // for each doc
147
148	return result;
149
150	}
151
152	protected Element processDocumentStructureRetrieve(Element request) {
153	Element result = this.doc.createElement(GSXML.RESPONSE_ELEM);
154	result.setAttribute(GSXML.FROM_ATT, STRUCTURE_SERVICE);
155	result.setAttribute(GSXML.TYPE_ATT, GSXML.REQUEST_TYPE_PROCESS);
156
157	Element doc_list = (Element)GSXML.getChildByTagName(request, GSXML.DOC_NODE_ELEM+GSXML.LIST_MODIFIER);
158	if (doc_list == null) {
159	System.err.println("XMLRetrieve.DocumentStructureRetrieve: no documents specified in the request. ");
160	return result;
161	}
162
163	Element result_doc_list = (Element)this.doc.importNode(doc_list, true);
164	result.appendChild(result_doc_list);
165	// first look for the stylesheet in the collection
166	File stylesheet = new File(GSFile.collStylesheetFile(this.site_home, this.cluster_name, this.toc_xsl_name));
167	if (!stylesheet.exists()) {
168	// now try in the site
169	stylesheet = new File(GSFile.siteStylesheetFile(this.site_home, this.toc_xsl_name));
170	}
171	if (!stylesheet.exists()) {
172	System.err.println("XMLRetrieve.DocumentStructureRetrieve: couldn't find the stylesheet file to produce the table of contents:"+stylesheet.getPath());
173	return result;
174	}
175
176	// for now, we dont have any params, and we always return the structure of the whole document
177
178	XMLTransformer transformer = new XMLTransformer();
179	NodeList docs = result_doc_list.getElementsByTagName(GSXML.DOC_NODE_ELEM);
180
181	for (int i=0; i<docs.getLength(); i++) {
182
183	Element doc = (Element)docs.item(i);
184
185	Element structure = this.doc.createElement(GSXML.NODE_STRUCTURE_ELEM);
186	doc.appendChild(structure);
187	String doc_name = doc.getAttribute(GSXML.NODE_ID_ATT);
188	// make sure we are at the top level
189	doc_name = getWorkName(doc_name);
190
191	File doc_file = new File(GSFile.collectionIndexDir(this.site_home, this.cluster_name)+File.separator+"text"+File.separatorChar+doc_name+".xml");
192
193	if (!doc_file.exists()) {
194	System.err.println("XMLRetrieve.DocumentStructureRetrieve: couldn't find file in coll "+this.cluster_name +", file "+doc_name+".xml");
195	} else {
196	try {
197	Node toc = transformer.transform(stylesheet, doc_file);
198	structure.appendChild(this.doc.importNode(toc, true));
199	} catch (Exception e) {
200	System.err.println("XMLRetrieve.DocumentStructureRetrieve: couldn't transform the document to get the toc");
201	}
202	}
203
204	}
205
206	return result;
207
208	}
209
210	// this just extracts a bit of text from the section to use as the Title
211	// this should be overwritten for any format that has something more suitable
212	protected Element processDocumentMetadataRetrieve(Element request) {
213	Element result = this.doc.createElement(GSXML.RESPONSE_ELEM);
214	result.setAttribute(GSXML.FROM_ATT, METADATA_SERVICE);
215	result.setAttribute(GSXML.TYPE_ATT, GSXML.REQUEST_TYPE_PROCESS);
216
217	Element doc_list = (Element)GSXML.getChildByTagName(request, GSXML.DOC_NODE_ELEM+GSXML.LIST_MODIFIER);
218	if (doc_list == null) {
219	System.err.println("XMLRetrieve.DocumentMetadataRetrieve: no documents in the request");
220	return result;
221	}
222
223	Element result_doc_list = (Element)this.doc.importNode(doc_list, true);
224	result.appendChild(result_doc_list);
225
226	Element param_list = (Element) GSXML.getChildByTagName(request, GSXML.PARAM_ELEM+GSXML.LIST_MODIFIER);
227	if (param_list == null) {
228	System.err.println("XMLRetrieve.DocumentMetadataRetrieve: no metadata in the request");
229	return result;
230	}
231
232	Vector meta_name_list = new Vector();
233	boolean all_metadata = false;
234	// Process the request parameters
235	Element param = (Element) param_list.getFirstChild();
236	while (param != null) {
237	// Identify the metadata information desired
238	if (param.getAttribute(GSXML.NAME_ATT).equals("metadata")) {
239	String metadata = GSXML.getValue(param);
240	if (metadata.equals("all")) {
241	all_metadata = true;
242	break;
243	}
244	meta_name_list.add(metadata);
245	}
246	param = (Element) param.getNextSibling();
247	}
248
249	NodeList docs = result_doc_list.getElementsByTagName(GSXML.DOC_NODE_ELEM);
250	for (int i=0; i<docs.getLength(); i++) {
251	Element doc = (Element)docs.item(i);
252	String node_id = doc.getAttribute(GSXML.NODE_ID_ATT);
253	String doc_name = getWorkName(node_id);
254
255	Element metadata_list = getMetadata(node_id, all_metadata, meta_name_list);
256	doc.appendChild(metadata_list);
257	}
258
259	return result;
260	}
261
262	protected Element loadDocument(String doc_name) {
263	// try to find the document
264	File doc_file = new File(GSFile.collectionIndexDir(this.site_home, this.cluster_name)+File.separator+"text"+File.separatorChar+doc_name+".xml");
265
266	if (!doc_file.exists()) {
267	System.out.println("XMLRetrieve.loadDocument: couldn't find file in coll "+this.cluster_name +", file "+doc_name+".xml");
268	return null;
269	}
270
271	Document the_doc = null;
272	try {
273	the_doc = this.converter.getDOM(doc_file, this.document_encoding);
274	} catch (Exception e) {
275	System.err.println("XMLRetrieve.loadDocument: couldn't create a DOM from file "+doc_file.getPath());
276	return null;
277	}
278
279	return the_doc.getDocumentElement();
280
281	}
282
283
284	protected Element getSection(Element doc_elem, String node_id) {
285	String [] bits = node_id.split("\\.");
286	if (bits.length > 4) {
287	System.err.println("XMLRetrieve.getSection: badly formatted node id ("+node_id +"), cant retrieve the section");
288	return null;
289	}
290
291	String id="";
292	String tagname = "";
293	String scope = "";
294	if (bits.length==2) {
295	tagname = bits[1];
296	} else {
297	scope = bits[1];
298	tagname = bits[2];
299
300	if (bits.length == 4) {
301	id = bits[3];
302	}
303	}
304	scope = translateScope(scope);
305	Element top=null;
306	if (!scope.equals("")) {
307	top = (Element)GSXML.getNodeByPath(doc_elem, scope);
308	if (top == null) {
309	// something gone wrong
310	return null;
311	}
312	} else {
313	top = doc_elem;
314	}
315
316	NodeList elements = top.getElementsByTagName(tagname);
317	if (elements.getLength() == 0) {
318	return null;
319	}
320	// no id, just return the first one
321	if (id.equals("")) {
322	return (Element)elements.item(0);
323	}
324	// have an id, need to check and find the right one.
325	for (int i=0; i<elements.getLength();i++) {
326	Element e = (Element)elements.item(i);
327	if (e.getAttribute("gs3:id").equals(id)) {
328	return e;
329	}
330	}
331	return null;
332
333	}
334
335	protected Element getMetadata(String node_id, boolean all, Vector meta_name_list) {
336
337	// our default strategy here is to only return Title and root:Title
338	// ignore all others
339	// the title of a section is just a little bit of the text inside it.
340	// the root:Title is the title from the doc info in the config file
341	Element metadata_list = this.doc.createElement(GSXML.METADATA_ELEM+ GSXML.LIST_MODIFIER);
342	String doc_name = getWorkName(node_id);
343	boolean node_is_root = false;
344	if (doc_name.equals(node_id)) {
345	node_is_root = true;
346	}
347
348	Element this_doc = GSXML.getNamedElement(this.collection_doc_list, GSXML.DOCUMENT_ELEM, GSXML.NAME_ATT, doc_name);
349	Element doc_meta_list = (Element) GSXML.getChildByTagName(this_doc, GSXML.METADATA_ELEM+GSXML.LIST_MODIFIER);
350
351	boolean get_section_title = false;
352
353	if (all) {
354	if (node_is_root) {
355	return (Element)this.doc.importNode(doc_meta_list, true);
356	} else {
357	get_section_title = true;
358	}
359
360	} else {
361	// have to process metadata one by one
362	for (int i=0; i<meta_name_list.size(); i++) {
363	String meta_name = (String) meta_name_list.elementAt(i);
364	String actual_meta_name = meta_name;
365	if (meta_name.startsWith("root:")) {
366	actual_meta_name = meta_name.substring(5);
367	} else {
368	// its a section level one - check to see if doc is root
369	if (!node_is_root) {
370	if (meta_name.equals("Title")) {
371	get_section_title = true;
372	}
373	continue; // move on to teh next metadata
374	}
375	}
376
377	// here, we look for the specific meta elem in doc_meta_list
378	Element meta_item = GSXML.getNamedElement(doc_meta_list, GSXML.METADATA_ELEM, GSXML.NAME_ATT, actual_meta_name);
379	if (meta_item != null) {
380	meta_item = (Element)this.doc.importNode(meta_item, true);
381	meta_item.setAttribute(GSXML.NAME_ATT, meta_name);
382	metadata_list.appendChild(meta_item);
383	}
384	} // for each metadata
385	}
386
387	// now we have processed all teh doc metadata, just have section one to go, if needed
388	if (get_section_title) {
389
390	Element doc_elem = loadDocument(doc_name);
391	if (doc_elem != null) {
392	Element section = getSection(doc_elem, node_id);
393	if (section != null) {
394	Element title_meta = extractTitleMeta(section);
395	metadata_list.appendChild(title_meta);
396
397	}
398	}
399
400	}
401	return metadata_list;
402	}
403
404	protected Element extractTitleMeta(Element section) {
405	Element meta_elem = this.doc.createElement(GSXML.METADATA_ELEM);
406	meta_elem.setAttribute(GSXML.NAME_ATT, "Title");
407
408	String title = "dummy title";
409	Text t = this.doc.createTextNode(title);
410	meta_elem.appendChild(t);
411	return meta_elem;
412
413	}
414	// some methods for handling nodeIDs - they may be different for different colls, so they can be overwritten
415
416	// the full default nodeID looks like work.scope.tag.id
417	// the shorter versions are work, work.tag, work.scope.tag
418	protected String getWorkName(String node_id) {
419	int pos = node_id.indexOf('.');
420	if (pos == -1) {
421	return node_id;
422	}
423	return node_id.substring(0, pos);
424	}
425
426	// this assumes that the scope refers to a top level node - this may be overwritten if the scope bit in the id is a shorthand of some sort
427	protected String translateScope(String scope) {
428	return scope;
429	}
430
431	}

Note: See TracBrowser for help on using the repository browser.

Download in other formats: