Context Navigation

source: main/trunk/greenstone3/src/java/org/greenstone/gsdl3/action/DocumentAction.java@ 24813

Last change on this file since 24813 was 24813, checked in by sjm84, 12 years ago
Fixed highlighting inside tags
Property svn:keywords set to `Author Date Id Revision`
File size: 39.1 KB

Line
1	/*
2	* DocumentAction.java
3	* Copyright (C) 2002 New Zealand Digital Library, http://www.nzdl.org
4	*
5	* This program is free software; you can redistribute it and/or modify
6	* it under the terms of the GNU General Public License as published by
7	* the Free Software Foundation; either version 2 of the License, or
8	* (at your option) any later version.
9	*
10	* This program is distributed in the hope that it will be useful,
11	* but WITHOUT ANY WARRANTY; without even the implied warranty of
12	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13	* GNU General Public License for more details.
14	*
15	* You should have received a copy of the GNU General Public License
16	* along with this program; if not, write to the Free Software
17	* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
18	*/
19	package org.greenstone.gsdl3.action;
20
21	// Greenstone classes
22	import org.greenstone.gsdl3.core.ModuleInterface;
23	import org.greenstone.gsdl3.util.*;
24
25	// XML classes
26	import org.w3c.dom.Document;
27	import org.w3c.dom.Element;
28	import org.w3c.dom.Node;
29	import org.w3c.dom.Text;
30	import org.w3c.dom.NodeList;
31
32	// General Java classes
33	import java.util.ArrayList;
34	import java.util.HashMap;
35	import java.util.HashSet;
36	import java.io.File;
37
38	import org.apache.log4j.*;
39
40	/** Action class for retrieving Documents via the message router */
41	public class DocumentAction extends Action
42	{
43
44	static Logger logger = Logger.getLogger(org.greenstone.gsdl3.action.DocumentAction.class.getName());
45
46	// this is used to specify that the sibling nodes of a selected one should be obtained
47	public static final String SIBLING_ARG = "sib";
48	public static final String GOTO_PAGE_ARG = "gp";
49	public static final String ENRICH_DOC_ARG = "end";
50
51	/**
52	* if this is set to true, when a document is displayed, any annotation type
53	* services (enrich) will be offered to the user as well
54	*/
55	protected boolean provide_annotations = false;
56
57	protected boolean highlight_query_terms = false;
58
59	public boolean configure()
60	{
61	super.configure();
62	String highlight = (String) config_params.get("highlightQueryTerms");
63	if (highlight != null && highlight.equals("true"))
64	{
65	highlight_query_terms = true;
66	}
67	String annotate = (String) config_params.get("displayAnnotationService");
68	if (annotate != null && annotate.equals("true"))
69	{
70	provide_annotations = true;
71	}
72	return true;
73	}
74
75	public Node process(Node message_node)
76	{
77	// for now, no subaction eventually we may want to have subactions such as text assoc or something ?
78
79	Element message = this.converter.nodeToElement(message_node);
80
81	// the response
82	Element result = this.doc.createElement(GSXML.MESSAGE_ELEM);
83	Element page_response = this.doc.createElement(GSXML.RESPONSE_ELEM);
84	result.appendChild(page_response);
85
86	// get the request - assume only one
87	Element request = (Element) GSXML.getChildByTagName(message, GSXML.REQUEST_ELEM);
88	Element cgi_paramList = (Element) GSXML.getChildByTagName(request, GSXML.PARAM_ELEM + GSXML.LIST_MODIFIER);
89	HashMap params = GSXML.extractParams(cgi_paramList, false);
90
91	// just in case there are some that need to get passed to the services
92	HashMap service_params = (HashMap) params.get("s0");
93
94	String has_rl = null;
95	String has_href = null;
96	has_href = (String) params.get("href");//for an external link : get the href URL if it is existing in the params list
97	has_rl = (String) params.get("rl");//for an external link : get the rl value if it is existing in the params list
98	String collection = (String) params.get(GSParams.COLLECTION);
99	String lang = request.getAttribute(GSXML.LANG_ATT);
100	String uid = request.getAttribute(GSXML.USER_ID_ATT);
101	String document_name = (String) params.get(GSParams.DOCUMENT);
102	if ((document_name == null \|\| document_name.equals("")) && (has_href == null \|\| has_href.equals("")))
103	{
104	logger.error("no document specified!");
105	return result;
106	}
107	String document_type = (String) params.get(GSParams.DOCUMENT_TYPE);
108	if (document_type == null)
109	{
110	document_type = "simple";
111	}
112	//whether to retrieve siblings or not
113	boolean get_siblings = false;
114	String sibs = (String) params.get(SIBLING_ARG);
115	if (sibs != null && sibs.equals("1"))
116	{
117	get_siblings = true;
118	}
119
120	String sibling_num = (String) params.get(GOTO_PAGE_ARG);
121	if (sibling_num != null && !sibling_num.equals(""))
122	{
123	// we have to modify the doc name
124	document_name = document_name + "." + sibling_num + ".ss";
125	}
126
127	boolean expand_document = false;
128	String ed_arg = (String) params.get(GSParams.EXPAND_DOCUMENT);
129	if (ed_arg != null && ed_arg.equals("1"))
130	{
131	expand_document = true;
132	}
133
134	boolean expand_contents = false;
135	if (expand_document)
136	{ // we always expand the contents with the text
137	expand_contents = true;
138	}
139	else
140	{
141	String ec_arg = (String) params.get(GSParams.EXPAND_CONTENTS);
142	if (ec_arg != null && ec_arg.equals("1"))
143	{
144	expand_contents = true;
145	}
146	}
147
148	//append site metadata
149	addSiteMetadata(page_response, lang, uid);
150
151	// get the additional data needed for the page
152	getBackgroundData(page_response, collection, lang, uid);
153	Element format_elem = (Element) GSXML.getChildByTagName(page_response, GSXML.FORMAT_ELEM);
154
155	// the_document is where all the doc info - structure and metadata etc
156	// is added into, to be returned in the page
157	Element the_document = this.doc.createElement(GSXML.DOCUMENT_ELEM);
158	page_response.appendChild(the_document);
159
160	// set the doctype from the cgi arg as an attribute
161	the_document.setAttribute(GSXML.DOC_TYPE_ATT, document_type);
162
163	// create a basic doc list containing the current node
164	Element basic_doc_list = this.doc.createElement(GSXML.DOC_NODE_ELEM + GSXML.LIST_MODIFIER);
165	Element current_doc = this.doc.createElement(GSXML.DOC_NODE_ELEM);
166	basic_doc_list.appendChild(current_doc);
167	if (document_name.length() != 0)
168	{
169	current_doc.setAttribute(GSXML.NODE_ID_ATT, document_name);
170	}
171	else if (has_href.length() != 0)
172	{
173	current_doc.setAttribute(GSXML.NODE_ID_ATT, has_href);
174	current_doc.setAttribute("externalURL", has_rl);
175	}
176
177	// Create a parameter list to specify the required structure information
178	Element ds_param_list = this.doc.createElement(GSXML.PARAM_ELEM + GSXML.LIST_MODIFIER);
179
180	if (service_params != null)
181	{
182	GSXML.addParametersToList(this.doc, ds_param_list, service_params);
183	}
184
185	Element ds_param = null;
186	boolean get_structure = false;
187	boolean get_structure_info = false;
188	if (document_type.equals("paged"))
189	{
190	get_structure_info = true;
191	// get teh info needed for paged naviagtion
192	ds_param = this.doc.createElement(GSXML.PARAM_ELEM);
193	ds_param_list.appendChild(ds_param);
194	ds_param.setAttribute(GSXML.NAME_ATT, "info");
195	ds_param.setAttribute(GSXML.VALUE_ATT, "numSiblings");
196	ds_param = this.doc.createElement(GSXML.PARAM_ELEM);
197	ds_param_list.appendChild(ds_param);
198	ds_param.setAttribute(GSXML.NAME_ATT, "info");
199	ds_param.setAttribute(GSXML.VALUE_ATT, "numChildren");
200	ds_param = this.doc.createElement(GSXML.PARAM_ELEM);
201	ds_param_list.appendChild(ds_param);
202	ds_param.setAttribute(GSXML.NAME_ATT, "info");
203	ds_param.setAttribute(GSXML.VALUE_ATT, "siblingPosition");
204
205	}
206	else if (document_type.equals("hierarchy"))
207	{
208	get_structure = true;
209	if (expand_contents)
210	{
211	ds_param = this.doc.createElement(GSXML.PARAM_ELEM);
212	ds_param_list.appendChild(ds_param);
213	ds_param.setAttribute(GSXML.NAME_ATT, "structure");
214	ds_param.setAttribute(GSXML.VALUE_ATT, "entire");
215	}
216	else
217	{
218	// get the info needed for table of contents
219	ds_param = this.doc.createElement(GSXML.PARAM_ELEM);
220	ds_param_list.appendChild(ds_param);
221	ds_param.setAttribute(GSXML.NAME_ATT, "structure");
222	ds_param.setAttribute(GSXML.VALUE_ATT, "ancestors");
223	ds_param = this.doc.createElement(GSXML.PARAM_ELEM);
224	ds_param_list.appendChild(ds_param);
225	ds_param.setAttribute(GSXML.NAME_ATT, "structure");
226	ds_param.setAttribute(GSXML.VALUE_ATT, "children");
227	if (get_siblings)
228	{
229	ds_param = this.doc.createElement(GSXML.PARAM_ELEM);
230	ds_param_list.appendChild(ds_param);
231	ds_param.setAttribute(GSXML.NAME_ATT, "structure");
232	ds_param.setAttribute(GSXML.VALUE_ATT, "siblings");
233	}
234	}
235	}
236	else
237	{
238	// we dont need any structure
239	}
240
241	boolean has_dummy = false;
242	if (get_structure \|\| get_structure_info)
243	{
244
245	// Build a request to obtain the document structure
246	Element ds_message = this.doc.createElement(GSXML.MESSAGE_ELEM);
247	String to = GSPath.appendLink(collection, "DocumentStructureRetrieve");// Hard-wired?
248	Element ds_request = GSXML.createBasicRequest(this.doc, GSXML.REQUEST_TYPE_PROCESS, to, lang, uid);
249	ds_message.appendChild(ds_request);
250	ds_request.appendChild(ds_param_list);
251
252	// create a doc_node_list and put in the doc_node that we are interested in
253	ds_request.appendChild(basic_doc_list);
254
255	// Process the document structure retrieve message
256	Element ds_response_message = (Element) this.mr.process(ds_message);
257	if (processErrorElements(ds_response_message, page_response))
258	{
259	return result;
260	}
261
262	// get the info and print out
263	String path = GSPath.appendLink(GSXML.RESPONSE_ELEM, GSXML.DOC_NODE_ELEM + GSXML.LIST_MODIFIER);
264	path = GSPath.appendLink(path, GSXML.DOC_NODE_ELEM);
265	path = GSPath.appendLink(path, "nodeStructureInfo");
266	Element ds_response_struct_info = (Element) GSXML.getNodeByPath(ds_response_message, path);
267	// get the doc_node bit
268	if (ds_response_struct_info != null)
269	{
270	the_document.appendChild(this.doc.importNode(ds_response_struct_info, true));
271	}
272	path = GSPath.appendLink(GSXML.RESPONSE_ELEM, GSXML.DOC_NODE_ELEM + GSXML.LIST_MODIFIER);
273	path = GSPath.appendLink(path, GSXML.DOC_NODE_ELEM);
274	path = GSPath.appendLink(path, GSXML.NODE_STRUCTURE_ELEM);
275	Element ds_response_structure = (Element) GSXML.getNodeByPath(ds_response_message, path);
276
277	if (ds_response_structure != null)
278	{
279	// add the contents of the structure bit into the_document
280	NodeList structs = ds_response_structure.getChildNodes();
281	for (int i = 0; i < structs.getLength(); i++)
282	{
283	the_document.appendChild(this.doc.importNode(structs.item(i), true));
284	}
285	}
286	else
287	{
288	// no structure nodes, so put in a dummy doc node
289	Element doc_node = this.doc.createElement(GSXML.DOC_NODE_ELEM);
290	if (document_name.length() != 0)
291	{
292	doc_node.setAttribute(GSXML.NODE_ID_ATT, document_name);
293	}
294	else if (has_href.length() != 0)
295	{
296	doc_node.setAttribute(GSXML.NODE_ID_ATT, has_href);
297	doc_node.setAttribute("externalURL", has_rl);
298	}
299	the_document.appendChild(doc_node);
300	has_dummy = true;
301	}
302	}
303	else
304	{ // a simple type - we dont have a dummy node for simple
305	// should think about this more
306	// no structure request, so just put in a dummy doc node
307	Element doc_node = this.doc.createElement(GSXML.DOC_NODE_ELEM);
308	if (document_name.length() != 0)
309	{
310	doc_node.setAttribute(GSXML.NODE_ID_ATT, document_name);
311	}
312	else if (has_href.length() != 0)
313	{
314	doc_node.setAttribute(GSXML.NODE_ID_ATT, has_href);
315	doc_node.setAttribute("externalURL", has_rl);
316	}
317	the_document.appendChild(doc_node);
318	has_dummy = true;
319	}
320
321	// Build a request to obtain some document metadata
322	Element dm_message = this.doc.createElement(GSXML.MESSAGE_ELEM);
323	String to = GSPath.appendLink(collection, "DocumentMetadataRetrieve"); // Hard-wired?
324	Element dm_request = GSXML.createBasicRequest(this.doc, GSXML.REQUEST_TYPE_PROCESS, to, lang, uid);
325	dm_message.appendChild(dm_request);
326	// Create a parameter list to specify the required metadata information
327
328	HashSet meta_names = new HashSet();
329	meta_names.add("Title"); // the default
330	if (format_elem != null)
331	{
332	extractMetadataNames(format_elem, meta_names);
333	}
334
335	Element dm_param_list = createMetadataParamList(meta_names);
336	if (service_params != null)
337	{
338	GSXML.addParametersToList(this.doc, dm_param_list, service_params);
339	}
340
341	dm_request.appendChild(dm_param_list);
342
343	// create the doc node list for the metadata request
344	Element dm_doc_list = this.doc.createElement(GSXML.DOC_NODE_ELEM + GSXML.LIST_MODIFIER);
345	dm_request.appendChild(dm_doc_list);
346
347	// Add each node from the structure response into the metadata request
348	NodeList doc_nodes = the_document.getElementsByTagName(GSXML.DOC_NODE_ELEM);
349	for (int i = 0; i < doc_nodes.getLength(); i++)
350	{
351	Element doc_node = (Element) doc_nodes.item(i);
352	String doc_node_id = doc_node.getAttribute(GSXML.NODE_ID_ATT);
353
354	// Add the documentNode to the list
355	Element dm_doc_node = this.doc.createElement(GSXML.DOC_NODE_ELEM);
356	dm_doc_list.appendChild(dm_doc_node);
357	dm_doc_node.setAttribute(GSXML.NODE_ID_ATT, doc_node_id);
358	dm_doc_node.setAttribute(GSXML.NODE_TYPE_ATT, doc_node.getAttribute(GSXML.NODE_TYPE_ATT));
359	}
360
361	// we also want a metadata request to the top level document to get
362	// assocfilepath - this could be cached too
363	Element doc_meta_request = GSXML.createBasicRequest(this.doc, GSXML.REQUEST_TYPE_PROCESS, to, lang, uid);
364	dm_message.appendChild(doc_meta_request);
365	Element doc_meta_param_list = this.doc.createElement(GSXML.PARAM_ELEM + GSXML.LIST_MODIFIER);
366	if (service_params != null)
367	{
368	GSXML.addParametersToList(this.doc, doc_meta_param_list, service_params);
369	}
370
371	doc_meta_request.appendChild(doc_meta_param_list);
372	Element doc_param = this.doc.createElement(GSXML.PARAM_ELEM);
373	doc_meta_param_list.appendChild(doc_param);
374	doc_param.setAttribute(GSXML.NAME_ATT, "metadata");
375	doc_param.setAttribute(GSXML.VALUE_ATT, "assocfilepath");
376
377	// create the doc node list for the metadata request
378	Element doc_list = this.doc.createElement(GSXML.DOC_NODE_ELEM + GSXML.LIST_MODIFIER);
379	doc_meta_request.appendChild(doc_list);
380
381	Element doc_node = this.doc.createElement(GSXML.DOC_NODE_ELEM);
382	// the node we want is the root document node
383	if (document_name.length() != 0)
384	{
385	doc_node.setAttribute(GSXML.NODE_ID_ATT, document_name + ".rt");
386	}
387	else if (has_href.length() != 0)
388	{
389	doc_node.setAttribute(GSXML.NODE_ID_ATT, has_href + ".rt");
390	doc_node.setAttribute("externalURL", has_rl);
391	}
392	doc_list.appendChild(doc_node);
393	Element dm_response_message = (Element) this.mr.process(dm_message);
394	if (processErrorElements(dm_response_message, page_response))
395	{
396	return result;
397	}
398
399	String path = GSPath.appendLink(GSXML.RESPONSE_ELEM, GSXML.DOC_NODE_ELEM + GSXML.LIST_MODIFIER);
400	Element dm_response_doc_list = (Element) GSXML.getNodeByPath(dm_response_message, path);
401
402	// Merge the metadata with the structure information
403	NodeList dm_response_docs = dm_response_doc_list.getChildNodes();
404	for (int i = 0; i < doc_nodes.getLength(); i++)
405	{
406	GSXML.mergeMetadataLists(doc_nodes.item(i), dm_response_docs.item(i));
407	}
408	// get the top level doc metadata out
409	Element doc_meta_response = (Element) dm_response_message.getElementsByTagName(GSXML.RESPONSE_ELEM).item(1);
410	Element top_doc_node = (Element) GSXML.getNodeByPath(doc_meta_response, "documentNodeList/documentNode");
411	GSXML.mergeMetadataLists(the_document, top_doc_node);
412
413	// Build a request to obtain some document content
414	Element dc_message = this.doc.createElement(GSXML.MESSAGE_ELEM);
415	to = GSPath.appendLink(collection, "DocumentContentRetrieve"); // Hard-wired?
416	Element dc_request = GSXML.createBasicRequest(this.doc, GSXML.REQUEST_TYPE_PROCESS, to, lang, uid);
417	dc_message.appendChild(dc_request);
418
419	// Create a parameter list to specify the request parameters - empty for now
420	Element dc_param_list = this.doc.createElement(GSXML.PARAM_ELEM + GSXML.LIST_MODIFIER);
421	if (service_params != null)
422	{
423	GSXML.addParametersToList(this.doc, dc_param_list, service_params);
424	}
425
426	dc_request.appendChild(dc_param_list);
427
428	// get the content
429	// the doc list for the content request is the same as the one for the structure request unless we want the whole document, in which case its the same as for the metadata request.
430	if (expand_document)
431	{
432	dc_request.appendChild(dm_doc_list);
433	}
434	else
435	{
436	dc_request.appendChild(basic_doc_list);
437	}
438	logger.debug("request = " + converter.getString(dc_message));
439	Element dc_response_message = (Element) this.mr.process(dc_message);
440	if (processErrorElements(dc_response_message, page_response))
441	{
442	return result;
443	}
444
445	Element dc_response_doc_list = (Element) GSXML.getNodeByPath(dc_response_message, path);
446
447	if (expand_document)
448	{
449	// Merge the content with the structure information
450	NodeList dc_response_docs = dc_response_doc_list.getChildNodes();
451	for (int i = 0; i < doc_nodes.getLength(); i++)
452	{
453	Node content = GSXML.getChildByTagName((Element) dc_response_docs.item(i), "nodeContent");
454	if (content != null)
455	{
456	if (highlight_query_terms)
457	{
458	content = highlightQueryTerms(request, (Element) content);
459	}
460	doc_nodes.item(i).appendChild(this.doc.importNode(content, true));
461	}
462	//GSXML.mergeMetadataLists(doc_nodes.item(i), dm_response_docs.item(i));
463	}
464	}
465	else
466	{
467	//path = GSPath.appendLink(path, GSXML.DOC_NODE_ELEM);
468	Element dc_response_doc = (Element) GSXML.getChildByTagName(dc_response_doc_list, GSXML.DOC_NODE_ELEM);
469	Element dc_response_doc_content = (Element) GSXML.getChildByTagName(dc_response_doc, GSXML.NODE_CONTENT_ELEM);
470	Element dc_response_doc_external = (Element) GSXML.getChildByTagName(dc_response_doc, "external");
471
472	if (dc_response_doc_content == null)
473	{
474	// no content to add
475	if (dc_response_doc_external != null)
476	{
477	String modified_doc_id = dc_response_doc.getAttribute(GSXML.NODE_ID_ATT);
478
479	the_document.setAttribute("selectedNode", modified_doc_id);
480	the_document.setAttribute("external", dc_response_doc_external.getAttribute("external_link"));
481	}
482	return result;
483	}
484	if (highlight_query_terms)
485	{
486	dc_response_doc.removeChild(dc_response_doc_content);
487
488	dc_response_doc_content = highlightQueryTerms(request, dc_response_doc_content);
489	dc_response_doc.appendChild(dc_response_doc.getOwnerDocument().importNode(dc_response_doc_content, true));
490	}
491
492	if (provide_annotations)
493	{
494	String service_selected = (String) params.get(ENRICH_DOC_ARG);
495	if (service_selected != null && service_selected.equals("1"))
496	{
497	// now we can modifiy the response doc if needed
498	String enrich_service = (String) params.get(GSParams.SERVICE);
499	// send a message to the service
500	Element enrich_message = this.doc.createElement(GSXML.MESSAGE_ELEM);
501	Element enrich_request = GSXML.createBasicRequest(this.doc, GSXML.REQUEST_TYPE_PROCESS, enrich_service, lang, uid);
502	enrich_message.appendChild(enrich_request);
503	// check for parameters
504	HashMap e_service_params = (HashMap) params.get("s1");
505	if (e_service_params != null)
506	{
507	Element enrich_pl = this.doc.createElement(GSXML.PARAM_ELEM + GSXML.LIST_MODIFIER);
508	GSXML.addParametersToList(this.doc, enrich_pl, e_service_params);
509	enrich_request.appendChild(enrich_pl);
510	}
511	Element e_doc_list = this.doc.createElement(GSXML.DOC_NODE_ELEM + GSXML.LIST_MODIFIER);
512	enrich_request.appendChild(e_doc_list);
513	e_doc_list.appendChild(this.doc.importNode(dc_response_doc, true));
514
515	Node enrich_response = this.mr.process(enrich_message);
516
517	String[] links = { GSXML.RESPONSE_ELEM, GSXML.DOC_NODE_ELEM + GSXML.LIST_MODIFIER, GSXML.DOC_NODE_ELEM, GSXML.NODE_CONTENT_ELEM };
518	path = GSPath.createPath(links);
519	dc_response_doc_content = (Element) GSXML.getNodeByPath(enrich_response, path);
520
521	}
522	} // if provide_annotations
523
524	// use the returned id rather than the sent one cos there may have
525	// been modifiers such as .pr that are removed.
526	String modified_doc_id = dc_response_doc.getAttribute(GSXML.NODE_ID_ATT);
527	the_document.setAttribute("selectedNode", modified_doc_id);
528	if (has_dummy)
529	{
530	// change the id if necessary and add the content
531	Element dummy_node = (Element) doc_nodes.item(0);
532
533	dummy_node.setAttribute(GSXML.NODE_ID_ATT, modified_doc_id);
534	dummy_node.appendChild(this.doc.importNode(dc_response_doc_content, true));
535	// hack for simple type
536	if (document_type.equals("simple"))
537	{
538	// we dont want the internal docNode, just want the content and metadata in the document
539	// rethink this!!
540	the_document.removeChild(dummy_node);
541
542	NodeList dummy_children = dummy_node.getChildNodes();
543	//for (int i=0; i<dummy_children.getLength(); i++) {
544	for (int i = dummy_children.getLength() - 1; i >= 0; i--)
545	{
546	// special case as we don't want more than one metadata list
547	if (dummy_children.item(i).getNodeName().equals(GSXML.METADATA_ELEM + GSXML.LIST_MODIFIER))
548	{
549	GSXML.mergeMetadataFromList(the_document, dummy_children.item(i));
550	}
551	else
552	{
553	the_document.appendChild(dummy_children.item(i));
554	}
555	}
556	}
557	}
558	else
559	{
560	// Merge the document content with the metadata and structure information
561	for (int i = 0; i < doc_nodes.getLength(); i++)
562	{
563	Node dn = doc_nodes.item(i);
564	String dn_id = ((Element) dn).getAttribute(GSXML.NODE_ID_ATT);
565	if (dn_id.equals(modified_doc_id))
566	{
567	dn.appendChild(this.doc.importNode(dc_response_doc_content, true));
568	break;
569	}
570	}
571	}
572	}
573	logger.debug("(DocumentAction) Page:\n" + this.converter.getPrettyString(result));
574	return result;
575	}
576
577	/**
578	* tell the param class what its arguments are if an action has its own
579	* arguments, this should add them to the params object - particularly
580	* important for args that should not be saved
581	*/
582	public boolean getActionParameters(GSParams params)
583	{
584	params.addParameter(GOTO_PAGE_ARG, false);
585	params.addParameter(ENRICH_DOC_ARG, false);
586	return true;
587	}
588
589	/**
590	* this method gets the collection description, the format info, the list of
591	* enrich services, etc - stuff that is needed for the page, but is the same
592	* whatever the query is - should be cached
593	*/
594	protected boolean getBackgroundData(Element page_response, String collection, String lang, String uid)
595	{
596
597	// create a message to process - contains requests for the collection
598	// description, the format element, the enrich services on offer
599	// these could all be cached
600	Element info_message = this.doc.createElement(GSXML.MESSAGE_ELEM);
601	String path = GSPath.appendLink(collection, "DocumentContentRetrieve");
602	// the format request - ignore for now, where does this request go to??
603	Element format_request = GSXML.createBasicRequest(this.doc, GSXML.REQUEST_TYPE_FORMAT, path, lang, uid);
604	info_message.appendChild(format_request);
605
606	// the enrich_services request - only do this if provide_annotations is true
607
608	if (provide_annotations)
609	{
610	Element enrich_services_request = GSXML.createBasicRequest(this.doc, GSXML.REQUEST_TYPE_DESCRIBE, "", lang, uid);
611	enrich_services_request.setAttribute(GSXML.INFO_ATT, "serviceList");
612	info_message.appendChild(enrich_services_request);
613	}
614
615	Element info_response = (Element) this.mr.process(info_message);
616
617	// the collection is the first response
618	NodeList responses = info_response.getElementsByTagName(GSXML.RESPONSE_ELEM);
619	Element format_resp = (Element) responses.item(0);
620
621	Element format_elem = (Element) GSXML.getChildByTagName(format_resp, GSXML.FORMAT_ELEM);
622	if (format_elem != null)
623	{
624	logger.debug("doc action found a format statement");
625	// set teh format type
626	format_elem.setAttribute(GSXML.TYPE_ATT, "display");
627	page_response.appendChild(this.doc.importNode(format_elem, true));
628	}
629
630	if (provide_annotations)
631	{
632	Element services_resp = (Element) responses.item(1);
633
634	// a new message for the mr
635	Element enrich_message = this.doc.createElement(GSXML.MESSAGE_ELEM);
636
637	NodeList e_services = services_resp.getElementsByTagName(GSXML.SERVICE_ELEM);
638	boolean service_found = false;
639	for (int j = 0; j < e_services.getLength(); j++)
640	{
641	if (((Element) e_services.item(j)).getAttribute(GSXML.TYPE_ATT).equals("enrich"))
642	{
643	Element s = GSXML.createBasicRequest(this.doc, GSXML.REQUEST_TYPE_DESCRIBE, ((Element) e_services.item(j)).getAttribute(GSXML.NAME_ATT), lang, uid);
644	enrich_message.appendChild(s);
645	service_found = true;
646	}
647	}
648	if (service_found)
649	{
650	Element enrich_response = (Element) this.mr.process(enrich_message);
651
652	NodeList e_responses = enrich_response.getElementsByTagName(GSXML.RESPONSE_ELEM);
653	Element service_list = this.doc.createElement(GSXML.SERVICE_ELEM + GSXML.LIST_MODIFIER);
654	for (int i = 0; i < e_responses.getLength(); i++)
655	{
656	Element e_resp = (Element) e_responses.item(i);
657	Element e_service = (Element) this.doc.importNode(GSXML.getChildByTagName(e_resp, GSXML.SERVICE_ELEM), true);
658	e_service.setAttribute(GSXML.NAME_ATT, e_resp.getAttribute(GSXML.FROM_ATT));
659	service_list.appendChild(e_service);
660	}
661	page_response.appendChild(service_list);
662	}
663	} // if provide_annotations
664	return true;
665
666	}
667
668	/**
669	* this involves a bit of a hack to get the equivalent query terms - has to
670	* requery the query service - uses the last selected service name. (if it
671	* ends in query). should this action do the query or should it send a
672	* message to the query action? but that will involve lots of extra stuff.
673	* also doesn't handle phrases properly - just highlights all the terms found
674	* in the text.
675	*/
676	protected Element highlightQueryTerms(Element request, Element dc_response_doc_content)
677	{
678
679	// do the query again to get term info
680	Element cgi_param_list = (Element) GSXML.getChildByTagName(request, GSXML.PARAM_ELEM + GSXML.LIST_MODIFIER);
681	HashMap params = GSXML.extractParams(cgi_param_list, false);
682
683	HashMap previous_params = (HashMap) params.get("p");
684	if (previous_params == null)
685	{
686	return dc_response_doc_content;
687	}
688	String service_name = (String) previous_params.get(GSParams.SERVICE);
689	if (service_name == null \|\| !service_name.endsWith("Query"))
690	{ // hack for now - we only do highlighting if we were in a query last - ie not if we were in a browse thingy
691	logger.debug("invalid service, not doing highlighting");
692	return dc_response_doc_content;
693	}
694	String collection = (String) params.get(GSParams.COLLECTION);
695	String lang = request.getAttribute(GSXML.LANG_ATT);
696	String uid = request.getAttribute(GSXML.USER_ID_ATT);
697	String to = GSPath.appendLink(collection, service_name);
698
699	Element mr_query_message = this.doc.createElement(GSXML.MESSAGE_ELEM);
700	Element mr_query_request = GSXML.createBasicRequest(this.doc, GSXML.REQUEST_TYPE_PROCESS, to, lang, uid);
701	mr_query_message.appendChild(mr_query_request);
702
703	// paramList
704	HashMap service_params = (HashMap) params.get("s1");
705
706	Element query_param_list = this.doc.createElement(GSXML.PARAM_ELEM + GSXML.LIST_MODIFIER);
707	GSXML.addParametersToList(this.doc, query_param_list, service_params);
708	mr_query_request.appendChild(query_param_list);
709
710	// do the query
711	Element mr_query_response = (Element) this.mr.process(mr_query_message);
712
713	String path = GSPath.appendLink(GSXML.RESPONSE_ELEM, GSXML.TERM_ELEM + GSXML.LIST_MODIFIER);
714	Element query_term_list_element = (Element) GSXML.getNodeByPath(mr_query_response, path);
715	if (query_term_list_element == null)
716	{
717	// no term info
718	logger.error("No query term information.\n");
719	return dc_response_doc_content;
720	}
721
722	String content = GSXML.getNodeText(dc_response_doc_content);
723
724	String metadata_path = GSPath.appendLink(GSXML.RESPONSE_ELEM, GSXML.METADATA_ELEM + GSXML.LIST_MODIFIER);
725	Element metadata_list = (Element) GSXML.getNodeByPath(mr_query_response, metadata_path);
726
727	HashSet query_term_variants = new HashSet();
728	NodeList equivalent_terms_nodelist = query_term_list_element.getElementsByTagName("equivTermList");
729	if (equivalent_terms_nodelist == null \|\| equivalent_terms_nodelist.getLength() == 0)
730	{
731	NodeList terms_nodelist = query_term_list_element.getElementsByTagName("term");
732	if (terms_nodelist != null && terms_nodelist.getLength() > 0)
733	{
734	for (int i = 0; i < terms_nodelist.getLength(); i++)
735	{
736	String termValue = ((Element) terms_nodelist.item(i)).getAttribute("name");
737	String termValueU = null;
738	String termValueL = null;
739
740	if (termValue.length() > 1)
741	{
742	termValueU = termValue.substring(0, 1).toUpperCase() + termValue.substring(1);
743	termValueL = termValue.substring(0, 1).toLowerCase() + termValue.substring(1);
744	}
745	else
746	{
747	termValueU = termValue.substring(0, 1).toUpperCase();
748	termValueL = termValue.substring(0, 1).toLowerCase();
749	}
750
751	query_term_variants.add(termValueU);
752	query_term_variants.add(termValueL);
753	}
754	}
755	}
756	else
757	{
758	for (int i = 0; i < equivalent_terms_nodelist.getLength(); i++)
759	{
760	Element equivalent_terms_element = (Element) equivalent_terms_nodelist.item(i);
761	String[] equivalent_terms = GSXML.getAttributeValuesFromList(equivalent_terms_element, GSXML.NAME_ATT);
762	for (int j = 0; j < equivalent_terms.length; j++)
763	{
764	query_term_variants.add(equivalent_terms[j]);
765	}
766	}
767	}
768
769	ArrayList phrase_query_term_variants_hierarchy = new ArrayList();
770
771	Element query_element = GSXML.getNamedElement(metadata_list, GSXML.METADATA_ELEM, GSXML.NAME_ATT, "query");
772	String performed_query = GSXML.getNodeText(query_element) + " ";
773
774	ArrayList phrase_query_p_term_variants_list = new ArrayList();
775	int term_start = 0;
776	boolean in_term = false;
777	boolean in_phrase = false;
778	for (int i = 0; i < performed_query.length(); i++)
779	{
780	char character = performed_query.charAt(i);
781	boolean is_character_letter_or_digit = Character.isLetterOrDigit(character);
782
783	// Has a query term just started?
784	if (in_term == false && is_character_letter_or_digit == true)
785	{
786	in_term = true;
787	term_start = i;
788	}
789
790	// Or has a term just finished?
791	else if (in_term == true && is_character_letter_or_digit == false)
792	{
793	in_term = false;
794	String term = performed_query.substring(term_start, i);
795
796	Element term_element = GSXML.getNamedElement(query_term_list_element, GSXML.TERM_ELEM, GSXML.NAME_ATT, term);
797	if (term_element != null)
798	{
799
800	HashSet phrase_query_p_term_x_variants = new HashSet();
801
802	NodeList term_equivalent_terms_nodelist = term_element.getElementsByTagName("equivTermList");
803	if (term_equivalent_terms_nodelist == null \|\| term_equivalent_terms_nodelist.getLength() == 0)
804	{
805	String termValueU = null;
806	String termValueL = null;
807
808	if (term.length() > 1)
809	{
810	termValueU = term.substring(0, 1).toUpperCase() + term.substring(1);
811	termValueL = term.substring(0, 1).toLowerCase() + term.substring(1);
812	}
813	else
814	{
815	termValueU = term.substring(0, 1).toUpperCase();
816	termValueL = term.substring(0, 1).toLowerCase();
817	}
818
819	phrase_query_p_term_x_variants.add(termValueU);
820	phrase_query_p_term_x_variants.add(termValueL);
821	}
822	else
823	{
824	for (int j = 0; j < term_equivalent_terms_nodelist.getLength(); j++)
825	{
826	Element term_equivalent_terms_element = (Element) term_equivalent_terms_nodelist.item(j);
827	String[] term_equivalent_terms = GSXML.getAttributeValuesFromList(term_equivalent_terms_element, GSXML.NAME_ATT);
828	for (int k = 0; k < term_equivalent_terms.length; k++)
829	{
830	phrase_query_p_term_x_variants.add(term_equivalent_terms[k]);
831	}
832	}
833	}
834	phrase_query_p_term_variants_list.add(phrase_query_p_term_x_variants);
835
836	if (in_phrase == false)
837	{
838	phrase_query_term_variants_hierarchy.add(phrase_query_p_term_variants_list);
839	phrase_query_p_term_variants_list = new ArrayList();
840	}
841	}
842	}
843	// Watch for phrases (surrounded by quotes)
844	if (character == '\"')
845	{
846	// Has a phrase just started?
847	if (in_phrase == false)
848	{
849	in_phrase = true;
850	}
851	// Or has a phrase just finished?
852	else if (in_phrase == true)
853	{
854	in_phrase = false;
855	phrase_query_term_variants_hierarchy.add(phrase_query_p_term_variants_list);
856	}
857
858	phrase_query_p_term_variants_list = new ArrayList();
859	}
860	}
861
862	System.err.println(query_term_variants + " *** " + phrase_query_term_variants_hierarchy);
863	return highlightQueryTermsInternal(content, query_term_variants, phrase_query_term_variants_hierarchy);
864	}
865
866	/**
867	* Highlights query terms in a piece of text.
868	*/
869	private Element highlightQueryTermsInternal(String content, HashSet query_term_variants, ArrayList phrase_query_term_variants_hierarchy)
870	{
871	// Convert the content string to an array of characters for speed
872	char[] content_characters = new char[content.length()];
873	content.getChars(0, content.length(), content_characters, 0);
874
875	// Now skim through the content, identifying word matches
876	ArrayList word_matches = new ArrayList();
877	int word_start = 0;
878	boolean in_word = false;
879	boolean preceding_word_matched = false;
880	boolean inTag = false;
881	for (int i = 0; i < content_characters.length; i++)
882	{
883	//We don't want to find words inside HTML tags
884	if(content_characters[i] == '<')
885	{
886	inTag = true;
887	continue;
888	}
889	else if (inTag && content_characters[i] == '>')
890	{
891	inTag = false;
892	}
893	else if (inTag)
894	{
895	continue;
896	}
897
898	boolean is_character_letter_or_digit = Character.isLetterOrDigit(content_characters[i]);
899
900	// Has a word just started?
901	if (in_word == false && is_character_letter_or_digit == true)
902	{
903	in_word = true;
904	word_start = i;
905	}
906
907	// Or has a word just finished?
908	else if (in_word == true && is_character_letter_or_digit == false)
909	{
910	in_word = false;
911
912	// Check if the word matches any of the query term equivalents
913	String word = new String(content_characters, word_start, (i - word_start));
914	if (query_term_variants.contains(word))
915	{
916	// We have found a matching word, so remember its location
917	word_matches.add(new WordMatch(word, word_start, i, preceding_word_matched));
918	preceding_word_matched = true;
919	}
920	else
921	{
922	preceding_word_matched = false;
923	}
924	}
925	}
926
927	// Don't forget the last word...
928	if (in_word == true)
929	{
930	// Check if the word matches any of the query term equivalents
931	String word = new String(content_characters, word_start, (content_characters.length - word_start));
932	if (query_term_variants.contains(word))
933	{
934	// We have found a matching word, so remember its location
935	word_matches.add(new WordMatch(word, word_start, content_characters.length, preceding_word_matched));
936	}
937	}
938
939	ArrayList highlight_start_positions = new ArrayList();
940	ArrayList highlight_end_positions = new ArrayList();
941
942	// Deal with phrases now
943	ArrayList partial_phrase_matches = new ArrayList();
944	for (int i = 0; i < word_matches.size(); i++)
945	{
946	WordMatch word_match = (WordMatch) word_matches.get(i);
947
948	// See if any partial phrase matches are extended by this word
949	if (word_match.preceding_word_matched)
950	{
951	for (int j = partial_phrase_matches.size() - 1; j >= 0; j--)
952	{
953	PartialPhraseMatch partial_phrase_match = (PartialPhraseMatch) partial_phrase_matches.remove(j);
954	ArrayList phrase_query_p_term_variants_list = (ArrayList) phrase_query_term_variants_hierarchy.get(partial_phrase_match.query_phrase_number);
955	HashSet phrase_query_p_term_x_variants = (HashSet) phrase_query_p_term_variants_list.get(partial_phrase_match.num_words_matched);
956	if (phrase_query_p_term_x_variants.contains(word_match.word))
957	{
958	partial_phrase_match.num_words_matched++;
959
960	// Has a complete phrase match occurred?
961	if (partial_phrase_match.num_words_matched == phrase_query_p_term_variants_list.size())
962	{
963	// Check for overlaps by looking at the previous highlight range
964	if (!highlight_end_positions.isEmpty())
965	{
966	int last_highlight_index = highlight_end_positions.size() - 1;
967	int last_highlight_end = ((Integer) highlight_end_positions.get(last_highlight_index)).intValue();
968	if (last_highlight_end > partial_phrase_match.start_position)
969	{
970	// There is an overlap, so remove the previous phrase match
971	int last_highlight_start = ((Integer) highlight_start_positions.remove(last_highlight_index)).intValue();
972	highlight_end_positions.remove(last_highlight_index);
973	partial_phrase_match.start_position = last_highlight_start;
974	}
975	}
976
977	highlight_start_positions.add(new Integer(partial_phrase_match.start_position));
978	highlight_end_positions.add(new Integer(word_match.end_position));
979	}
980	// No, but add the partial match back into the list for next time
981	else
982	{
983	partial_phrase_matches.add(partial_phrase_match);
984	}
985	}
986	}
987	}
988	else
989	{
990	partial_phrase_matches.clear();
991	}
992
993	// See if this word is at the start of any of the phrases
994	for (int p = 0; p < phrase_query_term_variants_hierarchy.size(); p++)
995	{
996	ArrayList phrase_query_p_term_variants_list = (ArrayList) phrase_query_term_variants_hierarchy.get(p);
997	HashSet phrase_query_p_term_1_variants = (HashSet) phrase_query_p_term_variants_list.get(0);
998	if (phrase_query_p_term_1_variants.contains(word_match.word))
999	{
1000	// If this phrase is just one word long, we have a complete match
1001	if (phrase_query_p_term_variants_list.size() == 1)
1002	{
1003	highlight_start_positions.add(new Integer(word_match.start_position));
1004	highlight_end_positions.add(new Integer(word_match.end_position));
1005	}
1006	// Otherwise we have the start of a potential phrase match
1007	else
1008	{
1009	partial_phrase_matches.add(new PartialPhraseMatch(word_match.start_position, p));
1010	}
1011	}
1012	}
1013	}
1014
1015	// Now add the annotation tags into the document at the correct points
1016	Element content_element = this.doc.createElement(GSXML.NODE_CONTENT_ELEM);
1017
1018	int last_wrote = 0;
1019	for (int i = 0; i < highlight_start_positions.size(); i++)
1020	{
1021	int highlight_start = ((Integer) highlight_start_positions.get(i)).intValue();
1022	int highlight_end = ((Integer) highlight_end_positions.get(i)).intValue();
1023
1024	// Print anything before the highlight range
1025	if (last_wrote < highlight_start)
1026	{
1027	String preceding_text = new String(content_characters, last_wrote, (highlight_start - last_wrote));
1028	content_element.appendChild(this.doc.createTextNode(preceding_text));
1029	}
1030
1031	// Print the highlight text, annotated
1032	if (highlight_end > last_wrote)
1033	{
1034	String highlight_text = new String(content_characters, highlight_start, (highlight_end - highlight_start));
1035	Element annotation_element = GSXML.createTextElement(this.doc, "annotation", highlight_text);
1036	annotation_element.setAttribute("type", "query_term");
1037	content_element.appendChild(annotation_element);
1038	last_wrote = highlight_end;
1039	}
1040	}
1041
1042	// Finish off any unwritten text
1043	if (last_wrote < content_characters.length)
1044	{
1045	String remaining_text = new String(content_characters, last_wrote, (content_characters.length - last_wrote));
1046	content_element.appendChild(this.doc.createTextNode(remaining_text));
1047	}
1048
1049	return content_element;
1050	}
1051
1052	static private class WordMatch
1053	{
1054	public String word;
1055	public int start_position;
1056	public int end_position;
1057	public boolean preceding_word_matched;
1058
1059	public WordMatch(String word, int start_position, int end_position, boolean preceding_word_matched)
1060	{
1061	this.word = word;
1062	this.start_position = start_position;
1063	this.end_position = end_position;
1064	this.preceding_word_matched = preceding_word_matched;
1065	}
1066	}
1067
1068	static private class PartialPhraseMatch
1069	{
1070	public int start_position;
1071	public int query_phrase_number;
1072	public int num_words_matched;
1073
1074	public PartialPhraseMatch(int start_position, int query_phrase_number)
1075	{
1076	this.start_position = start_position;
1077	this.query_phrase_number = query_phrase_number;
1078	this.num_words_matched = 1;
1079	}
1080	}
1081	}

Note: See TracBrowser for help on using the repository browser.

Download in other formats: