Context Navigation

source: main/trunk/greenstone3/src/java/org/greenstone/gsdl3/action/DocumentAction.java@ 37177

Last change on this file since 37177 was 37177, checked in by davidb, 15 months ago
Introduction of new optional parameter docVersion. If null (or equal to the empty string), then code works as before. Designed to work with the file-level document-version history mechanism, if non-empty, then this value is used to change where doc.xml on the file system is read from
Property svn:keywords set to `Author Date Id Revision`
File size: 64.0 KB

Line
1	/*
2	* DocumentAction.java
3	* Copyright (C) 2002 New Zealand Digital Library, http://www.nzdl.org
4	*
5	* This program is free software; you can redistribute it and/or modify
6	* it under the terms of the GNU General Public License as published by
7	* the Free Software Foundation; either version 2 of the License, or
8	* (at your option) any later version.
9	*
10	* This program is distributed in the hope that it will be useful,
11	* but WITHOUT ANY WARRANTY; without even the implied warranty of
12	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13	* GNU General Public License for more details.
14	*
15	* You should have received a copy of the GNU General Public License
16	* along with this program; if not, write to the Free Software
17	* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
18	*/
19	package org.greenstone.gsdl3.action;
20
21	// Greenstone classes
22	import org.greenstone.gsdl3.core.ModuleInterface;
23	import org.greenstone.gsdl3.service.AbstractDocumentRetrieve;
24	import org.greenstone.gsdl3.service.DocXMLUtil;
25	import org.greenstone.gsdl3.util.*;
26	import org.greenstone.util.GlobalProperties;
27
28	// XML classes
29	import org.w3c.dom.Document;
30	import org.w3c.dom.Element;
31	import org.w3c.dom.Node;
32	import org.w3c.dom.Text;
33	import org.w3c.dom.NodeList;
34
35	// General Java classes
36	import java.util.ArrayList;
37	import java.util.HashMap;
38	import java.util.HashSet;
39	import java.util.Iterator;
40	import java.io.File;
41	import java.io.Serializable;
42
43	import org.apache.log4j.*;
44
45	/** Action class for retrieving Documents via the message router */
46	public class DocumentAction extends Action
47	{
48
49	static Logger logger = Logger.getLogger(org.greenstone.gsdl3.action.DocumentAction.class.getName());
50
51	// this is used to specify that the sibling nodes of a selected one should be obtained
52	public static final String SIBLING_ARG = "sib";
53	public static final String GOTO_PAGE_ARG = "gp";
54	public static final String ENRICH_DOC_ARG = "end";
55	public static final String EXPAND_DOCUMENT_ARG = "ed";
56	public static final String EXPAND_CONTENTS_ARG = "ec";
57	public static final String REALISTIC_BOOK_ARG = "book";
58	public static final String NO_TEXT_ARG = "noText";
59	public static final String DOC_EDIT_ARG = "docEdit";
60	public static final String DOC_VERSION_ARG = "dv";
61
62	/**
63	* if this is set to true, when a document is displayed, any annotation type
64	* services (enrich) will be offered to the user as well
65	*/
66	protected boolean provide_annotations = false;
67
68	protected boolean highlight_query_terms = false;
69
70	public boolean configure()
71	{
72	super.configure();
73	String highlight = (String) config_params.get("highlightQueryTerms");
74	if (highlight != null && highlight.equals("true"))
75	{
76	highlight_query_terms = true;
77	}
78	String annotate = (String) config_params.get("displayAnnotationService");
79	if (annotate != null && annotate.equals("true"))
80	{
81	provide_annotations = true;
82	}
83	return true;
84	}
85
86	public Node process(Node message_node)
87	{
88	// for now, no subaction eventually we may want to have subactions such as text assoc or something ?
89
90	Element message = GSXML.nodeToElement(message_node);
91	Document doc = XMLConverter.newDOM();
92
93	// the response
94	Element result = doc.createElement(GSXML.MESSAGE_ELEM);
95	Element page_response = doc.createElement(GSXML.RESPONSE_ELEM);
96	result.appendChild(page_response);
97
98	// get the request - assume only one
99	Element request = (Element) GSXML.getChildByTagName(message, GSXML.REQUEST_ELEM);
100	Element cgi_paramList = (Element) GSXML.getChildByTagName(request, GSXML.PARAM_ELEM + GSXML.LIST_MODIFIER);
101	HashMap<String, Serializable> params = GSXML.extractParams(cgi_paramList, false);
102
103	// just in case there are some that need to get passed to the services
104	// why do we use s0 here and s1 in other places???
105	HashMap service_params = (HashMap) params.get("s0");
106
107	String collection = (String) params.get(GSParams.COLLECTION);
108	String document_id = (String) params.get(GSParams.DOCUMENT);
109	if (document_id != null && document_id.equals(""))
110	{
111	document_id = null;
112	}
113	String href = (String) params.get(GSParams.HREF);//for an external link : get the href URL if it is existing in the params list
114	if (href != null && href.equals(""))
115	{
116	href = null;
117	}
118	String rl = (String) params.get(GSParams.RELATIVE_LINK);//for an external link : get the rl value if it is existing in the params list
119	if (document_id == null && href == null)
120	{
121	logger.error("no document specified!");
122	return result;
123	}
124	if (rl != null && rl.equals("0"))
125	{
126	// this is a true external link, we should have been directed to a different page or action
127	logger.error("rl value was 0, shouldn't get here");
128	return result;
129	}
130
131	String doc_id_modifier = "";
132	String sibling_num = (String) params.get(GOTO_PAGE_ARG);
133	if (sibling_num != null && !sibling_num.equals(""))
134	{
135	// we have to modify the doc name
136	doc_id_modifier = "." + sibling_num + ".ss";
137	}
138
139
140	UserContext userContext = new UserContext(request);
141
142	//append site metadata
143	addSiteMetadata(page_response, userContext);
144	addInterfaceOptions(page_response);
145
146	// get the additional data needed for the page
147	getBackgroundData(page_response, collection, userContext);
148
149	// create a basic doc list containing the current node
150	// we will use this to query whether the id is valid, and to get document type
151	Element basic_doc_list = doc.createElement(GSXML.DOC_NODE_ELEM + GSXML.LIST_MODIFIER);
152	Element current_doc = doc.createElement(GSXML.DOC_NODE_ELEM);
153	basic_doc_list.appendChild(current_doc);
154	if (document_id != null)
155	{
156	current_doc.setAttribute(GSXML.NODE_ID_ATT, document_id + doc_id_modifier);
157	}
158	else
159	{
160	current_doc.setAttribute(GSXML.HREF_ID_ATT, href);
161	// do we need this??
162	current_doc.setAttribute(GSXML.ID_MOD_ATT, doc_id_modifier);
163	}
164
165	// lets do a quick check here for valid doc id.
166	if (document_id != null) {
167	boolean is_valid = checkValidOID(basic_doc_list, collection, userContext, page_response );
168	if (!is_valid) {
169	GSXML.addError(page_response, "Invalid doc id ("+document_id+")", GSXML.ERROR_TYPE_INVALID_ID);
170	return result;
171	}
172	}
173	Element format_elem = (Element) GSXML.getChildByTagName(page_response, GSXML.FORMAT_ELEM);
174
175	if (format_elem != null) {
176	// lets look for param defaults set in config file
177	NodeList param_defaults = format_elem.getElementsByTagName(GSXML.PARAM_DEFAULT_ELEM);
178	for (int i=0; i<param_defaults.getLength(); i++) {
179	Element p = (Element)param_defaults.item(i);
180	String name = p.getAttribute(GSXML.NAME_ATT);
181	if (params.get(name) ==null) {
182	// wasn't set from interface
183	String value = p.getAttribute(GSXML.VALUE_ATT);
184	params.put(name, value );
185	// also add into request param xml so that xslt knows it too
186	GSXML.addParameterToList(cgi_paramList, name, value);
187	}
188	}
189	}
190
191	String document_type = (String) params.get(GSParams.DOCUMENT_TYPE);
192	if (document_type != null && document_type.equals(""))
193	{
194	//document_type = "hierarchy";
195	document_type = null; // we'll get it later if not already specified
196	}
197	// what if it is null here?? Anu to check...
198
199
200	boolean editing_document = false;
201	String doc_edit = (String) params.get(DOC_EDIT_ARG);
202	if (doc_edit != null && doc_edit.equals("1")) {
203	editing_document = true;
204	}
205
206	// are we editing mode? just get the archive document, convert to our internal doc format, and return it
207	if (editing_document) {
208	String opt_document_version = (String) params.get(DOC_VERSION_ARG);
209	return getFormattedArchiveDoc(doc, collection, document_id, opt_document_version, document_type, result, page_response, userContext);
210	}
211
212	//whether to retrieve siblings or not
213	boolean get_siblings = false;
214	String sibs = (String) params.get(SIBLING_ARG);
215	if (sibs != null && sibs.equals("1"))
216	{
217	get_siblings = true;
218	}
219
220	boolean expand_document = false;
221	String ed_arg = (String) params.get(EXPAND_DOCUMENT_ARG);
222	if (ed_arg != null && ed_arg.equals("1"))
223	{
224	expand_document = true;
225	}
226
227	boolean expand_contents = false;
228	if (expand_document)
229	{ // we always expand the contents with the text
230	expand_contents = true;
231	}
232	else
233	{
234	String ec_arg = (String) params.get(EXPAND_CONTENTS_ARG);
235	if (ec_arg != null && ec_arg.equals("1"))
236	{
237	expand_contents = true;
238	}
239	}
240
241	// do we want text content? Not if no_text=1.
242	// expand_document overrides this. - should it??
243	boolean get_text = true;
244	String nt_arg = (String) params.get(NO_TEXT_ARG);
245
246	if (!expand_document && nt_arg!=null && nt_arg.equals("1")) {
247	logger.debug("SETTING GET TEXT TO FALSE");
248	get_text = false;
249	} else {
250	logger.debug("GET TEXT REMAINS TRUE");
251	}
252
253	// the_document is where all the doc info - structure and metadata etc
254	// is added into, to be returned in the page
255	Element the_document = doc.createElement(GSXML.DOCUMENT_ELEM);
256	page_response.appendChild(the_document);
257
258	// used to create basic_doc_list here
259	if (document_type == null)
260	{
261	document_type = getDocumentType(basic_doc_list, collection, userContext, page_response);
262	}
263	if (document_type == null)
264	{
265	logger.debug("##### doctype is null, setting to simple");
266	document_type = GSXML.DOC_TYPE_SIMPLE;
267	}
268
269	the_document.setAttribute(GSXML.DOC_TYPE_ATT, document_type);
270
271	// start getting doc structure
272
273	// Create a parameter list to specify the required structure information
274	Element ds_param_list = doc.createElement(GSXML.PARAM_ELEM + GSXML.LIST_MODIFIER);
275
276	if (service_params != null)
277	{
278	GSXML.addParametersToList(ds_param_list, service_params);
279	}
280
281	Element ds_param = null;
282	boolean get_structure = false;
283	boolean get_structure_info = false;
284	if (document_type.equals(GSXML.DOC_TYPE_PAGED))
285	{
286	get_structure_info = true;
287
288	if (expand_contents)
289	{
290	ds_param = doc.createElement(GSXML.PARAM_ELEM);
291	ds_param_list.appendChild(ds_param);
292	ds_param.setAttribute(GSXML.NAME_ATT, "structure");
293	ds_param.setAttribute(GSXML.VALUE_ATT, "entire");
294	}
295
296	// get the info needed for paged naviagtion
297	ds_param = doc.createElement(GSXML.PARAM_ELEM);
298	ds_param_list.appendChild(ds_param);
299	ds_param.setAttribute(GSXML.NAME_ATT, "info");
300	ds_param.setAttribute(GSXML.VALUE_ATT, "numSiblings");
301	ds_param = doc.createElement(GSXML.PARAM_ELEM);
302	ds_param_list.appendChild(ds_param);
303	ds_param.setAttribute(GSXML.NAME_ATT, "info");
304	ds_param.setAttribute(GSXML.VALUE_ATT, "numChildren");
305	ds_param = doc.createElement(GSXML.PARAM_ELEM);
306	ds_param_list.appendChild(ds_param);
307	ds_param.setAttribute(GSXML.NAME_ATT, "info");
308	ds_param.setAttribute(GSXML.VALUE_ATT, "siblingPosition");
309
310	if (get_siblings)
311	{
312	ds_param = doc.createElement(GSXML.PARAM_ELEM);
313	ds_param_list.appendChild(ds_param);
314	ds_param.setAttribute(GSXML.NAME_ATT, "structure");
315	ds_param.setAttribute(GSXML.VALUE_ATT, "siblings");
316	}
317
318	}
319	else if (document_type.equals(GSXML.DOC_TYPE_HIERARCHY) \|\| document_type.equals(GSXML.DOC_TYPE_PAGED_HIERARCHY))
320	{
321	get_structure = true;
322	if (expand_contents)
323	{
324	ds_param = doc.createElement(GSXML.PARAM_ELEM);
325	ds_param_list.appendChild(ds_param);
326	ds_param.setAttribute(GSXML.NAME_ATT, "structure");
327	ds_param.setAttribute(GSXML.VALUE_ATT, "entire");
328	}
329	else
330	{
331	// get the info needed for table of contents
332	ds_param = doc.createElement(GSXML.PARAM_ELEM);
333	ds_param_list.appendChild(ds_param);
334	ds_param.setAttribute(GSXML.NAME_ATT, "structure");
335	ds_param.setAttribute(GSXML.VALUE_ATT, "ancestors");
336	ds_param = doc.createElement(GSXML.PARAM_ELEM);
337	ds_param_list.appendChild(ds_param);
338	ds_param.setAttribute(GSXML.NAME_ATT, "structure");
339	ds_param.setAttribute(GSXML.VALUE_ATT, "children");
340	if (get_siblings)
341	{
342	ds_param = doc.createElement(GSXML.PARAM_ELEM);
343	ds_param_list.appendChild(ds_param);
344	ds_param.setAttribute(GSXML.NAME_ATT, "structure");
345	ds_param.setAttribute(GSXML.VALUE_ATT, "siblings");
346	}
347	}
348	}
349	else
350	{
351	// we dont need any structure
352	}
353
354	boolean has_dummy = false;
355	if (get_structure \|\| get_structure_info)
356	{
357
358	// Build a request to obtain the document structure
359	Element ds_message = doc.createElement(GSXML.MESSAGE_ELEM);
360	String to = GSPath.appendLink(collection, AbstractDocumentRetrieve.DOCUMENT_STRUCTURE_RETRIEVE_SERVICE);
361	Element ds_request = GSXML.createBasicRequest(doc, GSXML.REQUEST_TYPE_PROCESS, to, userContext);
362	ds_message.appendChild(ds_request);
363	ds_request.appendChild(ds_param_list);
364
365	// add the node list we created earlier
366	ds_request.appendChild(basic_doc_list);
367
368	// Process the document structure retrieve message
369	Element ds_response_message = (Element) this.mr.process(ds_message);
370	if (processErrorElements(ds_response_message, page_response))
371	{
372	return result;
373	}
374
375	// get the info and print out
376	String path = GSPath.appendLink(GSXML.RESPONSE_ELEM, GSXML.DOC_NODE_ELEM + GSXML.LIST_MODIFIER);
377	path = GSPath.appendLink(path, GSXML.DOC_NODE_ELEM);
378	path = GSPath.appendLink(path, "nodeStructureInfo");
379	Element ds_response_struct_info = (Element) GSXML.getNodeByPath(ds_response_message, path);
380	// get the doc_node bit
381	if (ds_response_struct_info != null)
382	{
383	the_document.appendChild(doc.importNode(ds_response_struct_info, true));
384	}
385	path = GSPath.appendLink(GSXML.RESPONSE_ELEM, GSXML.DOC_NODE_ELEM + GSXML.LIST_MODIFIER);
386	path = GSPath.appendLink(path, GSXML.DOC_NODE_ELEM);
387	path = GSPath.appendLink(path, GSXML.NODE_STRUCTURE_ELEM);
388	Element ds_response_structure = (Element) GSXML.getNodeByPath(ds_response_message, path);
389
390	if (ds_response_structure != null)
391	{
392	// add the contents of the structure bit into the_document
393	NodeList structs = ds_response_structure.getChildNodes();
394	for (int i = 0; i < structs.getLength(); i++)
395	{
396	the_document.appendChild(doc.importNode(structs.item(i), true));
397	}
398	}
399	else
400	{
401	// no structure nodes, so put in a dummy doc node
402	Element doc_node = doc.createElement(GSXML.DOC_NODE_ELEM);
403	if (document_id != null)
404	{
405	doc_node.setAttribute(GSXML.NODE_ID_ATT, document_id);
406	}
407	else
408	{
409	doc_node.setAttribute(GSXML.HREF_ID_ATT, href);
410
411	}
412	the_document.appendChild(doc_node);
413	has_dummy = true;
414	}
415	}
416	else
417	{ // a simple type - we dont have a dummy node for simple
418	// should think about this more
419	// no structure request, so just put in a dummy doc node
420	Element doc_node = doc.createElement(GSXML.DOC_NODE_ELEM);
421	if (document_id != null)
422	{
423	doc_node.setAttribute(GSXML.NODE_ID_ATT, document_id);
424	}
425	else
426	{
427	doc_node.setAttribute(GSXML.HREF_ID_ATT, href);
428	}
429	the_document.appendChild(doc_node);
430	has_dummy = true;
431	}
432
433	// end getting doc structure
434
435	// start getting doc metadata
436
437	// Build a request to obtain some document metadata
438	Element dm_message = doc.createElement(GSXML.MESSAGE_ELEM);
439	String to = GSPath.appendLink(collection, AbstractDocumentRetrieve.DOCUMENT_METADATA_RETRIEVE_SERVICE);
440	Element dm_request = GSXML.createBasicRequest(doc, GSXML.REQUEST_TYPE_PROCESS, to, userContext);
441	dm_message.appendChild(dm_request);
442	// Create a parameter list to specify the required metadata information
443
444	HashSet<String> meta_names = new HashSet<String>();
445	meta_names.add("Title"); // the default
446	if (format_elem != null)
447	{
448	getRequiredMetadataNames(format_elem, meta_names);
449	}
450
451	Element extraMetaListElem = (Element) GSXML.getChildByTagName(request, GSXML.EXTRA_METADATA + GSXML.LIST_MODIFIER);
452	if (extraMetaListElem != null)
453	{
454	NodeList extraMetaList = extraMetaListElem.getElementsByTagName(GSXML.EXTRA_METADATA);
455	for (int i = 0; i < extraMetaList.getLength(); i++)
456	{
457	meta_names.add(((Element) extraMetaList.item(i)).getAttribute(GSXML.NAME_ATT));
458	}
459	}
460
461	Element dm_param_list = createMetadataParamList(doc,meta_names);
462	if (service_params != null)
463	{
464	GSXML.addParametersToList(dm_param_list, service_params);
465	}
466
467	dm_request.appendChild(dm_param_list);
468
469	// create the doc node list for the metadata request
470	Element dm_doc_list = doc.createElement(GSXML.DOC_NODE_ELEM + GSXML.LIST_MODIFIER);
471	dm_request.appendChild(dm_doc_list);
472
473	// Add each node from the structure response into the metadata request
474	NodeList doc_nodes = the_document.getElementsByTagName(GSXML.DOC_NODE_ELEM);
475	for (int i = 0; i < doc_nodes.getLength(); i++)
476	{
477	Element doc_node = (Element) doc_nodes.item(i);
478	String doc_node_id = doc_node.getAttribute(GSXML.NODE_ID_ATT);
479
480	// Add the documentNode to the list
481	Element dm_doc_node = doc.createElement(GSXML.DOC_NODE_ELEM);
482	if (needSectionContent(params)) {
483	if (doc_node_id.equals(document_id)) {
484	dm_doc_list.appendChild(dm_doc_node);
485	}
486	} else {
487	dm_doc_list.appendChild(dm_doc_node);
488	}
489	//dm_doc_list.appendChild(dm_doc_node);
490	dm_doc_node.setAttribute(GSXML.NODE_ID_ATT, doc_node_id);
491	dm_doc_node.setAttribute(GSXML.NODE_TYPE_ATT, doc_node.getAttribute(GSXML.NODE_TYPE_ATT));
492	if (document_id == null){
493	dm_doc_node.setAttribute(GSXML.HREF_ID_ATT, href );
494	}
495
496	}
497	// we also want a metadata request to the top level document to get
498	// assocfilepath - this could be cached too
499	Element doc_meta_request = GSXML.createBasicRequest(doc, GSXML.REQUEST_TYPE_PROCESS, to, userContext);
500	dm_message.appendChild(doc_meta_request);
501	Element doc_meta_param_list = doc.createElement(GSXML.PARAM_ELEM + GSXML.LIST_MODIFIER);
502	if (service_params != null)
503	{
504	GSXML.addParametersToList(doc_meta_param_list, service_params);
505	}
506
507	doc_meta_request.appendChild(doc_meta_param_list);
508	Element doc_param = doc.createElement(GSXML.PARAM_ELEM);
509	doc_meta_param_list.appendChild(doc_param);
510	doc_param.setAttribute(GSXML.NAME_ATT, "metadata");
511	doc_param.setAttribute(GSXML.VALUE_ATT, "assocfilepath");
512
513	// create the doc node list for the metadata request
514	Element doc_list = doc.createElement(GSXML.DOC_NODE_ELEM + GSXML.LIST_MODIFIER);
515	doc_meta_request.appendChild(doc_list);
516
517	Element doc_node = doc.createElement(GSXML.DOC_NODE_ELEM);
518	// the node we want is the root document node
519	if (document_id != null)
520	{
521	doc_node.setAttribute(GSXML.NODE_ID_ATT, document_id + ".rt");
522	}
523	/*else
524	{
525	doc_node.setAttribute(GSXML.HREF_ID_ATT, href);// + ".rt");
526	// can we assume that href is always a top level doc??
527	//doc_node.setAttribute(GSXML.ID_MOD_ATT, ".rt");
528	//doc_node.setAttribute("externalURL", has_rl);
529	}*/
530	doc_list.appendChild(doc_node);
531
532	Element dm_response_message = (Element) this.mr.process(dm_message);
533	if (processErrorElements(dm_response_message, page_response))
534	{
535	return result;
536	}
537
538	String path = GSPath.appendLink(GSXML.RESPONSE_ELEM, GSXML.DOC_NODE_ELEM + GSXML.LIST_MODIFIER);
539	Element dm_response_doc_list = (Element) GSXML.getNodeByPath(dm_response_message, path);
540
541	// Merge the metadata with the structure information
542	NodeList dm_response_docs = dm_response_doc_list.getChildNodes();
543	for (int i = 0; i < doc_nodes.getLength(); i++)
544	{
545	Node dcNode;
546	String node_idd = ((Element)doc_nodes.item(i)).getAttribute(GSXML.NODE_ID_ATT);
547	if (node_idd.isEmpty()) {
548	String href_id_att = ((Element)doc_nodes.item(i)).getAttribute(GSXML.HREF_ID_ATT);
549	dcNode = GSXML.getNamedElement(dm_response_doc_list, "documentNode", GSXML.HREF_ID_ATT, href_id_att);
550	} else {
551	dcNode = GSXML.getNamedElement(dm_response_doc_list, "documentNode", GSXML.NODE_ID_ATT, node_idd);
552	}
553	GSXML.mergeMetadataLists(doc_nodes.item(i), dcNode);
554	}
555	// get the top level doc metadata out
556	Element doc_meta_response = (Element) dm_response_message.getElementsByTagName(GSXML.RESPONSE_ELEM).item(1);
557	Element top_doc_node = (Element) GSXML.getNodeByPath(doc_meta_response, "documentNodeList/documentNode");
558	GSXML.mergeMetadataLists(the_document, top_doc_node);
559
560	// if we are highlighting query terms, then we also get them highlighted in the metadata
561
562	HashSet<String> query_term_variants = null;
563	ArrayList<ArrayList<HashSet<String>>> phrase_query_term_variants_hierarchy = null;
564	boolean do_highlight_query_terms = highlight_query_terms;
565	int query_terms_status = 0;
566	if (highlight_query_terms) {
567	// lets get the query term equivalents
568	query_term_variants = new HashSet<String>();
569	phrase_query_term_variants_hierarchy = new ArrayList<ArrayList<HashSet<String>>>();
570	if ((query_terms_status = getQueryTermVariants(request, query_term_variants, phrase_query_term_variants_hierarchy)) ==0) {
571	do_highlight_query_terms = false; // we couldn't get the terms
572	}
573	}
574
575	// lets try marking up the metadata with search terms
576	// if the search service doesn't send back <equivTermlist> then we haven't got the term variants. We lower case everything and do case insensitive matching
577	boolean highlight_case_insensitive = false;
578	if (query_terms_status == NO_EQUIV_QUERY_TERMS) {
579	highlight_case_insensitive = true;
580	}
581	if (do_highlight_query_terms) {
582	highlightQueryTermsDOM(doc, the_document, "metadata", query_term_variants, phrase_query_term_variants_hierarchy, highlight_case_insensitive);
583	}
584
585	// do we want doc text content? If not, we are done.
586	if (!get_text) {
587	// don't get text
588	return result;
589	}
590
591	// Build a request to obtain some document content
592	Element dc_message = doc.createElement(GSXML.MESSAGE_ELEM);
593	to = GSPath.appendLink(collection, AbstractDocumentRetrieve.DOCUMENT_CONTENT_RETRIEVE_SERVICE);
594	Element dc_request = GSXML.createBasicRequest(doc, GSXML.REQUEST_TYPE_PROCESS, to, userContext);
595	dc_message.appendChild(dc_request);
596
597	// Create a parameter list to specify the request parameters - empty for now
598	Element dc_param_list = doc.createElement(GSXML.PARAM_ELEM + GSXML.LIST_MODIFIER);
599	if (service_params != null)
600	{
601	GSXML.addParametersToList(dc_param_list, service_params);
602	}
603
604	dc_request.appendChild(dc_param_list);
605
606	// get the content
607	// the doc list for the content request is the same as the one for the structure request unless we want the whole document, in which case its the same as for the metadata request.
608	if (expand_document)
609	{
610	dc_request.appendChild(dm_doc_list);
611	}
612	else
613	{
614	dc_request.appendChild(basic_doc_list);
615	}
616	Element dc_response_message = (Element) this.mr.process(dc_message);
617
618	if (processErrorElements(dc_response_message, page_response))
619	{
620	return result;
621
622	}
623	Element dc_response_doc_list = (Element) GSXML.getNodeByPath(dc_response_message, path);
624
625	boolean get_marked_up_doc_from_query = false;
626	if (do_highlight_query_terms && query_terms_status == NO_EQUIV_QUERY_TERMS) {
627	get_marked_up_doc_from_query = true; // we try to. solr we can, lucene we can't
628	}
629
630	if (expand_document)
631	{
632	// Merge the content with the structure information
633	NodeList dc_response_docs = dc_response_doc_list.getChildNodes();
634	for (int i = 0; i < doc_nodes.getLength(); i++)
635	{
636	String node_id = ((Element)doc_nodes.item(i)).getAttribute(GSXML.NODE_ID_ATT);
637	Node docNode = GSXML.getNamedElement(dc_response_doc_list, "documentNode", GSXML.NODE_ID_ATT, node_id);
638	Node content = GSXML.getChildByTagName(docNode, GSXML.NODE_CONTENT_ELEM);
639	if (content != null)
640	{
641	if (do_highlight_query_terms) {
642	if (get_marked_up_doc_from_query) {
643
644	Element new_content = retrieveHighlightedContent(request, node_id);
645
646	if (new_content == null) {
647	// we didn't get any text back from the request. assume we won't be able to get it next time either (eg lucene)
648	get_marked_up_doc_from_query = false;
649	content = highlightQueryTermsElementText(doc, (Element)content, query_term_variants, phrase_query_term_variants_hierarchy, highlight_case_insensitive);
650	} else {
651	content= new_content;
652	}
653	} else {
654	content = highlightQueryTermsElementText(doc, (Element)content, query_term_variants, phrase_query_term_variants_hierarchy, highlight_case_insensitive);
655	}
656	}
657	doc_nodes.item(i).appendChild(doc.importNode(content, true));
658	}
659
660	}
661	if (has_dummy && document_type.equals(GSXML.DOC_TYPE_SIMPLE)) {
662	Element dummy_node = (Element) doc_nodes.item(0);
663	the_document.removeChild(dummy_node);
664	the_document.setAttribute(GSXML.NODE_ID_ATT, dummy_node.getAttribute(GSXML.NODE_ID_ATT));
665	NodeList dummy_children = dummy_node.getChildNodes();
666	for (int i = dummy_children.getLength() - 1; i >= 0; i--)
667	{
668	// special case as we don't want more than one metadata list
669	if (dummy_children.item(i).getNodeName().equals(GSXML.METADATA_ELEM + GSXML.LIST_MODIFIER))
670	{
671	GSXML.mergeMetadataFromList(the_document, dummy_children.item(i));
672	}
673	else
674	{
675	the_document.appendChild(dummy_children.item(i));
676	}
677	}
678	}
679	}
680	else
681	{
682	Element dc_response_doc = (Element) GSXML.getChildByTagName(dc_response_doc_list, GSXML.DOC_NODE_ELEM);
683	Element dc_response_doc_content = (Element) GSXML.getChildByTagName(dc_response_doc, GSXML.NODE_CONTENT_ELEM);
684
685	if (dc_response_doc_content == null)
686	{
687	// no content to add
688	if (dc_response_doc.getAttribute("external").equals("true"))
689	{
690	String href_id = dc_response_doc.getAttribute(GSXML.HREF_ID_ATT);
691
692	the_document.setAttribute("selectedNode", href_id);
693	the_document.setAttribute("external", href_id);
694	}
695	return result;
696	}
697	if (do_highlight_query_terms)
698	{
699	dc_response_doc.removeChild(dc_response_doc_content);
700	if (get_marked_up_doc_from_query) {
701	Element new_content = retrieveHighlightedContent(request, null);
702	if (new_content == null) {
703	get_marked_up_doc_from_query = false;
704	dc_response_doc_content = highlightQueryTermsElementText(doc, (Element)dc_response_doc_content, query_term_variants, phrase_query_term_variants_hierarchy, highlight_case_insensitive);
705	} else {
706
707	dc_response_doc_content = new_content;
708	}
709	} else {
710	dc_response_doc_content = highlightQueryTermsElementText(doc, (Element)dc_response_doc_content, query_term_variants, phrase_query_term_variants_hierarchy, highlight_case_insensitive);
711	}
712	dc_response_doc.appendChild(dc_response_doc.getOwnerDocument().importNode(dc_response_doc_content, true));
713	}
714
715	if (provide_annotations)
716	{
717	String service_selected = (String) params.get(ENRICH_DOC_ARG);
718	if (service_selected != null && service_selected.equals("1"))
719	{
720	// now we can modifiy the response doc if needed
721	String enrich_service = (String) params.get(GSParams.SERVICE);
722	// send a message to the service
723	Element enrich_message = doc.createElement(GSXML.MESSAGE_ELEM);
724	Element enrich_request = GSXML.createBasicRequest(doc, GSXML.REQUEST_TYPE_PROCESS, enrich_service, userContext);
725	enrich_message.appendChild(enrich_request);
726	// check for parameters
727	HashMap e_service_params = (HashMap) params.get("s1");
728	if (e_service_params != null)
729	{
730	Element enrich_pl = doc.createElement(GSXML.PARAM_ELEM + GSXML.LIST_MODIFIER);
731	GSXML.addParametersToList(enrich_pl, e_service_params);
732	enrich_request.appendChild(enrich_pl);
733	}
734	Element e_doc_list = doc.createElement(GSXML.DOC_NODE_ELEM + GSXML.LIST_MODIFIER);
735	enrich_request.appendChild(e_doc_list);
736	e_doc_list.appendChild(doc.importNode(dc_response_doc, true));
737
738	Node enrich_response = this.mr.process(enrich_message);
739
740	String[] links = { GSXML.RESPONSE_ELEM, GSXML.DOC_NODE_ELEM + GSXML.LIST_MODIFIER, GSXML.DOC_NODE_ELEM, GSXML.NODE_CONTENT_ELEM };
741	path = GSPath.createPath(links);
742	dc_response_doc_content = (Element) GSXML.getNodeByPath(enrich_response, path);
743
744	}
745	} // if provide_annotations
746
747	// use the returned id rather than the sent one cos there may have
748	// been modifiers such as .pr that are removed.
749	String modified_doc_id = dc_response_doc.getAttribute(GSXML.NODE_ID_ATT);
750	the_document.setAttribute("selectedNode", modified_doc_id);
751	if (has_dummy)
752	{
753	// change the id if necessary and add the content
754	Element dummy_node = (Element) doc_nodes.item(0);
755
756	dummy_node.setAttribute(GSXML.NODE_ID_ATT, modified_doc_id);
757	dummy_node.appendChild(doc.importNode(dc_response_doc_content, true));
758	// hack for simple type
759	if (document_type.equals(GSXML.DOC_TYPE_SIMPLE))
760	{
761	// we dont want the internal docNode, just want the content and metadata in the document
762	// rethink this!!
763	the_document.removeChild(dummy_node);
764
765	NodeList dummy_children = dummy_node.getChildNodes();
766	for (int i = dummy_children.getLength() - 1; i >= 0; i--)
767	{
768	// special case as we don't want more than one metadata list
769	if (dummy_children.item(i).getNodeName().equals(GSXML.METADATA_ELEM + GSXML.LIST_MODIFIER))
770	{
771	GSXML.mergeMetadataFromList(the_document, dummy_children.item(i));
772	}
773	else
774	{
775	the_document.appendChild(dummy_children.item(i));
776	}
777	}
778	}
779
780	the_document.setAttribute(GSXML.NODE_ID_ATT, modified_doc_id);
781	}
782	else
783	{
784	// Merge the document content with the metadata and structure information
785	for (int i = 0; i < doc_nodes.getLength(); i++)
786	{
787	Node dn = doc_nodes.item(i);
788	String dn_id = ((Element) dn).getAttribute(GSXML.NODE_ID_ATT);
789	if (dn_id.equals(modified_doc_id))
790	{
791	dn.appendChild(doc.importNode(dc_response_doc_content, true));
792	break;
793	}
794	}
795	}
796	}
797	//logger.debug("(DocumentAction) Page:\n" + GSXML.xmlNodeToString(result));
798	return result;
799	}
800
801	protected boolean checkValidOID(Element basic_doc_list, String collection, UserContext userContext, Element page_response) {
802	Document doc = basic_doc_list.getOwnerDocument();
803
804	Element v_message = doc.createElement(GSXML.MESSAGE_ELEM);
805	String to = GSPath.appendLink(collection, AbstractDocumentRetrieve.VALIDATE_DOCUMENT_ID_SERVICE);
806	Element v_request = GSXML.createBasicRequest(doc, GSXML.REQUEST_TYPE_PROCESS, to, userContext);
807	v_message.appendChild(v_request);
808
809	// add the node list
810	v_request.appendChild(basic_doc_list);
811	Element v_response_message = (Element) this.mr.process(v_message);
812	if (processErrorElements(v_response_message, page_response))
813	{
814	return false;
815	}
816	String[] links = { GSXML.RESPONSE_ELEM, GSXML.DOC_NODE_ELEM + GSXML.LIST_MODIFIER, GSXML.DOC_NODE_ELEM };
817	String path = GSPath.createPath(links);
818	Element info_elem = (Element) GSXML.getNodeByPath(v_response_message, path);
819	if (info_elem == null) {
820	return false;
821	}
822	if (info_elem.getAttribute("valid").equals("true")) {
823	return true;
824	}
825	return false;
826
827	}
828
829	protected Element getFormattedArchiveDoc(Document doc, String collection, String document_id, String opt_document_version, String document_type,
830	Element result, Element page_response, UserContext userContext ) {
831	// call get archive doc
832	Element dx_message = doc.createElement(GSXML.MESSAGE_ELEM);
833	String to = DocXMLUtil.DOC_XML_GET_SECTION_SERVICE;
834	Element dx_request = GSXML.createBasicRequest(doc, GSXML.REQUEST_TYPE_PROCESS, to, userContext);
835	dx_message.appendChild(dx_request);
836	Element dx_section = doc.createElement(GSXML.DOCXML_SECTION_ELEM);
837	dx_section.setAttribute(GSXML.NODE_ID_ATT, document_id);
838	dx_section.setAttribute(GSXML.COLLECTION_ATT, collection);
839	dx_section.setAttribute(GSXML.DOC_VERSION_ATT, opt_document_version);
840	dx_request.appendChild(dx_section);
841
842	Element dx_response_message = (Element) this.mr.process(dx_message);
843	if (processErrorElements(dx_response_message, page_response))
844	{
845	return result;
846	}
847
848	// get the section out
849	String path = GSPath.appendLink(GSXML.RESPONSE_ELEM, GSXML.DOCXML_SECTION_ELEM);
850	Element section = (Element) GSXML.getNodeByPath(dx_response_message, path);
851	if (section == null) {
852	logger.error("no archive doc returned for "+document_id);
853	return result;
854	}
855	// convert the archive format into the internal format that the page response requires
856
857	// work out doctype
858	// NOTE: this will be coming from collection database in index
859	// the archive file doesn't store this. So we have to assume
860	// that the doc type will not be changing with any
861	// modifications happening to archives.
862
863	// if doc type is null, then we need to work it out.
864	// create a basic doc list containing the current node
865
866	if (document_type == null) {
867	Element basic_doc_list = doc.createElement(GSXML.DOC_NODE_ELEM + GSXML.LIST_MODIFIER);
868	Element current_doc = doc.createElement(GSXML.DOC_NODE_ELEM);
869	basic_doc_list.appendChild(current_doc);
870	current_doc.setAttribute(GSXML.NODE_ID_ATT, document_id);
871	basic_doc_list.appendChild(current_doc);
872	document_type = getDocumentType(basic_doc_list, collection, userContext, page_response);
873	}
874
875	if (document_type == null) {
876	logger.debug("@@@ doctype is null, setting to simple");
877	document_type = GSXML.DOC_TYPE_SIMPLE;
878	}
879
880	Element doc_elem = doc.createElement(GSXML.DOCUMENT_ELEM);
881	doc_elem.setAttribute(GSXML.DOC_TYPE_ATT, document_type);
882	page_response.appendChild(doc_elem);
883
884	Element transformed_section = transformArchiveToDocument(section);
885	if (document_type == GSXML.DOC_TYPE_SIMPLE) {
886	// simple doc, only returning a single document node, which is the top level section.
887	doc_elem.setAttribute(GSXML.NODE_ID_ATT, document_id);
888	GSXML.mergeElements(doc_elem, transformed_section);
889	return result;
890	}
891
892	// multi sectioned document.
893	transformed_section.setAttribute(GSXML.NODE_ID_ATT, document_id);
894	// In docEdit mode, we obtain the text from archives, from doc.xml
895	// Now the transformation has replaced <Section> with <documentNode>
896	// Need to add nodeID, nodeType and docType attributes to each docNode
897	// as doc.xml doesn't store that.
898	insertDocNodeAttributes(transformed_section, document_type, null);
899	doc_elem.appendChild(doc.importNode(transformed_section, true));
900	logger.debug("dx result = "+XMLConverter.getPrettyString(result));
901
902	return result;
903	}
904
905
906	private boolean needSectionContent(HashMap<String, Serializable> params) {
907	String document_id = (String) params.get(GSParams.DOCUMENT);
908	String ilt = (String) params.get(GSParams.INLINE_TEMPLATE);
909	String iltPrefix = "<xsl:template match=\"/\"><text><xsl:for-each select=\"/page/pageResponse/document//documentNode[@nodeID =";
910	if (ilt != null && ilt.startsWith(iltPrefix) && document_id != null) {
911	return true;
912	}
913
914	return false;
915	}
916	/**
917	* this method gets the collection description, the format info, the list of
918	* enrich services, etc - stuff that is needed for the page, but is the same
919	* whatever the query is - should be cached
920	*/
921	protected boolean getBackgroundData(Element page_response, String collection, UserContext userContext)
922	{
923	Document doc = page_response.getOwnerDocument();
924
925	// create a message to process - contains requests for the collection
926	// description, the format element, the enrich services on offer
927	// these could all be cached
928	Element info_message = doc.createElement(GSXML.MESSAGE_ELEM);
929	String path = GSPath.appendLink(collection, AbstractDocumentRetrieve.DOCUMENT_CONTENT_RETRIEVE_SERVICE);
930	// the format request - ignore for now, where does this request go to??
931	Element format_request = GSXML.createBasicRequest(doc, GSXML.REQUEST_TYPE_FORMAT, path, userContext);
932	info_message.appendChild(format_request);
933
934	// the enrich_services request - only do this if provide_annotations is true
935
936	if (provide_annotations)
937	{
938	Element enrich_services_request = GSXML.createBasicRequest(doc, GSXML.REQUEST_TYPE_DESCRIBE, "", userContext);
939	enrich_services_request.setAttribute(GSXML.INFO_ATT, "serviceList");
940	info_message.appendChild(enrich_services_request);
941	}
942
943	Element info_response = (Element) this.mr.process(info_message);
944
945	// the collection is the first response
946	NodeList responses = info_response.getElementsByTagName(GSXML.RESPONSE_ELEM);
947	Element format_resp = (Element) responses.item(0);
948
949	Element format_elem = (Element) GSXML.getChildByTagName(format_resp, GSXML.FORMAT_ELEM);
950	if (format_elem != null)
951	{
952	Element global_format_elem = (Element) GSXML.getChildByTagName(format_resp, GSXML.GLOBAL_FORMAT_ELEM);
953	if (global_format_elem != null)
954	{
955	GSXSLT.mergeFormatElements(format_elem, global_format_elem, false);
956	}
957
958	// set the format type
959	format_elem.setAttribute(GSXML.TYPE_ATT, "display");
960	page_response.appendChild(doc.importNode(format_elem, true));
961	}
962
963	if (provide_annotations)
964	{
965	Element services_resp = (Element) responses.item(1);
966
967	// a new message for the mr
968	Element enrich_message = doc.createElement(GSXML.MESSAGE_ELEM);
969	NodeList e_services = services_resp.getElementsByTagName(GSXML.SERVICE_ELEM);
970	boolean service_found = false;
971	for (int j = 0; j < e_services.getLength(); j++)
972	{
973	if (((Element) e_services.item(j)).getAttribute(GSXML.TYPE_ATT).equals("enrich"))
974	{
975	Element s = GSXML.createBasicRequest(doc, GSXML.REQUEST_TYPE_DESCRIBE, ((Element) e_services.item(j)).getAttribute(GSXML.NAME_ATT), userContext);
976	enrich_message.appendChild(s);
977	service_found = true;
978	}
979	}
980	if (service_found)
981	{
982	Element enrich_response = (Element) this.mr.process(enrich_message);
983
984	NodeList e_responses = enrich_response.getElementsByTagName(GSXML.RESPONSE_ELEM);
985	Element service_list = doc.createElement(GSXML.SERVICE_ELEM + GSXML.LIST_MODIFIER);
986	for (int i = 0; i < e_responses.getLength(); i++)
987	{
988	Element e_resp = (Element) e_responses.item(i);
989	Element e_service = (Element) doc.importNode(GSXML.getChildByTagName(e_resp, GSXML.SERVICE_ELEM), true);
990	e_service.setAttribute(GSXML.NAME_ATT, e_resp.getAttribute(GSXML.FROM_ATT));
991	service_list.appendChild(e_service);
992	}
993	page_response.appendChild(service_list);
994	}
995	} // if provide_annotations
996	return true;
997
998	}
999
1000	protected String getDocumentType(Element basic_doc_list, String collection, UserContext userContext, Element page_response)
1001	{
1002	Document doc = basic_doc_list.getOwnerDocument();
1003
1004	Element ds_message = doc.createElement(GSXML.MESSAGE_ELEM);
1005	String to = GSPath.appendLink(collection, AbstractDocumentRetrieve.DOCUMENT_STRUCTURE_RETRIEVE_SERVICE);
1006	Element ds_request = GSXML.createBasicRequest(doc, GSXML.REQUEST_TYPE_PROCESS, to, userContext);
1007	ds_message.appendChild(ds_request);
1008
1009	// Create a parameter list to specify the required structure information
1010	Element ds_param_list = doc.createElement(GSXML.PARAM_ELEM + GSXML.LIST_MODIFIER);
1011	Element ds_param = doc.createElement(GSXML.PARAM_ELEM);
1012	ds_param_list.appendChild(ds_param);
1013	ds_param.setAttribute(GSXML.NAME_ATT, "info");
1014	ds_param.setAttribute(GSXML.VALUE_ATT, "documentType");
1015
1016	ds_request.appendChild(ds_param_list);
1017
1018	// add the node list we created earlier
1019	ds_request.appendChild(basic_doc_list);
1020
1021	// Process the document structure retrieve message
1022	Element ds_response_message = (Element) this.mr.process(ds_message);
1023	if (processErrorElements(ds_response_message, page_response))
1024	{
1025	return null;
1026	}
1027
1028	String[] links = { GSXML.RESPONSE_ELEM, GSXML.DOC_NODE_ELEM + GSXML.LIST_MODIFIER, GSXML.DOC_NODE_ELEM, "nodeStructureInfo" };
1029	String path = GSPath.createPath(links);
1030	Element info_elem = (Element) GSXML.getNodeByPath(ds_response_message, path);
1031	if (info_elem == null) {
1032	return null;
1033	}
1034	Element doctype_elem = GSXML.getNamedElement(info_elem, "info", "name", "documentType");
1035	if (doctype_elem != null)
1036	{
1037	String doc_type = doctype_elem.getAttribute("value");
1038	return doc_type;
1039	}
1040	return null;
1041	}
1042
1043	// Recursive method to set the docType, nodeType and nodeID attributes of each docNode
1044	// The docType remains constant as in parameter document_type
1045	// The nodeID for the first (root) docNode is already set. For all children, the rootNode id
1046	// is updated to be <parent-id>.<num-child>, where the first parent-id is rootNode id.
1047	// The nodeType is root if rootNode, internal if there are children and leaf if no children
1048	protected void insertDocNodeAttributes(Element docNode, String document_type, String id) {
1049
1050	boolean isRoot = false;
1051	if(id == null) { // rootNode, get the root nodeID to work with recursively
1052	id = docNode.getAttribute(GSXML.NODE_ID_ATT);
1053	isRoot = true;
1054	} else { // for all but the root node, need to still set the nodeID
1055	docNode.setAttribute(GSXML.NODE_ID_ATT, id);
1056	}
1057
1058	docNode.setAttribute(GSXML.DOC_TYPE_ATT, document_type);
1059
1060	NodeList docNodes = GSXML.getChildrenByTagName(docNode, GSXML.DOC_NODE_ELEM);
1061	if(docNodes.getLength() > 0) {
1062	docNode.setAttribute(GSXML.NODE_TYPE_ATT, GSXML.NODE_TYPE_INTERNAL);
1063	for(int i = 0; i < docNodes.getLength(); i++) {
1064	Element childDocNode = (Element)docNodes.item(i);
1065
1066	// work out the child docNode's nodeID based on current id
1067	String nodeID = id + "." + (i+1);
1068	insertDocNodeAttributes(childDocNode, document_type, nodeID); //recursion step
1069	}
1070	} else {
1071	docNode.setAttribute(GSXML.NODE_TYPE_ATT, GSXML.NODE_TYPE_LEAF);
1072	}
1073
1074	// rootNode's nodeType is a special case: it's "root", not "leaf" or "internal"
1075	if(isRoot) docNode.setAttribute(GSXML.NODE_TYPE_ATT, GSXML.NODE_TYPE_ROOT);
1076
1077	}
1078
1079	/** run the XSLT transform which converts from doc.xml format to our internal document format */
1080	protected Element transformArchiveToDocument(Element section) {
1081
1082	String stylesheet_filename = GSFile.stylesheetFile(GlobalProperties.getGSDL3Home(), (String) this.config_params.get(GSConstants.SITE_NAME), "", (String) this.config_params.get(GSConstants.INTERFACE_NAME), (ArrayList<String>) this.config_params.get(GSConstants.BASE_INTERFACES), "archive2document.xsl");
1083	if (stylesheet_filename == null) {
1084	logger.error("Couldn't find stylesheet archive2document.xsl");
1085	return section;
1086	}
1087
1088	Document stylesheet_doc = XMLConverter.getDOM(new File(stylesheet_filename));
1089	if (stylesheet_doc == null) {
1090	logger.error("Couldn't load in stylesheet "+stylesheet_filename);
1091	return section;
1092	}
1093
1094	Document section_doc = XMLConverter.newDOM();
1095	section_doc.appendChild(section_doc.importNode(section, true));
1096	Node result = this.transformer.transform(stylesheet_doc, section_doc);
1097	logger.debug("transform result = "+XMLConverter.getPrettyString(result));
1098
1099	Element new_element;
1100	if (result.getNodeType() == Node.DOCUMENT_NODE) {
1101	new_element = ((Document) result).getDocumentElement();
1102	} else {
1103	new_element = (Element) result;
1104	}
1105
1106
1107	return new_element;
1108
1109	}
1110
1111	protected final int NO_QUERY_TERMS = 0;
1112	protected final int NO_EQUIV_QUERY_TERMS = 1;
1113	protected final int EQUIV_QUERY_TERMS = 2;
1114	/**
1115	* this involves a bit of a hack to get the equivalent query terms - has to
1116	* requery the query service - uses the last selected service name. (if it
1117	* ends in query).
1118	*/
1119	protected int getQueryTermVariants(Element request, HashSet<String> query_term_variants, ArrayList<ArrayList<HashSet<String>>> phrase_query_term_variants_hierarchy)
1120	{
1121	Document doc = XMLConverter.newDOM();
1122
1123	// do the query again to get term info
1124	Element cgi_param_list = (Element) GSXML.getChildByTagName(request, GSXML.PARAM_ELEM + GSXML.LIST_MODIFIER);
1125	HashMap<String, Serializable> params = GSXML.extractParams(cgi_param_list, false);
1126
1127	HashMap previous_params = (HashMap) params.get("p");
1128	if (previous_params == null)
1129	{
1130	return NO_QUERY_TERMS;
1131	}
1132	String service_name = (String) previous_params.get(GSParams.SERVICE);
1133	if (service_name == null \|\| !service_name.endsWith("Query"))
1134	{ // hack for now - we only do highlighting if we were in a query last - ie not if we were in a browse thingy
1135	logger.debug("invalid service "+service_name+", not doing highlighting");
1136	return NO_QUERY_TERMS;
1137	}
1138
1139	String collection = (String) params.get(GSParams.COLLECTION);
1140	UserContext userContext = new UserContext(request);
1141	String to = GSPath.appendLink(collection, service_name);
1142
1143	Element mr_query_message = doc.createElement(GSXML.MESSAGE_ELEM);
1144	Element mr_query_request = GSXML.createBasicRequest(doc, GSXML.REQUEST_TYPE_PROCESS, to, userContext);
1145	mr_query_message.appendChild(mr_query_request);
1146
1147	// paramList
1148	HashMap service_params = (HashMap) params.get("s1");
1149
1150	Element query_param_list = doc.createElement(GSXML.PARAM_ELEM + GSXML.LIST_MODIFIER);
1151	GSXML.addParametersToList(query_param_list, service_params);
1152	mr_query_request.appendChild(query_param_list);
1153
1154	// do the query
1155	Element mr_query_response = (Element) this.mr.process(mr_query_message);
1156
1157	// find the term lists
1158	String path = GSPath.appendLink(GSXML.RESPONSE_ELEM, GSXML.TERM_ELEM + GSXML.LIST_MODIFIER);
1159	Element query_term_list_element = (Element) GSXML.getNodeByPath(mr_query_response, path);
1160	if (query_term_list_element == null)
1161	{
1162	// no term info
1163	return NO_QUERY_TERMS;
1164	}
1165
1166	int result_code = NO_EQUIV_QUERY_TERMS;
1167	NodeList equivalent_terms_nodelist = query_term_list_element.getElementsByTagName("equivTermList");
1168	if (equivalent_terms_nodelist == null \|\| equivalent_terms_nodelist.getLength() == 0)
1169	{
1170	// if we have no equivalent terms, just add the current terms lower cased and we do case insensitive matching later on
1171	NodeList terms_nodelist = query_term_list_element.getElementsByTagName("term");
1172	if (terms_nodelist != null && terms_nodelist.getLength() > 0)
1173	{
1174	for (int i = 0; i < terms_nodelist.getLength(); i++)
1175	{
1176	String termValue = ((Element) terms_nodelist.item(i)).getAttribute("name");
1177	query_term_variants.add(termValue.toLowerCase());
1178	}
1179	}
1180	}
1181	else
1182	{
1183	result_code = EQUIV_QUERY_TERMS;
1184	for (int i = 0; i < equivalent_terms_nodelist.getLength(); i++)
1185	{
1186	Element equivalent_terms_element = (Element) equivalent_terms_nodelist.item(i);
1187	String[] equivalent_terms = GSXML.getAttributeValuesFromList(equivalent_terms_element, GSXML.NAME_ATT);
1188	for (int j = 0; j < equivalent_terms.length; j++)
1189	{
1190	query_term_variants.add(equivalent_terms[j]);
1191	}
1192	}
1193	}
1194
1195	String metadata_path = GSPath.appendLink(GSXML.RESPONSE_ELEM, GSXML.METADATA_ELEM + GSXML.LIST_MODIFIER);
1196	Element metadata_list = (Element) GSXML.getNodeByPath(mr_query_response, metadata_path);
1197
1198	Element query_element = GSXML.getNamedElement(metadata_list, GSXML.METADATA_ELEM, GSXML.NAME_ATT, "query");
1199	String performed_query = GSXML.getNodeText(query_element) + " ";
1200	logger.debug("performed query="+performed_query);
1201
1202	boolean has_phrases = false; // if there are no phrases, we don't bother making the phrase variants structure
1203	if (performed_query.contains("\"")) {
1204	has_phrases = true;
1205	}
1206
1207	ArrayList<HashSet<String>> phrase_query_p_term_variants_list = new ArrayList<HashSet<String>>();
1208	int term_start = 0;
1209	boolean in_term = false;
1210	boolean in_phrase = false;
1211	for (int i = 0; i < performed_query.length(); i++) {
1212
1213	char character = performed_query.charAt(i);
1214	boolean is_character_letter_or_digit = Character.isLetterOrDigit(character);
1215
1216	// Has a query term just started?
1217	if (in_term == false && is_character_letter_or_digit == true)
1218	{
1219	in_term = true;
1220	term_start = i;
1221	}
1222
1223	// Or has a term just finished?
1224	else if (in_term == true && is_character_letter_or_digit == false)
1225	{
1226	in_term = false;
1227	String term = performed_query.substring(term_start, i);
1228	if (has_phrases) {
1229	// do the phrase bit
1230	HashSet<String> phrase_query_p_term_x_variants = new HashSet<String>();
1231	if (result_code == EQUIV_QUERY_TERMS) {
1232	Element term_element = GSXML.getNamedElement(query_term_list_element, GSXML.TERM_ELEM, GSXML.NAME_ATT, term);
1233	if (term_element != null) {
1234	// might be null for eg TX in [snails]:TX
1235
1236	NodeList term_equivalent_terms_nodelist = term_element.getElementsByTagName("equivTermList");
1237	if (term_equivalent_terms_nodelist != null \|\| term_equivalent_terms_nodelist.getLength() != 0) {
1238	for (int j = 0; j < term_equivalent_terms_nodelist.getLength(); j++)
1239	{
1240	Element term_equivalent_terms_element = (Element) term_equivalent_terms_nodelist.item(j);
1241	String[] term_equivalent_terms = GSXML.getAttributeValuesFromList(term_equivalent_terms_element, GSXML.NAME_ATT);
1242	for (int k = 0; k < term_equivalent_terms.length; k++)
1243	{
1244	phrase_query_p_term_x_variants.add(term_equivalent_terms[k]);
1245	}
1246	}
1247	}
1248	}
1249	} else { // result_code != EQUIV_QUERY_TERMS
1250	// we don;t have equivalent term list, so just add the lower cased version in, and we do case-insensitive matching later on
1251	if (query_term_variants.contains(term.toLowerCase()) \|\| containsSubString(query_term_variants, term)) {
1252	// this handles the case where the user has searched for snails, but term list returns 'snail'
1253	phrase_query_p_term_x_variants.add(term.toLowerCase());
1254	}
1255	}
1256	if (phrase_query_p_term_x_variants.size()>0) {
1257	// we have found a valid term
1258	phrase_query_p_term_variants_list.add(phrase_query_p_term_x_variants);
1259
1260	if (in_phrase == false)
1261	{
1262	phrase_query_term_variants_hierarchy.add(phrase_query_p_term_variants_list);
1263	phrase_query_p_term_variants_list = new ArrayList<HashSet<String>>();
1264	}
1265	}
1266	} // end if has_phrases
1267	else {
1268	// no phrases so we don't have to do the phrasey stuff. but
1269	// we need to check the term against the query term list - if its not in there, check whether its the root of a term.
1270	// we want to handle the case where user has queried "snails", the term list returned only has snail, and therefore snails doesn't get highlighted.
1271	// but dont want to include eg TX
1272	if (result_code == NO_EQUIV_QUERY_TERMS) {
1273	if (containsSubString(query_term_variants, term)) {
1274	query_term_variants.add(term.toLowerCase());
1275	}
1276	}
1277
1278	}
1279	} // end of in_term...
1280	// Watch for phrases (surrounded by quotes)
1281	if (character == '\"') {
1282
1283	// Has a phrase just started?
1284	if (in_phrase == false)
1285	{
1286	in_phrase = true;
1287	}
1288	// Or has a phrase just finished?
1289	else if (in_phrase == true)
1290	{
1291	in_phrase = false;
1292	phrase_query_term_variants_hierarchy.add(phrase_query_p_term_variants_list);
1293	}
1294
1295	phrase_query_p_term_variants_list = new ArrayList<HashSet<String>>();
1296	} // if char == "
1297	} // for each char in performed query
1298
1299	return result_code;
1300	}
1301
1302	protected boolean containsSubString(HashSet<String> query_term_variants, String term) {
1303	// hack to filter out TX, TI field names
1304	String lc_term = term.toLowerCase();
1305	if (query_term_variants.contains(term)) {
1306	return false; // or true??
1307	}
1308	if (term.matches("[A-Z][A-Z][A-Z]?")) {
1309	return false;
1310	}
1311	Iterator i = query_term_variants.iterator();
1312	while (i.hasNext()) {
1313	String t = (String)i.next();
1314	if (term.startsWith(t)) {
1315	return true;
1316	}
1317	}
1318	return false;
1319	}
1320
1321
1322	/** retrieve the marked up highlighted section - only works for solr collection */
1323	protected Element retrieveHighlightedContent(Element request, String node_id) {
1324
1325	Document doc = XMLConverter.newDOM();
1326
1327	// do the query again to get term info
1328	Element cgi_param_list = (Element) GSXML.getChildByTagName(request, GSXML.PARAM_ELEM + GSXML.LIST_MODIFIER);
1329	HashMap<String, Serializable> params = GSXML.extractParams(cgi_param_list, false);
1330
1331	HashMap previous_params = (HashMap) params.get("p");
1332	if (previous_params == null)
1333	{
1334	return null;
1335	}
1336	String service_name = (String) previous_params.get(GSParams.SERVICE);
1337	if (service_name == null \|\| !service_name.endsWith("Query"))
1338	{ // hack for now - we only do highlighting if we were in a query last - ie not if we were in a browse thingy
1339	logger.debug("HL invalid service, not doing highlighting");
1340	return null;
1341	}
1342
1343	String collection = (String) params.get(GSParams.COLLECTION);
1344	UserContext userContext = new UserContext(request);
1345	String to = GSPath.appendLink(collection, service_name);
1346
1347	Element mr_query_message = doc.createElement(GSXML.MESSAGE_ELEM);
1348	Element mr_query_request = GSXML.createBasicRequest(doc, GSXML.REQUEST_TYPE_PROCESS, to, userContext);
1349	mr_query_message.appendChild(mr_query_request);
1350
1351	// paramList
1352	HashMap service_params = (HashMap) params.get("s1");
1353
1354	// hack in case the user searched on eg titles, but we want highlighting in the text
1355	service_params.put("index", "TX");
1356	Element query_param_list = doc.createElement(GSXML.PARAM_ELEM + GSXML.LIST_MODIFIER);
1357	GSXML.addParametersToList(query_param_list, service_params);
1358
1359	if (node_id != null) {
1360	GSXML.addParameterToList(query_param_list, "hldocOID", node_id);
1361	} else {
1362	GSXML.addParameterToList(query_param_list, "hldocOID", (String) params.get(GSParams.DOCUMENT));
1363	}
1364	mr_query_request.appendChild(query_param_list);
1365	// do the query
1366
1367	Element mr_query_response = (Element) this.mr.process(mr_query_message);
1368	String pathNode = GSPath.appendLink(GSXML.RESPONSE_ELEM, GSXML.NODE_CONTENT_ELEM);
1369	Element highlighted_node = (Element) GSXML.getNodeByPath(mr_query_response, pathNode);
1370
1371	if (highlighted_node == null) {
1372	return null;
1373	}
1374	// For SOLR, the highlighted node will be a nodeContent element, which is the hldocOID section content, with search terms marked up.
1375	//We send it back to the documnetContentRetrieve service so that resolveTextMacros can be applied, and it can be properly encased in documentNode etc elements
1376
1377	// Build a request to process highlighted text
1378
1379	Element hl_message = doc.createElement(GSXML.MESSAGE_ELEM);
1380	to = GSPath.appendLink(collection, AbstractDocumentRetrieve.DOCUMENT_CONTENT_RETRIEVE_SERVICE);
1381	Element dc_request = GSXML.createBasicRequest(doc, GSXML.REQUEST_TYPE_PROCESS, to, userContext);
1382	hl_message.appendChild(dc_request);
1383
1384	// Create a parameter list to specify the request parameters - empty for now
1385	Element dc_param_list = doc.createElement(GSXML.PARAM_ELEM + GSXML.LIST_MODIFIER);
1386	dc_request.appendChild(dc_param_list);
1387
1388	// get the content
1389	Element doc_list = doc.createElement(GSXML.DOC_NODE_ELEM + GSXML.LIST_MODIFIER);
1390	dc_request.appendChild(doc_list);
1391	Element current_doc = doc.createElement(GSXML.DOC_NODE_ELEM);
1392	doc_list.appendChild(current_doc);
1393	current_doc.setAttribute(GSXML.NODE_ID_ATT, (String) params.get(GSParams.DOCUMENT));
1394	//Append highlighted content to request for processing
1395	dc_request.appendChild(doc.importNode(highlighted_node, true));
1396	Element hl_response_message = (Element) this.mr.process(hl_message);
1397	//Get results
1398	NodeList contentList = hl_response_message.getElementsByTagName(GSXML.NODE_CONTENT_ELEM);
1399	Element content = (Element) contentList.item(0);
1400	return content;
1401
1402
1403	}
1404	/**
1405	* Highlights query terms in specified elements (whose name is in element_names) text inside top_level_elem
1406	*/
1407	protected boolean highlightQueryTermsDOM(Document doc, Element top_level_elem, String element_name, HashSet<String> query_term_variants, ArrayList<ArrayList<HashSet<String>>> phrase_query_term_variants_hierarchy, boolean case_insensitive) {
1408
1409	NodeList named_elems = top_level_elem.getElementsByTagName(element_name);
1410	for (int j=named_elems.getLength()-1; j>=0; j--) {
1411	Element this_elem = (Element)named_elems.item(j);
1412	Element replacement_elem = highlightQueryTermsElementText(doc, this_elem, query_term_variants, phrase_query_term_variants_hierarchy, case_insensitive);
1413	this_elem.getParentNode().replaceChild(replacement_elem, this_elem);
1414	}
1415	return true;
1416	}
1417	/**
1418	* Highlights query terms in the text content of an element.
1419	*/
1420	private Element highlightQueryTermsElementText(Document doc, Element original_element, HashSet<String> query_term_variants, ArrayList<ArrayList<HashSet<String>>> phrase_query_term_variants_hierarchy, boolean case_insensitive)
1421	{
1422	String content = GSXML.getNodeText(original_element);
1423	// Convert the content string to an array of characters for speed
1424	char[] content_characters = new char[content.length()];
1425	content.getChars(0, content.length(), content_characters, 0);
1426
1427	// Now skim through the content, identifying word matches
1428	ArrayList<WordMatch> word_matches = new ArrayList<WordMatch>();
1429	int word_start = 0;
1430	boolean in_word = false;
1431	boolean preceding_word_matched = false;
1432	boolean inTag = false;
1433	for (int i = 0; i < content_characters.length; i++)
1434	{
1435	//We don't want to find words inside HTML tags
1436	if (content_characters[i] == '<')
1437	{
1438	// are we currently in a word?
1439	if (in_word) {
1440	in_word = false;
1441	String word = new String(content_characters, word_start, (i - word_start));
1442	if (case_insensitive) {
1443	word = word.toLowerCase();
1444	}
1445	if (query_term_variants.contains(word)) {
1446	// We have found a matching word, so remember its location
1447	word_matches.add(new WordMatch(word, word_start, i, preceding_word_matched));
1448	// should preceding word matched be set to true/false here??
1449	preceding_word_matched = true;
1450	} else {
1451	preceding_word_matched = false;
1452	}
1453	}
1454	inTag = true;
1455	continue;
1456	}
1457	else if (inTag && content_characters[i] == '>')
1458	{
1459	inTag = false;
1460	continue;
1461	}
1462	else if (inTag)
1463	{
1464	continue;
1465	}
1466
1467	boolean is_character_letter_or_digit = Character.isLetterOrDigit(content_characters[i]);
1468
1469	// Has a word just started?
1470	if (in_word == false && is_character_letter_or_digit == true)
1471	{
1472	in_word = true;
1473	word_start = i;
1474	}
1475
1476	// Or has a word just finished?
1477	else if (in_word == true && is_character_letter_or_digit == false)
1478	{
1479	in_word = false;
1480
1481	// Check if the word matches any of the query term equivalents
1482	String word = new String(content_characters, word_start, (i - word_start));
1483	if (case_insensitive) {
1484	word = word.toLowerCase();
1485	}
1486	if (query_term_variants.contains(word))
1487	{
1488	// We have found a matching word, so remember its location
1489	word_matches.add(new WordMatch(word, word_start, i, preceding_word_matched));
1490	preceding_word_matched = true;
1491	}
1492	else
1493	{
1494	preceding_word_matched = false;
1495	}
1496	}
1497	}
1498
1499	// Don't forget the last word...
1500	if (in_word == true)
1501	{
1502	// Check if the word matches any of the query term equivalents
1503	String word = new String(content_characters, word_start, (content_characters.length - word_start));
1504	if (case_insensitive) {
1505	word = word.toLowerCase();
1506	}
1507	if (query_term_variants.contains(word))
1508	{
1509	// We have found a matching word, so remember its location
1510	word_matches.add(new WordMatch(word, word_start, content_characters.length, preceding_word_matched));
1511	}
1512	}
1513
1514	if (word_matches.size() == 0) {
1515	// just return a copy of the original element
1516	return (Element)doc.importNode(original_element, true);
1517
1518	}
1519
1520	ArrayList<Integer> highlight_start_positions = new ArrayList<Integer>();
1521	ArrayList<Integer> highlight_end_positions = new ArrayList<Integer>();
1522
1523	if (phrase_query_term_variants_hierarchy.size() ==0) {
1524	for (int i = 0; i < word_matches.size(); i++) {
1525	highlight_start_positions.add(Integer.valueOf(word_matches.get(i).start_position));
1526	highlight_end_positions.add(Integer.valueOf(word_matches.get(i).end_position));
1527	}
1528	}
1529	else {
1530	// Deal with phrases now
1531	ArrayList<PartialPhraseMatch> partial_phrase_matches = new ArrayList<PartialPhraseMatch>();
1532	for (int i = 0; i < word_matches.size(); i++)
1533	{
1534	WordMatch word_match = word_matches.get(i);
1535
1536	// See if any partial phrase matches are extended by this word
1537	if (word_match.preceding_word_matched)
1538	{
1539	for (int j = partial_phrase_matches.size() - 1; j >= 0; j--)
1540	{
1541	PartialPhraseMatch partial_phrase_match = partial_phrase_matches.remove(j);
1542	ArrayList phrase_query_p_term_variants_list = phrase_query_term_variants_hierarchy.get(partial_phrase_match.query_phrase_number);
1543	HashSet phrase_query_p_term_x_variants = (HashSet) phrase_query_p_term_variants_list.get(partial_phrase_match.num_words_matched);
1544	if (phrase_query_p_term_x_variants.contains(word_match.word))
1545	{
1546	partial_phrase_match.num_words_matched++;
1547
1548	// Has a complete phrase match occurred?
1549	if (partial_phrase_match.num_words_matched == phrase_query_p_term_variants_list.size())
1550	{
1551	// Check for overlaps by looking at the previous highlight range
1552	if (!highlight_end_positions.isEmpty())
1553	{
1554	int last_highlight_index = highlight_end_positions.size() - 1;
1555	int last_highlight_end = highlight_end_positions.get(last_highlight_index).intValue();
1556	if (last_highlight_end > partial_phrase_match.start_position)
1557	{
1558	// There is an overlap, so remove the previous phrase match
1559	int last_highlight_start = highlight_start_positions.remove(last_highlight_index).intValue();
1560	highlight_end_positions.remove(last_highlight_index);
1561	partial_phrase_match.start_position = last_highlight_start;
1562	}
1563	}
1564
1565	highlight_start_positions.add(Integer.valueOf(partial_phrase_match.start_position));
1566	highlight_end_positions.add(Integer.valueOf(word_match.end_position));
1567	}
1568	// No, but add the partial match back into the list for next time
1569	else
1570	{
1571	partial_phrase_matches.add(partial_phrase_match);
1572	}
1573	}
1574	}
1575	}
1576	else
1577	{
1578	partial_phrase_matches.clear();
1579	}
1580
1581	// See if this word is at the start of any of the phrases
1582	for (int p = 0; p < phrase_query_term_variants_hierarchy.size(); p++)
1583	{
1584	ArrayList phrase_query_p_term_variants_list = phrase_query_term_variants_hierarchy.get(p);
1585	if (phrase_query_p_term_variants_list.size()>0) {
1586	HashSet phrase_query_p_term_1_variants = (HashSet) phrase_query_p_term_variants_list.get(0);
1587	if (phrase_query_p_term_1_variants.contains(word_match.word))
1588	{
1589	// If this phrase is just one word long, we have a complete match
1590	if (phrase_query_p_term_variants_list.size() == 1)
1591	{
1592	highlight_start_positions.add(Integer.valueOf(word_match.start_position));
1593	highlight_end_positions.add(Integer.valueOf(word_match.end_position));
1594	}
1595	// Otherwise we have the start of a potential phrase match
1596	else
1597	{
1598	partial_phrase_matches.add(new PartialPhraseMatch(word_match.start_position, p));
1599	}
1600	}
1601	}
1602	}
1603	}
1604	}
1605
1606	// Now add the annotation tags into the document at the correct points
1607	Element content_element = (Element)doc.importNode(original_element, false); // just copy the element plus any attributes, but not any children.
1608	int last_wrote = 0;
1609	for (int i = 0; i < highlight_start_positions.size(); i++)
1610	{
1611	int highlight_start = highlight_start_positions.get(i).intValue();
1612	int highlight_end = highlight_end_positions.get(i).intValue();
1613
1614	// Print anything before the highlight range
1615	if (last_wrote < highlight_start)
1616	{
1617	String preceding_text = new String(content_characters, last_wrote, (highlight_start - last_wrote));
1618	content_element.appendChild(doc.createTextNode(preceding_text));
1619	}
1620
1621	// Print the highlight text, annotated
1622	if (highlight_end > last_wrote)
1623	{
1624	String highlight_text = new String(content_characters, highlight_start, (highlight_end - highlight_start));
1625	Element annotation_element = GSXML.createTextElement(doc, "annotation", highlight_text);
1626	annotation_element.setAttribute("type", "query_term");
1627	content_element.appendChild(annotation_element);
1628	last_wrote = highlight_end;
1629	}
1630	}
1631
1632	// Finish off any unwritten text
1633	if (last_wrote < content_characters.length)
1634	{
1635	String remaining_text = new String(content_characters, last_wrote, (content_characters.length - last_wrote));
1636	content_element.appendChild(doc.createTextNode(remaining_text));
1637	}
1638	return content_element;
1639	}
1640
1641
1642	static private class WordMatch
1643	{
1644	public String word;
1645	public int start_position;
1646	public int end_position;
1647	public boolean preceding_word_matched;
1648
1649	public WordMatch(String word, int start_position, int end_position, boolean preceding_word_matched)
1650	{
1651	this.word = word;
1652	this.start_position = start_position;
1653	this.end_position = end_position;
1654	this.preceding_word_matched = preceding_word_matched;
1655	}
1656	}
1657
1658	static private class PartialPhraseMatch
1659	{
1660	public int start_position;
1661	public int query_phrase_number;
1662	public int num_words_matched;
1663
1664	public PartialPhraseMatch(int start_position, int query_phrase_number)
1665	{
1666	this.start_position = start_position;
1667	this.query_phrase_number = query_phrase_number;
1668	this.num_words_matched = 1;
1669	}
1670	}
1671	}

Note: See TracBrowser for help on using the repository browser.

Download in other formats: