Context Navigation

source: main/trunk/greenstone3/src/java/org/greenstone/gsdl3/action/DocumentAction.java@ 32546

Last change on this file since 32546 was 32546, checked in by kjdon, 5 years ago
use variable instead of hard coded string for paramDefault
Property svn:keywords set to `Author Date Id Revision`
File size: 61.8 KB

Line
1	/*
2	* DocumentAction.java
3	* Copyright (C) 2002 New Zealand Digital Library, http://www.nzdl.org
4	*
5	* This program is free software; you can redistribute it and/or modify
6	* it under the terms of the GNU General Public License as published by
7	* the Free Software Foundation; either version 2 of the License, or
8	* (at your option) any later version.
9	*
10	* This program is distributed in the hope that it will be useful,
11	* but WITHOUT ANY WARRANTY; without even the implied warranty of
12	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13	* GNU General Public License for more details.
14	*
15	* You should have received a copy of the GNU General Public License
16	* along with this program; if not, write to the Free Software
17	* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
18	*/
19	package org.greenstone.gsdl3.action;
20
21	// Greenstone classes
22	import org.greenstone.gsdl3.core.ModuleInterface;
23	import org.greenstone.gsdl3.util.*;
24	import org.greenstone.util.GlobalProperties;
25
26	// XML classes
27	import org.w3c.dom.Document;
28	import org.w3c.dom.Element;
29	import org.w3c.dom.Node;
30	import org.w3c.dom.Text;
31	import org.w3c.dom.NodeList;
32
33	// General Java classes
34	import java.util.ArrayList;
35	import java.util.HashMap;
36	import java.util.HashSet;
37	import java.util.Iterator;
38	import java.io.File;
39	import java.io.Serializable;
40
41	import org.apache.log4j.*;
42
43	/** Action class for retrieving Documents via the message router */
44	public class DocumentAction extends Action
45	{
46
47	static Logger logger = Logger.getLogger(org.greenstone.gsdl3.action.DocumentAction.class.getName());
48
49	// this is used to specify that the sibling nodes of a selected one should be obtained
50	public static final String SIBLING_ARG = "sib";
51	public static final String GOTO_PAGE_ARG = "gp";
52	public static final String ENRICH_DOC_ARG = "end";
53	public static final String EXPAND_DOCUMENT_ARG = "ed";
54	public static final String EXPAND_CONTENTS_ARG = "ec";
55	public static final String REALISTIC_BOOK_ARG = "book";
56	public static final String NO_TEXT_ARG = "noText";
57	public static final String DOC_EDIT_ARG = "docEdit";
58
59	/**
60	* if this is set to true, when a document is displayed, any annotation type
61	* services (enrich) will be offered to the user as well
62	*/
63	protected boolean provide_annotations = false;
64
65	protected boolean highlight_query_terms = false;
66
67	public boolean configure()
68	{
69	super.configure();
70	String highlight = (String) config_params.get("highlightQueryTerms");
71	if (highlight != null && highlight.equals("true"))
72	{
73	highlight_query_terms = true;
74	}
75	String annotate = (String) config_params.get("displayAnnotationService");
76	if (annotate != null && annotate.equals("true"))
77	{
78	provide_annotations = true;
79	}
80	return true;
81	}
82
83	public Node process(Node message_node)
84	{
85	// for now, no subaction eventually we may want to have subactions such as text assoc or something ?
86
87	Element message = GSXML.nodeToElement(message_node);
88	Document doc = XMLConverter.newDOM();
89
90	// the response
91	Element result = doc.createElement(GSXML.MESSAGE_ELEM);
92	Element page_response = doc.createElement(GSXML.RESPONSE_ELEM);
93	result.appendChild(page_response);
94
95	// get the request - assume only one
96	Element request = (Element) GSXML.getChildByTagName(message, GSXML.REQUEST_ELEM);
97	Element cgi_paramList = (Element) GSXML.getChildByTagName(request, GSXML.PARAM_ELEM + GSXML.LIST_MODIFIER);
98	HashMap<String, Serializable> params = GSXML.extractParams(cgi_paramList, false);
99
100	// just in case there are some that need to get passed to the services
101	// why do we use s0 here and s1 in other places???
102	HashMap service_params = (HashMap) params.get("s0");
103
104	String collection = (String) params.get(GSParams.COLLECTION);
105	String document_id = (String) params.get(GSParams.DOCUMENT);
106	if (document_id != null && document_id.equals(""))
107	{
108	document_id = null;
109	}
110	String href = (String) params.get(GSParams.HREF);//for an external link : get the href URL if it is existing in the params list
111	if (href != null && href.equals(""))
112	{
113	href = null;
114	}
115	String rl = (String) params.get(GSParams.RELATIVE_LINK);//for an external link : get the rl value if it is existing in the params list
116	if (document_id == null && href == null)
117	{
118	logger.error("no document specified!");
119	return result;
120	}
121	if (rl != null && rl.equals("0"))
122	{
123	// this is a true external link, we should have been directed to a different page or action
124	logger.error("rl value was 0, shouldn't get here");
125	return result;
126	}
127
128	UserContext userContext = new UserContext(request);
129
130	//append site metadata
131	addSiteMetadata(page_response, userContext);
132	addInterfaceOptions(page_response);
133
134	// get the additional data needed for the page
135	getBackgroundData(page_response, collection, userContext);
136	Element format_elem = (Element) GSXML.getChildByTagName(page_response, GSXML.FORMAT_ELEM);
137
138	if (format_elem != null) {
139	// lets look for param defaults set in config file
140	NodeList param_defaults = format_elem.getElementsByTagName(GSXML.PARAM_DEFAULT_ELEM);
141	for (int i=0; i<param_defaults.getLength(); i++) {
142	Element p = (Element)param_defaults.item(i);
143	String name = p.getAttribute(GSXML.NAME_ATT);
144	if (params.get(name) ==null) {
145	// wasn't set from interface
146	String value = p.getAttribute(GSXML.VALUE_ATT);
147	params.put(name, value );
148	// also add into request param xml so that xslt knows it too
149	GSXML.addParameterToList(cgi_paramList, name, value);
150	}
151	}
152	}
153
154	String document_type = (String) params.get(GSParams.DOCUMENT_TYPE);
155	if (document_type != null && document_type.equals(""))
156	{
157	//document_type = "hierarchy";
158	document_type = null; // we'll get it later if not already specified
159	}
160	// what if it is null here?? Anu to check...
161
162
163	boolean editing_document = false;
164	String doc_edit = (String) params.get(DOC_EDIT_ARG);
165	if (doc_edit != null && doc_edit.equals("1")) {
166	editing_document = true;
167	}
168
169	// are we editing mode? just get the archive document, convert to our internal doc format, and return it
170	if (editing_document) {
171	return getFormattedArchiveDoc(doc, collection, document_id, document_type, result, page_response, userContext);
172	}
173
174	//whether to retrieve siblings or not
175	boolean get_siblings = false;
176	String sibs = (String) params.get(SIBLING_ARG);
177	if (sibs != null && sibs.equals("1"))
178	{
179	get_siblings = true;
180	}
181
182	String doc_id_modifier = "";
183	String sibling_num = (String) params.get(GOTO_PAGE_ARG);
184	if (sibling_num != null && !sibling_num.equals(""))
185	{
186	// we have to modify the doc name
187	doc_id_modifier = "." + sibling_num + ".ss";
188	}
189
190	boolean expand_document = false;
191	String ed_arg = (String) params.get(EXPAND_DOCUMENT_ARG);
192	if (ed_arg != null && ed_arg.equals("1"))
193	{
194	expand_document = true;
195	}
196
197	boolean expand_contents = false;
198	if (expand_document)
199	{ // we always expand the contents with the text
200	expand_contents = true;
201	}
202	else
203	{
204	String ec_arg = (String) params.get(EXPAND_CONTENTS_ARG);
205	if (ec_arg != null && ec_arg.equals("1"))
206	{
207	expand_contents = true;
208	}
209	}
210
211	// do we want text content? Not if no_text=1.
212	// expand_document overrides this. - should it??
213	boolean get_text = true;
214	String nt_arg = (String) params.get(NO_TEXT_ARG);
215
216	if (!expand_document && nt_arg!=null && nt_arg.equals("1")) {
217	logger.debug("SETTING GET TEXT TO FALSE");
218	get_text = false;
219	} else {
220	logger.debug("GET TEXT REMAINS TRUE");
221	}
222
223	// the_document is where all the doc info - structure and metadata etc
224	// is added into, to be returned in the page
225	Element the_document = doc.createElement(GSXML.DOCUMENT_ELEM);
226	page_response.appendChild(the_document);
227
228	// create a basic doc list containing the current node
229	Element basic_doc_list = doc.createElement(GSXML.DOC_NODE_ELEM + GSXML.LIST_MODIFIER);
230	Element current_doc = doc.createElement(GSXML.DOC_NODE_ELEM);
231	basic_doc_list.appendChild(current_doc);
232	if (document_id != null)
233	{
234	current_doc.setAttribute(GSXML.NODE_ID_ATT, document_id + doc_id_modifier);
235	}
236	else
237	{
238	current_doc.setAttribute(GSXML.HREF_ID_ATT, href);
239	// do we need this??
240	current_doc.setAttribute(GSXML.ID_MOD_ATT, doc_id_modifier);
241	}
242
243	if (document_type == null)
244	{
245	document_type = getDocumentType(basic_doc_list, collection, userContext, page_response);
246	}
247	if (document_type == null)
248	{
249	logger.debug("##### doctype is null, setting to simple");
250	document_type = GSXML.DOC_TYPE_SIMPLE;
251	}
252
253	the_document.setAttribute(GSXML.DOC_TYPE_ATT, document_type);
254
255	// start getting doc structure
256
257	// Create a parameter list to specify the required structure information
258	Element ds_param_list = doc.createElement(GSXML.PARAM_ELEM + GSXML.LIST_MODIFIER);
259
260	if (service_params != null)
261	{
262	GSXML.addParametersToList(ds_param_list, service_params);
263	}
264
265	Element ds_param = null;
266	boolean get_structure = false;
267	boolean get_structure_info = false;
268	if (document_type.equals(GSXML.DOC_TYPE_PAGED))
269	{
270	get_structure_info = true;
271
272	if (expand_contents)
273	{
274	ds_param = doc.createElement(GSXML.PARAM_ELEM);
275	ds_param_list.appendChild(ds_param);
276	ds_param.setAttribute(GSXML.NAME_ATT, "structure");
277	ds_param.setAttribute(GSXML.VALUE_ATT, "entire");
278	}
279
280	// get the info needed for paged naviagtion
281	ds_param = doc.createElement(GSXML.PARAM_ELEM);
282	ds_param_list.appendChild(ds_param);
283	ds_param.setAttribute(GSXML.NAME_ATT, "info");
284	ds_param.setAttribute(GSXML.VALUE_ATT, "numSiblings");
285	ds_param = doc.createElement(GSXML.PARAM_ELEM);
286	ds_param_list.appendChild(ds_param);
287	ds_param.setAttribute(GSXML.NAME_ATT, "info");
288	ds_param.setAttribute(GSXML.VALUE_ATT, "numChildren");
289	ds_param = doc.createElement(GSXML.PARAM_ELEM);
290	ds_param_list.appendChild(ds_param);
291	ds_param.setAttribute(GSXML.NAME_ATT, "info");
292	ds_param.setAttribute(GSXML.VALUE_ATT, "siblingPosition");
293
294	if (get_siblings)
295	{
296	ds_param = doc.createElement(GSXML.PARAM_ELEM);
297	ds_param_list.appendChild(ds_param);
298	ds_param.setAttribute(GSXML.NAME_ATT, "structure");
299	ds_param.setAttribute(GSXML.VALUE_ATT, "siblings");
300	}
301
302	}
303	else if (document_type.equals(GSXML.DOC_TYPE_HIERARCHY) \|\| document_type.equals(GSXML.DOC_TYPE_PAGED_HIERARCHY))
304	{
305	get_structure = true;
306	if (expand_contents)
307	{
308	ds_param = doc.createElement(GSXML.PARAM_ELEM);
309	ds_param_list.appendChild(ds_param);
310	ds_param.setAttribute(GSXML.NAME_ATT, "structure");
311	ds_param.setAttribute(GSXML.VALUE_ATT, "entire");
312	}
313	else
314	{
315	// get the info needed for table of contents
316	ds_param = doc.createElement(GSXML.PARAM_ELEM);
317	ds_param_list.appendChild(ds_param);
318	ds_param.setAttribute(GSXML.NAME_ATT, "structure");
319	ds_param.setAttribute(GSXML.VALUE_ATT, "ancestors");
320	ds_param = doc.createElement(GSXML.PARAM_ELEM);
321	ds_param_list.appendChild(ds_param);
322	ds_param.setAttribute(GSXML.NAME_ATT, "structure");
323	ds_param.setAttribute(GSXML.VALUE_ATT, "children");
324	if (get_siblings)
325	{
326	ds_param = doc.createElement(GSXML.PARAM_ELEM);
327	ds_param_list.appendChild(ds_param);
328	ds_param.setAttribute(GSXML.NAME_ATT, "structure");
329	ds_param.setAttribute(GSXML.VALUE_ATT, "siblings");
330	}
331	}
332	}
333	else
334	{
335	// we dont need any structure
336	}
337
338	boolean has_dummy = false;
339	if (get_structure \|\| get_structure_info)
340	{
341
342	// Build a request to obtain the document structure
343	Element ds_message = doc.createElement(GSXML.MESSAGE_ELEM);
344	String to = GSPath.appendLink(collection, "DocumentStructureRetrieve");// Hard-wired?
345	Element ds_request = GSXML.createBasicRequest(doc, GSXML.REQUEST_TYPE_PROCESS, to, userContext);
346	ds_message.appendChild(ds_request);
347	ds_request.appendChild(ds_param_list);
348
349	// add the node list we created earlier
350	ds_request.appendChild(basic_doc_list);
351
352	// Process the document structure retrieve message
353	Element ds_response_message = (Element) this.mr.process(ds_message);
354	if (processErrorElements(ds_response_message, page_response))
355	{
356	return result;
357	}
358
359	// get the info and print out
360	String path = GSPath.appendLink(GSXML.RESPONSE_ELEM, GSXML.DOC_NODE_ELEM + GSXML.LIST_MODIFIER);
361	path = GSPath.appendLink(path, GSXML.DOC_NODE_ELEM);
362	path = GSPath.appendLink(path, "nodeStructureInfo");
363	Element ds_response_struct_info = (Element) GSXML.getNodeByPath(ds_response_message, path);
364	// get the doc_node bit
365	if (ds_response_struct_info != null)
366	{
367	the_document.appendChild(doc.importNode(ds_response_struct_info, true));
368	}
369	path = GSPath.appendLink(GSXML.RESPONSE_ELEM, GSXML.DOC_NODE_ELEM + GSXML.LIST_MODIFIER);
370	path = GSPath.appendLink(path, GSXML.DOC_NODE_ELEM);
371	path = GSPath.appendLink(path, GSXML.NODE_STRUCTURE_ELEM);
372	Element ds_response_structure = (Element) GSXML.getNodeByPath(ds_response_message, path);
373
374	if (ds_response_structure != null)
375	{
376	// add the contents of the structure bit into the_document
377	NodeList structs = ds_response_structure.getChildNodes();
378	for (int i = 0; i < structs.getLength(); i++)
379	{
380	the_document.appendChild(doc.importNode(structs.item(i), true));
381	}
382	}
383	else
384	{
385	// no structure nodes, so put in a dummy doc node
386	Element doc_node = doc.createElement(GSXML.DOC_NODE_ELEM);
387	if (document_id != null)
388	{
389	doc_node.setAttribute(GSXML.NODE_ID_ATT, document_id);
390	}
391	else
392	{
393	doc_node.setAttribute(GSXML.HREF_ID_ATT, href);
394
395	}
396	the_document.appendChild(doc_node);
397	has_dummy = true;
398	}
399	}
400	else
401	{ // a simple type - we dont have a dummy node for simple
402	// should think about this more
403	// no structure request, so just put in a dummy doc node
404	Element doc_node = doc.createElement(GSXML.DOC_NODE_ELEM);
405	if (document_id != null)
406	{
407	doc_node.setAttribute(GSXML.NODE_ID_ATT, document_id);
408	}
409	else
410	{
411	doc_node.setAttribute(GSXML.HREF_ID_ATT, href);
412	}
413	the_document.appendChild(doc_node);
414	has_dummy = true;
415	}
416
417	// end getting doc structure
418
419	// start getting doc metadata
420
421	// Build a request to obtain some document metadata
422	Element dm_message = doc.createElement(GSXML.MESSAGE_ELEM);
423	String to = GSPath.appendLink(collection, "DocumentMetadataRetrieve"); // Hard-wired?
424	Element dm_request = GSXML.createBasicRequest(doc, GSXML.REQUEST_TYPE_PROCESS, to, userContext);
425	dm_message.appendChild(dm_request);
426	// Create a parameter list to specify the required metadata information
427
428	HashSet<String> meta_names = new HashSet<String>();
429	meta_names.add("Title"); // the default
430	if (format_elem != null)
431	{
432	getRequiredMetadataNames(format_elem, meta_names);
433	}
434
435	Element extraMetaListElem = (Element) GSXML.getChildByTagName(request, GSXML.EXTRA_METADATA + GSXML.LIST_MODIFIER);
436	if (extraMetaListElem != null)
437	{
438	NodeList extraMetaList = extraMetaListElem.getElementsByTagName(GSXML.EXTRA_METADATA);
439	for (int i = 0; i < extraMetaList.getLength(); i++)
440	{
441	meta_names.add(((Element) extraMetaList.item(i)).getAttribute(GSXML.NAME_ATT));
442	}
443	}
444
445	Element dm_param_list = createMetadataParamList(doc,meta_names);
446	if (service_params != null)
447	{
448	GSXML.addParametersToList(dm_param_list, service_params);
449	}
450
451	dm_request.appendChild(dm_param_list);
452
453	// create the doc node list for the metadata request
454	Element dm_doc_list = doc.createElement(GSXML.DOC_NODE_ELEM + GSXML.LIST_MODIFIER);
455	dm_request.appendChild(dm_doc_list);
456
457	// Add each node from the structure response into the metadata request
458	NodeList doc_nodes = the_document.getElementsByTagName(GSXML.DOC_NODE_ELEM);
459	for (int i = 0; i < doc_nodes.getLength(); i++)
460	{
461	Element doc_node = (Element) doc_nodes.item(i);
462	String doc_node_id = doc_node.getAttribute(GSXML.NODE_ID_ATT);
463
464	// Add the documentNode to the list
465	Element dm_doc_node = doc.createElement(GSXML.DOC_NODE_ELEM);
466	if (needSectionContent(params)) {
467	if (doc_node_id.equals(document_id)) {
468	dm_doc_list.appendChild(dm_doc_node);
469	}
470	} else {
471	dm_doc_list.appendChild(dm_doc_node);
472	}
473	//dm_doc_list.appendChild(dm_doc_node);
474	dm_doc_node.setAttribute(GSXML.NODE_ID_ATT, doc_node_id);
475	dm_doc_node.setAttribute(GSXML.NODE_TYPE_ATT, doc_node.getAttribute(GSXML.NODE_TYPE_ATT));
476	if (document_id == null){
477	dm_doc_node.setAttribute(GSXML.HREF_ID_ATT, href );
478	}
479
480	}
481	// we also want a metadata request to the top level document to get
482	// assocfilepath - this could be cached too
483	Element doc_meta_request = GSXML.createBasicRequest(doc, GSXML.REQUEST_TYPE_PROCESS, to, userContext);
484	dm_message.appendChild(doc_meta_request);
485	Element doc_meta_param_list = doc.createElement(GSXML.PARAM_ELEM + GSXML.LIST_MODIFIER);
486	if (service_params != null)
487	{
488	GSXML.addParametersToList(doc_meta_param_list, service_params);
489	}
490
491	doc_meta_request.appendChild(doc_meta_param_list);
492	Element doc_param = doc.createElement(GSXML.PARAM_ELEM);
493	doc_meta_param_list.appendChild(doc_param);
494	doc_param.setAttribute(GSXML.NAME_ATT, "metadata");
495	doc_param.setAttribute(GSXML.VALUE_ATT, "assocfilepath");
496
497	// create the doc node list for the metadata request
498	Element doc_list = doc.createElement(GSXML.DOC_NODE_ELEM + GSXML.LIST_MODIFIER);
499	doc_meta_request.appendChild(doc_list);
500
501	Element doc_node = doc.createElement(GSXML.DOC_NODE_ELEM);
502	// the node we want is the root document node
503	if (document_id != null)
504	{
505	doc_node.setAttribute(GSXML.NODE_ID_ATT, document_id + ".rt");
506	}
507	/*else
508	{
509	doc_node.setAttribute(GSXML.HREF_ID_ATT, href);// + ".rt");
510	// can we assume that href is always a top level doc??
511	//doc_node.setAttribute(GSXML.ID_MOD_ATT, ".rt");
512	//doc_node.setAttribute("externalURL", has_rl);
513	}*/
514	doc_list.appendChild(doc_node);
515
516	Element dm_response_message = (Element) this.mr.process(dm_message);
517	if (processErrorElements(dm_response_message, page_response))
518	{
519	return result;
520	}
521
522	String path = GSPath.appendLink(GSXML.RESPONSE_ELEM, GSXML.DOC_NODE_ELEM + GSXML.LIST_MODIFIER);
523	Element dm_response_doc_list = (Element) GSXML.getNodeByPath(dm_response_message, path);
524
525	// Merge the metadata with the structure information
526	NodeList dm_response_docs = dm_response_doc_list.getChildNodes();
527	for (int i = 0; i < doc_nodes.getLength(); i++)
528	{
529	Node dcNode;
530	String node_idd = ((Element)doc_nodes.item(i)).getAttribute(GSXML.NODE_ID_ATT);
531	if (node_idd.isEmpty()) {
532	String href_id_att = ((Element)doc_nodes.item(i)).getAttribute(GSXML.HREF_ID_ATT);
533	dcNode = GSXML.getNamedElement(dm_response_doc_list, "documentNode", GSXML.HREF_ID_ATT, href_id_att);
534	} else {
535	dcNode = GSXML.getNamedElement(dm_response_doc_list, "documentNode", GSXML.NODE_ID_ATT, node_idd);
536	}
537	GSXML.mergeMetadataLists(doc_nodes.item(i), dcNode);
538	}
539	// get the top level doc metadata out
540	Element doc_meta_response = (Element) dm_response_message.getElementsByTagName(GSXML.RESPONSE_ELEM).item(1);
541	Element top_doc_node = (Element) GSXML.getNodeByPath(doc_meta_response, "documentNodeList/documentNode");
542	GSXML.mergeMetadataLists(the_document, top_doc_node);
543
544	// if we are highlighting query terms, then we also get them highlighted in the metadata
545
546	HashSet<String> query_term_variants = null;
547	ArrayList<ArrayList<HashSet<String>>> phrase_query_term_variants_hierarchy = null;
548	boolean do_highlight_query_terms = highlight_query_terms;
549	int query_terms_status = 0;
550	if (highlight_query_terms) {
551	// lets get the query term equivalents
552	query_term_variants = new HashSet<String>();
553	phrase_query_term_variants_hierarchy = new ArrayList<ArrayList<HashSet<String>>>();
554	if ((query_terms_status = getQueryTermVariants(request, query_term_variants, phrase_query_term_variants_hierarchy)) ==0) {
555	do_highlight_query_terms = false; // we couldn't get the terms
556	}
557	}
558
559	// lets try marking up the metadata with search terms
560	// if the search service doesn't send back <equivTermlist> then we haven't got the term variants. We lower case everything and do case insensitive matching
561	boolean highlight_case_insensitive = false;
562	if (query_terms_status == NO_EQUIV_QUERY_TERMS) {
563	highlight_case_insensitive = true;
564	}
565	if (do_highlight_query_terms) {
566	highlightQueryTermsDOM(doc, the_document, "metadata", query_term_variants, phrase_query_term_variants_hierarchy, highlight_case_insensitive);
567	}
568
569	// do we want doc text content? If not, we are done.
570	if (!get_text) {
571	// don't get text
572	return result;
573	}
574
575	// Build a request to obtain some document content
576	Element dc_message = doc.createElement(GSXML.MESSAGE_ELEM);
577	to = GSPath.appendLink(collection, "DocumentContentRetrieve"); // Hard-wired?
578	Element dc_request = GSXML.createBasicRequest(doc, GSXML.REQUEST_TYPE_PROCESS, to, userContext);
579	dc_message.appendChild(dc_request);
580
581	// Create a parameter list to specify the request parameters - empty for now
582	Element dc_param_list = doc.createElement(GSXML.PARAM_ELEM + GSXML.LIST_MODIFIER);
583	if (service_params != null)
584	{
585	GSXML.addParametersToList(dc_param_list, service_params);
586	}
587
588	dc_request.appendChild(dc_param_list);
589
590	// get the content
591	// the doc list for the content request is the same as the one for the structure request unless we want the whole document, in which case its the same as for the metadata request.
592	if (expand_document)
593	{
594	dc_request.appendChild(dm_doc_list);
595	}
596	else
597	{
598	dc_request.appendChild(basic_doc_list);
599	}
600	Element dc_response_message = (Element) this.mr.process(dc_message);
601
602	if (processErrorElements(dc_response_message, page_response))
603	{
604	return result;
605
606	}
607	Element dc_response_doc_list = (Element) GSXML.getNodeByPath(dc_response_message, path);
608
609	boolean get_marked_up_doc_from_query = false;
610	if (do_highlight_query_terms && query_terms_status == NO_EQUIV_QUERY_TERMS) {
611	get_marked_up_doc_from_query = true; // we try to. solr we can, lucene we can't
612	}
613
614	if (expand_document)
615	{
616	// Merge the content with the structure information
617	NodeList dc_response_docs = dc_response_doc_list.getChildNodes();
618	for (int i = 0; i < doc_nodes.getLength(); i++)
619	{
620	String node_id = ((Element)doc_nodes.item(i)).getAttribute(GSXML.NODE_ID_ATT);
621	Node docNode = GSXML.getNamedElement(dc_response_doc_list, "documentNode", GSXML.NODE_ID_ATT, node_id);
622	Node content = GSXML.getChildByTagName(docNode, GSXML.NODE_CONTENT_ELEM);
623	if (content != null)
624	{
625	if (do_highlight_query_terms) {
626	if (get_marked_up_doc_from_query) {
627
628	Element new_content = retrieveHighlightedContent(request, node_id);
629
630	if (new_content == null) {
631	// we didn't get any text back from the request. assume we won't be able to get it next time either (eg lucene)
632	get_marked_up_doc_from_query = false;
633	content = highlightQueryTermsElementText(doc, (Element)content, query_term_variants, phrase_query_term_variants_hierarchy, highlight_case_insensitive);
634	} else {
635	content= new_content;
636	}
637	} else {
638	content = highlightQueryTermsElementText(doc, (Element)content, query_term_variants, phrase_query_term_variants_hierarchy, highlight_case_insensitive);
639	}
640	}
641	doc_nodes.item(i).appendChild(doc.importNode(content, true));
642	}
643
644	}
645	if (has_dummy && document_type.equals(GSXML.DOC_TYPE_SIMPLE)) {
646	Element dummy_node = (Element) doc_nodes.item(0);
647	the_document.removeChild(dummy_node);
648	the_document.setAttribute(GSXML.NODE_ID_ATT, dummy_node.getAttribute(GSXML.NODE_ID_ATT));
649	NodeList dummy_children = dummy_node.getChildNodes();
650	for (int i = dummy_children.getLength() - 1; i >= 0; i--)
651	{
652	// special case as we don't want more than one metadata list
653	if (dummy_children.item(i).getNodeName().equals(GSXML.METADATA_ELEM + GSXML.LIST_MODIFIER))
654	{
655	GSXML.mergeMetadataFromList(the_document, dummy_children.item(i));
656	}
657	else
658	{
659	the_document.appendChild(dummy_children.item(i));
660	}
661	}
662	}
663	}
664	else
665	{
666	Element dc_response_doc = (Element) GSXML.getChildByTagName(dc_response_doc_list, GSXML.DOC_NODE_ELEM);
667	Element dc_response_doc_content = (Element) GSXML.getChildByTagName(dc_response_doc, GSXML.NODE_CONTENT_ELEM);
668
669	if (dc_response_doc_content == null)
670	{
671	// no content to add
672	if (dc_response_doc.getAttribute("external").equals("true"))
673	{
674	String href_id = dc_response_doc.getAttribute(GSXML.HREF_ID_ATT);
675
676	the_document.setAttribute("selectedNode", href_id);
677	the_document.setAttribute("external", href_id);
678	}
679	return result;
680	}
681	if (do_highlight_query_terms)
682	{
683	dc_response_doc.removeChild(dc_response_doc_content);
684	if (get_marked_up_doc_from_query) {
685	Element new_content = retrieveHighlightedContent(request, null);
686	if (new_content == null) {
687	get_marked_up_doc_from_query = false;
688	dc_response_doc_content = highlightQueryTermsElementText(doc, (Element)dc_response_doc_content, query_term_variants, phrase_query_term_variants_hierarchy, highlight_case_insensitive);
689	} else {
690
691	dc_response_doc_content = new_content;
692	}
693	} else {
694	dc_response_doc_content = highlightQueryTermsElementText(doc, (Element)dc_response_doc_content, query_term_variants, phrase_query_term_variants_hierarchy, highlight_case_insensitive);
695	}
696	dc_response_doc.appendChild(dc_response_doc.getOwnerDocument().importNode(dc_response_doc_content, true));
697	}
698
699	if (provide_annotations)
700	{
701	String service_selected = (String) params.get(ENRICH_DOC_ARG);
702	if (service_selected != null && service_selected.equals("1"))
703	{
704	// now we can modifiy the response doc if needed
705	String enrich_service = (String) params.get(GSParams.SERVICE);
706	// send a message to the service
707	Element enrich_message = doc.createElement(GSXML.MESSAGE_ELEM);
708	Element enrich_request = GSXML.createBasicRequest(doc, GSXML.REQUEST_TYPE_PROCESS, enrich_service, userContext);
709	enrich_message.appendChild(enrich_request);
710	// check for parameters
711	HashMap e_service_params = (HashMap) params.get("s1");
712	if (e_service_params != null)
713	{
714	Element enrich_pl = doc.createElement(GSXML.PARAM_ELEM + GSXML.LIST_MODIFIER);
715	GSXML.addParametersToList(enrich_pl, e_service_params);
716	enrich_request.appendChild(enrich_pl);
717	}
718	Element e_doc_list = doc.createElement(GSXML.DOC_NODE_ELEM + GSXML.LIST_MODIFIER);
719	enrich_request.appendChild(e_doc_list);
720	e_doc_list.appendChild(doc.importNode(dc_response_doc, true));
721
722	Node enrich_response = this.mr.process(enrich_message);
723
724	String[] links = { GSXML.RESPONSE_ELEM, GSXML.DOC_NODE_ELEM + GSXML.LIST_MODIFIER, GSXML.DOC_NODE_ELEM, GSXML.NODE_CONTENT_ELEM };
725	path = GSPath.createPath(links);
726	dc_response_doc_content = (Element) GSXML.getNodeByPath(enrich_response, path);
727
728	}
729	} // if provide_annotations
730
731	// use the returned id rather than the sent one cos there may have
732	// been modifiers such as .pr that are removed.
733	String modified_doc_id = dc_response_doc.getAttribute(GSXML.NODE_ID_ATT);
734	the_document.setAttribute("selectedNode", modified_doc_id);
735	if (has_dummy)
736	{
737	// change the id if necessary and add the content
738	Element dummy_node = (Element) doc_nodes.item(0);
739
740	dummy_node.setAttribute(GSXML.NODE_ID_ATT, modified_doc_id);
741	dummy_node.appendChild(doc.importNode(dc_response_doc_content, true));
742	// hack for simple type
743	if (document_type.equals(GSXML.DOC_TYPE_SIMPLE))
744	{
745	// we dont want the internal docNode, just want the content and metadata in the document
746	// rethink this!!
747	the_document.removeChild(dummy_node);
748
749	NodeList dummy_children = dummy_node.getChildNodes();
750	for (int i = dummy_children.getLength() - 1; i >= 0; i--)
751	{
752	// special case as we don't want more than one metadata list
753	if (dummy_children.item(i).getNodeName().equals(GSXML.METADATA_ELEM + GSXML.LIST_MODIFIER))
754	{
755	GSXML.mergeMetadataFromList(the_document, dummy_children.item(i));
756	}
757	else
758	{
759	the_document.appendChild(dummy_children.item(i));
760	}
761	}
762	}
763
764	the_document.setAttribute(GSXML.NODE_ID_ATT, modified_doc_id);
765	}
766	else
767	{
768	// Merge the document content with the metadata and structure information
769	for (int i = 0; i < doc_nodes.getLength(); i++)
770	{
771	Node dn = doc_nodes.item(i);
772	String dn_id = ((Element) dn).getAttribute(GSXML.NODE_ID_ATT);
773	if (dn_id.equals(modified_doc_id))
774	{
775	dn.appendChild(doc.importNode(dc_response_doc_content, true));
776	break;
777	}
778	}
779	}
780	}
781	//logger.debug("(DocumentAction) Page:\n" + GSXML.xmlNodeToString(result));
782	return result;
783	}
784
785	protected Element getFormattedArchiveDoc(Document doc, String collection, String document_id, String document_type, Element result, Element page_response, UserContext userContext ) {
786	// call get archive doc
787	Element dx_message = doc.createElement(GSXML.MESSAGE_ELEM);
788	String to = "DocXMLGetSection";
789	Element dx_request = GSXML.createBasicRequest(doc, GSXML.REQUEST_TYPE_PROCESS, to, userContext);
790	dx_message.appendChild(dx_request);
791	Element dx_section = doc.createElement(GSXML.DOCXML_SECTION_ELEM);
792	dx_section.setAttribute(GSXML.NODE_ID_ATT, document_id);
793	dx_section.setAttribute(GSXML.COLLECTION_ATT, collection);
794	dx_request.appendChild(dx_section);
795
796	Element dx_response_message = (Element) this.mr.process(dx_message);
797	if (processErrorElements(dx_response_message, page_response))
798	{
799	return result;
800	}
801
802	// get the section out
803	String path = GSPath.appendLink(GSXML.RESPONSE_ELEM, GSXML.DOCXML_SECTION_ELEM);
804	Element section = (Element) GSXML.getNodeByPath(dx_response_message, path);
805	if (section == null) {
806	logger.error("no archive doc returned for "+document_id);
807	return result;
808	}
809	// convert the archive format into the internal format that the page response requires
810
811	// work out doctype
812	// NOTE: this will be coming from collection database in index
813	// the archive file doesn't store this. So we have to assume
814	// that the doc type will not be changing with any
815	// modifications happening to archives.
816
817	// if doc type is null, then we need to work it out.
818	// create a basic doc list containing the current node
819
820	if (document_type == null) {
821	Element basic_doc_list = doc.createElement(GSXML.DOC_NODE_ELEM + GSXML.LIST_MODIFIER);
822	Element current_doc = doc.createElement(GSXML.DOC_NODE_ELEM);
823	basic_doc_list.appendChild(current_doc);
824	current_doc.setAttribute(GSXML.NODE_ID_ATT, document_id);
825	basic_doc_list.appendChild(current_doc);
826	document_type = getDocumentType(basic_doc_list, collection, userContext, page_response);
827	}
828
829	if (document_type == null) {
830	logger.debug("@@@ doctype is null, setting to simple");
831	document_type = GSXML.DOC_TYPE_SIMPLE;
832	}
833
834	Element doc_elem = doc.createElement(GSXML.DOCUMENT_ELEM);
835	doc_elem.setAttribute(GSXML.DOC_TYPE_ATT, document_type);
836	page_response.appendChild(doc_elem);
837
838	Element transformed_section = transformArchiveToDocument(section);
839	if (document_type == GSXML.DOC_TYPE_SIMPLE) {
840	// simple doc, only returning a single document node, which is the top level section.
841	doc_elem.setAttribute(GSXML.NODE_ID_ATT, document_id);
842	GSXML.mergeElements(doc_elem, transformed_section);
843	return result;
844	}
845
846	// multi sectioned document.
847	transformed_section.setAttribute(GSXML.NODE_ID_ATT, document_id);
848	// In docEdit mode, we obtain the text from archives, from doc.xml
849	// Now the transformation has replaced <Section> with <documentNode>
850	// Need to add nodeID, nodeType and docType attributes to each docNode
851	// as doc.xml doesn't store that.
852	insertDocNodeAttributes(transformed_section, document_type, null);
853	doc_elem.appendChild(doc.importNode(transformed_section, true));
854	logger.debug("dx result = "+XMLConverter.getPrettyString(result));
855
856	return result;
857	}
858
859
860	private boolean needSectionContent(HashMap<String, Serializable> params) {
861	String document_id = (String) params.get(GSParams.DOCUMENT);
862	String ilt = (String) params.get(GSParams.INLINE_TEMPLATE);
863	String iltPrefix = "<xsl:template match=\"/\"><text><xsl:for-each select=\"/page/pageResponse/document//documentNode[@nodeID =";
864	if (ilt != null && ilt.startsWith(iltPrefix) && document_id != null) {
865	return true;
866	}
867
868	return false;
869	}
870	/**
871	* this method gets the collection description, the format info, the list of
872	* enrich services, etc - stuff that is needed for the page, but is the same
873	* whatever the query is - should be cached
874	*/
875	protected boolean getBackgroundData(Element page_response, String collection, UserContext userContext)
876	{
877	Document doc = page_response.getOwnerDocument();
878
879	// create a message to process - contains requests for the collection
880	// description, the format element, the enrich services on offer
881	// these could all be cached
882	Element info_message = doc.createElement(GSXML.MESSAGE_ELEM);
883	String path = GSPath.appendLink(collection, "DocumentContentRetrieve");
884	// the format request - ignore for now, where does this request go to??
885	Element format_request = GSXML.createBasicRequest(doc, GSXML.REQUEST_TYPE_FORMAT, path, userContext);
886	info_message.appendChild(format_request);
887
888	// the enrich_services request - only do this if provide_annotations is true
889
890	if (provide_annotations)
891	{
892	Element enrich_services_request = GSXML.createBasicRequest(doc, GSXML.REQUEST_TYPE_DESCRIBE, "", userContext);
893	enrich_services_request.setAttribute(GSXML.INFO_ATT, "serviceList");
894	info_message.appendChild(enrich_services_request);
895	}
896
897	Element info_response = (Element) this.mr.process(info_message);
898
899	// the collection is the first response
900	NodeList responses = info_response.getElementsByTagName(GSXML.RESPONSE_ELEM);
901	Element format_resp = (Element) responses.item(0);
902
903	Element format_elem = (Element) GSXML.getChildByTagName(format_resp, GSXML.FORMAT_ELEM);
904	if (format_elem != null)
905	{
906	Element global_format_elem = (Element) GSXML.getChildByTagName(format_resp, GSXML.GLOBAL_FORMAT_ELEM);
907	if (global_format_elem != null)
908	{
909	GSXSLT.mergeFormatElements(format_elem, global_format_elem, false);
910	}
911
912	// set the format type
913	format_elem.setAttribute(GSXML.TYPE_ATT, "display");
914	page_response.appendChild(doc.importNode(format_elem, true));
915	}
916
917	if (provide_annotations)
918	{
919	Element services_resp = (Element) responses.item(1);
920
921	// a new message for the mr
922	Element enrich_message = doc.createElement(GSXML.MESSAGE_ELEM);
923	NodeList e_services = services_resp.getElementsByTagName(GSXML.SERVICE_ELEM);
924	boolean service_found = false;
925	for (int j = 0; j < e_services.getLength(); j++)
926	{
927	if (((Element) e_services.item(j)).getAttribute(GSXML.TYPE_ATT).equals("enrich"))
928	{
929	Element s = GSXML.createBasicRequest(doc, GSXML.REQUEST_TYPE_DESCRIBE, ((Element) e_services.item(j)).getAttribute(GSXML.NAME_ATT), userContext);
930	enrich_message.appendChild(s);
931	service_found = true;
932	}
933	}
934	if (service_found)
935	{
936	Element enrich_response = (Element) this.mr.process(enrich_message);
937
938	NodeList e_responses = enrich_response.getElementsByTagName(GSXML.RESPONSE_ELEM);
939	Element service_list = doc.createElement(GSXML.SERVICE_ELEM + GSXML.LIST_MODIFIER);
940	for (int i = 0; i < e_responses.getLength(); i++)
941	{
942	Element e_resp = (Element) e_responses.item(i);
943	Element e_service = (Element) doc.importNode(GSXML.getChildByTagName(e_resp, GSXML.SERVICE_ELEM), true);
944	e_service.setAttribute(GSXML.NAME_ATT, e_resp.getAttribute(GSXML.FROM_ATT));
945	service_list.appendChild(e_service);
946	}
947	page_response.appendChild(service_list);
948	}
949	} // if provide_annotations
950	return true;
951
952	}
953
954	protected String getDocumentType(Element basic_doc_list, String collection, UserContext userContext, Element page_response)
955	{
956	Document doc = basic_doc_list.getOwnerDocument();
957
958	Element ds_message = doc.createElement(GSXML.MESSAGE_ELEM);
959	String to = GSPath.appendLink(collection, "DocumentStructureRetrieve");// Hard-wired?
960	Element ds_request = GSXML.createBasicRequest(doc, GSXML.REQUEST_TYPE_PROCESS, to, userContext);
961	ds_message.appendChild(ds_request);
962
963	// Create a parameter list to specify the required structure information
964	Element ds_param_list = doc.createElement(GSXML.PARAM_ELEM + GSXML.LIST_MODIFIER);
965	Element ds_param = doc.createElement(GSXML.PARAM_ELEM);
966	ds_param_list.appendChild(ds_param);
967	ds_param.setAttribute(GSXML.NAME_ATT, "info");
968	ds_param.setAttribute(GSXML.VALUE_ATT, "documentType");
969
970	ds_request.appendChild(ds_param_list);
971
972	// add the node list we created earlier
973	ds_request.appendChild(basic_doc_list);
974
975	// Process the document structure retrieve message
976	Element ds_response_message = (Element) this.mr.process(ds_message);
977	if (processErrorElements(ds_response_message, page_response))
978	{
979	return null;
980	}
981
982	String[] links = { GSXML.RESPONSE_ELEM, GSXML.DOC_NODE_ELEM + GSXML.LIST_MODIFIER, GSXML.DOC_NODE_ELEM, "nodeStructureInfo" };
983	String path = GSPath.createPath(links);
984	Element info_elem = (Element) GSXML.getNodeByPath(ds_response_message, path);
985	if (info_elem == null) {
986	return null;
987	}
988	Element doctype_elem = GSXML.getNamedElement(info_elem, "info", "name", "documentType");
989	if (doctype_elem != null)
990	{
991	String doc_type = doctype_elem.getAttribute("value");
992	return doc_type;
993	}
994	return null;
995	}
996
997	// Recursive method to set the docType, nodeType and nodeID attributes of each docNode
998	// The docType remains constant as in parameter document_type
999	// The nodeID for the first (root) docNode is already set. For all children, the rootNode id
1000	// is updated to be <parent-id>.<num-child>, where the first parent-id is rootNode id.
1001	// The nodeType is root if rootNode, internal if there are children and leaf if no children
1002	protected void insertDocNodeAttributes(Element docNode, String document_type, String id) {
1003
1004	boolean isRoot = false;
1005	if(id == null) { // rootNode, get the root nodeID to work with recursively
1006	id = docNode.getAttribute(GSXML.NODE_ID_ATT);
1007	isRoot = true;
1008	} else { // for all but the root node, need to still set the nodeID
1009	docNode.setAttribute(GSXML.NODE_ID_ATT, id);
1010	}
1011
1012	docNode.setAttribute(GSXML.DOC_TYPE_ATT, document_type);
1013
1014	NodeList docNodes = GSXML.getChildrenByTagName(docNode, GSXML.DOC_NODE_ELEM);
1015	if(docNodes.getLength() > 0) {
1016	docNode.setAttribute(GSXML.NODE_TYPE_ATT, GSXML.NODE_TYPE_INTERNAL);
1017	for(int i = 0; i < docNodes.getLength(); i++) {
1018	Element childDocNode = (Element)docNodes.item(i);
1019
1020	// work out the child docNode's nodeID based on current id
1021	String nodeID = id + "." + (i+1);
1022	insertDocNodeAttributes(childDocNode, document_type, nodeID); //recursion step
1023	}
1024	} else {
1025	docNode.setAttribute(GSXML.NODE_TYPE_ATT, GSXML.NODE_TYPE_LEAF);
1026	}
1027
1028	// rootNode's nodeType is a special case: it's "root", not "leaf" or "internal"
1029	if(isRoot) docNode.setAttribute(GSXML.NODE_TYPE_ATT, GSXML.NODE_TYPE_ROOT);
1030
1031	}
1032
1033	/** run the XSLT transform which converts from doc.xml format to our internal document format */
1034	protected Element transformArchiveToDocument(Element section) {
1035
1036	String stylesheet_filename = GSFile.stylesheetFile(GlobalProperties.getGSDL3Home(), (String) this.config_params.get(GSConstants.SITE_NAME), "", (String) this.config_params.get(GSConstants.INTERFACE_NAME), (ArrayList<String>) this.config_params.get(GSConstants.BASE_INTERFACES), "archive2document.xsl");
1037	if (stylesheet_filename == null) {
1038	logger.error("Couldn't find stylesheet archive2document.xsl");
1039	return section;
1040	}
1041
1042	Document stylesheet_doc = XMLConverter.getDOM(new File(stylesheet_filename));
1043	if (stylesheet_doc == null) {
1044	logger.error("Couldn't load in stylesheet "+stylesheet_filename);
1045	return section;
1046	}
1047
1048	Document section_doc = XMLConverter.newDOM();
1049	section_doc.appendChild(section_doc.importNode(section, true));
1050	Node result = this.transformer.transform(stylesheet_doc, section_doc);
1051	logger.debug("transform result = "+XMLConverter.getPrettyString(result));
1052
1053	Element new_element;
1054	if (result.getNodeType() == Node.DOCUMENT_NODE) {
1055	new_element = ((Document) result).getDocumentElement();
1056	} else {
1057	new_element = (Element) result;
1058	}
1059
1060
1061	return new_element;
1062
1063	}
1064
1065	protected final int NO_QUERY_TERMS = 0;
1066	protected final int NO_EQUIV_QUERY_TERMS = 1;
1067	protected final int EQUIV_QUERY_TERMS = 2;
1068	/**
1069	* this involves a bit of a hack to get the equivalent query terms - has to
1070	* requery the query service - uses the last selected service name. (if it
1071	* ends in query).
1072	*/
1073	protected int getQueryTermVariants(Element request, HashSet<String> query_term_variants, ArrayList<ArrayList<HashSet<String>>> phrase_query_term_variants_hierarchy)
1074	{
1075	Document doc = XMLConverter.newDOM();
1076
1077	// do the query again to get term info
1078	Element cgi_param_list = (Element) GSXML.getChildByTagName(request, GSXML.PARAM_ELEM + GSXML.LIST_MODIFIER);
1079	HashMap<String, Serializable> params = GSXML.extractParams(cgi_param_list, false);
1080
1081	HashMap previous_params = (HashMap) params.get("p");
1082	if (previous_params == null)
1083	{
1084	return NO_QUERY_TERMS;
1085	}
1086	String service_name = (String) previous_params.get(GSParams.SERVICE);
1087	if (service_name == null \|\| !service_name.endsWith("Query"))
1088	{ // hack for now - we only do highlighting if we were in a query last - ie not if we were in a browse thingy
1089	logger.debug("invalid service "+service_name+", not doing highlighting");
1090	return NO_QUERY_TERMS;
1091	}
1092
1093	String collection = (String) params.get(GSParams.COLLECTION);
1094	UserContext userContext = new UserContext(request);
1095	String to = GSPath.appendLink(collection, service_name);
1096
1097	Element mr_query_message = doc.createElement(GSXML.MESSAGE_ELEM);
1098	Element mr_query_request = GSXML.createBasicRequest(doc, GSXML.REQUEST_TYPE_PROCESS, to, userContext);
1099	mr_query_message.appendChild(mr_query_request);
1100
1101	// paramList
1102	HashMap service_params = (HashMap) params.get("s1");
1103
1104	Element query_param_list = doc.createElement(GSXML.PARAM_ELEM + GSXML.LIST_MODIFIER);
1105	GSXML.addParametersToList(query_param_list, service_params);
1106	mr_query_request.appendChild(query_param_list);
1107
1108	// do the query
1109	Element mr_query_response = (Element) this.mr.process(mr_query_message);
1110
1111	// find the term lists
1112	String path = GSPath.appendLink(GSXML.RESPONSE_ELEM, GSXML.TERM_ELEM + GSXML.LIST_MODIFIER);
1113	Element query_term_list_element = (Element) GSXML.getNodeByPath(mr_query_response, path);
1114	if (query_term_list_element == null)
1115	{
1116	// no term info
1117	return NO_QUERY_TERMS;
1118	}
1119
1120	int result_code = NO_EQUIV_QUERY_TERMS;
1121	NodeList equivalent_terms_nodelist = query_term_list_element.getElementsByTagName("equivTermList");
1122	if (equivalent_terms_nodelist == null \|\| equivalent_terms_nodelist.getLength() == 0)
1123	{
1124	// if we have no equivalent terms, just add the current terms lower cased and we do case insensitive matching later on
1125	NodeList terms_nodelist = query_term_list_element.getElementsByTagName("term");
1126	if (terms_nodelist != null && terms_nodelist.getLength() > 0)
1127	{
1128	for (int i = 0; i < terms_nodelist.getLength(); i++)
1129	{
1130	String termValue = ((Element) terms_nodelist.item(i)).getAttribute("name");
1131	query_term_variants.add(termValue.toLowerCase());
1132	}
1133	}
1134	}
1135	else
1136	{
1137	result_code = EQUIV_QUERY_TERMS;
1138	for (int i = 0; i < equivalent_terms_nodelist.getLength(); i++)
1139	{
1140	Element equivalent_terms_element = (Element) equivalent_terms_nodelist.item(i);
1141	String[] equivalent_terms = GSXML.getAttributeValuesFromList(equivalent_terms_element, GSXML.NAME_ATT);
1142	for (int j = 0; j < equivalent_terms.length; j++)
1143	{
1144	query_term_variants.add(equivalent_terms[j]);
1145	}
1146	}
1147	}
1148
1149	String metadata_path = GSPath.appendLink(GSXML.RESPONSE_ELEM, GSXML.METADATA_ELEM + GSXML.LIST_MODIFIER);
1150	Element metadata_list = (Element) GSXML.getNodeByPath(mr_query_response, metadata_path);
1151
1152	Element query_element = GSXML.getNamedElement(metadata_list, GSXML.METADATA_ELEM, GSXML.NAME_ATT, "query");
1153	String performed_query = GSXML.getNodeText(query_element) + " ";
1154	logger.debug("performed query="+performed_query);
1155
1156	boolean has_phrases = false; // if there are no phrases, we don't bother making the phrase variants structure
1157	if (performed_query.contains("\"")) {
1158	has_phrases = true;
1159	}
1160
1161	ArrayList<HashSet<String>> phrase_query_p_term_variants_list = new ArrayList<HashSet<String>>();
1162	int term_start = 0;
1163	boolean in_term = false;
1164	boolean in_phrase = false;
1165	for (int i = 0; i < performed_query.length(); i++) {
1166
1167	char character = performed_query.charAt(i);
1168	boolean is_character_letter_or_digit = Character.isLetterOrDigit(character);
1169
1170	// Has a query term just started?
1171	if (in_term == false && is_character_letter_or_digit == true)
1172	{
1173	in_term = true;
1174	term_start = i;
1175	}
1176
1177	// Or has a term just finished?
1178	else if (in_term == true && is_character_letter_or_digit == false)
1179	{
1180	in_term = false;
1181	String term = performed_query.substring(term_start, i);
1182	if (has_phrases) {
1183	// do the phrase bit
1184	HashSet<String> phrase_query_p_term_x_variants = new HashSet<String>();
1185	if (result_code == EQUIV_QUERY_TERMS) {
1186	Element term_element = GSXML.getNamedElement(query_term_list_element, GSXML.TERM_ELEM, GSXML.NAME_ATT, term);
1187	if (term_element != null) {
1188	// might be null for eg TX in [snails]:TX
1189
1190	NodeList term_equivalent_terms_nodelist = term_element.getElementsByTagName("equivTermList");
1191	if (term_equivalent_terms_nodelist != null \|\| term_equivalent_terms_nodelist.getLength() != 0) {
1192	for (int j = 0; j < term_equivalent_terms_nodelist.getLength(); j++)
1193	{
1194	Element term_equivalent_terms_element = (Element) term_equivalent_terms_nodelist.item(j);
1195	String[] term_equivalent_terms = GSXML.getAttributeValuesFromList(term_equivalent_terms_element, GSXML.NAME_ATT);
1196	for (int k = 0; k < term_equivalent_terms.length; k++)
1197	{
1198	phrase_query_p_term_x_variants.add(term_equivalent_terms[k]);
1199	}
1200	}
1201	}
1202	}
1203	} else { // result_code != EQUIV_QUERY_TERMS
1204	// we don;t have equivalent term list, so just add the lower cased version in, and we do case-insensitive matching later on
1205	if (query_term_variants.contains(term.toLowerCase()) \|\| containsSubString(query_term_variants, term)) {
1206	// this handles the case where the user has searched for snails, but term list returns 'snail'
1207	phrase_query_p_term_x_variants.add(term.toLowerCase());
1208	}
1209	}
1210	if (phrase_query_p_term_x_variants.size()>0) {
1211	// we have found a valid term
1212	phrase_query_p_term_variants_list.add(phrase_query_p_term_x_variants);
1213
1214	if (in_phrase == false)
1215	{
1216	phrase_query_term_variants_hierarchy.add(phrase_query_p_term_variants_list);
1217	phrase_query_p_term_variants_list = new ArrayList<HashSet<String>>();
1218	}
1219	}
1220	} // end if has_phrases
1221	else {
1222	// no phrases so we don't have to do the phrasey stuff. but
1223	// we need to check the term against the query term list - if its not in there, check whether its the root of a term.
1224	// we want to handle the case where user has queried "snails", the term list returned only has snail, and therefore snails doesn't get highlighted.
1225	// but dont want to include eg TX
1226	if (result_code == NO_EQUIV_QUERY_TERMS) {
1227	if (containsSubString(query_term_variants, term)) {
1228	query_term_variants.add(term.toLowerCase());
1229	}
1230	}
1231
1232	}
1233	} // end of in_term...
1234	// Watch for phrases (surrounded by quotes)
1235	if (character == '\"') {
1236
1237	// Has a phrase just started?
1238	if (in_phrase == false)
1239	{
1240	in_phrase = true;
1241	}
1242	// Or has a phrase just finished?
1243	else if (in_phrase == true)
1244	{
1245	in_phrase = false;
1246	phrase_query_term_variants_hierarchy.add(phrase_query_p_term_variants_list);
1247	}
1248
1249	phrase_query_p_term_variants_list = new ArrayList<HashSet<String>>();
1250	} // if char == "
1251	} // for each char in performed query
1252
1253	return result_code;
1254	}
1255
1256	protected boolean containsSubString(HashSet<String> query_term_variants, String term) {
1257	// hack to filter out TX, TI field names
1258	String lc_term = term.toLowerCase();
1259	if (query_term_variants.contains(term)) {
1260	return false; // or true??
1261	}
1262	if (term.matches("[A-Z][A-Z][A-Z]?")) {
1263	return false;
1264	}
1265	Iterator i = query_term_variants.iterator();
1266	while (i.hasNext()) {
1267	String t = (String)i.next();
1268	if (term.startsWith(t)) {
1269	return true;
1270	}
1271	}
1272	return false;
1273	}
1274
1275
1276	/** retrieve the marked up highlighted section - only works for solr collection */
1277	protected Element retrieveHighlightedContent(Element request, String node_id) {
1278
1279	Document doc = XMLConverter.newDOM();
1280
1281	// do the query again to get term info
1282	Element cgi_param_list = (Element) GSXML.getChildByTagName(request, GSXML.PARAM_ELEM + GSXML.LIST_MODIFIER);
1283	HashMap<String, Serializable> params = GSXML.extractParams(cgi_param_list, false);
1284
1285	HashMap previous_params = (HashMap) params.get("p");
1286	if (previous_params == null)
1287	{
1288	return null;
1289	}
1290	String service_name = (String) previous_params.get(GSParams.SERVICE);
1291	if (service_name == null \|\| !service_name.endsWith("Query"))
1292	{ // hack for now - we only do highlighting if we were in a query last - ie not if we were in a browse thingy
1293	logger.debug("HL invalid service, not doing highlighting");
1294	return null;
1295	}
1296
1297	String collection = (String) params.get(GSParams.COLLECTION);
1298	UserContext userContext = new UserContext(request);
1299	String to = GSPath.appendLink(collection, service_name);
1300
1301	Element mr_query_message = doc.createElement(GSXML.MESSAGE_ELEM);
1302	Element mr_query_request = GSXML.createBasicRequest(doc, GSXML.REQUEST_TYPE_PROCESS, to, userContext);
1303	mr_query_message.appendChild(mr_query_request);
1304
1305	// paramList
1306	HashMap service_params = (HashMap) params.get("s1");
1307
1308	// hack in case the user searched on eg titles, but we want highlighting in the text
1309	service_params.put("index", "TX");
1310	Element query_param_list = doc.createElement(GSXML.PARAM_ELEM + GSXML.LIST_MODIFIER);
1311	GSXML.addParametersToList(query_param_list, service_params);
1312
1313	if (node_id != null) {
1314	GSXML.addParameterToList(query_param_list, "hldocOID", node_id);
1315	} else {
1316	GSXML.addParameterToList(query_param_list, "hldocOID", (String) params.get(GSParams.DOCUMENT));
1317	}
1318	mr_query_request.appendChild(query_param_list);
1319	// do the query
1320
1321	Element mr_query_response = (Element) this.mr.process(mr_query_message);
1322	String pathNode = GSPath.appendLink(GSXML.RESPONSE_ELEM, GSXML.NODE_CONTENT_ELEM);
1323	Element highlighted_node = (Element) GSXML.getNodeByPath(mr_query_response, pathNode);
1324
1325	if (highlighted_node == null) {
1326	return null;
1327	}
1328	// For SOLR, the highlighted node will be a nodeContent element, which is the hldocOID section content, with search terms marked up.
1329	//We send it back to the documnetContentRetrieve service so that resolveTextMacros can be applied, and it can be properly encased in documentNode etc elements
1330
1331	// Build a request to process highlighted text
1332
1333	Element hl_message = doc.createElement(GSXML.MESSAGE_ELEM);
1334	to = GSPath.appendLink(collection, "DocumentContentRetrieve");
1335	Element dc_request = GSXML.createBasicRequest(doc, GSXML.REQUEST_TYPE_PROCESS, to, userContext);
1336	hl_message.appendChild(dc_request);
1337
1338	// Create a parameter list to specify the request parameters - empty for now
1339	Element dc_param_list = doc.createElement(GSXML.PARAM_ELEM + GSXML.LIST_MODIFIER);
1340	dc_request.appendChild(dc_param_list);
1341
1342	// get the content
1343	Element doc_list = doc.createElement(GSXML.DOC_NODE_ELEM + GSXML.LIST_MODIFIER);
1344	dc_request.appendChild(doc_list);
1345	Element current_doc = doc.createElement(GSXML.DOC_NODE_ELEM);
1346	doc_list.appendChild(current_doc);
1347	current_doc.setAttribute(GSXML.NODE_ID_ATT, (String) params.get(GSParams.DOCUMENT));
1348	//Append highlighted content to request for processing
1349	dc_request.appendChild(doc.importNode(highlighted_node, true));
1350	Element hl_response_message = (Element) this.mr.process(hl_message);
1351	//Get results
1352	NodeList contentList = hl_response_message.getElementsByTagName(GSXML.NODE_CONTENT_ELEM);
1353	Element content = (Element) contentList.item(0);
1354	return content;
1355
1356
1357	}
1358	/**
1359	* Highlights query terms in specified elements (whose name is in element_names) text inside top_level_elem
1360	*/
1361	protected boolean highlightQueryTermsDOM(Document doc, Element top_level_elem, String element_name, HashSet<String> query_term_variants, ArrayList<ArrayList<HashSet<String>>> phrase_query_term_variants_hierarchy, boolean case_insensitive) {
1362
1363	NodeList named_elems = top_level_elem.getElementsByTagName(element_name);
1364	for (int j=named_elems.getLength()-1; j>=0; j--) {
1365	Element this_elem = (Element)named_elems.item(j);
1366	Element replacement_elem = highlightQueryTermsElementText(doc, this_elem, query_term_variants, phrase_query_term_variants_hierarchy, case_insensitive);
1367	this_elem.getParentNode().replaceChild(replacement_elem, this_elem);
1368	}
1369	return true;
1370	}
1371	/**
1372	* Highlights query terms in the text content of an element.
1373	*/
1374	private Element highlightQueryTermsElementText(Document doc, Element original_element, HashSet<String> query_term_variants, ArrayList<ArrayList<HashSet<String>>> phrase_query_term_variants_hierarchy, boolean case_insensitive)
1375	{
1376	String content = GSXML.getNodeText(original_element);
1377	// Convert the content string to an array of characters for speed
1378	char[] content_characters = new char[content.length()];
1379	content.getChars(0, content.length(), content_characters, 0);
1380
1381	// Now skim through the content, identifying word matches
1382	ArrayList<WordMatch> word_matches = new ArrayList<WordMatch>();
1383	int word_start = 0;
1384	boolean in_word = false;
1385	boolean preceding_word_matched = false;
1386	boolean inTag = false;
1387	for (int i = 0; i < content_characters.length; i++)
1388	{
1389	//We don't want to find words inside HTML tags
1390	if (content_characters[i] == '<')
1391	{
1392	// are we currently in a word?
1393	if (in_word) {
1394	in_word = false;
1395	String word = new String(content_characters, word_start, (i - word_start));
1396	if (case_insensitive) {
1397	word = word.toLowerCase();
1398	}
1399	if (query_term_variants.contains(word)) {
1400	// We have found a matching word, so remember its location
1401	word_matches.add(new WordMatch(word, word_start, i, preceding_word_matched));
1402	// should preceding word matched be set to true/false here??
1403	preceding_word_matched = true;
1404	} else {
1405	preceding_word_matched = false;
1406	}
1407	}
1408	inTag = true;
1409	continue;
1410	}
1411	else if (inTag && content_characters[i] == '>')
1412	{
1413	inTag = false;
1414	continue;
1415	}
1416	else if (inTag)
1417	{
1418	continue;
1419	}
1420
1421	boolean is_character_letter_or_digit = Character.isLetterOrDigit(content_characters[i]);
1422
1423	// Has a word just started?
1424	if (in_word == false && is_character_letter_or_digit == true)
1425	{
1426	in_word = true;
1427	word_start = i;
1428	}
1429
1430	// Or has a word just finished?
1431	else if (in_word == true && is_character_letter_or_digit == false)
1432	{
1433	in_word = false;
1434
1435	// Check if the word matches any of the query term equivalents
1436	String word = new String(content_characters, word_start, (i - word_start));
1437	if (case_insensitive) {
1438	word = word.toLowerCase();
1439	}
1440	if (query_term_variants.contains(word))
1441	{
1442	// We have found a matching word, so remember its location
1443	word_matches.add(new WordMatch(word, word_start, i, preceding_word_matched));
1444	preceding_word_matched = true;
1445	}
1446	else
1447	{
1448	preceding_word_matched = false;
1449	}
1450	}
1451	}
1452
1453	// Don't forget the last word...
1454	if (in_word == true)
1455	{
1456	// Check if the word matches any of the query term equivalents
1457	String word = new String(content_characters, word_start, (content_characters.length - word_start));
1458	if (case_insensitive) {
1459	word = word.toLowerCase();
1460	}
1461	if (query_term_variants.contains(word))
1462	{
1463	// We have found a matching word, so remember its location
1464	word_matches.add(new WordMatch(word, word_start, content_characters.length, preceding_word_matched));
1465	}
1466	}
1467
1468	if (word_matches.size() == 0) {
1469	// just return a copy of the original element
1470	return (Element)doc.importNode(original_element, true);
1471
1472	}
1473
1474	ArrayList<Integer> highlight_start_positions = new ArrayList<Integer>();
1475	ArrayList<Integer> highlight_end_positions = new ArrayList<Integer>();
1476
1477	if (phrase_query_term_variants_hierarchy.size() ==0) {
1478	for (int i = 0; i < word_matches.size(); i++) {
1479	highlight_start_positions.add(new Integer(word_matches.get(i).start_position));
1480	highlight_end_positions.add(new Integer(word_matches.get(i).end_position));
1481	}
1482	}
1483	else {
1484	// Deal with phrases now
1485	ArrayList<PartialPhraseMatch> partial_phrase_matches = new ArrayList<PartialPhraseMatch>();
1486	for (int i = 0; i < word_matches.size(); i++)
1487	{
1488	WordMatch word_match = word_matches.get(i);
1489
1490	// See if any partial phrase matches are extended by this word
1491	if (word_match.preceding_word_matched)
1492	{
1493	for (int j = partial_phrase_matches.size() - 1; j >= 0; j--)
1494	{
1495	PartialPhraseMatch partial_phrase_match = partial_phrase_matches.remove(j);
1496	ArrayList phrase_query_p_term_variants_list = phrase_query_term_variants_hierarchy.get(partial_phrase_match.query_phrase_number);
1497	HashSet phrase_query_p_term_x_variants = (HashSet) phrase_query_p_term_variants_list.get(partial_phrase_match.num_words_matched);
1498	if (phrase_query_p_term_x_variants.contains(word_match.word))
1499	{
1500	partial_phrase_match.num_words_matched++;
1501
1502	// Has a complete phrase match occurred?
1503	if (partial_phrase_match.num_words_matched == phrase_query_p_term_variants_list.size())
1504	{
1505	// Check for overlaps by looking at the previous highlight range
1506	if (!highlight_end_positions.isEmpty())
1507	{
1508	int last_highlight_index = highlight_end_positions.size() - 1;
1509	int last_highlight_end = highlight_end_positions.get(last_highlight_index).intValue();
1510	if (last_highlight_end > partial_phrase_match.start_position)
1511	{
1512	// There is an overlap, so remove the previous phrase match
1513	int last_highlight_start = highlight_start_positions.remove(last_highlight_index).intValue();
1514	highlight_end_positions.remove(last_highlight_index);
1515	partial_phrase_match.start_position = last_highlight_start;
1516	}
1517	}
1518
1519	highlight_start_positions.add(new Integer(partial_phrase_match.start_position));
1520	highlight_end_positions.add(new Integer(word_match.end_position));
1521	}
1522	// No, but add the partial match back into the list for next time
1523	else
1524	{
1525	partial_phrase_matches.add(partial_phrase_match);
1526	}
1527	}
1528	}
1529	}
1530	else
1531	{
1532	partial_phrase_matches.clear();
1533	}
1534
1535	// See if this word is at the start of any of the phrases
1536	for (int p = 0; p < phrase_query_term_variants_hierarchy.size(); p++)
1537	{
1538	ArrayList phrase_query_p_term_variants_list = phrase_query_term_variants_hierarchy.get(p);
1539	if (phrase_query_p_term_variants_list.size()>0) {
1540	HashSet phrase_query_p_term_1_variants = (HashSet) phrase_query_p_term_variants_list.get(0);
1541	if (phrase_query_p_term_1_variants.contains(word_match.word))
1542	{
1543	// If this phrase is just one word long, we have a complete match
1544	if (phrase_query_p_term_variants_list.size() == 1)
1545	{
1546	highlight_start_positions.add(new Integer(word_match.start_position));
1547	highlight_end_positions.add(new Integer(word_match.end_position));
1548	}
1549	// Otherwise we have the start of a potential phrase match
1550	else
1551	{
1552	partial_phrase_matches.add(new PartialPhraseMatch(word_match.start_position, p));
1553	}
1554	}
1555	}
1556	}
1557	}
1558	}
1559
1560	// Now add the annotation tags into the document at the correct points
1561	Element content_element = (Element)doc.importNode(original_element, false); // just copy the element plus any attributes, but not any children.
1562	int last_wrote = 0;
1563	for (int i = 0; i < highlight_start_positions.size(); i++)
1564	{
1565	int highlight_start = highlight_start_positions.get(i).intValue();
1566	int highlight_end = highlight_end_positions.get(i).intValue();
1567
1568	// Print anything before the highlight range
1569	if (last_wrote < highlight_start)
1570	{
1571	String preceding_text = new String(content_characters, last_wrote, (highlight_start - last_wrote));
1572	content_element.appendChild(doc.createTextNode(preceding_text));
1573	}
1574
1575	// Print the highlight text, annotated
1576	if (highlight_end > last_wrote)
1577	{
1578	String highlight_text = new String(content_characters, highlight_start, (highlight_end - highlight_start));
1579	Element annotation_element = GSXML.createTextElement(doc, "annotation", highlight_text);
1580	annotation_element.setAttribute("type", "query_term");
1581	content_element.appendChild(annotation_element);
1582	last_wrote = highlight_end;
1583	}
1584	}
1585
1586	// Finish off any unwritten text
1587	if (last_wrote < content_characters.length)
1588	{
1589	String remaining_text = new String(content_characters, last_wrote, (content_characters.length - last_wrote));
1590	content_element.appendChild(doc.createTextNode(remaining_text));
1591	}
1592	return content_element;
1593	}
1594
1595
1596	static private class WordMatch
1597	{
1598	public String word;
1599	public int start_position;
1600	public int end_position;
1601	public boolean preceding_word_matched;
1602
1603	public WordMatch(String word, int start_position, int end_position, boolean preceding_word_matched)
1604	{
1605	this.word = word;
1606	this.start_position = start_position;
1607	this.end_position = end_position;
1608	this.preceding_word_matched = preceding_word_matched;
1609	}
1610	}
1611
1612	static private class PartialPhraseMatch
1613	{
1614	public int start_position;
1615	public int query_phrase_number;
1616	public int num_words_matched;
1617
1618	public PartialPhraseMatch(int start_position, int query_phrase_number)
1619	{
1620	this.start_position = start_position;
1621	this.query_phrase_number = query_phrase_number;
1622	this.num_words_matched = 1;
1623	}
1624	}
1625	}

Note: See TracBrowser for help on using the repository browser.

Download in other formats: