Context Navigation

source: main/trunk/greenstone3/src/java/org/greenstone/gsdl3/action/DocumentAction.java@ 36978

Last change on this file since 36978 was 36978, checked in by kjdon, 17 months ago
now this does a check to see if the doc id is valid before proceding to get teh structure, metadata etc. if the id is invalid, the page will contain an error element and not a document element. Also replaced hard coded service names with their variable names
Property svn:keywords set to `Author Date Id Revision`
File size: 63.8 KB

Line
1	/*
2	* DocumentAction.java
3	* Copyright (C) 2002 New Zealand Digital Library, http://www.nzdl.org
4	*
5	* This program is free software; you can redistribute it and/or modify
6	* it under the terms of the GNU General Public License as published by
7	* the Free Software Foundation; either version 2 of the License, or
8	* (at your option) any later version.
9	*
10	* This program is distributed in the hope that it will be useful,
11	* but WITHOUT ANY WARRANTY; without even the implied warranty of
12	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13	* GNU General Public License for more details.
14	*
15	* You should have received a copy of the GNU General Public License
16	* along with this program; if not, write to the Free Software
17	* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
18	*/
19	package org.greenstone.gsdl3.action;
20
21	// Greenstone classes
22	import org.greenstone.gsdl3.core.ModuleInterface;
23	import org.greenstone.gsdl3.service.AbstractDocumentRetrieve;
24	import org.greenstone.gsdl3.service.DocXMLUtil;
25	import org.greenstone.gsdl3.util.*;
26	import org.greenstone.util.GlobalProperties;
27
28	// XML classes
29	import org.w3c.dom.Document;
30	import org.w3c.dom.Element;
31	import org.w3c.dom.Node;
32	import org.w3c.dom.Text;
33	import org.w3c.dom.NodeList;
34
35	// General Java classes
36	import java.util.ArrayList;
37	import java.util.HashMap;
38	import java.util.HashSet;
39	import java.util.Iterator;
40	import java.io.File;
41	import java.io.Serializable;
42
43	import org.apache.log4j.*;
44
45	/** Action class for retrieving Documents via the message router */
46	public class DocumentAction extends Action
47	{
48
49	static Logger logger = Logger.getLogger(org.greenstone.gsdl3.action.DocumentAction.class.getName());
50
51	// this is used to specify that the sibling nodes of a selected one should be obtained
52	public static final String SIBLING_ARG = "sib";
53	public static final String GOTO_PAGE_ARG = "gp";
54	public static final String ENRICH_DOC_ARG = "end";
55	public static final String EXPAND_DOCUMENT_ARG = "ed";
56	public static final String EXPAND_CONTENTS_ARG = "ec";
57	public static final String REALISTIC_BOOK_ARG = "book";
58	public static final String NO_TEXT_ARG = "noText";
59	public static final String DOC_EDIT_ARG = "docEdit";
60
61	/**
62	* if this is set to true, when a document is displayed, any annotation type
63	* services (enrich) will be offered to the user as well
64	*/
65	protected boolean provide_annotations = false;
66
67	protected boolean highlight_query_terms = false;
68
69	public boolean configure()
70	{
71	super.configure();
72	String highlight = (String) config_params.get("highlightQueryTerms");
73	if (highlight != null && highlight.equals("true"))
74	{
75	highlight_query_terms = true;
76	}
77	String annotate = (String) config_params.get("displayAnnotationService");
78	if (annotate != null && annotate.equals("true"))
79	{
80	provide_annotations = true;
81	}
82	return true;
83	}
84
85	public Node process(Node message_node)
86	{
87	// for now, no subaction eventually we may want to have subactions such as text assoc or something ?
88
89	Element message = GSXML.nodeToElement(message_node);
90	Document doc = XMLConverter.newDOM();
91
92	// the response
93	Element result = doc.createElement(GSXML.MESSAGE_ELEM);
94	Element page_response = doc.createElement(GSXML.RESPONSE_ELEM);
95	result.appendChild(page_response);
96
97	// get the request - assume only one
98	Element request = (Element) GSXML.getChildByTagName(message, GSXML.REQUEST_ELEM);
99	Element cgi_paramList = (Element) GSXML.getChildByTagName(request, GSXML.PARAM_ELEM + GSXML.LIST_MODIFIER);
100	HashMap<String, Serializable> params = GSXML.extractParams(cgi_paramList, false);
101
102	// just in case there are some that need to get passed to the services
103	// why do we use s0 here and s1 in other places???
104	HashMap service_params = (HashMap) params.get("s0");
105
106	String collection = (String) params.get(GSParams.COLLECTION);
107	String document_id = (String) params.get(GSParams.DOCUMENT);
108	if (document_id != null && document_id.equals(""))
109	{
110	document_id = null;
111	}
112	String href = (String) params.get(GSParams.HREF);//for an external link : get the href URL if it is existing in the params list
113	if (href != null && href.equals(""))
114	{
115	href = null;
116	}
117	String rl = (String) params.get(GSParams.RELATIVE_LINK);//for an external link : get the rl value if it is existing in the params list
118	if (document_id == null && href == null)
119	{
120	logger.error("no document specified!");
121	return result;
122	}
123	if (rl != null && rl.equals("0"))
124	{
125	// this is a true external link, we should have been directed to a different page or action
126	logger.error("rl value was 0, shouldn't get here");
127	return result;
128	}
129
130	String doc_id_modifier = "";
131	String sibling_num = (String) params.get(GOTO_PAGE_ARG);
132	if (sibling_num != null && !sibling_num.equals(""))
133	{
134	// we have to modify the doc name
135	doc_id_modifier = "." + sibling_num + ".ss";
136	}
137
138
139	UserContext userContext = new UserContext(request);
140
141	//append site metadata
142	addSiteMetadata(page_response, userContext);
143	addInterfaceOptions(page_response);
144
145	// get the additional data needed for the page
146	getBackgroundData(page_response, collection, userContext);
147
148	// create a basic doc list containing the current node
149	// we will use this to query whether the id is valid, and to get document type
150	Element basic_doc_list = doc.createElement(GSXML.DOC_NODE_ELEM + GSXML.LIST_MODIFIER);
151	Element current_doc = doc.createElement(GSXML.DOC_NODE_ELEM);
152	basic_doc_list.appendChild(current_doc);
153	if (document_id != null)
154	{
155	current_doc.setAttribute(GSXML.NODE_ID_ATT, document_id + doc_id_modifier);
156	}
157	else
158	{
159	current_doc.setAttribute(GSXML.HREF_ID_ATT, href);
160	// do we need this??
161	current_doc.setAttribute(GSXML.ID_MOD_ATT, doc_id_modifier);
162	}
163
164	// lets do a quick check here for valid doc id.
165	if (document_id != null) {
166	boolean is_valid = checkValidOID(basic_doc_list, collection, userContext, page_response );
167	if (!is_valid) {
168	GSXML.addError(page_response, "Invalid doc id ("+document_id+")", GSXML.ERROR_TYPE_INVALID_ID);
169	return result;
170	}
171	}
172	Element format_elem = (Element) GSXML.getChildByTagName(page_response, GSXML.FORMAT_ELEM);
173
174	if (format_elem != null) {
175	// lets look for param defaults set in config file
176	NodeList param_defaults = format_elem.getElementsByTagName(GSXML.PARAM_DEFAULT_ELEM);
177	for (int i=0; i<param_defaults.getLength(); i++) {
178	Element p = (Element)param_defaults.item(i);
179	String name = p.getAttribute(GSXML.NAME_ATT);
180	if (params.get(name) ==null) {
181	// wasn't set from interface
182	String value = p.getAttribute(GSXML.VALUE_ATT);
183	params.put(name, value );
184	// also add into request param xml so that xslt knows it too
185	GSXML.addParameterToList(cgi_paramList, name, value);
186	}
187	}
188	}
189
190	String document_type = (String) params.get(GSParams.DOCUMENT_TYPE);
191	if (document_type != null && document_type.equals(""))
192	{
193	//document_type = "hierarchy";
194	document_type = null; // we'll get it later if not already specified
195	}
196	// what if it is null here?? Anu to check...
197
198
199	boolean editing_document = false;
200	String doc_edit = (String) params.get(DOC_EDIT_ARG);
201	if (doc_edit != null && doc_edit.equals("1")) {
202	editing_document = true;
203	}
204
205	// are we editing mode? just get the archive document, convert to our internal doc format, and return it
206	if (editing_document) {
207	return getFormattedArchiveDoc(doc, collection, document_id, document_type, result, page_response, userContext);
208	}
209
210	//whether to retrieve siblings or not
211	boolean get_siblings = false;
212	String sibs = (String) params.get(SIBLING_ARG);
213	if (sibs != null && sibs.equals("1"))
214	{
215	get_siblings = true;
216	}
217
218	boolean expand_document = false;
219	String ed_arg = (String) params.get(EXPAND_DOCUMENT_ARG);
220	if (ed_arg != null && ed_arg.equals("1"))
221	{
222	expand_document = true;
223	}
224
225	boolean expand_contents = false;
226	if (expand_document)
227	{ // we always expand the contents with the text
228	expand_contents = true;
229	}
230	else
231	{
232	String ec_arg = (String) params.get(EXPAND_CONTENTS_ARG);
233	if (ec_arg != null && ec_arg.equals("1"))
234	{
235	expand_contents = true;
236	}
237	}
238
239	// do we want text content? Not if no_text=1.
240	// expand_document overrides this. - should it??
241	boolean get_text = true;
242	String nt_arg = (String) params.get(NO_TEXT_ARG);
243
244	if (!expand_document && nt_arg!=null && nt_arg.equals("1")) {
245	logger.debug("SETTING GET TEXT TO FALSE");
246	get_text = false;
247	} else {
248	logger.debug("GET TEXT REMAINS TRUE");
249	}
250
251	// the_document is where all the doc info - structure and metadata etc
252	// is added into, to be returned in the page
253	Element the_document = doc.createElement(GSXML.DOCUMENT_ELEM);
254	page_response.appendChild(the_document);
255
256	// used to create basic_doc_list here
257	if (document_type == null)
258	{
259	document_type = getDocumentType(basic_doc_list, collection, userContext, page_response);
260	}
261	if (document_type == null)
262	{
263	logger.debug("##### doctype is null, setting to simple");
264	document_type = GSXML.DOC_TYPE_SIMPLE;
265	}
266
267	the_document.setAttribute(GSXML.DOC_TYPE_ATT, document_type);
268
269	// start getting doc structure
270
271	// Create a parameter list to specify the required structure information
272	Element ds_param_list = doc.createElement(GSXML.PARAM_ELEM + GSXML.LIST_MODIFIER);
273
274	if (service_params != null)
275	{
276	GSXML.addParametersToList(ds_param_list, service_params);
277	}
278
279	Element ds_param = null;
280	boolean get_structure = false;
281	boolean get_structure_info = false;
282	if (document_type.equals(GSXML.DOC_TYPE_PAGED))
283	{
284	get_structure_info = true;
285
286	if (expand_contents)
287	{
288	ds_param = doc.createElement(GSXML.PARAM_ELEM);
289	ds_param_list.appendChild(ds_param);
290	ds_param.setAttribute(GSXML.NAME_ATT, "structure");
291	ds_param.setAttribute(GSXML.VALUE_ATT, "entire");
292	}
293
294	// get the info needed for paged naviagtion
295	ds_param = doc.createElement(GSXML.PARAM_ELEM);
296	ds_param_list.appendChild(ds_param);
297	ds_param.setAttribute(GSXML.NAME_ATT, "info");
298	ds_param.setAttribute(GSXML.VALUE_ATT, "numSiblings");
299	ds_param = doc.createElement(GSXML.PARAM_ELEM);
300	ds_param_list.appendChild(ds_param);
301	ds_param.setAttribute(GSXML.NAME_ATT, "info");
302	ds_param.setAttribute(GSXML.VALUE_ATT, "numChildren");
303	ds_param = doc.createElement(GSXML.PARAM_ELEM);
304	ds_param_list.appendChild(ds_param);
305	ds_param.setAttribute(GSXML.NAME_ATT, "info");
306	ds_param.setAttribute(GSXML.VALUE_ATT, "siblingPosition");
307
308	if (get_siblings)
309	{
310	ds_param = doc.createElement(GSXML.PARAM_ELEM);
311	ds_param_list.appendChild(ds_param);
312	ds_param.setAttribute(GSXML.NAME_ATT, "structure");
313	ds_param.setAttribute(GSXML.VALUE_ATT, "siblings");
314	}
315
316	}
317	else if (document_type.equals(GSXML.DOC_TYPE_HIERARCHY) \|\| document_type.equals(GSXML.DOC_TYPE_PAGED_HIERARCHY))
318	{
319	get_structure = true;
320	if (expand_contents)
321	{
322	ds_param = doc.createElement(GSXML.PARAM_ELEM);
323	ds_param_list.appendChild(ds_param);
324	ds_param.setAttribute(GSXML.NAME_ATT, "structure");
325	ds_param.setAttribute(GSXML.VALUE_ATT, "entire");
326	}
327	else
328	{
329	// get the info needed for table of contents
330	ds_param = doc.createElement(GSXML.PARAM_ELEM);
331	ds_param_list.appendChild(ds_param);
332	ds_param.setAttribute(GSXML.NAME_ATT, "structure");
333	ds_param.setAttribute(GSXML.VALUE_ATT, "ancestors");
334	ds_param = doc.createElement(GSXML.PARAM_ELEM);
335	ds_param_list.appendChild(ds_param);
336	ds_param.setAttribute(GSXML.NAME_ATT, "structure");
337	ds_param.setAttribute(GSXML.VALUE_ATT, "children");
338	if (get_siblings)
339	{
340	ds_param = doc.createElement(GSXML.PARAM_ELEM);
341	ds_param_list.appendChild(ds_param);
342	ds_param.setAttribute(GSXML.NAME_ATT, "structure");
343	ds_param.setAttribute(GSXML.VALUE_ATT, "siblings");
344	}
345	}
346	}
347	else
348	{
349	// we dont need any structure
350	}
351
352	boolean has_dummy = false;
353	if (get_structure \|\| get_structure_info)
354	{
355
356	// Build a request to obtain the document structure
357	Element ds_message = doc.createElement(GSXML.MESSAGE_ELEM);
358	String to = GSPath.appendLink(collection, AbstractDocumentRetrieve.DOCUMENT_STRUCTURE_RETRIEVE_SERVICE);
359	Element ds_request = GSXML.createBasicRequest(doc, GSXML.REQUEST_TYPE_PROCESS, to, userContext);
360	ds_message.appendChild(ds_request);
361	ds_request.appendChild(ds_param_list);
362
363	// add the node list we created earlier
364	ds_request.appendChild(basic_doc_list);
365
366	// Process the document structure retrieve message
367	Element ds_response_message = (Element) this.mr.process(ds_message);
368	if (processErrorElements(ds_response_message, page_response))
369	{
370	return result;
371	}
372
373	// get the info and print out
374	String path = GSPath.appendLink(GSXML.RESPONSE_ELEM, GSXML.DOC_NODE_ELEM + GSXML.LIST_MODIFIER);
375	path = GSPath.appendLink(path, GSXML.DOC_NODE_ELEM);
376	path = GSPath.appendLink(path, "nodeStructureInfo");
377	Element ds_response_struct_info = (Element) GSXML.getNodeByPath(ds_response_message, path);
378	// get the doc_node bit
379	if (ds_response_struct_info != null)
380	{
381	the_document.appendChild(doc.importNode(ds_response_struct_info, true));
382	}
383	path = GSPath.appendLink(GSXML.RESPONSE_ELEM, GSXML.DOC_NODE_ELEM + GSXML.LIST_MODIFIER);
384	path = GSPath.appendLink(path, GSXML.DOC_NODE_ELEM);
385	path = GSPath.appendLink(path, GSXML.NODE_STRUCTURE_ELEM);
386	Element ds_response_structure = (Element) GSXML.getNodeByPath(ds_response_message, path);
387
388	if (ds_response_structure != null)
389	{
390	// add the contents of the structure bit into the_document
391	NodeList structs = ds_response_structure.getChildNodes();
392	for (int i = 0; i < structs.getLength(); i++)
393	{
394	the_document.appendChild(doc.importNode(structs.item(i), true));
395	}
396	}
397	else
398	{
399	// no structure nodes, so put in a dummy doc node
400	Element doc_node = doc.createElement(GSXML.DOC_NODE_ELEM);
401	if (document_id != null)
402	{
403	doc_node.setAttribute(GSXML.NODE_ID_ATT, document_id);
404	}
405	else
406	{
407	doc_node.setAttribute(GSXML.HREF_ID_ATT, href);
408
409	}
410	the_document.appendChild(doc_node);
411	has_dummy = true;
412	}
413	}
414	else
415	{ // a simple type - we dont have a dummy node for simple
416	// should think about this more
417	// no structure request, so just put in a dummy doc node
418	Element doc_node = doc.createElement(GSXML.DOC_NODE_ELEM);
419	if (document_id != null)
420	{
421	doc_node.setAttribute(GSXML.NODE_ID_ATT, document_id);
422	}
423	else
424	{
425	doc_node.setAttribute(GSXML.HREF_ID_ATT, href);
426	}
427	the_document.appendChild(doc_node);
428	has_dummy = true;
429	}
430
431	// end getting doc structure
432
433	// start getting doc metadata
434
435	// Build a request to obtain some document metadata
436	Element dm_message = doc.createElement(GSXML.MESSAGE_ELEM);
437	String to = GSPath.appendLink(collection, AbstractDocumentRetrieve.DOCUMENT_METADATA_RETRIEVE_SERVICE);
438	Element dm_request = GSXML.createBasicRequest(doc, GSXML.REQUEST_TYPE_PROCESS, to, userContext);
439	dm_message.appendChild(dm_request);
440	// Create a parameter list to specify the required metadata information
441
442	HashSet<String> meta_names = new HashSet<String>();
443	meta_names.add("Title"); // the default
444	if (format_elem != null)
445	{
446	getRequiredMetadataNames(format_elem, meta_names);
447	}
448
449	Element extraMetaListElem = (Element) GSXML.getChildByTagName(request, GSXML.EXTRA_METADATA + GSXML.LIST_MODIFIER);
450	if (extraMetaListElem != null)
451	{
452	NodeList extraMetaList = extraMetaListElem.getElementsByTagName(GSXML.EXTRA_METADATA);
453	for (int i = 0; i < extraMetaList.getLength(); i++)
454	{
455	meta_names.add(((Element) extraMetaList.item(i)).getAttribute(GSXML.NAME_ATT));
456	}
457	}
458
459	Element dm_param_list = createMetadataParamList(doc,meta_names);
460	if (service_params != null)
461	{
462	GSXML.addParametersToList(dm_param_list, service_params);
463	}
464
465	dm_request.appendChild(dm_param_list);
466
467	// create the doc node list for the metadata request
468	Element dm_doc_list = doc.createElement(GSXML.DOC_NODE_ELEM + GSXML.LIST_MODIFIER);
469	dm_request.appendChild(dm_doc_list);
470
471	// Add each node from the structure response into the metadata request
472	NodeList doc_nodes = the_document.getElementsByTagName(GSXML.DOC_NODE_ELEM);
473	for (int i = 0; i < doc_nodes.getLength(); i++)
474	{
475	Element doc_node = (Element) doc_nodes.item(i);
476	String doc_node_id = doc_node.getAttribute(GSXML.NODE_ID_ATT);
477
478	// Add the documentNode to the list
479	Element dm_doc_node = doc.createElement(GSXML.DOC_NODE_ELEM);
480	if (needSectionContent(params)) {
481	if (doc_node_id.equals(document_id)) {
482	dm_doc_list.appendChild(dm_doc_node);
483	}
484	} else {
485	dm_doc_list.appendChild(dm_doc_node);
486	}
487	//dm_doc_list.appendChild(dm_doc_node);
488	dm_doc_node.setAttribute(GSXML.NODE_ID_ATT, doc_node_id);
489	dm_doc_node.setAttribute(GSXML.NODE_TYPE_ATT, doc_node.getAttribute(GSXML.NODE_TYPE_ATT));
490	if (document_id == null){
491	dm_doc_node.setAttribute(GSXML.HREF_ID_ATT, href );
492	}
493
494	}
495	// we also want a metadata request to the top level document to get
496	// assocfilepath - this could be cached too
497	Element doc_meta_request = GSXML.createBasicRequest(doc, GSXML.REQUEST_TYPE_PROCESS, to, userContext);
498	dm_message.appendChild(doc_meta_request);
499	Element doc_meta_param_list = doc.createElement(GSXML.PARAM_ELEM + GSXML.LIST_MODIFIER);
500	if (service_params != null)
501	{
502	GSXML.addParametersToList(doc_meta_param_list, service_params);
503	}
504
505	doc_meta_request.appendChild(doc_meta_param_list);
506	Element doc_param = doc.createElement(GSXML.PARAM_ELEM);
507	doc_meta_param_list.appendChild(doc_param);
508	doc_param.setAttribute(GSXML.NAME_ATT, "metadata");
509	doc_param.setAttribute(GSXML.VALUE_ATT, "assocfilepath");
510
511	// create the doc node list for the metadata request
512	Element doc_list = doc.createElement(GSXML.DOC_NODE_ELEM + GSXML.LIST_MODIFIER);
513	doc_meta_request.appendChild(doc_list);
514
515	Element doc_node = doc.createElement(GSXML.DOC_NODE_ELEM);
516	// the node we want is the root document node
517	if (document_id != null)
518	{
519	doc_node.setAttribute(GSXML.NODE_ID_ATT, document_id + ".rt");
520	}
521	/*else
522	{
523	doc_node.setAttribute(GSXML.HREF_ID_ATT, href);// + ".rt");
524	// can we assume that href is always a top level doc??
525	//doc_node.setAttribute(GSXML.ID_MOD_ATT, ".rt");
526	//doc_node.setAttribute("externalURL", has_rl);
527	}*/
528	doc_list.appendChild(doc_node);
529
530	Element dm_response_message = (Element) this.mr.process(dm_message);
531	if (processErrorElements(dm_response_message, page_response))
532	{
533	return result;
534	}
535
536	String path = GSPath.appendLink(GSXML.RESPONSE_ELEM, GSXML.DOC_NODE_ELEM + GSXML.LIST_MODIFIER);
537	Element dm_response_doc_list = (Element) GSXML.getNodeByPath(dm_response_message, path);
538
539	// Merge the metadata with the structure information
540	NodeList dm_response_docs = dm_response_doc_list.getChildNodes();
541	for (int i = 0; i < doc_nodes.getLength(); i++)
542	{
543	Node dcNode;
544	String node_idd = ((Element)doc_nodes.item(i)).getAttribute(GSXML.NODE_ID_ATT);
545	if (node_idd.isEmpty()) {
546	String href_id_att = ((Element)doc_nodes.item(i)).getAttribute(GSXML.HREF_ID_ATT);
547	dcNode = GSXML.getNamedElement(dm_response_doc_list, "documentNode", GSXML.HREF_ID_ATT, href_id_att);
548	} else {
549	dcNode = GSXML.getNamedElement(dm_response_doc_list, "documentNode", GSXML.NODE_ID_ATT, node_idd);
550	}
551	GSXML.mergeMetadataLists(doc_nodes.item(i), dcNode);
552	}
553	// get the top level doc metadata out
554	Element doc_meta_response = (Element) dm_response_message.getElementsByTagName(GSXML.RESPONSE_ELEM).item(1);
555	Element top_doc_node = (Element) GSXML.getNodeByPath(doc_meta_response, "documentNodeList/documentNode");
556	GSXML.mergeMetadataLists(the_document, top_doc_node);
557
558	// if we are highlighting query terms, then we also get them highlighted in the metadata
559
560	HashSet<String> query_term_variants = null;
561	ArrayList<ArrayList<HashSet<String>>> phrase_query_term_variants_hierarchy = null;
562	boolean do_highlight_query_terms = highlight_query_terms;
563	int query_terms_status = 0;
564	if (highlight_query_terms) {
565	// lets get the query term equivalents
566	query_term_variants = new HashSet<String>();
567	phrase_query_term_variants_hierarchy = new ArrayList<ArrayList<HashSet<String>>>();
568	if ((query_terms_status = getQueryTermVariants(request, query_term_variants, phrase_query_term_variants_hierarchy)) ==0) {
569	do_highlight_query_terms = false; // we couldn't get the terms
570	}
571	}
572
573	// lets try marking up the metadata with search terms
574	// if the search service doesn't send back <equivTermlist> then we haven't got the term variants. We lower case everything and do case insensitive matching
575	boolean highlight_case_insensitive = false;
576	if (query_terms_status == NO_EQUIV_QUERY_TERMS) {
577	highlight_case_insensitive = true;
578	}
579	if (do_highlight_query_terms) {
580	highlightQueryTermsDOM(doc, the_document, "metadata", query_term_variants, phrase_query_term_variants_hierarchy, highlight_case_insensitive);
581	}
582
583	// do we want doc text content? If not, we are done.
584	if (!get_text) {
585	// don't get text
586	return result;
587	}
588
589	// Build a request to obtain some document content
590	Element dc_message = doc.createElement(GSXML.MESSAGE_ELEM);
591	to = GSPath.appendLink(collection, AbstractDocumentRetrieve.DOCUMENT_CONTENT_RETRIEVE_SERVICE);
592	Element dc_request = GSXML.createBasicRequest(doc, GSXML.REQUEST_TYPE_PROCESS, to, userContext);
593	dc_message.appendChild(dc_request);
594
595	// Create a parameter list to specify the request parameters - empty for now
596	Element dc_param_list = doc.createElement(GSXML.PARAM_ELEM + GSXML.LIST_MODIFIER);
597	if (service_params != null)
598	{
599	GSXML.addParametersToList(dc_param_list, service_params);
600	}
601
602	dc_request.appendChild(dc_param_list);
603
604	// get the content
605	// the doc list for the content request is the same as the one for the structure request unless we want the whole document, in which case its the same as for the metadata request.
606	if (expand_document)
607	{
608	dc_request.appendChild(dm_doc_list);
609	}
610	else
611	{
612	dc_request.appendChild(basic_doc_list);
613	}
614	Element dc_response_message = (Element) this.mr.process(dc_message);
615
616	if (processErrorElements(dc_response_message, page_response))
617	{
618	return result;
619
620	}
621	Element dc_response_doc_list = (Element) GSXML.getNodeByPath(dc_response_message, path);
622
623	boolean get_marked_up_doc_from_query = false;
624	if (do_highlight_query_terms && query_terms_status == NO_EQUIV_QUERY_TERMS) {
625	get_marked_up_doc_from_query = true; // we try to. solr we can, lucene we can't
626	}
627
628	if (expand_document)
629	{
630	// Merge the content with the structure information
631	NodeList dc_response_docs = dc_response_doc_list.getChildNodes();
632	for (int i = 0; i < doc_nodes.getLength(); i++)
633	{
634	String node_id = ((Element)doc_nodes.item(i)).getAttribute(GSXML.NODE_ID_ATT);
635	Node docNode = GSXML.getNamedElement(dc_response_doc_list, "documentNode", GSXML.NODE_ID_ATT, node_id);
636	Node content = GSXML.getChildByTagName(docNode, GSXML.NODE_CONTENT_ELEM);
637	if (content != null)
638	{
639	if (do_highlight_query_terms) {
640	if (get_marked_up_doc_from_query) {
641
642	Element new_content = retrieveHighlightedContent(request, node_id);
643
644	if (new_content == null) {
645	// we didn't get any text back from the request. assume we won't be able to get it next time either (eg lucene)
646	get_marked_up_doc_from_query = false;
647	content = highlightQueryTermsElementText(doc, (Element)content, query_term_variants, phrase_query_term_variants_hierarchy, highlight_case_insensitive);
648	} else {
649	content= new_content;
650	}
651	} else {
652	content = highlightQueryTermsElementText(doc, (Element)content, query_term_variants, phrase_query_term_variants_hierarchy, highlight_case_insensitive);
653	}
654	}
655	doc_nodes.item(i).appendChild(doc.importNode(content, true));
656	}
657
658	}
659	if (has_dummy && document_type.equals(GSXML.DOC_TYPE_SIMPLE)) {
660	Element dummy_node = (Element) doc_nodes.item(0);
661	the_document.removeChild(dummy_node);
662	the_document.setAttribute(GSXML.NODE_ID_ATT, dummy_node.getAttribute(GSXML.NODE_ID_ATT));
663	NodeList dummy_children = dummy_node.getChildNodes();
664	for (int i = dummy_children.getLength() - 1; i >= 0; i--)
665	{
666	// special case as we don't want more than one metadata list
667	if (dummy_children.item(i).getNodeName().equals(GSXML.METADATA_ELEM + GSXML.LIST_MODIFIER))
668	{
669	GSXML.mergeMetadataFromList(the_document, dummy_children.item(i));
670	}
671	else
672	{
673	the_document.appendChild(dummy_children.item(i));
674	}
675	}
676	}
677	}
678	else
679	{
680	Element dc_response_doc = (Element) GSXML.getChildByTagName(dc_response_doc_list, GSXML.DOC_NODE_ELEM);
681	Element dc_response_doc_content = (Element) GSXML.getChildByTagName(dc_response_doc, GSXML.NODE_CONTENT_ELEM);
682
683	if (dc_response_doc_content == null)
684	{
685	// no content to add
686	if (dc_response_doc.getAttribute("external").equals("true"))
687	{
688	String href_id = dc_response_doc.getAttribute(GSXML.HREF_ID_ATT);
689
690	the_document.setAttribute("selectedNode", href_id);
691	the_document.setAttribute("external", href_id);
692	}
693	return result;
694	}
695	if (do_highlight_query_terms)
696	{
697	dc_response_doc.removeChild(dc_response_doc_content);
698	if (get_marked_up_doc_from_query) {
699	Element new_content = retrieveHighlightedContent(request, null);
700	if (new_content == null) {
701	get_marked_up_doc_from_query = false;
702	dc_response_doc_content = highlightQueryTermsElementText(doc, (Element)dc_response_doc_content, query_term_variants, phrase_query_term_variants_hierarchy, highlight_case_insensitive);
703	} else {
704
705	dc_response_doc_content = new_content;
706	}
707	} else {
708	dc_response_doc_content = highlightQueryTermsElementText(doc, (Element)dc_response_doc_content, query_term_variants, phrase_query_term_variants_hierarchy, highlight_case_insensitive);
709	}
710	dc_response_doc.appendChild(dc_response_doc.getOwnerDocument().importNode(dc_response_doc_content, true));
711	}
712
713	if (provide_annotations)
714	{
715	String service_selected = (String) params.get(ENRICH_DOC_ARG);
716	if (service_selected != null && service_selected.equals("1"))
717	{
718	// now we can modifiy the response doc if needed
719	String enrich_service = (String) params.get(GSParams.SERVICE);
720	// send a message to the service
721	Element enrich_message = doc.createElement(GSXML.MESSAGE_ELEM);
722	Element enrich_request = GSXML.createBasicRequest(doc, GSXML.REQUEST_TYPE_PROCESS, enrich_service, userContext);
723	enrich_message.appendChild(enrich_request);
724	// check for parameters
725	HashMap e_service_params = (HashMap) params.get("s1");
726	if (e_service_params != null)
727	{
728	Element enrich_pl = doc.createElement(GSXML.PARAM_ELEM + GSXML.LIST_MODIFIER);
729	GSXML.addParametersToList(enrich_pl, e_service_params);
730	enrich_request.appendChild(enrich_pl);
731	}
732	Element e_doc_list = doc.createElement(GSXML.DOC_NODE_ELEM + GSXML.LIST_MODIFIER);
733	enrich_request.appendChild(e_doc_list);
734	e_doc_list.appendChild(doc.importNode(dc_response_doc, true));
735
736	Node enrich_response = this.mr.process(enrich_message);
737
738	String[] links = { GSXML.RESPONSE_ELEM, GSXML.DOC_NODE_ELEM + GSXML.LIST_MODIFIER, GSXML.DOC_NODE_ELEM, GSXML.NODE_CONTENT_ELEM };
739	path = GSPath.createPath(links);
740	dc_response_doc_content = (Element) GSXML.getNodeByPath(enrich_response, path);
741
742	}
743	} // if provide_annotations
744
745	// use the returned id rather than the sent one cos there may have
746	// been modifiers such as .pr that are removed.
747	String modified_doc_id = dc_response_doc.getAttribute(GSXML.NODE_ID_ATT);
748	the_document.setAttribute("selectedNode", modified_doc_id);
749	if (has_dummy)
750	{
751	// change the id if necessary and add the content
752	Element dummy_node = (Element) doc_nodes.item(0);
753
754	dummy_node.setAttribute(GSXML.NODE_ID_ATT, modified_doc_id);
755	dummy_node.appendChild(doc.importNode(dc_response_doc_content, true));
756	// hack for simple type
757	if (document_type.equals(GSXML.DOC_TYPE_SIMPLE))
758	{
759	// we dont want the internal docNode, just want the content and metadata in the document
760	// rethink this!!
761	the_document.removeChild(dummy_node);
762
763	NodeList dummy_children = dummy_node.getChildNodes();
764	for (int i = dummy_children.getLength() - 1; i >= 0; i--)
765	{
766	// special case as we don't want more than one metadata list
767	if (dummy_children.item(i).getNodeName().equals(GSXML.METADATA_ELEM + GSXML.LIST_MODIFIER))
768	{
769	GSXML.mergeMetadataFromList(the_document, dummy_children.item(i));
770	}
771	else
772	{
773	the_document.appendChild(dummy_children.item(i));
774	}
775	}
776	}
777
778	the_document.setAttribute(GSXML.NODE_ID_ATT, modified_doc_id);
779	}
780	else
781	{
782	// Merge the document content with the metadata and structure information
783	for (int i = 0; i < doc_nodes.getLength(); i++)
784	{
785	Node dn = doc_nodes.item(i);
786	String dn_id = ((Element) dn).getAttribute(GSXML.NODE_ID_ATT);
787	if (dn_id.equals(modified_doc_id))
788	{
789	dn.appendChild(doc.importNode(dc_response_doc_content, true));
790	break;
791	}
792	}
793	}
794	}
795	//logger.debug("(DocumentAction) Page:\n" + GSXML.xmlNodeToString(result));
796	return result;
797	}
798
799	protected boolean checkValidOID(Element basic_doc_list, String collection, UserContext userContext, Element page_response) {
800	Document doc = basic_doc_list.getOwnerDocument();
801
802	Element v_message = doc.createElement(GSXML.MESSAGE_ELEM);
803	String to = GSPath.appendLink(collection, AbstractDocumentRetrieve.VALIDATE_DOCUMENT_ID_SERVICE);
804	Element v_request = GSXML.createBasicRequest(doc, GSXML.REQUEST_TYPE_PROCESS, to, userContext);
805	v_message.appendChild(v_request);
806
807	// add the node list
808	v_request.appendChild(basic_doc_list);
809	Element v_response_message = (Element) this.mr.process(v_message);
810	if (processErrorElements(v_response_message, page_response))
811	{
812	return false;
813	}
814	String[] links = { GSXML.RESPONSE_ELEM, GSXML.DOC_NODE_ELEM + GSXML.LIST_MODIFIER, GSXML.DOC_NODE_ELEM };
815	String path = GSPath.createPath(links);
816	Element info_elem = (Element) GSXML.getNodeByPath(v_response_message, path);
817	if (info_elem == null) {
818	return false;
819	}
820	if (info_elem.getAttribute("valid").equals("true")) {
821	return true;
822	}
823	return false;
824
825	}
826
827	protected Element getFormattedArchiveDoc(Document doc, String collection, String document_id, String document_type, Element result, Element page_response, UserContext userContext ) {
828	// call get archive doc
829	Element dx_message = doc.createElement(GSXML.MESSAGE_ELEM);
830	String to = DocXMLUtil.DOC_XML_GET_SECTION_SERVICE;
831	Element dx_request = GSXML.createBasicRequest(doc, GSXML.REQUEST_TYPE_PROCESS, to, userContext);
832	dx_message.appendChild(dx_request);
833	Element dx_section = doc.createElement(GSXML.DOCXML_SECTION_ELEM);
834	dx_section.setAttribute(GSXML.NODE_ID_ATT, document_id);
835	dx_section.setAttribute(GSXML.COLLECTION_ATT, collection);
836	dx_request.appendChild(dx_section);
837
838	Element dx_response_message = (Element) this.mr.process(dx_message);
839	if (processErrorElements(dx_response_message, page_response))
840	{
841	return result;
842	}
843
844	// get the section out
845	String path = GSPath.appendLink(GSXML.RESPONSE_ELEM, GSXML.DOCXML_SECTION_ELEM);
846	Element section = (Element) GSXML.getNodeByPath(dx_response_message, path);
847	if (section == null) {
848	logger.error("no archive doc returned for "+document_id);
849	return result;
850	}
851	// convert the archive format into the internal format that the page response requires
852
853	// work out doctype
854	// NOTE: this will be coming from collection database in index
855	// the archive file doesn't store this. So we have to assume
856	// that the doc type will not be changing with any
857	// modifications happening to archives.
858
859	// if doc type is null, then we need to work it out.
860	// create a basic doc list containing the current node
861
862	if (document_type == null) {
863	Element basic_doc_list = doc.createElement(GSXML.DOC_NODE_ELEM + GSXML.LIST_MODIFIER);
864	Element current_doc = doc.createElement(GSXML.DOC_NODE_ELEM);
865	basic_doc_list.appendChild(current_doc);
866	current_doc.setAttribute(GSXML.NODE_ID_ATT, document_id);
867	basic_doc_list.appendChild(current_doc);
868	document_type = getDocumentType(basic_doc_list, collection, userContext, page_response);
869	}
870
871	if (document_type == null) {
872	logger.debug("@@@ doctype is null, setting to simple");
873	document_type = GSXML.DOC_TYPE_SIMPLE;
874	}
875
876	Element doc_elem = doc.createElement(GSXML.DOCUMENT_ELEM);
877	doc_elem.setAttribute(GSXML.DOC_TYPE_ATT, document_type);
878	page_response.appendChild(doc_elem);
879
880	Element transformed_section = transformArchiveToDocument(section);
881	if (document_type == GSXML.DOC_TYPE_SIMPLE) {
882	// simple doc, only returning a single document node, which is the top level section.
883	doc_elem.setAttribute(GSXML.NODE_ID_ATT, document_id);
884	GSXML.mergeElements(doc_elem, transformed_section);
885	return result;
886	}
887
888	// multi sectioned document.
889	transformed_section.setAttribute(GSXML.NODE_ID_ATT, document_id);
890	// In docEdit mode, we obtain the text from archives, from doc.xml
891	// Now the transformation has replaced <Section> with <documentNode>
892	// Need to add nodeID, nodeType and docType attributes to each docNode
893	// as doc.xml doesn't store that.
894	insertDocNodeAttributes(transformed_section, document_type, null);
895	doc_elem.appendChild(doc.importNode(transformed_section, true));
896	logger.debug("dx result = "+XMLConverter.getPrettyString(result));
897
898	return result;
899	}
900
901
902	private boolean needSectionContent(HashMap<String, Serializable> params) {
903	String document_id = (String) params.get(GSParams.DOCUMENT);
904	String ilt = (String) params.get(GSParams.INLINE_TEMPLATE);
905	String iltPrefix = "<xsl:template match=\"/\"><text><xsl:for-each select=\"/page/pageResponse/document//documentNode[@nodeID =";
906	if (ilt != null && ilt.startsWith(iltPrefix) && document_id != null) {
907	return true;
908	}
909
910	return false;
911	}
912	/**
913	* this method gets the collection description, the format info, the list of
914	* enrich services, etc - stuff that is needed for the page, but is the same
915	* whatever the query is - should be cached
916	*/
917	protected boolean getBackgroundData(Element page_response, String collection, UserContext userContext)
918	{
919	Document doc = page_response.getOwnerDocument();
920
921	// create a message to process - contains requests for the collection
922	// description, the format element, the enrich services on offer
923	// these could all be cached
924	Element info_message = doc.createElement(GSXML.MESSAGE_ELEM);
925	String path = GSPath.appendLink(collection, AbstractDocumentRetrieve.DOCUMENT_CONTENT_RETRIEVE_SERVICE);
926	// the format request - ignore for now, where does this request go to??
927	Element format_request = GSXML.createBasicRequest(doc, GSXML.REQUEST_TYPE_FORMAT, path, userContext);
928	info_message.appendChild(format_request);
929
930	// the enrich_services request - only do this if provide_annotations is true
931
932	if (provide_annotations)
933	{
934	Element enrich_services_request = GSXML.createBasicRequest(doc, GSXML.REQUEST_TYPE_DESCRIBE, "", userContext);
935	enrich_services_request.setAttribute(GSXML.INFO_ATT, "serviceList");
936	info_message.appendChild(enrich_services_request);
937	}
938
939	Element info_response = (Element) this.mr.process(info_message);
940
941	// the collection is the first response
942	NodeList responses = info_response.getElementsByTagName(GSXML.RESPONSE_ELEM);
943	Element format_resp = (Element) responses.item(0);
944
945	Element format_elem = (Element) GSXML.getChildByTagName(format_resp, GSXML.FORMAT_ELEM);
946	if (format_elem != null)
947	{
948	Element global_format_elem = (Element) GSXML.getChildByTagName(format_resp, GSXML.GLOBAL_FORMAT_ELEM);
949	if (global_format_elem != null)
950	{
951	GSXSLT.mergeFormatElements(format_elem, global_format_elem, false);
952	}
953
954	// set the format type
955	format_elem.setAttribute(GSXML.TYPE_ATT, "display");
956	page_response.appendChild(doc.importNode(format_elem, true));
957	}
958
959	if (provide_annotations)
960	{
961	Element services_resp = (Element) responses.item(1);
962
963	// a new message for the mr
964	Element enrich_message = doc.createElement(GSXML.MESSAGE_ELEM);
965	NodeList e_services = services_resp.getElementsByTagName(GSXML.SERVICE_ELEM);
966	boolean service_found = false;
967	for (int j = 0; j < e_services.getLength(); j++)
968	{
969	if (((Element) e_services.item(j)).getAttribute(GSXML.TYPE_ATT).equals("enrich"))
970	{
971	Element s = GSXML.createBasicRequest(doc, GSXML.REQUEST_TYPE_DESCRIBE, ((Element) e_services.item(j)).getAttribute(GSXML.NAME_ATT), userContext);
972	enrich_message.appendChild(s);
973	service_found = true;
974	}
975	}
976	if (service_found)
977	{
978	Element enrich_response = (Element) this.mr.process(enrich_message);
979
980	NodeList e_responses = enrich_response.getElementsByTagName(GSXML.RESPONSE_ELEM);
981	Element service_list = doc.createElement(GSXML.SERVICE_ELEM + GSXML.LIST_MODIFIER);
982	for (int i = 0; i < e_responses.getLength(); i++)
983	{
984	Element e_resp = (Element) e_responses.item(i);
985	Element e_service = (Element) doc.importNode(GSXML.getChildByTagName(e_resp, GSXML.SERVICE_ELEM), true);
986	e_service.setAttribute(GSXML.NAME_ATT, e_resp.getAttribute(GSXML.FROM_ATT));
987	service_list.appendChild(e_service);
988	}
989	page_response.appendChild(service_list);
990	}
991	} // if provide_annotations
992	return true;
993
994	}
995
996	protected String getDocumentType(Element basic_doc_list, String collection, UserContext userContext, Element page_response)
997	{
998	Document doc = basic_doc_list.getOwnerDocument();
999
1000	Element ds_message = doc.createElement(GSXML.MESSAGE_ELEM);
1001	String to = GSPath.appendLink(collection, AbstractDocumentRetrieve.DOCUMENT_STRUCTURE_RETRIEVE_SERVICE);
1002	Element ds_request = GSXML.createBasicRequest(doc, GSXML.REQUEST_TYPE_PROCESS, to, userContext);
1003	ds_message.appendChild(ds_request);
1004
1005	// Create a parameter list to specify the required structure information
1006	Element ds_param_list = doc.createElement(GSXML.PARAM_ELEM + GSXML.LIST_MODIFIER);
1007	Element ds_param = doc.createElement(GSXML.PARAM_ELEM);
1008	ds_param_list.appendChild(ds_param);
1009	ds_param.setAttribute(GSXML.NAME_ATT, "info");
1010	ds_param.setAttribute(GSXML.VALUE_ATT, "documentType");
1011
1012	ds_request.appendChild(ds_param_list);
1013
1014	// add the node list we created earlier
1015	ds_request.appendChild(basic_doc_list);
1016
1017	// Process the document structure retrieve message
1018	Element ds_response_message = (Element) this.mr.process(ds_message);
1019	if (processErrorElements(ds_response_message, page_response))
1020	{
1021	return null;
1022	}
1023
1024	String[] links = { GSXML.RESPONSE_ELEM, GSXML.DOC_NODE_ELEM + GSXML.LIST_MODIFIER, GSXML.DOC_NODE_ELEM, "nodeStructureInfo" };
1025	String path = GSPath.createPath(links);
1026	Element info_elem = (Element) GSXML.getNodeByPath(ds_response_message, path);
1027	if (info_elem == null) {
1028	return null;
1029	}
1030	Element doctype_elem = GSXML.getNamedElement(info_elem, "info", "name", "documentType");
1031	if (doctype_elem != null)
1032	{
1033	String doc_type = doctype_elem.getAttribute("value");
1034	return doc_type;
1035	}
1036	return null;
1037	}
1038
1039	// Recursive method to set the docType, nodeType and nodeID attributes of each docNode
1040	// The docType remains constant as in parameter document_type
1041	// The nodeID for the first (root) docNode is already set. For all children, the rootNode id
1042	// is updated to be <parent-id>.<num-child>, where the first parent-id is rootNode id.
1043	// The nodeType is root if rootNode, internal if there are children and leaf if no children
1044	protected void insertDocNodeAttributes(Element docNode, String document_type, String id) {
1045
1046	boolean isRoot = false;
1047	if(id == null) { // rootNode, get the root nodeID to work with recursively
1048	id = docNode.getAttribute(GSXML.NODE_ID_ATT);
1049	isRoot = true;
1050	} else { // for all but the root node, need to still set the nodeID
1051	docNode.setAttribute(GSXML.NODE_ID_ATT, id);
1052	}
1053
1054	docNode.setAttribute(GSXML.DOC_TYPE_ATT, document_type);
1055
1056	NodeList docNodes = GSXML.getChildrenByTagName(docNode, GSXML.DOC_NODE_ELEM);
1057	if(docNodes.getLength() > 0) {
1058	docNode.setAttribute(GSXML.NODE_TYPE_ATT, GSXML.NODE_TYPE_INTERNAL);
1059	for(int i = 0; i < docNodes.getLength(); i++) {
1060	Element childDocNode = (Element)docNodes.item(i);
1061
1062	// work out the child docNode's nodeID based on current id
1063	String nodeID = id + "." + (i+1);
1064	insertDocNodeAttributes(childDocNode, document_type, nodeID); //recursion step
1065	}
1066	} else {
1067	docNode.setAttribute(GSXML.NODE_TYPE_ATT, GSXML.NODE_TYPE_LEAF);
1068	}
1069
1070	// rootNode's nodeType is a special case: it's "root", not "leaf" or "internal"
1071	if(isRoot) docNode.setAttribute(GSXML.NODE_TYPE_ATT, GSXML.NODE_TYPE_ROOT);
1072
1073	}
1074
1075	/** run the XSLT transform which converts from doc.xml format to our internal document format */
1076	protected Element transformArchiveToDocument(Element section) {
1077
1078	String stylesheet_filename = GSFile.stylesheetFile(GlobalProperties.getGSDL3Home(), (String) this.config_params.get(GSConstants.SITE_NAME), "", (String) this.config_params.get(GSConstants.INTERFACE_NAME), (ArrayList<String>) this.config_params.get(GSConstants.BASE_INTERFACES), "archive2document.xsl");
1079	if (stylesheet_filename == null) {
1080	logger.error("Couldn't find stylesheet archive2document.xsl");
1081	return section;
1082	}
1083
1084	Document stylesheet_doc = XMLConverter.getDOM(new File(stylesheet_filename));
1085	if (stylesheet_doc == null) {
1086	logger.error("Couldn't load in stylesheet "+stylesheet_filename);
1087	return section;
1088	}
1089
1090	Document section_doc = XMLConverter.newDOM();
1091	section_doc.appendChild(section_doc.importNode(section, true));
1092	Node result = this.transformer.transform(stylesheet_doc, section_doc);
1093	logger.debug("transform result = "+XMLConverter.getPrettyString(result));
1094
1095	Element new_element;
1096	if (result.getNodeType() == Node.DOCUMENT_NODE) {
1097	new_element = ((Document) result).getDocumentElement();
1098	} else {
1099	new_element = (Element) result;
1100	}
1101
1102
1103	return new_element;
1104
1105	}
1106
1107	protected final int NO_QUERY_TERMS = 0;
1108	protected final int NO_EQUIV_QUERY_TERMS = 1;
1109	protected final int EQUIV_QUERY_TERMS = 2;
1110	/**
1111	* this involves a bit of a hack to get the equivalent query terms - has to
1112	* requery the query service - uses the last selected service name. (if it
1113	* ends in query).
1114	*/
1115	protected int getQueryTermVariants(Element request, HashSet<String> query_term_variants, ArrayList<ArrayList<HashSet<String>>> phrase_query_term_variants_hierarchy)
1116	{
1117	Document doc = XMLConverter.newDOM();
1118
1119	// do the query again to get term info
1120	Element cgi_param_list = (Element) GSXML.getChildByTagName(request, GSXML.PARAM_ELEM + GSXML.LIST_MODIFIER);
1121	HashMap<String, Serializable> params = GSXML.extractParams(cgi_param_list, false);
1122
1123	HashMap previous_params = (HashMap) params.get("p");
1124	if (previous_params == null)
1125	{
1126	return NO_QUERY_TERMS;
1127	}
1128	String service_name = (String) previous_params.get(GSParams.SERVICE);
1129	if (service_name == null \|\| !service_name.endsWith("Query"))
1130	{ // hack for now - we only do highlighting if we were in a query last - ie not if we were in a browse thingy
1131	logger.debug("invalid service "+service_name+", not doing highlighting");
1132	return NO_QUERY_TERMS;
1133	}
1134
1135	String collection = (String) params.get(GSParams.COLLECTION);
1136	UserContext userContext = new UserContext(request);
1137	String to = GSPath.appendLink(collection, service_name);
1138
1139	Element mr_query_message = doc.createElement(GSXML.MESSAGE_ELEM);
1140	Element mr_query_request = GSXML.createBasicRequest(doc, GSXML.REQUEST_TYPE_PROCESS, to, userContext);
1141	mr_query_message.appendChild(mr_query_request);
1142
1143	// paramList
1144	HashMap service_params = (HashMap) params.get("s1");
1145
1146	Element query_param_list = doc.createElement(GSXML.PARAM_ELEM + GSXML.LIST_MODIFIER);
1147	GSXML.addParametersToList(query_param_list, service_params);
1148	mr_query_request.appendChild(query_param_list);
1149
1150	// do the query
1151	Element mr_query_response = (Element) this.mr.process(mr_query_message);
1152
1153	// find the term lists
1154	String path = GSPath.appendLink(GSXML.RESPONSE_ELEM, GSXML.TERM_ELEM + GSXML.LIST_MODIFIER);
1155	Element query_term_list_element = (Element) GSXML.getNodeByPath(mr_query_response, path);
1156	if (query_term_list_element == null)
1157	{
1158	// no term info
1159	return NO_QUERY_TERMS;
1160	}
1161
1162	int result_code = NO_EQUIV_QUERY_TERMS;
1163	NodeList equivalent_terms_nodelist = query_term_list_element.getElementsByTagName("equivTermList");
1164	if (equivalent_terms_nodelist == null \|\| equivalent_terms_nodelist.getLength() == 0)
1165	{
1166	// if we have no equivalent terms, just add the current terms lower cased and we do case insensitive matching later on
1167	NodeList terms_nodelist = query_term_list_element.getElementsByTagName("term");
1168	if (terms_nodelist != null && terms_nodelist.getLength() > 0)
1169	{
1170	for (int i = 0; i < terms_nodelist.getLength(); i++)
1171	{
1172	String termValue = ((Element) terms_nodelist.item(i)).getAttribute("name");
1173	query_term_variants.add(termValue.toLowerCase());
1174	}
1175	}
1176	}
1177	else
1178	{
1179	result_code = EQUIV_QUERY_TERMS;
1180	for (int i = 0; i < equivalent_terms_nodelist.getLength(); i++)
1181	{
1182	Element equivalent_terms_element = (Element) equivalent_terms_nodelist.item(i);
1183	String[] equivalent_terms = GSXML.getAttributeValuesFromList(equivalent_terms_element, GSXML.NAME_ATT);
1184	for (int j = 0; j < equivalent_terms.length; j++)
1185	{
1186	query_term_variants.add(equivalent_terms[j]);
1187	}
1188	}
1189	}
1190
1191	String metadata_path = GSPath.appendLink(GSXML.RESPONSE_ELEM, GSXML.METADATA_ELEM + GSXML.LIST_MODIFIER);
1192	Element metadata_list = (Element) GSXML.getNodeByPath(mr_query_response, metadata_path);
1193
1194	Element query_element = GSXML.getNamedElement(metadata_list, GSXML.METADATA_ELEM, GSXML.NAME_ATT, "query");
1195	String performed_query = GSXML.getNodeText(query_element) + " ";
1196	logger.debug("performed query="+performed_query);
1197
1198	boolean has_phrases = false; // if there are no phrases, we don't bother making the phrase variants structure
1199	if (performed_query.contains("\"")) {
1200	has_phrases = true;
1201	}
1202
1203	ArrayList<HashSet<String>> phrase_query_p_term_variants_list = new ArrayList<HashSet<String>>();
1204	int term_start = 0;
1205	boolean in_term = false;
1206	boolean in_phrase = false;
1207	for (int i = 0; i < performed_query.length(); i++) {
1208
1209	char character = performed_query.charAt(i);
1210	boolean is_character_letter_or_digit = Character.isLetterOrDigit(character);
1211
1212	// Has a query term just started?
1213	if (in_term == false && is_character_letter_or_digit == true)
1214	{
1215	in_term = true;
1216	term_start = i;
1217	}
1218
1219	// Or has a term just finished?
1220	else if (in_term == true && is_character_letter_or_digit == false)
1221	{
1222	in_term = false;
1223	String term = performed_query.substring(term_start, i);
1224	if (has_phrases) {
1225	// do the phrase bit
1226	HashSet<String> phrase_query_p_term_x_variants = new HashSet<String>();
1227	if (result_code == EQUIV_QUERY_TERMS) {
1228	Element term_element = GSXML.getNamedElement(query_term_list_element, GSXML.TERM_ELEM, GSXML.NAME_ATT, term);
1229	if (term_element != null) {
1230	// might be null for eg TX in [snails]:TX
1231
1232	NodeList term_equivalent_terms_nodelist = term_element.getElementsByTagName("equivTermList");
1233	if (term_equivalent_terms_nodelist != null \|\| term_equivalent_terms_nodelist.getLength() != 0) {
1234	for (int j = 0; j < term_equivalent_terms_nodelist.getLength(); j++)
1235	{
1236	Element term_equivalent_terms_element = (Element) term_equivalent_terms_nodelist.item(j);
1237	String[] term_equivalent_terms = GSXML.getAttributeValuesFromList(term_equivalent_terms_element, GSXML.NAME_ATT);
1238	for (int k = 0; k < term_equivalent_terms.length; k++)
1239	{
1240	phrase_query_p_term_x_variants.add(term_equivalent_terms[k]);
1241	}
1242	}
1243	}
1244	}
1245	} else { // result_code != EQUIV_QUERY_TERMS
1246	// we don;t have equivalent term list, so just add the lower cased version in, and we do case-insensitive matching later on
1247	if (query_term_variants.contains(term.toLowerCase()) \|\| containsSubString(query_term_variants, term)) {
1248	// this handles the case where the user has searched for snails, but term list returns 'snail'
1249	phrase_query_p_term_x_variants.add(term.toLowerCase());
1250	}
1251	}
1252	if (phrase_query_p_term_x_variants.size()>0) {
1253	// we have found a valid term
1254	phrase_query_p_term_variants_list.add(phrase_query_p_term_x_variants);
1255
1256	if (in_phrase == false)
1257	{
1258	phrase_query_term_variants_hierarchy.add(phrase_query_p_term_variants_list);
1259	phrase_query_p_term_variants_list = new ArrayList<HashSet<String>>();
1260	}
1261	}
1262	} // end if has_phrases
1263	else {
1264	// no phrases so we don't have to do the phrasey stuff. but
1265	// we need to check the term against the query term list - if its not in there, check whether its the root of a term.
1266	// we want to handle the case where user has queried "snails", the term list returned only has snail, and therefore snails doesn't get highlighted.
1267	// but dont want to include eg TX
1268	if (result_code == NO_EQUIV_QUERY_TERMS) {
1269	if (containsSubString(query_term_variants, term)) {
1270	query_term_variants.add(term.toLowerCase());
1271	}
1272	}
1273
1274	}
1275	} // end of in_term...
1276	// Watch for phrases (surrounded by quotes)
1277	if (character == '\"') {
1278
1279	// Has a phrase just started?
1280	if (in_phrase == false)
1281	{
1282	in_phrase = true;
1283	}
1284	// Or has a phrase just finished?
1285	else if (in_phrase == true)
1286	{
1287	in_phrase = false;
1288	phrase_query_term_variants_hierarchy.add(phrase_query_p_term_variants_list);
1289	}
1290
1291	phrase_query_p_term_variants_list = new ArrayList<HashSet<String>>();
1292	} // if char == "
1293	} // for each char in performed query
1294
1295	return result_code;
1296	}
1297
1298	protected boolean containsSubString(HashSet<String> query_term_variants, String term) {
1299	// hack to filter out TX, TI field names
1300	String lc_term = term.toLowerCase();
1301	if (query_term_variants.contains(term)) {
1302	return false; // or true??
1303	}
1304	if (term.matches("[A-Z][A-Z][A-Z]?")) {
1305	return false;
1306	}
1307	Iterator i = query_term_variants.iterator();
1308	while (i.hasNext()) {
1309	String t = (String)i.next();
1310	if (term.startsWith(t)) {
1311	return true;
1312	}
1313	}
1314	return false;
1315	}
1316
1317
1318	/** retrieve the marked up highlighted section - only works for solr collection */
1319	protected Element retrieveHighlightedContent(Element request, String node_id) {
1320
1321	Document doc = XMLConverter.newDOM();
1322
1323	// do the query again to get term info
1324	Element cgi_param_list = (Element) GSXML.getChildByTagName(request, GSXML.PARAM_ELEM + GSXML.LIST_MODIFIER);
1325	HashMap<String, Serializable> params = GSXML.extractParams(cgi_param_list, false);
1326
1327	HashMap previous_params = (HashMap) params.get("p");
1328	if (previous_params == null)
1329	{
1330	return null;
1331	}
1332	String service_name = (String) previous_params.get(GSParams.SERVICE);
1333	if (service_name == null \|\| !service_name.endsWith("Query"))
1334	{ // hack for now - we only do highlighting if we were in a query last - ie not if we were in a browse thingy
1335	logger.debug("HL invalid service, not doing highlighting");
1336	return null;
1337	}
1338
1339	String collection = (String) params.get(GSParams.COLLECTION);
1340	UserContext userContext = new UserContext(request);
1341	String to = GSPath.appendLink(collection, service_name);
1342
1343	Element mr_query_message = doc.createElement(GSXML.MESSAGE_ELEM);
1344	Element mr_query_request = GSXML.createBasicRequest(doc, GSXML.REQUEST_TYPE_PROCESS, to, userContext);
1345	mr_query_message.appendChild(mr_query_request);
1346
1347	// paramList
1348	HashMap service_params = (HashMap) params.get("s1");
1349
1350	// hack in case the user searched on eg titles, but we want highlighting in the text
1351	service_params.put("index", "TX");
1352	Element query_param_list = doc.createElement(GSXML.PARAM_ELEM + GSXML.LIST_MODIFIER);
1353	GSXML.addParametersToList(query_param_list, service_params);
1354
1355	if (node_id != null) {
1356	GSXML.addParameterToList(query_param_list, "hldocOID", node_id);
1357	} else {
1358	GSXML.addParameterToList(query_param_list, "hldocOID", (String) params.get(GSParams.DOCUMENT));
1359	}
1360	mr_query_request.appendChild(query_param_list);
1361	// do the query
1362
1363	Element mr_query_response = (Element) this.mr.process(mr_query_message);
1364	String pathNode = GSPath.appendLink(GSXML.RESPONSE_ELEM, GSXML.NODE_CONTENT_ELEM);
1365	Element highlighted_node = (Element) GSXML.getNodeByPath(mr_query_response, pathNode);
1366
1367	if (highlighted_node == null) {
1368	return null;
1369	}
1370	// For SOLR, the highlighted node will be a nodeContent element, which is the hldocOID section content, with search terms marked up.
1371	//We send it back to the documnetContentRetrieve service so that resolveTextMacros can be applied, and it can be properly encased in documentNode etc elements
1372
1373	// Build a request to process highlighted text
1374
1375	Element hl_message = doc.createElement(GSXML.MESSAGE_ELEM);
1376	to = GSPath.appendLink(collection, AbstractDocumentRetrieve.DOCUMENT_CONTENT_RETRIEVE_SERVICE);
1377	Element dc_request = GSXML.createBasicRequest(doc, GSXML.REQUEST_TYPE_PROCESS, to, userContext);
1378	hl_message.appendChild(dc_request);
1379
1380	// Create a parameter list to specify the request parameters - empty for now
1381	Element dc_param_list = doc.createElement(GSXML.PARAM_ELEM + GSXML.LIST_MODIFIER);
1382	dc_request.appendChild(dc_param_list);
1383
1384	// get the content
1385	Element doc_list = doc.createElement(GSXML.DOC_NODE_ELEM + GSXML.LIST_MODIFIER);
1386	dc_request.appendChild(doc_list);
1387	Element current_doc = doc.createElement(GSXML.DOC_NODE_ELEM);
1388	doc_list.appendChild(current_doc);
1389	current_doc.setAttribute(GSXML.NODE_ID_ATT, (String) params.get(GSParams.DOCUMENT));
1390	//Append highlighted content to request for processing
1391	dc_request.appendChild(doc.importNode(highlighted_node, true));
1392	Element hl_response_message = (Element) this.mr.process(hl_message);
1393	//Get results
1394	NodeList contentList = hl_response_message.getElementsByTagName(GSXML.NODE_CONTENT_ELEM);
1395	Element content = (Element) contentList.item(0);
1396	return content;
1397
1398
1399	}
1400	/**
1401	* Highlights query terms in specified elements (whose name is in element_names) text inside top_level_elem
1402	*/
1403	protected boolean highlightQueryTermsDOM(Document doc, Element top_level_elem, String element_name, HashSet<String> query_term_variants, ArrayList<ArrayList<HashSet<String>>> phrase_query_term_variants_hierarchy, boolean case_insensitive) {
1404
1405	NodeList named_elems = top_level_elem.getElementsByTagName(element_name);
1406	for (int j=named_elems.getLength()-1; j>=0; j--) {
1407	Element this_elem = (Element)named_elems.item(j);
1408	Element replacement_elem = highlightQueryTermsElementText(doc, this_elem, query_term_variants, phrase_query_term_variants_hierarchy, case_insensitive);
1409	this_elem.getParentNode().replaceChild(replacement_elem, this_elem);
1410	}
1411	return true;
1412	}
1413	/**
1414	* Highlights query terms in the text content of an element.
1415	*/
1416	private Element highlightQueryTermsElementText(Document doc, Element original_element, HashSet<String> query_term_variants, ArrayList<ArrayList<HashSet<String>>> phrase_query_term_variants_hierarchy, boolean case_insensitive)
1417	{
1418	String content = GSXML.getNodeText(original_element);
1419	// Convert the content string to an array of characters for speed
1420	char[] content_characters = new char[content.length()];
1421	content.getChars(0, content.length(), content_characters, 0);
1422
1423	// Now skim through the content, identifying word matches
1424	ArrayList<WordMatch> word_matches = new ArrayList<WordMatch>();
1425	int word_start = 0;
1426	boolean in_word = false;
1427	boolean preceding_word_matched = false;
1428	boolean inTag = false;
1429	for (int i = 0; i < content_characters.length; i++)
1430	{
1431	//We don't want to find words inside HTML tags
1432	if (content_characters[i] == '<')
1433	{
1434	// are we currently in a word?
1435	if (in_word) {
1436	in_word = false;
1437	String word = new String(content_characters, word_start, (i - word_start));
1438	if (case_insensitive) {
1439	word = word.toLowerCase();
1440	}
1441	if (query_term_variants.contains(word)) {
1442	// We have found a matching word, so remember its location
1443	word_matches.add(new WordMatch(word, word_start, i, preceding_word_matched));
1444	// should preceding word matched be set to true/false here??
1445	preceding_word_matched = true;
1446	} else {
1447	preceding_word_matched = false;
1448	}
1449	}
1450	inTag = true;
1451	continue;
1452	}
1453	else if (inTag && content_characters[i] == '>')
1454	{
1455	inTag = false;
1456	continue;
1457	}
1458	else if (inTag)
1459	{
1460	continue;
1461	}
1462
1463	boolean is_character_letter_or_digit = Character.isLetterOrDigit(content_characters[i]);
1464
1465	// Has a word just started?
1466	if (in_word == false && is_character_letter_or_digit == true)
1467	{
1468	in_word = true;
1469	word_start = i;
1470	}
1471
1472	// Or has a word just finished?
1473	else if (in_word == true && is_character_letter_or_digit == false)
1474	{
1475	in_word = false;
1476
1477	// Check if the word matches any of the query term equivalents
1478	String word = new String(content_characters, word_start, (i - word_start));
1479	if (case_insensitive) {
1480	word = word.toLowerCase();
1481	}
1482	if (query_term_variants.contains(word))
1483	{
1484	// We have found a matching word, so remember its location
1485	word_matches.add(new WordMatch(word, word_start, i, preceding_word_matched));
1486	preceding_word_matched = true;
1487	}
1488	else
1489	{
1490	preceding_word_matched = false;
1491	}
1492	}
1493	}
1494
1495	// Don't forget the last word...
1496	if (in_word == true)
1497	{
1498	// Check if the word matches any of the query term equivalents
1499	String word = new String(content_characters, word_start, (content_characters.length - word_start));
1500	if (case_insensitive) {
1501	word = word.toLowerCase();
1502	}
1503	if (query_term_variants.contains(word))
1504	{
1505	// We have found a matching word, so remember its location
1506	word_matches.add(new WordMatch(word, word_start, content_characters.length, preceding_word_matched));
1507	}
1508	}
1509
1510	if (word_matches.size() == 0) {
1511	// just return a copy of the original element
1512	return (Element)doc.importNode(original_element, true);
1513
1514	}
1515
1516	ArrayList<Integer> highlight_start_positions = new ArrayList<Integer>();
1517	ArrayList<Integer> highlight_end_positions = new ArrayList<Integer>();
1518
1519	if (phrase_query_term_variants_hierarchy.size() ==0) {
1520	for (int i = 0; i < word_matches.size(); i++) {
1521	highlight_start_positions.add(Integer.valueOf(word_matches.get(i).start_position));
1522	highlight_end_positions.add(Integer.valueOf(word_matches.get(i).end_position));
1523	}
1524	}
1525	else {
1526	// Deal with phrases now
1527	ArrayList<PartialPhraseMatch> partial_phrase_matches = new ArrayList<PartialPhraseMatch>();
1528	for (int i = 0; i < word_matches.size(); i++)
1529	{
1530	WordMatch word_match = word_matches.get(i);
1531
1532	// See if any partial phrase matches are extended by this word
1533	if (word_match.preceding_word_matched)
1534	{
1535	for (int j = partial_phrase_matches.size() - 1; j >= 0; j--)
1536	{
1537	PartialPhraseMatch partial_phrase_match = partial_phrase_matches.remove(j);
1538	ArrayList phrase_query_p_term_variants_list = phrase_query_term_variants_hierarchy.get(partial_phrase_match.query_phrase_number);
1539	HashSet phrase_query_p_term_x_variants = (HashSet) phrase_query_p_term_variants_list.get(partial_phrase_match.num_words_matched);
1540	if (phrase_query_p_term_x_variants.contains(word_match.word))
1541	{
1542	partial_phrase_match.num_words_matched++;
1543
1544	// Has a complete phrase match occurred?
1545	if (partial_phrase_match.num_words_matched == phrase_query_p_term_variants_list.size())
1546	{
1547	// Check for overlaps by looking at the previous highlight range
1548	if (!highlight_end_positions.isEmpty())
1549	{
1550	int last_highlight_index = highlight_end_positions.size() - 1;
1551	int last_highlight_end = highlight_end_positions.get(last_highlight_index).intValue();
1552	if (last_highlight_end > partial_phrase_match.start_position)
1553	{
1554	// There is an overlap, so remove the previous phrase match
1555	int last_highlight_start = highlight_start_positions.remove(last_highlight_index).intValue();
1556	highlight_end_positions.remove(last_highlight_index);
1557	partial_phrase_match.start_position = last_highlight_start;
1558	}
1559	}
1560
1561	highlight_start_positions.add(Integer.valueOf(partial_phrase_match.start_position));
1562	highlight_end_positions.add(Integer.valueOf(word_match.end_position));
1563	}
1564	// No, but add the partial match back into the list for next time
1565	else
1566	{
1567	partial_phrase_matches.add(partial_phrase_match);
1568	}
1569	}
1570	}
1571	}
1572	else
1573	{
1574	partial_phrase_matches.clear();
1575	}
1576
1577	// See if this word is at the start of any of the phrases
1578	for (int p = 0; p < phrase_query_term_variants_hierarchy.size(); p++)
1579	{
1580	ArrayList phrase_query_p_term_variants_list = phrase_query_term_variants_hierarchy.get(p);
1581	if (phrase_query_p_term_variants_list.size()>0) {
1582	HashSet phrase_query_p_term_1_variants = (HashSet) phrase_query_p_term_variants_list.get(0);
1583	if (phrase_query_p_term_1_variants.contains(word_match.word))
1584	{
1585	// If this phrase is just one word long, we have a complete match
1586	if (phrase_query_p_term_variants_list.size() == 1)
1587	{
1588	highlight_start_positions.add(Integer.valueOf(word_match.start_position));
1589	highlight_end_positions.add(Integer.valueOf(word_match.end_position));
1590	}
1591	// Otherwise we have the start of a potential phrase match
1592	else
1593	{
1594	partial_phrase_matches.add(new PartialPhraseMatch(word_match.start_position, p));
1595	}
1596	}
1597	}
1598	}
1599	}
1600	}
1601
1602	// Now add the annotation tags into the document at the correct points
1603	Element content_element = (Element)doc.importNode(original_element, false); // just copy the element plus any attributes, but not any children.
1604	int last_wrote = 0;
1605	for (int i = 0; i < highlight_start_positions.size(); i++)
1606	{
1607	int highlight_start = highlight_start_positions.get(i).intValue();
1608	int highlight_end = highlight_end_positions.get(i).intValue();
1609
1610	// Print anything before the highlight range
1611	if (last_wrote < highlight_start)
1612	{
1613	String preceding_text = new String(content_characters, last_wrote, (highlight_start - last_wrote));
1614	content_element.appendChild(doc.createTextNode(preceding_text));
1615	}
1616
1617	// Print the highlight text, annotated
1618	if (highlight_end > last_wrote)
1619	{
1620	String highlight_text = new String(content_characters, highlight_start, (highlight_end - highlight_start));
1621	Element annotation_element = GSXML.createTextElement(doc, "annotation", highlight_text);
1622	annotation_element.setAttribute("type", "query_term");
1623	content_element.appendChild(annotation_element);
1624	last_wrote = highlight_end;
1625	}
1626	}
1627
1628	// Finish off any unwritten text
1629	if (last_wrote < content_characters.length)
1630	{
1631	String remaining_text = new String(content_characters, last_wrote, (content_characters.length - last_wrote));
1632	content_element.appendChild(doc.createTextNode(remaining_text));
1633	}
1634	return content_element;
1635	}
1636
1637
1638	static private class WordMatch
1639	{
1640	public String word;
1641	public int start_position;
1642	public int end_position;
1643	public boolean preceding_word_matched;
1644
1645	public WordMatch(String word, int start_position, int end_position, boolean preceding_word_matched)
1646	{
1647	this.word = word;
1648	this.start_position = start_position;
1649	this.end_position = end_position;
1650	this.preceding_word_matched = preceding_word_matched;
1651	}
1652	}
1653
1654	static private class PartialPhraseMatch
1655	{
1656	public int start_position;
1657	public int query_phrase_number;
1658	public int num_words_matched;
1659
1660	public PartialPhraseMatch(int start_position, int query_phrase_number)
1661	{
1662	this.start_position = start_position;
1663	this.query_phrase_number = query_phrase_number;
1664	this.num_words_matched = 1;
1665	}
1666	}
1667	}

Note: See TracBrowser for help on using the repository browser.

Download in other formats: