Context Navigation

DocumentAction.java@ 37515

Last change on this file since 37515 was 37515, checked in by kjdon, 14 months ago
usign the new GetRequiredMEtadataNames - has an extra arg, and we no longer need to do teh extraMEtadataList bit ourselves, as its now in getRequiredMetadataNames
Property svn:keywords set to `Author Date Id Revision`
File size: 63.6 KB

Line
1	/*
2	* DocumentAction.java
3	* Copyright (C) 2002 New Zealand Digital Library, http://www.nzdl.org
4	*
5	* This program is free software; you can redistribute it and/or modify
6	* it under the terms of the GNU General Public License as published by
7	* the Free Software Foundation; either version 2 of the License, or
8	* (at your option) any later version.
9	*
10	* This program is distributed in the hope that it will be useful,
11	* but WITHOUT ANY WARRANTY; without even the implied warranty of
12	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13	* GNU General Public License for more details.
14	*
15	* You should have received a copy of the GNU General Public License
16	* along with this program; if not, write to the Free Software
17	* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
18	*/
19	package org.greenstone.gsdl3.action;
20
21	// Greenstone classes
22	import org.greenstone.gsdl3.core.ModuleInterface;
23	import org.greenstone.gsdl3.service.AbstractDocumentRetrieve;
24	import org.greenstone.gsdl3.service.DocXMLUtil;
25	import org.greenstone.gsdl3.util.*;
26	import org.greenstone.util.GlobalProperties;
27
28	// XML classes
29	import org.w3c.dom.Document;
30	import org.w3c.dom.Element;
31	import org.w3c.dom.Node;
32	import org.w3c.dom.Text;
33	import org.w3c.dom.NodeList;
34
35	// General Java classes
36	import java.util.ArrayList;
37	import java.util.HashMap;
38	import java.util.HashSet;
39	import java.util.Iterator;
40	import java.io.File;
41	import java.io.Serializable;
42
43	import org.apache.log4j.*;
44
45	/** Action class for retrieving Documents via the message router */
46	public class DocumentAction extends Action
47	{
48
49	static Logger logger = Logger.getLogger(org.greenstone.gsdl3.action.DocumentAction.class.getName());
50
51	// this is used to specify that the sibling nodes of a selected one should be obtained
52	public static final String SIBLING_ARG = "sib";
53	public static final String GOTO_PAGE_ARG = "gp";
54	public static final String ENRICH_DOC_ARG = "end";
55	public static final String EXPAND_DOCUMENT_ARG = "ed";
56	public static final String EXPAND_CONTENTS_ARG = "ec";
57	public static final String REALISTIC_BOOK_ARG = "book";
58	public static final String NO_TEXT_ARG = "noText";
59	public static final String DOC_EDIT_ARG = "docEdit";
60	public static final String DOC_VERSION_ARG = "dv";
61
62	/**
63	* if this is set to true, when a document is displayed, any annotation type
64	* services (enrich) will be offered to the user as well
65	*/
66	protected boolean provide_annotations = false;
67
68	protected boolean highlight_query_terms = false;
69
70	public boolean configure()
71	{
72	super.configure();
73	String highlight = (String) config_params.get("highlightQueryTerms");
74	if (highlight != null && highlight.equals("true"))
75	{
76	highlight_query_terms = true;
77	}
78	String annotate = (String) config_params.get("displayAnnotationService");
79	if (annotate != null && annotate.equals("true"))
80	{
81	provide_annotations = true;
82	}
83	return true;
84	}
85
86	public Node process(Node message_node)
87	{
88	// for now, no subaction eventually we may want to have subactions such as text assoc or something ?
89
90	Element message = GSXML.nodeToElement(message_node);
91	Document doc = XMLConverter.newDOM();
92
93	// the response
94	Element result = doc.createElement(GSXML.MESSAGE_ELEM);
95	Element page_response = doc.createElement(GSXML.RESPONSE_ELEM);
96	result.appendChild(page_response);
97
98	// get the request - assume only one
99	Element request = (Element) GSXML.getChildByTagName(message, GSXML.REQUEST_ELEM);
100	Element cgi_paramList = (Element) GSXML.getChildByTagName(request, GSXML.PARAM_ELEM + GSXML.LIST_MODIFIER);
101	HashMap<String, Serializable> params = GSXML.extractParams(cgi_paramList, false);
102
103	// just in case there are some that need to get passed to the services
104	// why do we use s0 here and s1 in other places???
105	HashMap service_params = (HashMap) params.get("s0");
106
107	String collection = (String) params.get(GSParams.COLLECTION);
108	String document_id = (String) params.get(GSParams.DOCUMENT);
109	if (document_id != null && document_id.equals(""))
110	{
111	document_id = null;
112	}
113	String href = (String) params.get(GSParams.HREF);//for an external link : get the href URL if it is existing in the params list
114	if (href != null && href.equals(""))
115	{
116	href = null;
117	}
118	String rl = (String) params.get(GSParams.RELATIVE_LINK);//for an external link : get the rl value if it is existing in the params list
119	if (document_id == null && href == null)
120	{
121	logger.error("no document specified!");
122	return result;
123	}
124	if (rl != null && rl.equals("0"))
125	{
126	// this is a true external link, we should have been directed to a different page or action
127	logger.error("rl value was 0, shouldn't get here");
128	return result;
129	}
130
131	String doc_id_modifier = "";
132	String sibling_num = (String) params.get(GOTO_PAGE_ARG);
133	if (sibling_num != null && !sibling_num.equals(""))
134	{
135	// we have to modify the doc name
136	doc_id_modifier = "." + sibling_num + ".ss";
137	}
138
139
140	UserContext userContext = new UserContext(request);
141
142	//append site metadata
143	addSiteMetadata(page_response, userContext);
144	addInterfaceOptions(page_response);
145
146	// get the additional data needed for the page
147	getBackgroundData(page_response, collection, userContext);
148
149	// create a basic doc list containing the current node
150	// we will use this to query whether the id is valid, and to get document type
151	Element basic_doc_list = doc.createElement(GSXML.DOC_NODE_ELEM + GSXML.LIST_MODIFIER);
152	Element current_doc = doc.createElement(GSXML.DOC_NODE_ELEM);
153	basic_doc_list.appendChild(current_doc);
154	if (document_id != null)
155	{
156	current_doc.setAttribute(GSXML.NODE_ID_ATT, document_id + doc_id_modifier);
157	}
158	else
159	{
160	current_doc.setAttribute(GSXML.HREF_ID_ATT, href);
161	// do we need this??
162	current_doc.setAttribute(GSXML.ID_MOD_ATT, doc_id_modifier);
163	}
164
165	// lets do a quick check here for valid doc id.
166	if (document_id != null) {
167	boolean is_valid = checkValidOID(basic_doc_list, collection, userContext, page_response );
168	if (!is_valid) {
169	GSXML.addError(page_response, "Invalid doc id ("+document_id+")", GSXML.ERROR_TYPE_INVALID_ID);
170	return result;
171	}
172	}
173	Element format_elem = (Element) GSXML.getChildByTagName(page_response, GSXML.FORMAT_ELEM);
174
175	if (format_elem != null) {
176	// lets look for param defaults set in config file
177	NodeList param_defaults = format_elem.getElementsByTagName(GSXML.PARAM_DEFAULT_ELEM);
178	for (int i=0; i<param_defaults.getLength(); i++) {
179	Element p = (Element)param_defaults.item(i);
180	String name = p.getAttribute(GSXML.NAME_ATT);
181	if (params.get(name) ==null) {
182	// wasn't set from interface
183	String value = p.getAttribute(GSXML.VALUE_ATT);
184	params.put(name, value );
185	// also add into request param xml so that xslt knows it too
186	GSXML.addParameterToList(cgi_paramList, name, value);
187	}
188	}
189	}
190
191	String document_type = (String) params.get(GSParams.DOCUMENT_TYPE);
192	if (document_type != null && document_type.equals(""))
193	{
194	//document_type = "hierarchy";
195	document_type = null; // we'll get it later if not already specified
196	}
197	// what if it is null here?? Anu to check...
198
199
200	boolean editing_document = false;
201	String doc_edit = (String) params.get(DOC_EDIT_ARG);
202	if (doc_edit != null && doc_edit.equals("1")) {
203	editing_document = true;
204	}
205
206	// are we editing mode? just get the archive document, convert to our internal doc format, and return it
207	if (editing_document) {
208	String opt_document_version = (String) params.get(DOC_VERSION_ARG);
209	return getFormattedArchiveDoc(doc, collection, document_id, opt_document_version, document_type, result, page_response, userContext);
210	}
211
212	//whether to retrieve siblings or not
213	boolean get_siblings = false;
214	String sibs = (String) params.get(SIBLING_ARG);
215	if (sibs != null && sibs.equals("1"))
216	{
217	get_siblings = true;
218	}
219
220	boolean expand_document = false;
221	String ed_arg = (String) params.get(EXPAND_DOCUMENT_ARG);
222	if (ed_arg != null && ed_arg.equals("1"))
223	{
224	expand_document = true;
225	}
226
227	boolean expand_contents = false;
228	if (expand_document)
229	{ // we always expand the contents with the text
230	expand_contents = true;
231	}
232	else
233	{
234	String ec_arg = (String) params.get(EXPAND_CONTENTS_ARG);
235	if (ec_arg != null && ec_arg.equals("1"))
236	{
237	expand_contents = true;
238	}
239	}
240
241	// do we want text content? Not if no_text=1.
242	// expand_document overrides this. - should it??
243	boolean get_text = true;
244	String nt_arg = (String) params.get(NO_TEXT_ARG);
245
246	if (!expand_document && nt_arg!=null && nt_arg.equals("1")) {
247	logger.debug("SETTING GET TEXT TO FALSE");
248	get_text = false;
249	} else {
250	logger.debug("GET TEXT REMAINS TRUE");
251	}
252
253	// the_document is where all the doc info - structure and metadata etc
254	// is added into, to be returned in the page
255	Element the_document = doc.createElement(GSXML.DOCUMENT_ELEM);
256	page_response.appendChild(the_document);
257
258	// used to create basic_doc_list here
259	if (document_type == null)
260	{
261	document_type = getDocumentType(basic_doc_list, collection, userContext, page_response);
262	}
263	if (document_type == null)
264	{
265	logger.debug("##### doctype is null, setting to simple");
266	document_type = GSXML.DOC_TYPE_SIMPLE;
267	}
268
269	the_document.setAttribute(GSXML.DOC_TYPE_ATT, document_type);
270
271	// start getting doc structure
272
273	// Create a parameter list to specify the required structure information
274	Element ds_param_list = doc.createElement(GSXML.PARAM_ELEM + GSXML.LIST_MODIFIER);
275
276	if (service_params != null)
277	{
278	GSXML.addParametersToList(ds_param_list, service_params);
279	}
280
281	Element ds_param = null;
282	boolean get_structure = false;
283	boolean get_structure_info = false;
284	if (document_type.equals(GSXML.DOC_TYPE_PAGED))
285	{
286	get_structure_info = true;
287
288	if (expand_contents)
289	{
290	ds_param = doc.createElement(GSXML.PARAM_ELEM);
291	ds_param_list.appendChild(ds_param);
292	ds_param.setAttribute(GSXML.NAME_ATT, "structure");
293	ds_param.setAttribute(GSXML.VALUE_ATT, "entire");
294	}
295
296	// get the info needed for paged naviagtion
297	ds_param = doc.createElement(GSXML.PARAM_ELEM);
298	ds_param_list.appendChild(ds_param);
299	ds_param.setAttribute(GSXML.NAME_ATT, "info");
300	ds_param.setAttribute(GSXML.VALUE_ATT, "numSiblings");
301	ds_param = doc.createElement(GSXML.PARAM_ELEM);
302	ds_param_list.appendChild(ds_param);
303	ds_param.setAttribute(GSXML.NAME_ATT, "info");
304	ds_param.setAttribute(GSXML.VALUE_ATT, "numChildren");
305	ds_param = doc.createElement(GSXML.PARAM_ELEM);
306	ds_param_list.appendChild(ds_param);
307	ds_param.setAttribute(GSXML.NAME_ATT, "info");
308	ds_param.setAttribute(GSXML.VALUE_ATT, "siblingPosition");
309
310	if (get_siblings)
311	{
312	ds_param = doc.createElement(GSXML.PARAM_ELEM);
313	ds_param_list.appendChild(ds_param);
314	ds_param.setAttribute(GSXML.NAME_ATT, "structure");
315	ds_param.setAttribute(GSXML.VALUE_ATT, "siblings");
316	}
317
318	}
319	else if (document_type.equals(GSXML.DOC_TYPE_HIERARCHY) \|\| document_type.equals(GSXML.DOC_TYPE_PAGED_HIERARCHY))
320	{
321	get_structure = true;
322	if (expand_contents)
323	{
324	ds_param = doc.createElement(GSXML.PARAM_ELEM);
325	ds_param_list.appendChild(ds_param);
326	ds_param.setAttribute(GSXML.NAME_ATT, "structure");
327	ds_param.setAttribute(GSXML.VALUE_ATT, "entire");
328	}
329	else
330	{
331	// get the info needed for table of contents
332	ds_param = doc.createElement(GSXML.PARAM_ELEM);
333	ds_param_list.appendChild(ds_param);
334	ds_param.setAttribute(GSXML.NAME_ATT, "structure");
335	ds_param.setAttribute(GSXML.VALUE_ATT, "ancestors");
336	ds_param = doc.createElement(GSXML.PARAM_ELEM);
337	ds_param_list.appendChild(ds_param);
338	ds_param.setAttribute(GSXML.NAME_ATT, "structure");
339	ds_param.setAttribute(GSXML.VALUE_ATT, "children");
340	if (get_siblings)
341	{
342	ds_param = doc.createElement(GSXML.PARAM_ELEM);
343	ds_param_list.appendChild(ds_param);
344	ds_param.setAttribute(GSXML.NAME_ATT, "structure");
345	ds_param.setAttribute(GSXML.VALUE_ATT, "siblings");
346	}
347	}
348	}
349	else
350	{
351	// we dont need any structure
352	}
353
354	boolean has_dummy = false;
355	if (get_structure \|\| get_structure_info)
356	{
357
358	// Build a request to obtain the document structure
359	Element ds_message = doc.createElement(GSXML.MESSAGE_ELEM);
360	String to = GSPath.appendLink(collection, AbstractDocumentRetrieve.DOCUMENT_STRUCTURE_RETRIEVE_SERVICE);
361	Element ds_request = GSXML.createBasicRequest(doc, GSXML.REQUEST_TYPE_PROCESS, to, userContext);
362	ds_message.appendChild(ds_request);
363	ds_request.appendChild(ds_param_list);
364
365	// add the node list we created earlier
366	ds_request.appendChild(basic_doc_list);
367
368	// Process the document structure retrieve message
369	Element ds_response_message = (Element) this.mr.process(ds_message);
370	if (processErrorElements(ds_response_message, page_response))
371	{
372	return result;
373	}
374
375	// get the info and print out
376	String path = GSPath.appendLink(GSXML.RESPONSE_ELEM, GSXML.DOC_NODE_ELEM + GSXML.LIST_MODIFIER);
377	path = GSPath.appendLink(path, GSXML.DOC_NODE_ELEM);
378	path = GSPath.appendLink(path, "nodeStructureInfo");
379	Element ds_response_struct_info = (Element) GSXML.getNodeByPath(ds_response_message, path);
380	// get the doc_node bit
381	if (ds_response_struct_info != null)
382	{
383	the_document.appendChild(doc.importNode(ds_response_struct_info, true));
384	}
385	path = GSPath.appendLink(GSXML.RESPONSE_ELEM, GSXML.DOC_NODE_ELEM + GSXML.LIST_MODIFIER);
386	path = GSPath.appendLink(path, GSXML.DOC_NODE_ELEM);
387	path = GSPath.appendLink(path, GSXML.NODE_STRUCTURE_ELEM);
388	Element ds_response_structure = (Element) GSXML.getNodeByPath(ds_response_message, path);
389
390	if (ds_response_structure != null)
391	{
392	// add the contents of the structure bit into the_document
393	NodeList structs = ds_response_structure.getChildNodes();
394	for (int i = 0; i < structs.getLength(); i++)
395	{
396	the_document.appendChild(doc.importNode(structs.item(i), true));
397	}
398	}
399	else
400	{
401	// no structure nodes, so put in a dummy doc node
402	Element doc_node = doc.createElement(GSXML.DOC_NODE_ELEM);
403	if (document_id != null)
404	{
405	doc_node.setAttribute(GSXML.NODE_ID_ATT, document_id);
406	}
407	else
408	{
409	doc_node.setAttribute(GSXML.HREF_ID_ATT, href);
410
411	}
412	the_document.appendChild(doc_node);
413	has_dummy = true;
414	}
415	}
416	else
417	{ // a simple type - we dont have a dummy node for simple
418	// should think about this more
419	// no structure request, so just put in a dummy doc node
420	Element doc_node = doc.createElement(GSXML.DOC_NODE_ELEM);
421	if (document_id != null)
422	{
423	doc_node.setAttribute(GSXML.NODE_ID_ATT, document_id);
424	}
425	else
426	{
427	doc_node.setAttribute(GSXML.HREF_ID_ATT, href);
428	}
429	the_document.appendChild(doc_node);
430	has_dummy = true;
431	}
432
433	// end getting doc structure
434
435	// start getting doc metadata
436
437	// Build a request to obtain some document metadata
438	Element dm_message = doc.createElement(GSXML.MESSAGE_ELEM);
439	String to = GSPath.appendLink(collection, AbstractDocumentRetrieve.DOCUMENT_METADATA_RETRIEVE_SERVICE);
440	Element dm_request = GSXML.createBasicRequest(doc, GSXML.REQUEST_TYPE_PROCESS, to, userContext);
441	dm_message.appendChild(dm_request);
442	// Create a parameter list to specify the required metadata information
443
444	HashSet<String> meta_names = new HashSet<String>();
445	meta_names.add("Title"); // the default
446	getRequiredMetadataNames(meta_names, format_elem, request);
447
448	Element dm_param_list = createMetadataParamList(doc,meta_names);
449	if (service_params != null)
450	{
451	GSXML.addParametersToList(dm_param_list, service_params);
452	}
453
454	dm_request.appendChild(dm_param_list);
455
456	// create the doc node list for the metadata request
457	Element dm_doc_list = doc.createElement(GSXML.DOC_NODE_ELEM + GSXML.LIST_MODIFIER);
458	dm_request.appendChild(dm_doc_list);
459
460	// Add each node from the structure response into the metadata request
461	NodeList doc_nodes = the_document.getElementsByTagName(GSXML.DOC_NODE_ELEM);
462	for (int i = 0; i < doc_nodes.getLength(); i++)
463	{
464	Element doc_node = (Element) doc_nodes.item(i);
465	String doc_node_id = doc_node.getAttribute(GSXML.NODE_ID_ATT);
466
467	// Add the documentNode to the list
468	Element dm_doc_node = doc.createElement(GSXML.DOC_NODE_ELEM);
469	if (needSectionContent(params)) {
470	if (doc_node_id.equals(document_id)) {
471	dm_doc_list.appendChild(dm_doc_node);
472	}
473	} else {
474	dm_doc_list.appendChild(dm_doc_node);
475	}
476	//dm_doc_list.appendChild(dm_doc_node);
477	dm_doc_node.setAttribute(GSXML.NODE_ID_ATT, doc_node_id);
478	dm_doc_node.setAttribute(GSXML.NODE_TYPE_ATT, doc_node.getAttribute(GSXML.NODE_TYPE_ATT));
479	if (document_id == null){
480	dm_doc_node.setAttribute(GSXML.HREF_ID_ATT, href );
481	}
482
483	}
484	// we also want a metadata request to the top level document to get
485	// assocfilepath - this could be cached too
486	Element doc_meta_request = GSXML.createBasicRequest(doc, GSXML.REQUEST_TYPE_PROCESS, to, userContext);
487	dm_message.appendChild(doc_meta_request);
488	Element doc_meta_param_list = doc.createElement(GSXML.PARAM_ELEM + GSXML.LIST_MODIFIER);
489	if (service_params != null)
490	{
491	GSXML.addParametersToList(doc_meta_param_list, service_params);
492	}
493
494	doc_meta_request.appendChild(doc_meta_param_list);
495	Element doc_param = doc.createElement(GSXML.PARAM_ELEM);
496	doc_meta_param_list.appendChild(doc_param);
497	doc_param.setAttribute(GSXML.NAME_ATT, "metadata");
498	doc_param.setAttribute(GSXML.VALUE_ATT, "assocfilepath");
499
500	// create the doc node list for the metadata request
501	Element doc_list = doc.createElement(GSXML.DOC_NODE_ELEM + GSXML.LIST_MODIFIER);
502	doc_meta_request.appendChild(doc_list);
503
504	Element doc_node = doc.createElement(GSXML.DOC_NODE_ELEM);
505	// the node we want is the root document node
506	if (document_id != null)
507	{
508	doc_node.setAttribute(GSXML.NODE_ID_ATT, document_id + ".rt");
509	}
510	/*else
511	{
512	doc_node.setAttribute(GSXML.HREF_ID_ATT, href);// + ".rt");
513	// can we assume that href is always a top level doc??
514	//doc_node.setAttribute(GSXML.ID_MOD_ATT, ".rt");
515	//doc_node.setAttribute("externalURL", has_rl);
516	}*/
517	doc_list.appendChild(doc_node);
518
519	Element dm_response_message = (Element) this.mr.process(dm_message);
520	if (processErrorElements(dm_response_message, page_response))
521	{
522	return result;
523	}
524
525	String path = GSPath.appendLink(GSXML.RESPONSE_ELEM, GSXML.DOC_NODE_ELEM + GSXML.LIST_MODIFIER);
526	Element dm_response_doc_list = (Element) GSXML.getNodeByPath(dm_response_message, path);
527
528	// Merge the metadata with the structure information
529	NodeList dm_response_docs = dm_response_doc_list.getChildNodes();
530	for (int i = 0; i < doc_nodes.getLength(); i++)
531	{
532	Node dcNode;
533	String node_idd = ((Element)doc_nodes.item(i)).getAttribute(GSXML.NODE_ID_ATT);
534	if (node_idd.isEmpty()) {
535	String href_id_att = ((Element)doc_nodes.item(i)).getAttribute(GSXML.HREF_ID_ATT);
536	dcNode = GSXML.getNamedElement(dm_response_doc_list, "documentNode", GSXML.HREF_ID_ATT, href_id_att);
537	} else {
538	dcNode = GSXML.getNamedElement(dm_response_doc_list, "documentNode", GSXML.NODE_ID_ATT, node_idd);
539	}
540	GSXML.mergeMetadataLists(doc_nodes.item(i), dcNode);
541	}
542	// get the top level doc metadata out
543	Element doc_meta_response = (Element) dm_response_message.getElementsByTagName(GSXML.RESPONSE_ELEM).item(1);
544	Element top_doc_node = (Element) GSXML.getNodeByPath(doc_meta_response, "documentNodeList/documentNode");
545	GSXML.mergeMetadataLists(the_document, top_doc_node);
546
547	// if we are highlighting query terms, then we also get them highlighted in the metadata
548
549	HashSet<String> query_term_variants = null;
550	ArrayList<ArrayList<HashSet<String>>> phrase_query_term_variants_hierarchy = null;
551	boolean do_highlight_query_terms = highlight_query_terms;
552	int query_terms_status = 0;
553	if (highlight_query_terms) {
554	// lets get the query term equivalents
555	query_term_variants = new HashSet<String>();
556	phrase_query_term_variants_hierarchy = new ArrayList<ArrayList<HashSet<String>>>();
557	if ((query_terms_status = getQueryTermVariants(request, query_term_variants, phrase_query_term_variants_hierarchy)) ==0) {
558	do_highlight_query_terms = false; // we couldn't get the terms
559	}
560	}
561
562	// lets try marking up the metadata with search terms
563	// if the search service doesn't send back <equivTermlist> then we haven't got the term variants. We lower case everything and do case insensitive matching
564	boolean highlight_case_insensitive = false;
565	if (query_terms_status == NO_EQUIV_QUERY_TERMS) {
566	highlight_case_insensitive = true;
567	}
568	if (do_highlight_query_terms) {
569	highlightQueryTermsDOM(doc, the_document, "metadata", query_term_variants, phrase_query_term_variants_hierarchy, highlight_case_insensitive);
570	}
571
572	// do we want doc text content? If not, we are done.
573	if (!get_text) {
574	// don't get text
575	return result;
576	}
577
578	// Build a request to obtain some document content
579	Element dc_message = doc.createElement(GSXML.MESSAGE_ELEM);
580	to = GSPath.appendLink(collection, AbstractDocumentRetrieve.DOCUMENT_CONTENT_RETRIEVE_SERVICE);
581	Element dc_request = GSXML.createBasicRequest(doc, GSXML.REQUEST_TYPE_PROCESS, to, userContext);
582	dc_message.appendChild(dc_request);
583
584	// Create a parameter list to specify the request parameters - empty for now
585	Element dc_param_list = doc.createElement(GSXML.PARAM_ELEM + GSXML.LIST_MODIFIER);
586	if (service_params != null)
587	{
588	GSXML.addParametersToList(dc_param_list, service_params);
589	}
590
591	dc_request.appendChild(dc_param_list);
592
593	// get the content
594	// the doc list for the content request is the same as the one for the structure request unless we want the whole document, in which case its the same as for the metadata request.
595	if (expand_document)
596	{
597	dc_request.appendChild(dm_doc_list);
598	}
599	else
600	{
601	dc_request.appendChild(basic_doc_list);
602	}
603	Element dc_response_message = (Element) this.mr.process(dc_message);
604
605	if (processErrorElements(dc_response_message, page_response))
606	{
607	return result;
608
609	}
610	Element dc_response_doc_list = (Element) GSXML.getNodeByPath(dc_response_message, path);
611
612	boolean get_marked_up_doc_from_query = false;
613	if (do_highlight_query_terms && query_terms_status == NO_EQUIV_QUERY_TERMS) {
614	get_marked_up_doc_from_query = true; // we try to. solr we can, lucene we can't
615	}
616
617	if (expand_document)
618	{
619	// Merge the content with the structure information
620	NodeList dc_response_docs = dc_response_doc_list.getChildNodes();
621	for (int i = 0; i < doc_nodes.getLength(); i++)
622	{
623	String node_id = ((Element)doc_nodes.item(i)).getAttribute(GSXML.NODE_ID_ATT);
624	Node docNode = GSXML.getNamedElement(dc_response_doc_list, "documentNode", GSXML.NODE_ID_ATT, node_id);
625	Node content = GSXML.getChildByTagName(docNode, GSXML.NODE_CONTENT_ELEM);
626	if (content != null)
627	{
628	if (do_highlight_query_terms) {
629	if (get_marked_up_doc_from_query) {
630
631	Element new_content = retrieveHighlightedContent(request, node_id);
632
633	if (new_content == null) {
634	// we didn't get any text back from the request. assume we won't be able to get it next time either (eg lucene)
635	get_marked_up_doc_from_query = false;
636	content = highlightQueryTermsElementText(doc, (Element)content, query_term_variants, phrase_query_term_variants_hierarchy, highlight_case_insensitive);
637	} else {
638	content= new_content;
639	}
640	} else {
641	content = highlightQueryTermsElementText(doc, (Element)content, query_term_variants, phrase_query_term_variants_hierarchy, highlight_case_insensitive);
642	}
643	}
644	doc_nodes.item(i).appendChild(doc.importNode(content, true));
645	}
646
647	}
648	if (has_dummy && document_type.equals(GSXML.DOC_TYPE_SIMPLE)) {
649	Element dummy_node = (Element) doc_nodes.item(0);
650	the_document.removeChild(dummy_node);
651	the_document.setAttribute(GSXML.NODE_ID_ATT, dummy_node.getAttribute(GSXML.NODE_ID_ATT));
652	NodeList dummy_children = dummy_node.getChildNodes();
653	for (int i = dummy_children.getLength() - 1; i >= 0; i--)
654	{
655	// special case as we don't want more than one metadata list
656	if (dummy_children.item(i).getNodeName().equals(GSXML.METADATA_ELEM + GSXML.LIST_MODIFIER))
657	{
658	GSXML.mergeMetadataFromList(the_document, dummy_children.item(i));
659	}
660	else
661	{
662	the_document.appendChild(dummy_children.item(i));
663	}
664	}
665	}
666	}
667	else
668	{
669	Element dc_response_doc = (Element) GSXML.getChildByTagName(dc_response_doc_list, GSXML.DOC_NODE_ELEM);
670	Element dc_response_doc_content = (Element) GSXML.getChildByTagName(dc_response_doc, GSXML.NODE_CONTENT_ELEM);
671
672	if (dc_response_doc_content == null)
673	{
674	// no content to add
675	if (dc_response_doc.getAttribute("external").equals("true"))
676	{
677	String href_id = dc_response_doc.getAttribute(GSXML.HREF_ID_ATT);
678
679	the_document.setAttribute("selectedNode", href_id);
680	the_document.setAttribute("external", href_id);
681	}
682	return result;
683	}
684	if (do_highlight_query_terms)
685	{
686	dc_response_doc.removeChild(dc_response_doc_content);
687	if (get_marked_up_doc_from_query) {
688	Element new_content = retrieveHighlightedContent(request, null);
689	if (new_content == null) {
690	get_marked_up_doc_from_query = false;
691	dc_response_doc_content = highlightQueryTermsElementText(doc, (Element)dc_response_doc_content, query_term_variants, phrase_query_term_variants_hierarchy, highlight_case_insensitive);
692	} else {
693
694	dc_response_doc_content = new_content;
695	}
696	} else {
697	dc_response_doc_content = highlightQueryTermsElementText(doc, (Element)dc_response_doc_content, query_term_variants, phrase_query_term_variants_hierarchy, highlight_case_insensitive);
698	}
699	dc_response_doc.appendChild(dc_response_doc.getOwnerDocument().importNode(dc_response_doc_content, true));
700	}
701
702	if (provide_annotations)
703	{
704	String service_selected = (String) params.get(ENRICH_DOC_ARG);
705	if (service_selected != null && service_selected.equals("1"))
706	{
707	// now we can modifiy the response doc if needed
708	String enrich_service = (String) params.get(GSParams.SERVICE);
709	// send a message to the service
710	Element enrich_message = doc.createElement(GSXML.MESSAGE_ELEM);
711	Element enrich_request = GSXML.createBasicRequest(doc, GSXML.REQUEST_TYPE_PROCESS, enrich_service, userContext);
712	enrich_message.appendChild(enrich_request);
713	// check for parameters
714	HashMap e_service_params = (HashMap) params.get("s1");
715	if (e_service_params != null)
716	{
717	Element enrich_pl = doc.createElement(GSXML.PARAM_ELEM + GSXML.LIST_MODIFIER);
718	GSXML.addParametersToList(enrich_pl, e_service_params);
719	enrich_request.appendChild(enrich_pl);
720	}
721	Element e_doc_list = doc.createElement(GSXML.DOC_NODE_ELEM + GSXML.LIST_MODIFIER);
722	enrich_request.appendChild(e_doc_list);
723	e_doc_list.appendChild(doc.importNode(dc_response_doc, true));
724
725	Node enrich_response = this.mr.process(enrich_message);
726
727	String[] links = { GSXML.RESPONSE_ELEM, GSXML.DOC_NODE_ELEM + GSXML.LIST_MODIFIER, GSXML.DOC_NODE_ELEM, GSXML.NODE_CONTENT_ELEM };
728	path = GSPath.createPath(links);
729	dc_response_doc_content = (Element) GSXML.getNodeByPath(enrich_response, path);
730
731	}
732	} // if provide_annotations
733
734	// use the returned id rather than the sent one cos there may have
735	// been modifiers such as .pr that are removed.
736	String modified_doc_id = dc_response_doc.getAttribute(GSXML.NODE_ID_ATT);
737	the_document.setAttribute("selectedNode", modified_doc_id);
738	if (has_dummy)
739	{
740	// change the id if necessary and add the content
741	Element dummy_node = (Element) doc_nodes.item(0);
742
743	dummy_node.setAttribute(GSXML.NODE_ID_ATT, modified_doc_id);
744	dummy_node.appendChild(doc.importNode(dc_response_doc_content, true));
745	// hack for simple type
746	if (document_type.equals(GSXML.DOC_TYPE_SIMPLE))
747	{
748	// we dont want the internal docNode, just want the content and metadata in the document
749	// rethink this!!
750	the_document.removeChild(dummy_node);
751
752	NodeList dummy_children = dummy_node.getChildNodes();
753	for (int i = dummy_children.getLength() - 1; i >= 0; i--)
754	{
755	// special case as we don't want more than one metadata list
756	if (dummy_children.item(i).getNodeName().equals(GSXML.METADATA_ELEM + GSXML.LIST_MODIFIER))
757	{
758	GSXML.mergeMetadataFromList(the_document, dummy_children.item(i));
759	}
760	else
761	{
762	the_document.appendChild(dummy_children.item(i));
763	}
764	}
765	}
766
767	the_document.setAttribute(GSXML.NODE_ID_ATT, modified_doc_id);
768	}
769	else
770	{
771	// Merge the document content with the metadata and structure information
772	for (int i = 0; i < doc_nodes.getLength(); i++)
773	{
774	Node dn = doc_nodes.item(i);
775	String dn_id = ((Element) dn).getAttribute(GSXML.NODE_ID_ATT);
776	if (dn_id.equals(modified_doc_id))
777	{
778	dn.appendChild(doc.importNode(dc_response_doc_content, true));
779	break;
780	}
781	}
782	}
783	}
784	//logger.debug("(DocumentAction) Page:\n" + GSXML.xmlNodeToString(result));
785	return result;
786	}
787
788	protected boolean checkValidOID(Element basic_doc_list, String collection, UserContext userContext, Element page_response) {
789	Document doc = basic_doc_list.getOwnerDocument();
790
791	Element v_message = doc.createElement(GSXML.MESSAGE_ELEM);
792	String to = GSPath.appendLink(collection, AbstractDocumentRetrieve.VALIDATE_DOCUMENT_ID_SERVICE);
793	Element v_request = GSXML.createBasicRequest(doc, GSXML.REQUEST_TYPE_PROCESS, to, userContext);
794	v_message.appendChild(v_request);
795
796	// add the node list
797	v_request.appendChild(basic_doc_list);
798	Element v_response_message = (Element) this.mr.process(v_message);
799	if (processErrorElements(v_response_message, page_response))
800	{
801	return false;
802	}
803	String[] links = { GSXML.RESPONSE_ELEM, GSXML.DOC_NODE_ELEM + GSXML.LIST_MODIFIER, GSXML.DOC_NODE_ELEM };
804	String path = GSPath.createPath(links);
805	Element info_elem = (Element) GSXML.getNodeByPath(v_response_message, path);
806	if (info_elem == null) {
807	return false;
808	}
809	if (info_elem.getAttribute("valid").equals("true")) {
810	return true;
811	}
812	return false;
813
814	}
815
816	protected Element getFormattedArchiveDoc(Document doc, String collection, String document_id, String opt_document_version, String document_type,
817	Element result, Element page_response, UserContext userContext ) {
818	// call get archive doc
819	Element dx_message = doc.createElement(GSXML.MESSAGE_ELEM);
820	String to = DocXMLUtil.DOC_XML_GET_SECTION_SERVICE;
821	Element dx_request = GSXML.createBasicRequest(doc, GSXML.REQUEST_TYPE_PROCESS, to, userContext);
822	dx_message.appendChild(dx_request);
823	Element dx_section = doc.createElement(GSXML.DOCXML_SECTION_ELEM);
824	dx_section.setAttribute(GSXML.NODE_ID_ATT, document_id);
825	dx_section.setAttribute(GSXML.COLLECTION_ATT, collection);
826	dx_section.setAttribute(GSXML.DOC_VERSION_ATT, opt_document_version);
827	dx_request.appendChild(dx_section);
828
829	Element dx_response_message = (Element) this.mr.process(dx_message);
830	if (processErrorElements(dx_response_message, page_response))
831	{
832	return result;
833	}
834
835	// get the section out
836	String path = GSPath.appendLink(GSXML.RESPONSE_ELEM, GSXML.DOCXML_SECTION_ELEM);
837	Element section = (Element) GSXML.getNodeByPath(dx_response_message, path);
838	if (section == null) {
839	logger.error("no archive doc returned for "+document_id);
840	return result;
841	}
842	// convert the archive format into the internal format that the page response requires
843
844	// work out doctype
845	// NOTE: this will be coming from collection database in index
846	// the archive file doesn't store this. So we have to assume
847	// that the doc type will not be changing with any
848	// modifications happening to archives.
849
850	// if doc type is null, then we need to work it out.
851	// create a basic doc list containing the current node
852
853	if (document_type == null) {
854	Element basic_doc_list = doc.createElement(GSXML.DOC_NODE_ELEM + GSXML.LIST_MODIFIER);
855	Element current_doc = doc.createElement(GSXML.DOC_NODE_ELEM);
856	basic_doc_list.appendChild(current_doc);
857	current_doc.setAttribute(GSXML.NODE_ID_ATT, document_id);
858	basic_doc_list.appendChild(current_doc);
859	document_type = getDocumentType(basic_doc_list, collection, userContext, page_response);
860	}
861
862	if (document_type == null) {
863	logger.debug("@@@ doctype is null, setting to simple");
864	document_type = GSXML.DOC_TYPE_SIMPLE;
865	}
866
867	Element doc_elem = doc.createElement(GSXML.DOCUMENT_ELEM);
868	doc_elem.setAttribute(GSXML.DOC_TYPE_ATT, document_type);
869	page_response.appendChild(doc_elem);
870
871	Element transformed_section = transformArchiveToDocument(section);
872	if (document_type == GSXML.DOC_TYPE_SIMPLE) {
873	// simple doc, only returning a single document node, which is the top level section.
874	doc_elem.setAttribute(GSXML.NODE_ID_ATT, document_id);
875	GSXML.mergeElements(doc_elem, transformed_section);
876	return result;
877	}
878
879	// multi sectioned document.
880	transformed_section.setAttribute(GSXML.NODE_ID_ATT, document_id);
881	// In docEdit mode, we obtain the text from archives, from doc.xml
882	// Now the transformation has replaced <Section> with <documentNode>
883	// Need to add nodeID, nodeType and docType attributes to each docNode
884	// as doc.xml doesn't store that.
885	insertDocNodeAttributes(transformed_section, document_type, null);
886	doc_elem.appendChild(doc.importNode(transformed_section, true));
887	logger.debug("dx result = "+XMLConverter.getPrettyString(result));
888
889	return result;
890	}
891
892
893	private boolean needSectionContent(HashMap<String, Serializable> params) {
894	String document_id = (String) params.get(GSParams.DOCUMENT);
895	String ilt = (String) params.get(GSParams.INLINE_TEMPLATE);
896	String iltPrefix = "<xsl:template match=\"/\"><text><xsl:for-each select=\"/page/pageResponse/document//documentNode[@nodeID =";
897	if (ilt != null && ilt.startsWith(iltPrefix) && document_id != null) {
898	return true;
899	}
900
901	return false;
902	}
903	/**
904	* this method gets the collection description, the format info, the list of
905	* enrich services, etc - stuff that is needed for the page, but is the same
906	* whatever the query is - should be cached
907	*/
908	protected boolean getBackgroundData(Element page_response, String collection, UserContext userContext)
909	{
910	Document doc = page_response.getOwnerDocument();
911
912	// create a message to process - contains requests for the collection
913	// description, the format element, the enrich services on offer
914	// these could all be cached
915	Element info_message = doc.createElement(GSXML.MESSAGE_ELEM);
916	String path = GSPath.appendLink(collection, AbstractDocumentRetrieve.DOCUMENT_CONTENT_RETRIEVE_SERVICE);
917	// the format request - ignore for now, where does this request go to??
918	Element format_request = GSXML.createBasicRequest(doc, GSXML.REQUEST_TYPE_FORMAT, path, userContext);
919	info_message.appendChild(format_request);
920
921	// the enrich_services request - only do this if provide_annotations is true
922
923	if (provide_annotations)
924	{
925	Element enrich_services_request = GSXML.createBasicRequest(doc, GSXML.REQUEST_TYPE_DESCRIBE, "", userContext);
926	enrich_services_request.setAttribute(GSXML.INFO_ATT, "serviceList");
927	info_message.appendChild(enrich_services_request);
928	}
929
930	Element info_response = (Element) this.mr.process(info_message);
931
932	// the collection is the first response
933	NodeList responses = info_response.getElementsByTagName(GSXML.RESPONSE_ELEM);
934	Element format_resp = (Element) responses.item(0);
935
936	Element format_elem = (Element) GSXML.getChildByTagName(format_resp, GSXML.FORMAT_ELEM);
937	if (format_elem != null)
938	{
939	Element global_format_elem = (Element) GSXML.getChildByTagName(format_resp, GSXML.GLOBAL_FORMAT_ELEM);
940	if (global_format_elem != null)
941	{
942	GSXSLT.mergeFormatElements(format_elem, global_format_elem, false);
943	}
944
945	// set the format type
946	format_elem.setAttribute(GSXML.TYPE_ATT, "display");
947	page_response.appendChild(doc.importNode(format_elem, true));
948	}
949
950	if (provide_annotations)
951	{
952	Element services_resp = (Element) responses.item(1);
953
954	// a new message for the mr
955	Element enrich_message = doc.createElement(GSXML.MESSAGE_ELEM);
956	NodeList e_services = services_resp.getElementsByTagName(GSXML.SERVICE_ELEM);
957	boolean service_found = false;
958	for (int j = 0; j < e_services.getLength(); j++)
959	{
960	if (((Element) e_services.item(j)).getAttribute(GSXML.TYPE_ATT).equals("enrich"))
961	{
962	Element s = GSXML.createBasicRequest(doc, GSXML.REQUEST_TYPE_DESCRIBE, ((Element) e_services.item(j)).getAttribute(GSXML.NAME_ATT), userContext);
963	enrich_message.appendChild(s);
964	service_found = true;
965	}
966	}
967	if (service_found)
968	{
969	Element enrich_response = (Element) this.mr.process(enrich_message);
970
971	NodeList e_responses = enrich_response.getElementsByTagName(GSXML.RESPONSE_ELEM);
972	Element service_list = doc.createElement(GSXML.SERVICE_ELEM + GSXML.LIST_MODIFIER);
973	for (int i = 0; i < e_responses.getLength(); i++)
974	{
975	Element e_resp = (Element) e_responses.item(i);
976	Element e_service = (Element) doc.importNode(GSXML.getChildByTagName(e_resp, GSXML.SERVICE_ELEM), true);
977	e_service.setAttribute(GSXML.NAME_ATT, e_resp.getAttribute(GSXML.FROM_ATT));
978	service_list.appendChild(e_service);
979	}
980	page_response.appendChild(service_list);
981	}
982	} // if provide_annotations
983	return true;
984
985	}
986
987	protected String getDocumentType(Element basic_doc_list, String collection, UserContext userContext, Element page_response)
988	{
989	Document doc = basic_doc_list.getOwnerDocument();
990
991	Element ds_message = doc.createElement(GSXML.MESSAGE_ELEM);
992	String to = GSPath.appendLink(collection, AbstractDocumentRetrieve.DOCUMENT_STRUCTURE_RETRIEVE_SERVICE);
993	Element ds_request = GSXML.createBasicRequest(doc, GSXML.REQUEST_TYPE_PROCESS, to, userContext);
994	ds_message.appendChild(ds_request);
995
996	// Create a parameter list to specify the required structure information
997	Element ds_param_list = doc.createElement(GSXML.PARAM_ELEM + GSXML.LIST_MODIFIER);
998	Element ds_param = doc.createElement(GSXML.PARAM_ELEM);
999	ds_param_list.appendChild(ds_param);
1000	ds_param.setAttribute(GSXML.NAME_ATT, "info");
1001	ds_param.setAttribute(GSXML.VALUE_ATT, "documentType");
1002
1003	ds_request.appendChild(ds_param_list);
1004
1005	// add the node list we created earlier
1006	ds_request.appendChild(basic_doc_list);
1007
1008	// Process the document structure retrieve message
1009	Element ds_response_message = (Element) this.mr.process(ds_message);
1010	if (processErrorElements(ds_response_message, page_response))
1011	{
1012	return null;
1013	}
1014
1015	String[] links = { GSXML.RESPONSE_ELEM, GSXML.DOC_NODE_ELEM + GSXML.LIST_MODIFIER, GSXML.DOC_NODE_ELEM, "nodeStructureInfo" };
1016	String path = GSPath.createPath(links);
1017	Element info_elem = (Element) GSXML.getNodeByPath(ds_response_message, path);
1018	if (info_elem == null) {
1019	return null;
1020	}
1021	Element doctype_elem = GSXML.getNamedElement(info_elem, "info", "name", "documentType");
1022	if (doctype_elem != null)
1023	{
1024	String doc_type = doctype_elem.getAttribute("value");
1025	return doc_type;
1026	}
1027	return null;
1028	}
1029
1030	// Recursive method to set the docType, nodeType and nodeID attributes of each docNode
1031	// The docType remains constant as in parameter document_type
1032	// The nodeID for the first (root) docNode is already set. For all children, the rootNode id
1033	// is updated to be <parent-id>.<num-child>, where the first parent-id is rootNode id.
1034	// The nodeType is root if rootNode, internal if there are children and leaf if no children
1035	protected void insertDocNodeAttributes(Element docNode, String document_type, String id) {
1036
1037	boolean isRoot = false;
1038	if(id == null) { // rootNode, get the root nodeID to work with recursively
1039	id = docNode.getAttribute(GSXML.NODE_ID_ATT);
1040	isRoot = true;
1041	} else { // for all but the root node, need to still set the nodeID
1042	docNode.setAttribute(GSXML.NODE_ID_ATT, id);
1043	}
1044
1045	docNode.setAttribute(GSXML.DOC_TYPE_ATT, document_type);
1046
1047	NodeList docNodes = GSXML.getChildrenByTagName(docNode, GSXML.DOC_NODE_ELEM);
1048	if(docNodes.getLength() > 0) {
1049	docNode.setAttribute(GSXML.NODE_TYPE_ATT, GSXML.NODE_TYPE_INTERNAL);
1050	for(int i = 0; i < docNodes.getLength(); i++) {
1051	Element childDocNode = (Element)docNodes.item(i);
1052
1053	// work out the child docNode's nodeID based on current id
1054	String nodeID = id + "." + (i+1);
1055	insertDocNodeAttributes(childDocNode, document_type, nodeID); //recursion step
1056	}
1057	} else {
1058	docNode.setAttribute(GSXML.NODE_TYPE_ATT, GSXML.NODE_TYPE_LEAF);
1059	}
1060
1061	// rootNode's nodeType is a special case: it's "root", not "leaf" or "internal"
1062	if(isRoot) docNode.setAttribute(GSXML.NODE_TYPE_ATT, GSXML.NODE_TYPE_ROOT);
1063
1064	}
1065
1066	/** run the XSLT transform which converts from doc.xml format to our internal document format */
1067	protected Element transformArchiveToDocument(Element section) {
1068
1069	String stylesheet_filename = GSFile.stylesheetFile(GlobalProperties.getGSDL3Home(), (String) this.config_params.get(GSConstants.SITE_NAME), "", (String) this.config_params.get(GSConstants.INTERFACE_NAME), (ArrayList<String>) this.config_params.get(GSConstants.BASE_INTERFACES), "archive2document.xsl");
1070	if (stylesheet_filename == null) {
1071	logger.error("Couldn't find stylesheet archive2document.xsl");
1072	return section;
1073	}
1074
1075	Document stylesheet_doc = XMLConverter.getDOM(new File(stylesheet_filename));
1076	if (stylesheet_doc == null) {
1077	logger.error("Couldn't load in stylesheet "+stylesheet_filename);
1078	return section;
1079	}
1080
1081	Document section_doc = XMLConverter.newDOM();
1082	section_doc.appendChild(section_doc.importNode(section, true));
1083	Node result = this.transformer.transform(stylesheet_doc, section_doc);
1084	logger.debug("transform result = "+XMLConverter.getPrettyString(result));
1085
1086	Element new_element;
1087	if (result.getNodeType() == Node.DOCUMENT_NODE) {
1088	new_element = ((Document) result).getDocumentElement();
1089	} else {
1090	new_element = (Element) result;
1091	}
1092
1093
1094	return new_element;
1095
1096	}
1097
1098	protected final int NO_QUERY_TERMS = 0;
1099	protected final int NO_EQUIV_QUERY_TERMS = 1;
1100	protected final int EQUIV_QUERY_TERMS = 2;
1101	/**
1102	* this involves a bit of a hack to get the equivalent query terms - has to
1103	* requery the query service - uses the last selected service name. (if it
1104	* ends in query).
1105	*/
1106	protected int getQueryTermVariants(Element request, HashSet<String> query_term_variants, ArrayList<ArrayList<HashSet<String>>> phrase_query_term_variants_hierarchy)
1107	{
1108	Document doc = XMLConverter.newDOM();
1109
1110	// do the query again to get term info
1111	Element cgi_param_list = (Element) GSXML.getChildByTagName(request, GSXML.PARAM_ELEM + GSXML.LIST_MODIFIER);
1112	HashMap<String, Serializable> params = GSXML.extractParams(cgi_param_list, false);
1113
1114	HashMap previous_params = (HashMap) params.get("p");
1115	if (previous_params == null)
1116	{
1117	return NO_QUERY_TERMS;
1118	}
1119	String service_name = (String) previous_params.get(GSParams.SERVICE);
1120	if (service_name == null \|\| !service_name.endsWith("Query"))
1121	{ // hack for now - we only do highlighting if we were in a query last - ie not if we were in a browse thingy
1122	logger.debug("invalid service "+service_name+", not doing highlighting");
1123	return NO_QUERY_TERMS;
1124	}
1125
1126	String collection = (String) params.get(GSParams.COLLECTION);
1127	UserContext userContext = new UserContext(request);
1128	String to = GSPath.appendLink(collection, service_name);
1129
1130	Element mr_query_message = doc.createElement(GSXML.MESSAGE_ELEM);
1131	Element mr_query_request = GSXML.createBasicRequest(doc, GSXML.REQUEST_TYPE_PROCESS, to, userContext);
1132	mr_query_message.appendChild(mr_query_request);
1133
1134	// paramList
1135	HashMap service_params = (HashMap) params.get("s1");
1136
1137	Element query_param_list = doc.createElement(GSXML.PARAM_ELEM + GSXML.LIST_MODIFIER);
1138	GSXML.addParametersToList(query_param_list, service_params);
1139	mr_query_request.appendChild(query_param_list);
1140
1141	// do the query
1142	Element mr_query_response = (Element) this.mr.process(mr_query_message);
1143
1144	// find the term lists
1145	String path = GSPath.appendLink(GSXML.RESPONSE_ELEM, GSXML.TERM_ELEM + GSXML.LIST_MODIFIER);
1146	Element query_term_list_element = (Element) GSXML.getNodeByPath(mr_query_response, path);
1147	if (query_term_list_element == null)
1148	{
1149	// no term info
1150	return NO_QUERY_TERMS;
1151	}
1152
1153	int result_code = NO_EQUIV_QUERY_TERMS;
1154	NodeList equivalent_terms_nodelist = query_term_list_element.getElementsByTagName("equivTermList");
1155	if (equivalent_terms_nodelist == null \|\| equivalent_terms_nodelist.getLength() == 0)
1156	{
1157	// if we have no equivalent terms, just add the current terms lower cased and we do case insensitive matching later on
1158	NodeList terms_nodelist = query_term_list_element.getElementsByTagName("term");
1159	if (terms_nodelist != null && terms_nodelist.getLength() > 0)
1160	{
1161	for (int i = 0; i < terms_nodelist.getLength(); i++)
1162	{
1163	String termValue = ((Element) terms_nodelist.item(i)).getAttribute("name");
1164	query_term_variants.add(termValue.toLowerCase());
1165	}
1166	}
1167	}
1168	else
1169	{
1170	result_code = EQUIV_QUERY_TERMS;
1171	for (int i = 0; i < equivalent_terms_nodelist.getLength(); i++)
1172	{
1173	Element equivalent_terms_element = (Element) equivalent_terms_nodelist.item(i);
1174	String[] equivalent_terms = GSXML.getAttributeValuesFromList(equivalent_terms_element, GSXML.NAME_ATT);
1175	for (int j = 0; j < equivalent_terms.length; j++)
1176	{
1177	query_term_variants.add(equivalent_terms[j]);
1178	}
1179	}
1180	}
1181
1182	String metadata_path = GSPath.appendLink(GSXML.RESPONSE_ELEM, GSXML.METADATA_ELEM + GSXML.LIST_MODIFIER);
1183	Element metadata_list = (Element) GSXML.getNodeByPath(mr_query_response, metadata_path);
1184
1185	Element query_element = GSXML.getNamedElement(metadata_list, GSXML.METADATA_ELEM, GSXML.NAME_ATT, "query");
1186	String performed_query = GSXML.getNodeText(query_element) + " ";
1187	logger.debug("performed query="+performed_query);
1188
1189	boolean has_phrases = false; // if there are no phrases, we don't bother making the phrase variants structure
1190	if (performed_query.contains("\"")) {
1191	has_phrases = true;
1192	}
1193
1194	ArrayList<HashSet<String>> phrase_query_p_term_variants_list = new ArrayList<HashSet<String>>();
1195	int term_start = 0;
1196	boolean in_term = false;
1197	boolean in_phrase = false;
1198	for (int i = 0; i < performed_query.length(); i++) {
1199
1200	char character = performed_query.charAt(i);
1201	boolean is_character_letter_or_digit = Character.isLetterOrDigit(character);
1202
1203	// Has a query term just started?
1204	if (in_term == false && is_character_letter_or_digit == true)
1205	{
1206	in_term = true;
1207	term_start = i;
1208	}
1209
1210	// Or has a term just finished?
1211	else if (in_term == true && is_character_letter_or_digit == false)
1212	{
1213	in_term = false;
1214	String term = performed_query.substring(term_start, i);
1215	if (has_phrases) {
1216	// do the phrase bit
1217	HashSet<String> phrase_query_p_term_x_variants = new HashSet<String>();
1218	if (result_code == EQUIV_QUERY_TERMS) {
1219	Element term_element = GSXML.getNamedElement(query_term_list_element, GSXML.TERM_ELEM, GSXML.NAME_ATT, term);
1220	if (term_element != null) {
1221	// might be null for eg TX in [snails]:TX
1222
1223	NodeList term_equivalent_terms_nodelist = term_element.getElementsByTagName("equivTermList");
1224	if (term_equivalent_terms_nodelist != null \|\| term_equivalent_terms_nodelist.getLength() != 0) {
1225	for (int j = 0; j < term_equivalent_terms_nodelist.getLength(); j++)
1226	{
1227	Element term_equivalent_terms_element = (Element) term_equivalent_terms_nodelist.item(j);
1228	String[] term_equivalent_terms = GSXML.getAttributeValuesFromList(term_equivalent_terms_element, GSXML.NAME_ATT);
1229	for (int k = 0; k < term_equivalent_terms.length; k++)
1230	{
1231	phrase_query_p_term_x_variants.add(term_equivalent_terms[k]);
1232	}
1233	}
1234	}
1235	}
1236	} else { // result_code != EQUIV_QUERY_TERMS
1237	// we don;t have equivalent term list, so just add the lower cased version in, and we do case-insensitive matching later on
1238	if (query_term_variants.contains(term.toLowerCase()) \|\| containsSubString(query_term_variants, term)) {
1239	// this handles the case where the user has searched for snails, but term list returns 'snail'
1240	phrase_query_p_term_x_variants.add(term.toLowerCase());
1241	}
1242	}
1243	if (phrase_query_p_term_x_variants.size()>0) {
1244	// we have found a valid term
1245	phrase_query_p_term_variants_list.add(phrase_query_p_term_x_variants);
1246
1247	if (in_phrase == false)
1248	{
1249	phrase_query_term_variants_hierarchy.add(phrase_query_p_term_variants_list);
1250	phrase_query_p_term_variants_list = new ArrayList<HashSet<String>>();
1251	}
1252	}
1253	} // end if has_phrases
1254	else {
1255	// no phrases so we don't have to do the phrasey stuff. but
1256	// we need to check the term against the query term list - if its not in there, check whether its the root of a term.
1257	// we want to handle the case where user has queried "snails", the term list returned only has snail, and therefore snails doesn't get highlighted.
1258	// but dont want to include eg TX
1259	if (result_code == NO_EQUIV_QUERY_TERMS) {
1260	if (containsSubString(query_term_variants, term)) {
1261	query_term_variants.add(term.toLowerCase());
1262	}
1263	}
1264
1265	}
1266	} // end of in_term...
1267	// Watch for phrases (surrounded by quotes)
1268	if (character == '\"') {
1269
1270	// Has a phrase just started?
1271	if (in_phrase == false)
1272	{
1273	in_phrase = true;
1274	}
1275	// Or has a phrase just finished?
1276	else if (in_phrase == true)
1277	{
1278	in_phrase = false;
1279	phrase_query_term_variants_hierarchy.add(phrase_query_p_term_variants_list);
1280	}
1281
1282	phrase_query_p_term_variants_list = new ArrayList<HashSet<String>>();
1283	} // if char == "
1284	} // for each char in performed query
1285
1286	return result_code;
1287	}
1288
1289	protected boolean containsSubString(HashSet<String> query_term_variants, String term) {
1290	// hack to filter out TX, TI field names
1291	String lc_term = term.toLowerCase();
1292	if (query_term_variants.contains(term)) {
1293	return false; // or true??
1294	}
1295	if (term.matches("[A-Z][A-Z][A-Z]?")) {
1296	return false;
1297	}
1298	Iterator i = query_term_variants.iterator();
1299	while (i.hasNext()) {
1300	String t = (String)i.next();
1301	if (term.startsWith(t)) {
1302	return true;
1303	}
1304	}
1305	return false;
1306	}
1307
1308
1309	/** retrieve the marked up highlighted section - only works for solr collection */
1310	protected Element retrieveHighlightedContent(Element request, String node_id) {
1311
1312	Document doc = XMLConverter.newDOM();
1313
1314	// do the query again to get term info
1315	Element cgi_param_list = (Element) GSXML.getChildByTagName(request, GSXML.PARAM_ELEM + GSXML.LIST_MODIFIER);
1316	HashMap<String, Serializable> params = GSXML.extractParams(cgi_param_list, false);
1317
1318	HashMap previous_params = (HashMap) params.get("p");
1319	if (previous_params == null)
1320	{
1321	return null;
1322	}
1323	String service_name = (String) previous_params.get(GSParams.SERVICE);
1324	if (service_name == null \|\| !service_name.endsWith("Query"))
1325	{ // hack for now - we only do highlighting if we were in a query last - ie not if we were in a browse thingy
1326	logger.debug("HL invalid service, not doing highlighting");
1327	return null;
1328	}
1329
1330	String collection = (String) params.get(GSParams.COLLECTION);
1331	UserContext userContext = new UserContext(request);
1332	String to = GSPath.appendLink(collection, service_name);
1333
1334	Element mr_query_message = doc.createElement(GSXML.MESSAGE_ELEM);
1335	Element mr_query_request = GSXML.createBasicRequest(doc, GSXML.REQUEST_TYPE_PROCESS, to, userContext);
1336	mr_query_message.appendChild(mr_query_request);
1337
1338	// paramList
1339	HashMap service_params = (HashMap) params.get("s1");
1340
1341	// hack in case the user searched on eg titles, but we want highlighting in the text
1342	service_params.put("index", "TX");
1343	Element query_param_list = doc.createElement(GSXML.PARAM_ELEM + GSXML.LIST_MODIFIER);
1344	GSXML.addParametersToList(query_param_list, service_params);
1345
1346	if (node_id != null) {
1347	GSXML.addParameterToList(query_param_list, "hldocOID", node_id);
1348	} else {
1349	GSXML.addParameterToList(query_param_list, "hldocOID", (String) params.get(GSParams.DOCUMENT));
1350	}
1351	mr_query_request.appendChild(query_param_list);
1352	// do the query
1353
1354	Element mr_query_response = (Element) this.mr.process(mr_query_message);
1355	String pathNode = GSPath.appendLink(GSXML.RESPONSE_ELEM, GSXML.NODE_CONTENT_ELEM);
1356	Element highlighted_node = (Element) GSXML.getNodeByPath(mr_query_response, pathNode);
1357
1358	if (highlighted_node == null) {
1359	return null;
1360	}
1361	// For SOLR, the highlighted node will be a nodeContent element, which is the hldocOID section content, with search terms marked up.
1362	//We send it back to the documnetContentRetrieve service so that resolveTextMacros can be applied, and it can be properly encased in documentNode etc elements
1363
1364	// Build a request to process highlighted text
1365
1366	Element hl_message = doc.createElement(GSXML.MESSAGE_ELEM);
1367	to = GSPath.appendLink(collection, AbstractDocumentRetrieve.DOCUMENT_CONTENT_RETRIEVE_SERVICE);
1368	Element dc_request = GSXML.createBasicRequest(doc, GSXML.REQUEST_TYPE_PROCESS, to, userContext);
1369	hl_message.appendChild(dc_request);
1370
1371	// Create a parameter list to specify the request parameters - empty for now
1372	Element dc_param_list = doc.createElement(GSXML.PARAM_ELEM + GSXML.LIST_MODIFIER);
1373	dc_request.appendChild(dc_param_list);
1374
1375	// get the content
1376	Element doc_list = doc.createElement(GSXML.DOC_NODE_ELEM + GSXML.LIST_MODIFIER);
1377	dc_request.appendChild(doc_list);
1378	Element current_doc = doc.createElement(GSXML.DOC_NODE_ELEM);
1379	doc_list.appendChild(current_doc);
1380	current_doc.setAttribute(GSXML.NODE_ID_ATT, (String) params.get(GSParams.DOCUMENT));
1381	//Append highlighted content to request for processing
1382	dc_request.appendChild(doc.importNode(highlighted_node, true));
1383	Element hl_response_message = (Element) this.mr.process(hl_message);
1384	//Get results
1385	NodeList contentList = hl_response_message.getElementsByTagName(GSXML.NODE_CONTENT_ELEM);
1386	Element content = (Element) contentList.item(0);
1387	return content;
1388
1389
1390	}
1391	/**
1392	* Highlights query terms in specified elements (whose name is in element_names) text inside top_level_elem
1393	*/
1394	protected boolean highlightQueryTermsDOM(Document doc, Element top_level_elem, String element_name, HashSet<String> query_term_variants, ArrayList<ArrayList<HashSet<String>>> phrase_query_term_variants_hierarchy, boolean case_insensitive) {
1395
1396	NodeList named_elems = top_level_elem.getElementsByTagName(element_name);
1397	for (int j=named_elems.getLength()-1; j>=0; j--) {
1398	Element this_elem = (Element)named_elems.item(j);
1399	Element replacement_elem = highlightQueryTermsElementText(doc, this_elem, query_term_variants, phrase_query_term_variants_hierarchy, case_insensitive);
1400	this_elem.getParentNode().replaceChild(replacement_elem, this_elem);
1401	}
1402	return true;
1403	}
1404	/**
1405	* Highlights query terms in the text content of an element.
1406	*/
1407	private Element highlightQueryTermsElementText(Document doc, Element original_element, HashSet<String> query_term_variants, ArrayList<ArrayList<HashSet<String>>> phrase_query_term_variants_hierarchy, boolean case_insensitive)
1408	{
1409	String content = GSXML.getNodeText(original_element);
1410	// Convert the content string to an array of characters for speed
1411	char[] content_characters = new char[content.length()];
1412	content.getChars(0, content.length(), content_characters, 0);
1413
1414	// Now skim through the content, identifying word matches
1415	ArrayList<WordMatch> word_matches = new ArrayList<WordMatch>();
1416	int word_start = 0;
1417	boolean in_word = false;
1418	boolean preceding_word_matched = false;
1419	boolean inTag = false;
1420	for (int i = 0; i < content_characters.length; i++)
1421	{
1422	//We don't want to find words inside HTML tags
1423	if (content_characters[i] == '<')
1424	{
1425	// are we currently in a word?
1426	if (in_word) {
1427	in_word = false;
1428	String word = new String(content_characters, word_start, (i - word_start));
1429	if (case_insensitive) {
1430	word = word.toLowerCase();
1431	}
1432	if (query_term_variants.contains(word)) {
1433	// We have found a matching word, so remember its location
1434	word_matches.add(new WordMatch(word, word_start, i, preceding_word_matched));
1435	// should preceding word matched be set to true/false here??
1436	preceding_word_matched = true;
1437	} else {
1438	preceding_word_matched = false;
1439	}
1440	}
1441	inTag = true;
1442	continue;
1443	}
1444	else if (inTag && content_characters[i] == '>')
1445	{
1446	inTag = false;
1447	continue;
1448	}
1449	else if (inTag)
1450	{
1451	continue;
1452	}
1453
1454	boolean is_character_letter_or_digit = Character.isLetterOrDigit(content_characters[i]);
1455
1456	// Has a word just started?
1457	if (in_word == false && is_character_letter_or_digit == true)
1458	{
1459	in_word = true;
1460	word_start = i;
1461	}
1462
1463	// Or has a word just finished?
1464	else if (in_word == true && is_character_letter_or_digit == false)
1465	{
1466	in_word = false;
1467
1468	// Check if the word matches any of the query term equivalents
1469	String word = new String(content_characters, word_start, (i - word_start));
1470	if (case_insensitive) {
1471	word = word.toLowerCase();
1472	}
1473	if (query_term_variants.contains(word))
1474	{
1475	// We have found a matching word, so remember its location
1476	word_matches.add(new WordMatch(word, word_start, i, preceding_word_matched));
1477	preceding_word_matched = true;
1478	}
1479	else
1480	{
1481	preceding_word_matched = false;
1482	}
1483	}
1484	}
1485
1486	// Don't forget the last word...
1487	if (in_word == true)
1488	{
1489	// Check if the word matches any of the query term equivalents
1490	String word = new String(content_characters, word_start, (content_characters.length - word_start));
1491	if (case_insensitive) {
1492	word = word.toLowerCase();
1493	}
1494	if (query_term_variants.contains(word))
1495	{
1496	// We have found a matching word, so remember its location
1497	word_matches.add(new WordMatch(word, word_start, content_characters.length, preceding_word_matched));
1498	}
1499	}
1500
1501	if (word_matches.size() == 0) {
1502	// just return a copy of the original element
1503	return (Element)doc.importNode(original_element, true);
1504
1505	}
1506
1507	ArrayList<Integer> highlight_start_positions = new ArrayList<Integer>();
1508	ArrayList<Integer> highlight_end_positions = new ArrayList<Integer>();
1509
1510	if (phrase_query_term_variants_hierarchy.size() ==0) {
1511	for (int i = 0; i < word_matches.size(); i++) {
1512	highlight_start_positions.add(Integer.valueOf(word_matches.get(i).start_position));
1513	highlight_end_positions.add(Integer.valueOf(word_matches.get(i).end_position));
1514	}
1515	}
1516	else {
1517	// Deal with phrases now
1518	ArrayList<PartialPhraseMatch> partial_phrase_matches = new ArrayList<PartialPhraseMatch>();
1519	for (int i = 0; i < word_matches.size(); i++)
1520	{
1521	WordMatch word_match = word_matches.get(i);
1522
1523	// See if any partial phrase matches are extended by this word
1524	if (word_match.preceding_word_matched)
1525	{
1526	for (int j = partial_phrase_matches.size() - 1; j >= 0; j--)
1527	{
1528	PartialPhraseMatch partial_phrase_match = partial_phrase_matches.remove(j);
1529	ArrayList phrase_query_p_term_variants_list = phrase_query_term_variants_hierarchy.get(partial_phrase_match.query_phrase_number);
1530	HashSet phrase_query_p_term_x_variants = (HashSet) phrase_query_p_term_variants_list.get(partial_phrase_match.num_words_matched);
1531	if (phrase_query_p_term_x_variants.contains(word_match.word))
1532	{
1533	partial_phrase_match.num_words_matched++;
1534
1535	// Has a complete phrase match occurred?
1536	if (partial_phrase_match.num_words_matched == phrase_query_p_term_variants_list.size())
1537	{
1538	// Check for overlaps by looking at the previous highlight range
1539	if (!highlight_end_positions.isEmpty())
1540	{
1541	int last_highlight_index = highlight_end_positions.size() - 1;
1542	int last_highlight_end = highlight_end_positions.get(last_highlight_index).intValue();
1543	if (last_highlight_end > partial_phrase_match.start_position)
1544	{
1545	// There is an overlap, so remove the previous phrase match
1546	int last_highlight_start = highlight_start_positions.remove(last_highlight_index).intValue();
1547	highlight_end_positions.remove(last_highlight_index);
1548	partial_phrase_match.start_position = last_highlight_start;
1549	}
1550	}
1551
1552	highlight_start_positions.add(Integer.valueOf(partial_phrase_match.start_position));
1553	highlight_end_positions.add(Integer.valueOf(word_match.end_position));
1554	}
1555	// No, but add the partial match back into the list for next time
1556	else
1557	{
1558	partial_phrase_matches.add(partial_phrase_match);
1559	}
1560	}
1561	}
1562	}
1563	else
1564	{
1565	partial_phrase_matches.clear();
1566	}
1567
1568	// See if this word is at the start of any of the phrases
1569	for (int p = 0; p < phrase_query_term_variants_hierarchy.size(); p++)
1570	{
1571	ArrayList phrase_query_p_term_variants_list = phrase_query_term_variants_hierarchy.get(p);
1572	if (phrase_query_p_term_variants_list.size()>0) {
1573	HashSet phrase_query_p_term_1_variants = (HashSet) phrase_query_p_term_variants_list.get(0);
1574	if (phrase_query_p_term_1_variants.contains(word_match.word))
1575	{
1576	// If this phrase is just one word long, we have a complete match
1577	if (phrase_query_p_term_variants_list.size() == 1)
1578	{
1579	highlight_start_positions.add(Integer.valueOf(word_match.start_position));
1580	highlight_end_positions.add(Integer.valueOf(word_match.end_position));
1581	}
1582	// Otherwise we have the start of a potential phrase match
1583	else
1584	{
1585	partial_phrase_matches.add(new PartialPhraseMatch(word_match.start_position, p));
1586	}
1587	}
1588	}
1589	}
1590	}
1591	}
1592
1593	// Now add the annotation tags into the document at the correct points
1594	Element content_element = (Element)doc.importNode(original_element, false); // just copy the element plus any attributes, but not any children.
1595	int last_wrote = 0;
1596	for (int i = 0; i < highlight_start_positions.size(); i++)
1597	{
1598	int highlight_start = highlight_start_positions.get(i).intValue();
1599	int highlight_end = highlight_end_positions.get(i).intValue();
1600
1601	// Print anything before the highlight range
1602	if (last_wrote < highlight_start)
1603	{
1604	String preceding_text = new String(content_characters, last_wrote, (highlight_start - last_wrote));
1605	content_element.appendChild(doc.createTextNode(preceding_text));
1606	}
1607
1608	// Print the highlight text, annotated
1609	if (highlight_end > last_wrote)
1610	{
1611	String highlight_text = new String(content_characters, highlight_start, (highlight_end - highlight_start));
1612	Element annotation_element = GSXML.createTextElement(doc, "annotation", highlight_text);
1613	annotation_element.setAttribute("type", "query_term");
1614	content_element.appendChild(annotation_element);
1615	last_wrote = highlight_end;
1616	}
1617	}
1618
1619	// Finish off any unwritten text
1620	if (last_wrote < content_characters.length)
1621	{
1622	String remaining_text = new String(content_characters, last_wrote, (content_characters.length - last_wrote));
1623	content_element.appendChild(doc.createTextNode(remaining_text));
1624	}
1625	return content_element;
1626	}
1627
1628
1629	static private class WordMatch
1630	{
1631	public String word;
1632	public int start_position;
1633	public int end_position;
1634	public boolean preceding_word_matched;
1635
1636	public WordMatch(String word, int start_position, int end_position, boolean preceding_word_matched)
1637	{
1638	this.word = word;
1639	this.start_position = start_position;
1640	this.end_position = end_position;
1641	this.preceding_word_matched = preceding_word_matched;
1642	}
1643	}
1644
1645	static private class PartialPhraseMatch
1646	{
1647	public int start_position;
1648	public int query_phrase_number;
1649	public int num_words_matched;
1650
1651	public PartialPhraseMatch(int start_position, int query_phrase_number)
1652	{
1653	this.start_position = start_position;
1654	this.query_phrase_number = query_phrase_number;
1655	this.num_words_matched = 1;
1656	}
1657	}
1658	}

Note: See TracBrowser for help on using the repository browser.

Context Navigation

source: main/trunk/greenstone3/src/java/org/greenstone/gsdl3/action/DocumentAction.java@ 37515

Download in other formats: