Context Navigation

source: main/trunk/greenstone3/src/java/org/greenstone/gsdl3/action/DocumentAction.java@ 32505

Last change on this file since 32505 was 32505, checked in by kjdon, 6 years ago
moved soem code into a new method getFormattedArchiveDoc. Modified search term highlighting code. separated the getting of query term variants, and marking up the text. then redoing the query is only called once. now can call the text marking up bit on metadata too - useful if the document page displays a table of metadata - want to highlight search terms in the table.
Property svn:keywords set to `Author Date Id Revision`
File size: 70.9 KB

Line
1	/*
2	* DocumentAction.java
3	* Copyright (C) 2002 New Zealand Digital Library, http://www.nzdl.org
4	*
5	* This program is free software; you can redistribute it and/or modify
6	* it under the terms of the GNU General Public License as published by
7	* the Free Software Foundation; either version 2 of the License, or
8	* (at your option) any later version.
9	*
10	* This program is distributed in the hope that it will be useful,
11	* but WITHOUT ANY WARRANTY; without even the implied warranty of
12	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13	* GNU General Public License for more details.
14	*
15	* You should have received a copy of the GNU General Public License
16	* along with this program; if not, write to the Free Software
17	* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
18	*/
19	package org.greenstone.gsdl3.action;
20
21	// Greenstone classes
22	import org.greenstone.gsdl3.core.ModuleInterface;
23	import org.greenstone.gsdl3.util.*;
24	import org.greenstone.util.GlobalProperties;
25
26	// XML classes
27	import org.w3c.dom.Document;
28	import org.w3c.dom.Element;
29	import org.w3c.dom.Node;
30	import org.w3c.dom.Text;
31	import org.w3c.dom.NodeList;
32
33	// General Java classes
34	import java.util.ArrayList;
35	import java.util.HashMap;
36	import java.util.HashSet;
37	import java.io.File;
38	import java.io.Serializable;
39
40	import org.apache.log4j.*;
41
42	/** Action class for retrieving Documents via the message router */
43	public class DocumentAction extends Action
44	{
45
46	static Logger logger = Logger.getLogger(org.greenstone.gsdl3.action.DocumentAction.class.getName());
47
48	// this is used to specify that the sibling nodes of a selected one should be obtained
49	public static final String SIBLING_ARG = "sib";
50	public static final String GOTO_PAGE_ARG = "gp";
51	public static final String ENRICH_DOC_ARG = "end";
52	public static final String EXPAND_DOCUMENT_ARG = "ed";
53	public static final String EXPAND_CONTENTS_ARG = "ec";
54	public static final String REALISTIC_BOOK_ARG = "book";
55	public static final String NO_TEXT_ARG = "noText";
56	public static final String DOC_EDIT_ARG = "docEdit";
57
58	/**
59	* if this is set to true, when a document is displayed, any annotation type
60	* services (enrich) will be offered to the user as well
61	*/
62	protected boolean provide_annotations = false;
63
64	protected boolean highlight_query_terms = false;
65
66	public boolean configure()
67	{
68	super.configure();
69	String highlight = (String) config_params.get("highlightQueryTerms");
70	if (highlight != null && highlight.equals("true"))
71	{
72	highlight_query_terms = true;
73	}
74	String annotate = (String) config_params.get("displayAnnotationService");
75	if (annotate != null && annotate.equals("true"))
76	{
77	provide_annotations = true;
78	}
79	return true;
80	}
81
82	public Node process(Node message_node)
83	{
84	// for now, no subaction eventually we may want to have subactions such as text assoc or something ?
85
86	Element message = GSXML.nodeToElement(message_node);
87	Document doc = XMLConverter.newDOM();
88
89	// the response
90	Element result = doc.createElement(GSXML.MESSAGE_ELEM);
91	Element page_response = doc.createElement(GSXML.RESPONSE_ELEM);
92	result.appendChild(page_response);
93
94	// get the request - assume only one
95	Element request = (Element) GSXML.getChildByTagName(message, GSXML.REQUEST_ELEM);
96	Element cgi_paramList = (Element) GSXML.getChildByTagName(request, GSXML.PARAM_ELEM + GSXML.LIST_MODIFIER);
97	HashMap<String, Serializable> params = GSXML.extractParams(cgi_paramList, false);
98
99	// just in case there are some that need to get passed to the services
100	// why do we use s0 here and s1 in other places???
101	HashMap service_params = (HashMap) params.get("s0");
102
103	String collection = (String) params.get(GSParams.COLLECTION);
104	String document_id = (String) params.get(GSParams.DOCUMENT);
105	if (document_id != null && document_id.equals(""))
106	{
107	document_id = null;
108	}
109	String href = (String) params.get(GSParams.HREF);//for an external link : get the href URL if it is existing in the params list
110	if (href != null && href.equals(""))
111	{
112	href = null;
113	}
114	String rl = (String) params.get(GSParams.RELATIVE_LINK);//for an external link : get the rl value if it is existing in the params list
115	if (document_id == null && href == null)
116	{
117	logger.error("no document specified!");
118	return result;
119	}
120	if (rl != null && rl.equals("0"))
121	{
122	// this is a true external link, we should have been directed to a different page or action
123	logger.error("rl value was 0, shouldn't get here");
124	return result;
125	}
126
127	UserContext userContext = new UserContext(request);
128
129	//append site metadata
130	addSiteMetadata(page_response, userContext);
131	addInterfaceOptions(page_response);
132
133	// get the additional data needed for the page
134	getBackgroundData(page_response, collection, userContext);
135	Element format_elem = (Element) GSXML.getChildByTagName(page_response, GSXML.FORMAT_ELEM);
136
137	if (format_elem != null) {
138	// lets look for param defaults set in config file
139	NodeList param_defaults = format_elem.getElementsByTagName("paramDefault");
140	for (int i=0; i<param_defaults.getLength(); i++) {
141	Element p = (Element)param_defaults.item(i);
142	String name = p.getAttribute(GSXML.NAME_ATT);
143	if (params.get(name) ==null) {
144	// wasn't set from interface
145	String value = p.getAttribute(GSXML.VALUE_ATT);
146	params.put(name, value );
147	// also add into request param xml so that xslt knows it too
148	GSXML.addParameterToList(cgi_paramList, name, value);
149	}
150	}
151	}
152
153	String document_type = (String) params.get(GSParams.DOCUMENT_TYPE);
154	if (document_type != null && document_type.equals(""))
155	{
156	//document_type = "hierarchy";
157	document_type = null; // we'll get it later if not already specified
158	}
159	// what if it is null here?? Anu to check...
160
161
162	boolean editing_document = false;
163	String doc_edit = (String) params.get(DOC_EDIT_ARG);
164	if (doc_edit != null && doc_edit.equals("1")) {
165	editing_document = true;
166	}
167
168	// are we editing mode? just get the archive document, convert to our internal doc format, and return it
169	if (editing_document) {
170	return getFormattedArchiveDoc(doc, collection, document_id, document_type, result, page_response, userContext);
171	}
172
173	//whether to retrieve siblings or not
174	boolean get_siblings = false;
175	String sibs = (String) params.get(SIBLING_ARG);
176	if (sibs != null && sibs.equals("1"))
177	{
178	get_siblings = true;
179	}
180
181	String doc_id_modifier = "";
182	String sibling_num = (String) params.get(GOTO_PAGE_ARG);
183	if (sibling_num != null && !sibling_num.equals(""))
184	{
185	// we have to modify the doc name
186	doc_id_modifier = "." + sibling_num + ".ss";
187	}
188
189	boolean expand_document = false;
190	String ed_arg = (String) params.get(EXPAND_DOCUMENT_ARG);
191	if (ed_arg != null && ed_arg.equals("1"))
192	{
193	expand_document = true;
194	}
195
196	boolean expand_contents = false;
197	if (expand_document)
198	{ // we always expand the contents with the text
199	expand_contents = true;
200	}
201	else
202	{
203	String ec_arg = (String) params.get(EXPAND_CONTENTS_ARG);
204	if (ec_arg != null && ec_arg.equals("1"))
205	{
206	expand_contents = true;
207	}
208	}
209
210	// do we want text content? Not if no_text=1.
211	// expand_document overrides this. - should it??
212	boolean get_text = true;
213	String nt_arg = (String) params.get(NO_TEXT_ARG);
214
215	if (!expand_document && nt_arg!=null && nt_arg.equals("1")) {
216	logger.debug("SETTING GET TEXT TO FALSE");
217	get_text = false;
218	} else {
219	logger.debug("GET TEXT REMAINS TRUE");
220	}
221
222	// the_document is where all the doc info - structure and metadata etc
223	// is added into, to be returned in the page
224	Element the_document = doc.createElement(GSXML.DOCUMENT_ELEM);
225	page_response.appendChild(the_document);
226
227	// create a basic doc list containing the current node
228	Element basic_doc_list = doc.createElement(GSXML.DOC_NODE_ELEM + GSXML.LIST_MODIFIER);
229	Element current_doc = doc.createElement(GSXML.DOC_NODE_ELEM);
230	basic_doc_list.appendChild(current_doc);
231	if (document_id != null)
232	{
233	current_doc.setAttribute(GSXML.NODE_ID_ATT, document_id + doc_id_modifier);
234	}
235	else
236	{
237	current_doc.setAttribute(GSXML.HREF_ID_ATT, href);
238	// do we need this??
239	current_doc.setAttribute(GSXML.ID_MOD_ATT, doc_id_modifier);
240	}
241
242	if (document_type == null)
243	{
244	document_type = getDocumentType(basic_doc_list, collection, userContext, page_response);
245	}
246	if (document_type == null)
247	{
248	logger.debug("##### doctype is null, setting to simple");
249	document_type = GSXML.DOC_TYPE_SIMPLE;
250	}
251
252	the_document.setAttribute(GSXML.DOC_TYPE_ATT, document_type);
253
254	// start getting doc structure
255
256	// Create a parameter list to specify the required structure information
257	Element ds_param_list = doc.createElement(GSXML.PARAM_ELEM + GSXML.LIST_MODIFIER);
258
259	if (service_params != null)
260	{
261	GSXML.addParametersToList(ds_param_list, service_params);
262	}
263
264	Element ds_param = null;
265	boolean get_structure = false;
266	boolean get_structure_info = false;
267	if (document_type.equals(GSXML.DOC_TYPE_PAGED))
268	{
269	get_structure_info = true;
270
271	if (expand_contents)
272	{
273	ds_param = doc.createElement(GSXML.PARAM_ELEM);
274	ds_param_list.appendChild(ds_param);
275	ds_param.setAttribute(GSXML.NAME_ATT, "structure");
276	ds_param.setAttribute(GSXML.VALUE_ATT, "entire");
277	}
278
279	// get the info needed for paged naviagtion
280	ds_param = doc.createElement(GSXML.PARAM_ELEM);
281	ds_param_list.appendChild(ds_param);
282	ds_param.setAttribute(GSXML.NAME_ATT, "info");
283	ds_param.setAttribute(GSXML.VALUE_ATT, "numSiblings");
284	ds_param = doc.createElement(GSXML.PARAM_ELEM);
285	ds_param_list.appendChild(ds_param);
286	ds_param.setAttribute(GSXML.NAME_ATT, "info");
287	ds_param.setAttribute(GSXML.VALUE_ATT, "numChildren");
288	ds_param = doc.createElement(GSXML.PARAM_ELEM);
289	ds_param_list.appendChild(ds_param);
290	ds_param.setAttribute(GSXML.NAME_ATT, "info");
291	ds_param.setAttribute(GSXML.VALUE_ATT, "siblingPosition");
292
293	if (get_siblings)
294	{
295	ds_param = doc.createElement(GSXML.PARAM_ELEM);
296	ds_param_list.appendChild(ds_param);
297	ds_param.setAttribute(GSXML.NAME_ATT, "structure");
298	ds_param.setAttribute(GSXML.VALUE_ATT, "siblings");
299	}
300
301	}
302	else if (document_type.equals(GSXML.DOC_TYPE_HIERARCHY) \|\| document_type.equals(GSXML.DOC_TYPE_PAGED_HIERARCHY))
303	{
304	get_structure = true;
305	if (expand_contents)
306	{
307	ds_param = doc.createElement(GSXML.PARAM_ELEM);
308	ds_param_list.appendChild(ds_param);
309	ds_param.setAttribute(GSXML.NAME_ATT, "structure");
310	ds_param.setAttribute(GSXML.VALUE_ATT, "entire");
311	}
312	else
313	{
314	// get the info needed for table of contents
315	ds_param = doc.createElement(GSXML.PARAM_ELEM);
316	ds_param_list.appendChild(ds_param);
317	ds_param.setAttribute(GSXML.NAME_ATT, "structure");
318	ds_param.setAttribute(GSXML.VALUE_ATT, "ancestors");
319	ds_param = doc.createElement(GSXML.PARAM_ELEM);
320	ds_param_list.appendChild(ds_param);
321	ds_param.setAttribute(GSXML.NAME_ATT, "structure");
322	ds_param.setAttribute(GSXML.VALUE_ATT, "children");
323	if (get_siblings)
324	{
325	ds_param = doc.createElement(GSXML.PARAM_ELEM);
326	ds_param_list.appendChild(ds_param);
327	ds_param.setAttribute(GSXML.NAME_ATT, "structure");
328	ds_param.setAttribute(GSXML.VALUE_ATT, "siblings");
329	}
330	}
331	}
332	else
333	{
334	// we dont need any structure
335	}
336
337	boolean has_dummy = false;
338	if (get_structure \|\| get_structure_info)
339	{
340
341	// Build a request to obtain the document structure
342	Element ds_message = doc.createElement(GSXML.MESSAGE_ELEM);
343	String to = GSPath.appendLink(collection, "DocumentStructureRetrieve");// Hard-wired?
344	Element ds_request = GSXML.createBasicRequest(doc, GSXML.REQUEST_TYPE_PROCESS, to, userContext);
345	ds_message.appendChild(ds_request);
346	ds_request.appendChild(ds_param_list);
347
348	// add the node list we created earlier
349	ds_request.appendChild(basic_doc_list);
350
351	// Process the document structure retrieve message
352	Element ds_response_message = (Element) this.mr.process(ds_message);
353	if (processErrorElements(ds_response_message, page_response))
354	{
355	return result;
356	}
357
358	// get the info and print out
359	String path = GSPath.appendLink(GSXML.RESPONSE_ELEM, GSXML.DOC_NODE_ELEM + GSXML.LIST_MODIFIER);
360	path = GSPath.appendLink(path, GSXML.DOC_NODE_ELEM);
361	path = GSPath.appendLink(path, "nodeStructureInfo");
362	Element ds_response_struct_info = (Element) GSXML.getNodeByPath(ds_response_message, path);
363	// get the doc_node bit
364	if (ds_response_struct_info != null)
365	{
366	the_document.appendChild(doc.importNode(ds_response_struct_info, true));
367	}
368	path = GSPath.appendLink(GSXML.RESPONSE_ELEM, GSXML.DOC_NODE_ELEM + GSXML.LIST_MODIFIER);
369	path = GSPath.appendLink(path, GSXML.DOC_NODE_ELEM);
370	path = GSPath.appendLink(path, GSXML.NODE_STRUCTURE_ELEM);
371	Element ds_response_structure = (Element) GSXML.getNodeByPath(ds_response_message, path);
372
373	if (ds_response_structure != null)
374	{
375	// add the contents of the structure bit into the_document
376	NodeList structs = ds_response_structure.getChildNodes();
377	for (int i = 0; i < structs.getLength(); i++)
378	{
379	the_document.appendChild(doc.importNode(structs.item(i), true));
380	}
381	}
382	else
383	{
384	// no structure nodes, so put in a dummy doc node
385	Element doc_node = doc.createElement(GSXML.DOC_NODE_ELEM);
386	if (document_id != null)
387	{
388	doc_node.setAttribute(GSXML.NODE_ID_ATT, document_id);
389	}
390	else
391	{
392	doc_node.setAttribute(GSXML.HREF_ID_ATT, href);
393
394	}
395	the_document.appendChild(doc_node);
396	has_dummy = true;
397	}
398	}
399	else
400	{ // a simple type - we dont have a dummy node for simple
401	// should think about this more
402	// no structure request, so just put in a dummy doc node
403	Element doc_node = doc.createElement(GSXML.DOC_NODE_ELEM);
404	if (document_id != null)
405	{
406	doc_node.setAttribute(GSXML.NODE_ID_ATT, document_id);
407	}
408	else
409	{
410	doc_node.setAttribute(GSXML.HREF_ID_ATT, href);
411	}
412	the_document.appendChild(doc_node);
413	has_dummy = true;
414	}
415
416	// end getting doc structure
417
418	// start getting doc metadata
419
420	// Build a request to obtain some document metadata
421	Element dm_message = doc.createElement(GSXML.MESSAGE_ELEM);
422	String to = GSPath.appendLink(collection, "DocumentMetadataRetrieve"); // Hard-wired?
423	Element dm_request = GSXML.createBasicRequest(doc, GSXML.REQUEST_TYPE_PROCESS, to, userContext);
424	dm_message.appendChild(dm_request);
425	// Create a parameter list to specify the required metadata information
426
427	HashSet<String> meta_names = new HashSet<String>();
428	meta_names.add("Title"); // the default
429	if (format_elem != null)
430	{
431	getRequiredMetadataNames(format_elem, meta_names);
432	}
433
434	Element extraMetaListElem = (Element) GSXML.getChildByTagName(request, GSXML.EXTRA_METADATA + GSXML.LIST_MODIFIER);
435	if (extraMetaListElem != null)
436	{
437	NodeList extraMetaList = extraMetaListElem.getElementsByTagName(GSXML.EXTRA_METADATA);
438	for (int i = 0; i < extraMetaList.getLength(); i++)
439	{
440	meta_names.add(((Element) extraMetaList.item(i)).getAttribute(GSXML.NAME_ATT));
441	}
442	}
443
444	Element dm_param_list = createMetadataParamList(doc,meta_names);
445	if (service_params != null)
446	{
447	GSXML.addParametersToList(dm_param_list, service_params);
448	}
449
450	dm_request.appendChild(dm_param_list);
451
452	// create the doc node list for the metadata request
453	Element dm_doc_list = doc.createElement(GSXML.DOC_NODE_ELEM + GSXML.LIST_MODIFIER);
454	dm_request.appendChild(dm_doc_list);
455
456	// Add each node from the structure response into the metadata request
457	NodeList doc_nodes = the_document.getElementsByTagName(GSXML.DOC_NODE_ELEM);
458	for (int i = 0; i < doc_nodes.getLength(); i++)
459	{
460	Element doc_node = (Element) doc_nodes.item(i);
461	String doc_node_id = doc_node.getAttribute(GSXML.NODE_ID_ATT);
462
463	// Add the documentNode to the list
464	Element dm_doc_node = doc.createElement(GSXML.DOC_NODE_ELEM);
465	if (needSectionContent(params)) {
466	if (doc_node_id.equals(document_id)) {
467	dm_doc_list.appendChild(dm_doc_node);
468	}
469	} else {
470	dm_doc_list.appendChild(dm_doc_node);
471	}
472	//dm_doc_list.appendChild(dm_doc_node);
473	dm_doc_node.setAttribute(GSXML.NODE_ID_ATT, doc_node_id);
474	dm_doc_node.setAttribute(GSXML.NODE_TYPE_ATT, doc_node.getAttribute(GSXML.NODE_TYPE_ATT));
475	if (document_id == null){
476	dm_doc_node.setAttribute(GSXML.HREF_ID_ATT, href );
477	}
478
479	}
480	// we also want a metadata request to the top level document to get
481	// assocfilepath - this could be cached too
482	Element doc_meta_request = GSXML.createBasicRequest(doc, GSXML.REQUEST_TYPE_PROCESS, to, userContext);
483	dm_message.appendChild(doc_meta_request);
484	Element doc_meta_param_list = doc.createElement(GSXML.PARAM_ELEM + GSXML.LIST_MODIFIER);
485	if (service_params != null)
486	{
487	GSXML.addParametersToList(doc_meta_param_list, service_params);
488	}
489
490	doc_meta_request.appendChild(doc_meta_param_list);
491	Element doc_param = doc.createElement(GSXML.PARAM_ELEM);
492	doc_meta_param_list.appendChild(doc_param);
493	doc_param.setAttribute(GSXML.NAME_ATT, "metadata");
494	doc_param.setAttribute(GSXML.VALUE_ATT, "assocfilepath");
495
496	// create the doc node list for the metadata request
497	Element doc_list = doc.createElement(GSXML.DOC_NODE_ELEM + GSXML.LIST_MODIFIER);
498	doc_meta_request.appendChild(doc_list);
499
500	Element doc_node = doc.createElement(GSXML.DOC_NODE_ELEM);
501	// the node we want is the root document node
502	if (document_id != null)
503	{
504	doc_node.setAttribute(GSXML.NODE_ID_ATT, document_id + ".rt");
505	}
506	/*else
507	{
508	doc_node.setAttribute(GSXML.HREF_ID_ATT, href);// + ".rt");
509	// can we assume that href is always a top level doc??
510	//doc_node.setAttribute(GSXML.ID_MOD_ATT, ".rt");
511	//doc_node.setAttribute("externalURL", has_rl);
512	}*/
513	doc_list.appendChild(doc_node);
514
515	Element dm_response_message = (Element) this.mr.process(dm_message);
516	if (processErrorElements(dm_response_message, page_response))
517	{
518	return result;
519	}
520
521	String path = GSPath.appendLink(GSXML.RESPONSE_ELEM, GSXML.DOC_NODE_ELEM + GSXML.LIST_MODIFIER);
522	Element dm_response_doc_list = (Element) GSXML.getNodeByPath(dm_response_message, path);
523
524	// Merge the metadata with the structure information
525	NodeList dm_response_docs = dm_response_doc_list.getChildNodes();
526	for (int i = 0; i < doc_nodes.getLength(); i++)
527	{
528	Node dcNode;
529	String node_idd = ((Element)doc_nodes.item(i)).getAttribute(GSXML.NODE_ID_ATT);
530	if (node_idd.isEmpty()) {
531	String href_id_att = ((Element)doc_nodes.item(i)).getAttribute(GSXML.HREF_ID_ATT);
532	dcNode = GSXML.getNamedElement(dm_response_doc_list, "documentNode", GSXML.HREF_ID_ATT, href_id_att);
533	} else {
534	dcNode = GSXML.getNamedElement(dm_response_doc_list, "documentNode", GSXML.NODE_ID_ATT, node_idd);
535	}
536	GSXML.mergeMetadataLists(doc_nodes.item(i), dcNode);
537	}
538	// get the top level doc metadata out
539	Element doc_meta_response = (Element) dm_response_message.getElementsByTagName(GSXML.RESPONSE_ELEM).item(1);
540	Element top_doc_node = (Element) GSXML.getNodeByPath(doc_meta_response, "documentNodeList/documentNode");
541	GSXML.mergeMetadataLists(the_document, top_doc_node);
542
543	// do we want doc text content? If not, we are done.
544	if (!get_text) {
545	// don't get text
546	return result;
547	}
548
549
550	HashSet<String> query_term_variants = null;
551	ArrayList<ArrayList<HashSet<String>>> phrase_query_term_variants_hierarchy = null;
552	boolean do_highlight_query_terms = highlight_query_terms;
553	if (highlight_query_terms) {
554	// lets get the query term equivalents
555	query_term_variants = new HashSet<String>();
556	phrase_query_term_variants_hierarchy = new ArrayList<ArrayList<HashSet<String>>>();
557	if (!getQueryTermVariants(request, null, /current_node_id,/ query_term_variants, phrase_query_term_variants_hierarchy)) {
558	do_highlight_query_terms = false; // we couldn't get the terms
559	}
560	}
561
562	// lets try marking up the metadata with search terms
563	if (do_highlight_query_terms) {
564	highlightQueryTermsDOM(doc, the_document, "metadata", query_term_variants, phrase_query_term_variants_hierarchy);
565	}
566
567	// Build a request to obtain some document content
568	Element dc_message = doc.createElement(GSXML.MESSAGE_ELEM);
569	to = GSPath.appendLink(collection, "DocumentContentRetrieve"); // Hard-wired?
570	Element dc_request = GSXML.createBasicRequest(doc, GSXML.REQUEST_TYPE_PROCESS, to, userContext);
571	dc_message.appendChild(dc_request);
572
573	// Create a parameter list to specify the request parameters - empty for now
574	Element dc_param_list = doc.createElement(GSXML.PARAM_ELEM + GSXML.LIST_MODIFIER);
575	if (service_params != null)
576	{
577	GSXML.addParametersToList(dc_param_list, service_params);
578	}
579
580	dc_request.appendChild(dc_param_list);
581
582	// get the content
583	// the doc list for the content request is the same as the one for the structure request unless we want the whole document, in which case its the same as for the metadata request.
584	if (expand_document)
585	{
586	dc_request.appendChild(dm_doc_list);
587	}
588	else
589	{
590	dc_request.appendChild(basic_doc_list);
591	}
592	Element dc_response_message = (Element) this.mr.process(dc_message);
593
594	if (processErrorElements(dc_response_message, page_response))
595	{
596	return result;
597
598	}
599	Element dc_response_doc_list = (Element) GSXML.getNodeByPath(dc_response_message, path);
600
601	if (expand_document)
602	{
603	// Merge the content with the structure information
604	NodeList dc_response_docs = dc_response_doc_list.getChildNodes();
605	for (int i = 0; i < doc_nodes.getLength(); i++)
606	{
607	String node_id = ((Element)doc_nodes.item(i)).getAttribute(GSXML.NODE_ID_ATT);
608	//Node content = GSXML.getChildByTagName((Element) dc_response_docs.item(i), GSXML.NODE_CONTENT_ELEM);
609	Node docNode = GSXML.getNamedElement(dc_response_doc_list, "documentNode", GSXML.NODE_ID_ATT, node_id);
610	Node content = GSXML.getChildByTagName(docNode, GSXML.NODE_CONTENT_ELEM);
611	if (content != null)
612	{
613	if (do_highlight_query_terms)
614	{
615	content = highlightQueryTermsElementText(doc, (Element)content, query_term_variants, phrase_query_term_variants_hierarchy);
616	}
617
618	doc_nodes.item(i).appendChild(doc.importNode(content, true));
619	}
620	//GSXML.mergeMetadataLists(doc_nodes.item(i), dm_response_docs.item(i));
621	}
622	if (has_dummy && document_type.equals(GSXML.DOC_TYPE_SIMPLE)) {
623	Element dummy_node = (Element) doc_nodes.item(0);
624	the_document.removeChild(dummy_node);
625	the_document.setAttribute(GSXML.NODE_ID_ATT, dummy_node.getAttribute(GSXML.NODE_ID_ATT));
626	NodeList dummy_children = dummy_node.getChildNodes();
627	for (int i = dummy_children.getLength() - 1; i >= 0; i--)
628	{
629	// special case as we don't want more than one metadata list
630	if (dummy_children.item(i).getNodeName().equals(GSXML.METADATA_ELEM + GSXML.LIST_MODIFIER))
631	{
632	GSXML.mergeMetadataFromList(the_document, dummy_children.item(i));
633	}
634	else
635	{
636	the_document.appendChild(dummy_children.item(i));
637	}
638	}
639	}
640	}
641	else
642	{
643	//path = GSPath.appendLink(path, GSXML.DOC_NODE_ELEM);
644	Element dc_response_doc = (Element) GSXML.getChildByTagName(dc_response_doc_list, GSXML.DOC_NODE_ELEM);
645	Element dc_response_doc_content = (Element) GSXML.getChildByTagName(dc_response_doc, GSXML.NODE_CONTENT_ELEM);
646	//Element dc_response_doc_external = (Element) GSXML.getChildByTagName(dc_response_doc, "external");
647
648	if (dc_response_doc_content == null)
649	{
650	// no content to add
651	if (dc_response_doc.getAttribute("external").equals("true"))
652	{
653
654	//if (dc_response_doc_external != null)
655	//{
656	String href_id = dc_response_doc.getAttribute(GSXML.HREF_ID_ATT);
657
658	the_document.setAttribute("selectedNode", href_id);
659	the_document.setAttribute("external", href_id);
660	}
661	return result;
662	}
663	if (do_highlight_query_terms)
664	{
665	dc_response_doc.removeChild(dc_response_doc_content);
666
667	dc_response_doc_content = highlightQueryTermsElementText(doc, (Element)dc_response_doc_content, query_term_variants, phrase_query_term_variants_hierarchy);
668	dc_response_doc.appendChild(dc_response_doc.getOwnerDocument().importNode(dc_response_doc_content, true));
669	}
670
671	if (provide_annotations)
672	{
673	String service_selected = (String) params.get(ENRICH_DOC_ARG);
674	if (service_selected != null && service_selected.equals("1"))
675	{
676	// now we can modifiy the response doc if needed
677	String enrich_service = (String) params.get(GSParams.SERVICE);
678	// send a message to the service
679	Element enrich_message = doc.createElement(GSXML.MESSAGE_ELEM);
680	Element enrich_request = GSXML.createBasicRequest(doc, GSXML.REQUEST_TYPE_PROCESS, enrich_service, userContext);
681	enrich_message.appendChild(enrich_request);
682	// check for parameters
683	HashMap e_service_params = (HashMap) params.get("s1");
684	if (e_service_params != null)
685	{
686	Element enrich_pl = doc.createElement(GSXML.PARAM_ELEM + GSXML.LIST_MODIFIER);
687	GSXML.addParametersToList(enrich_pl, e_service_params);
688	enrich_request.appendChild(enrich_pl);
689	}
690	Element e_doc_list = doc.createElement(GSXML.DOC_NODE_ELEM + GSXML.LIST_MODIFIER);
691	enrich_request.appendChild(e_doc_list);
692	e_doc_list.appendChild(doc.importNode(dc_response_doc, true));
693
694	Node enrich_response = this.mr.process(enrich_message);
695
696	String[] links = { GSXML.RESPONSE_ELEM, GSXML.DOC_NODE_ELEM + GSXML.LIST_MODIFIER, GSXML.DOC_NODE_ELEM, GSXML.NODE_CONTENT_ELEM };
697	path = GSPath.createPath(links);
698	dc_response_doc_content = (Element) GSXML.getNodeByPath(enrich_response, path);
699
700	}
701	} // if provide_annotations
702
703	// use the returned id rather than the sent one cos there may have
704	// been modifiers such as .pr that are removed.
705	String modified_doc_id = dc_response_doc.getAttribute(GSXML.NODE_ID_ATT);
706	the_document.setAttribute("selectedNode", modified_doc_id);
707	if (has_dummy)
708	{
709	// change the id if necessary and add the content
710	Element dummy_node = (Element) doc_nodes.item(0);
711
712	dummy_node.setAttribute(GSXML.NODE_ID_ATT, modified_doc_id);
713	dummy_node.appendChild(doc.importNode(dc_response_doc_content, true));
714	// hack for simple type
715	if (document_type.equals(GSXML.DOC_TYPE_SIMPLE))
716	{
717	// we dont want the internal docNode, just want the content and metadata in the document
718	// rethink this!!
719	the_document.removeChild(dummy_node);
720
721	NodeList dummy_children = dummy_node.getChildNodes();
722	//for (int i=0; i<dummy_children.getLength(); i++) {
723	for (int i = dummy_children.getLength() - 1; i >= 0; i--)
724	{
725	// special case as we don't want more than one metadata list
726	if (dummy_children.item(i).getNodeName().equals(GSXML.METADATA_ELEM + GSXML.LIST_MODIFIER))
727	{
728	GSXML.mergeMetadataFromList(the_document, dummy_children.item(i));
729	}
730	else
731	{
732	the_document.appendChild(dummy_children.item(i));
733	}
734	}
735	}
736
737	the_document.setAttribute(GSXML.NODE_ID_ATT, modified_doc_id);
738	}
739	else
740	{
741	// Merge the document content with the metadata and structure information
742	for (int i = 0; i < doc_nodes.getLength(); i++)
743	{
744	Node dn = doc_nodes.item(i);
745	String dn_id = ((Element) dn).getAttribute(GSXML.NODE_ID_ATT);
746	if (dn_id.equals(modified_doc_id))
747	{
748	dn.appendChild(doc.importNode(dc_response_doc_content, true));
749	break;
750	}
751	}
752	}
753	}
754	//logger.debug("(DocumentAction) Page:\n" + GSXML.xmlNodeToString(result));
755	return result;
756	}
757
758	protected Element getFormattedArchiveDoc(Document doc, String collection, String document_id, String document_type, Element result, Element page_response, UserContext userContext ) {
759	// call get archive doc
760	Element dx_message = doc.createElement(GSXML.MESSAGE_ELEM);
761	String to = "DocXMLGetSection";
762	Element dx_request = GSXML.createBasicRequest(doc, GSXML.REQUEST_TYPE_PROCESS, to, userContext);
763	dx_message.appendChild(dx_request);
764	Element dx_section = doc.createElement(GSXML.DOCXML_SECTION_ELEM);
765	dx_section.setAttribute(GSXML.NODE_ID_ATT, document_id);
766	dx_section.setAttribute(GSXML.COLLECTION_ATT, collection);
767	dx_request.appendChild(dx_section);
768
769	Element dx_response_message = (Element) this.mr.process(dx_message);
770	if (processErrorElements(dx_response_message, page_response))
771	{
772	return result;
773	}
774
775	// get the section out
776	String path = GSPath.appendLink(GSXML.RESPONSE_ELEM, GSXML.DOCXML_SECTION_ELEM);
777	Element section = (Element) GSXML.getNodeByPath(dx_response_message, path);
778	if (section == null) {
779	logger.error("no archive doc returned for "+document_id);
780	return result;
781	}
782	// convert the archive format into the internal format that the page response requires
783
784	// work out doctype
785	// NOTE: this will be coming from collection database in index
786	// the archive file doesn't store this. So we have to assume
787	// that the doc type will not be changing with any
788	// modifications happening to archives.
789
790	// if doc type is null, then we need to work it out.
791	// create a basic doc list containing the current node
792
793	if (document_type == null) {
794	Element basic_doc_list = doc.createElement(GSXML.DOC_NODE_ELEM + GSXML.LIST_MODIFIER);
795	Element current_doc = doc.createElement(GSXML.DOC_NODE_ELEM);
796	basic_doc_list.appendChild(current_doc);
797	current_doc.setAttribute(GSXML.NODE_ID_ATT, document_id);
798	basic_doc_list.appendChild(current_doc);
799	document_type = getDocumentType(basic_doc_list, collection, userContext, page_response);
800	}
801
802	if (document_type == null) {
803	logger.debug("@@@ doctype is null, setting to simple");
804	document_type = GSXML.DOC_TYPE_SIMPLE;
805	}
806
807	Element doc_elem = doc.createElement(GSXML.DOCUMENT_ELEM);
808	doc_elem.setAttribute(GSXML.DOC_TYPE_ATT, document_type);
809	page_response.appendChild(doc_elem);
810
811	Element transformed_section = transformArchiveToDocument(section);
812	if (document_type == GSXML.DOC_TYPE_SIMPLE) {
813	// simple doc, only returning a single document node, which is the top level section.
814	doc_elem.setAttribute(GSXML.NODE_ID_ATT, document_id);
815	GSXML.mergeElements(doc_elem, transformed_section);
816	return result;
817	}
818
819	// multi sectioned document.
820	transformed_section.setAttribute(GSXML.NODE_ID_ATT, document_id);
821	// In docEdit mode, we obtain the text from archives, from doc.xml
822	// Now the transformation has replaced <Section> with <documentNode>
823	// Need to add nodeID, nodeType and docType attributes to each docNode
824	// as doc.xml doesn't store that.
825	insertDocNodeAttributes(transformed_section, document_type, null);
826	doc_elem.appendChild(doc.importNode(transformed_section, true));
827	logger.debug("dx result = "+XMLConverter.getPrettyString(result));
828
829	return result;
830	}
831
832
833	private boolean needSectionContent(HashMap<String, Serializable> params) {
834	String document_id = (String) params.get(GSParams.DOCUMENT);
835	String ilt = (String) params.get(GSParams.INLINE_TEMPLATE);
836	String iltPrefix = "<xsl:template match=\"/\"><text><xsl:for-each select=\"/page/pageResponse/document//documentNode[@nodeID =";
837	if (ilt != null && ilt.startsWith(iltPrefix) && document_id != null) {
838	return true;
839	}
840
841	return false;
842	}
843	/**
844	* this method gets the collection description, the format info, the list of
845	* enrich services, etc - stuff that is needed for the page, but is the same
846	* whatever the query is - should be cached
847	*/
848	protected boolean getBackgroundData(Element page_response, String collection, UserContext userContext)
849	{
850	Document doc = page_response.getOwnerDocument();
851
852	// create a message to process - contains requests for the collection
853	// description, the format element, the enrich services on offer
854	// these could all be cached
855	Element info_message = doc.createElement(GSXML.MESSAGE_ELEM);
856	String path = GSPath.appendLink(collection, "DocumentContentRetrieve");
857	// the format request - ignore for now, where does this request go to??
858	Element format_request = GSXML.createBasicRequest(doc, GSXML.REQUEST_TYPE_FORMAT, path, userContext);
859	info_message.appendChild(format_request);
860
861	// the enrich_services request - only do this if provide_annotations is true
862
863	if (provide_annotations)
864	{
865	Element enrich_services_request = GSXML.createBasicRequest(doc, GSXML.REQUEST_TYPE_DESCRIBE, "", userContext);
866	enrich_services_request.setAttribute(GSXML.INFO_ATT, "serviceList");
867	info_message.appendChild(enrich_services_request);
868	}
869
870	Element info_response = (Element) this.mr.process(info_message);
871
872	// the collection is the first response
873	NodeList responses = info_response.getElementsByTagName(GSXML.RESPONSE_ELEM);
874	Element format_resp = (Element) responses.item(0);
875
876	Element format_elem = (Element) GSXML.getChildByTagName(format_resp, GSXML.FORMAT_ELEM);
877	if (format_elem != null)
878	{
879	Element global_format_elem = (Element) GSXML.getChildByTagName(format_resp, GSXML.GLOBAL_FORMAT_ELEM);
880	if (global_format_elem != null)
881	{
882	GSXSLT.mergeFormatElements(format_elem, global_format_elem, false);
883	}
884
885	// set the format type
886	format_elem.setAttribute(GSXML.TYPE_ATT, "display");
887	page_response.appendChild(doc.importNode(format_elem, true));
888	}
889
890	if (provide_annotations)
891	{
892	Element services_resp = (Element) responses.item(1);
893
894	// a new message for the mr
895	Element enrich_message = doc.createElement(GSXML.MESSAGE_ELEM);
896	NodeList e_services = services_resp.getElementsByTagName(GSXML.SERVICE_ELEM);
897	boolean service_found = false;
898	for (int j = 0; j < e_services.getLength(); j++)
899	{
900	if (((Element) e_services.item(j)).getAttribute(GSXML.TYPE_ATT).equals("enrich"))
901	{
902	Element s = GSXML.createBasicRequest(doc, GSXML.REQUEST_TYPE_DESCRIBE, ((Element) e_services.item(j)).getAttribute(GSXML.NAME_ATT), userContext);
903	enrich_message.appendChild(s);
904	service_found = true;
905	}
906	}
907	if (service_found)
908	{
909	Element enrich_response = (Element) this.mr.process(enrich_message);
910
911	NodeList e_responses = enrich_response.getElementsByTagName(GSXML.RESPONSE_ELEM);
912	Element service_list = doc.createElement(GSXML.SERVICE_ELEM + GSXML.LIST_MODIFIER);
913	for (int i = 0; i < e_responses.getLength(); i++)
914	{
915	Element e_resp = (Element) e_responses.item(i);
916	Element e_service = (Element) doc.importNode(GSXML.getChildByTagName(e_resp, GSXML.SERVICE_ELEM), true);
917	e_service.setAttribute(GSXML.NAME_ATT, e_resp.getAttribute(GSXML.FROM_ATT));
918	service_list.appendChild(e_service);
919	}
920	page_response.appendChild(service_list);
921	}
922	} // if provide_annotations
923	return true;
924
925	}
926
927	protected String getDocumentType(Element basic_doc_list, String collection, UserContext userContext, Element page_response)
928	{
929	Document doc = basic_doc_list.getOwnerDocument();
930
931	Element ds_message = doc.createElement(GSXML.MESSAGE_ELEM);
932	String to = GSPath.appendLink(collection, "DocumentStructureRetrieve");// Hard-wired?
933	Element ds_request = GSXML.createBasicRequest(doc, GSXML.REQUEST_TYPE_PROCESS, to, userContext);
934	ds_message.appendChild(ds_request);
935
936	// Create a parameter list to specify the required structure information
937	Element ds_param_list = doc.createElement(GSXML.PARAM_ELEM + GSXML.LIST_MODIFIER);
938	Element ds_param = doc.createElement(GSXML.PARAM_ELEM);
939	ds_param_list.appendChild(ds_param);
940	ds_param.setAttribute(GSXML.NAME_ATT, "info");
941	ds_param.setAttribute(GSXML.VALUE_ATT, "documentType");
942
943	ds_request.appendChild(ds_param_list);
944
945	// add the node list we created earlier
946	ds_request.appendChild(basic_doc_list);
947
948	// Process the document structure retrieve message
949	Element ds_response_message = (Element) this.mr.process(ds_message);
950	if (processErrorElements(ds_response_message, page_response))
951	{
952	return null;
953	}
954
955	String[] links = { GSXML.RESPONSE_ELEM, GSXML.DOC_NODE_ELEM + GSXML.LIST_MODIFIER, GSXML.DOC_NODE_ELEM, "nodeStructureInfo" };
956	String path = GSPath.createPath(links);
957	Element info_elem = (Element) GSXML.getNodeByPath(ds_response_message, path);
958	if (info_elem == null) {
959	return null;
960	}
961	Element doctype_elem = GSXML.getNamedElement(info_elem, "info", "name", "documentType");
962	if (doctype_elem != null)
963	{
964	String doc_type = doctype_elem.getAttribute("value");
965	return doc_type;
966	}
967	return null;
968	}
969
970	// Recursive method to set the docType, nodeType and nodeID attributes of each docNode
971	// The docType remains constant as in parameter document_type
972	// The nodeID for the first (root) docNode is already set. For all children, the rootNode id
973	// is updated to be <parent-id>.<num-child>, where the first parent-id is rootNode id.
974	// The nodeType is root if rootNode, internal if there are children and leaf if no children
975	protected void insertDocNodeAttributes(Element docNode, String document_type, String id) {
976
977	boolean isRoot = false;
978	if(id == null) { // rootNode, get the root nodeID to work with recursively
979	id = docNode.getAttribute(GSXML.NODE_ID_ATT);
980	isRoot = true;
981	} else { // for all but the root node, need to still set the nodeID
982	docNode.setAttribute(GSXML.NODE_ID_ATT, id);
983	}
984
985	docNode.setAttribute(GSXML.DOC_TYPE_ATT, document_type);
986
987	NodeList docNodes = GSXML.getChildrenByTagName(docNode, GSXML.DOC_NODE_ELEM);
988	if(docNodes.getLength() > 0) {
989	docNode.setAttribute(GSXML.NODE_TYPE_ATT, GSXML.NODE_TYPE_INTERNAL);
990	for(int i = 0; i < docNodes.getLength(); i++) {
991	Element childDocNode = (Element)docNodes.item(i);
992
993	// work out the child docNode's nodeID based on current id
994	String nodeID = id + "." + (i+1);
995	insertDocNodeAttributes(childDocNode, document_type, nodeID); //recursion step
996	}
997	} else {
998	docNode.setAttribute(GSXML.NODE_TYPE_ATT, GSXML.NODE_TYPE_LEAF);
999	}
1000
1001	// rootNode's nodeType is a special case: it's "root", not "leaf" or "internal"
1002	if(isRoot) docNode.setAttribute(GSXML.NODE_TYPE_ATT, GSXML.NODE_TYPE_ROOT);
1003
1004	}
1005
1006	/** run the XSLT transform which converts from doc.xml format to our internal document format */
1007	protected Element transformArchiveToDocument(Element section) {
1008
1009	String stylesheet_filename = GSFile.stylesheetFile(GlobalProperties.getGSDL3Home(), (String) this.config_params.get(GSConstants.SITE_NAME), "", (String) this.config_params.get(GSConstants.INTERFACE_NAME), (ArrayList<String>) this.config_params.get(GSConstants.BASE_INTERFACES), "archive2document.xsl");
1010	if (stylesheet_filename == null) {
1011	logger.error("Couldn't find stylesheet archive2document.xsl");
1012	return section;
1013	}
1014
1015	Document stylesheet_doc = XMLConverter.getDOM(new File(stylesheet_filename));
1016	if (stylesheet_doc == null) {
1017	logger.error("Couldn't load in stylesheet "+stylesheet_filename);
1018	return section;
1019	}
1020
1021	Document section_doc = XMLConverter.newDOM();
1022	section_doc.appendChild(section_doc.importNode(section, true));
1023	Node result = this.transformer.transform(stylesheet_doc, section_doc);
1024	logger.debug("transform result = "+XMLConverter.getPrettyString(result));
1025
1026	Element new_element;
1027	if (result.getNodeType() == Node.DOCUMENT_NODE) {
1028	new_element = ((Document) result).getDocumentElement();
1029	} else {
1030	new_element = (Element) result;
1031	}
1032
1033
1034	return new_element;
1035
1036	}
1037
1038	/**
1039	* this involves a bit of a hack to get the equivalent query terms - has to
1040	* requery the query service - uses the last selected service name. (if it
1041	* ends in query). should this action do the query or should it send a
1042	* message to the query action? but that will involve lots of extra stuff.
1043	*/
1044	protected boolean getQueryTermVariants(Element request, String current_node_id, HashSet<String> query_term_variants, ArrayList<ArrayList<HashSet<String>>> phrase_query_term_variants_hierarchy)
1045	{
1046	Document doc = request.getOwnerDocument();
1047
1048	// do the query again to get term info
1049	Element cgi_param_list = (Element) GSXML.getChildByTagName(request, GSXML.PARAM_ELEM + GSXML.LIST_MODIFIER);
1050	//logger.error("cgi param list = "+XMLConverter.getPrettyString(cgi_param_list));
1051	HashMap<String, Serializable> params = GSXML.extractParams(cgi_param_list, false);
1052
1053	HashMap previous_params = (HashMap) params.get("p");
1054	if (previous_params == null)
1055	{
1056	//logger.error("no p parms");
1057	return false;
1058	}
1059	String service_name = (String) previous_params.get(GSParams.SERVICE);
1060	if (service_name == null \|\| !service_name.endsWith("Query"))
1061	{ // hack for now - we only do highlighting if we were in a query last - ie not if we were in a browse thingy
1062	logger.debug("invalid service, not doing highlighting");
1063	return false;
1064	}
1065
1066	String collection = (String) params.get(GSParams.COLLECTION);
1067	UserContext userContext = new UserContext(request);
1068	String to = GSPath.appendLink(collection, service_name);
1069
1070	Element mr_query_message = doc.createElement(GSXML.MESSAGE_ELEM);
1071	Element mr_query_request = GSXML.createBasicRequest(doc, GSXML.REQUEST_TYPE_PROCESS, to, userContext);
1072	mr_query_message.appendChild(mr_query_request);
1073
1074	// paramList
1075	HashMap service_params = (HashMap) params.get("s1");
1076
1077	Element query_param_list = doc.createElement(GSXML.PARAM_ELEM + GSXML.LIST_MODIFIER);
1078	GSXML.addParametersToList(query_param_list, service_params);
1079	// is this only used for solr??? - do we still want it for solr??
1080	// if (current_node_id != null) {
1081	// GSXML.addParameterToList(query_param_list, "hldocOID", current_node_id);
1082	// } else {
1083	// GSXML.addParameterToList(query_param_list, "hldocOID", (String) params.get(GSParams.DOCUMENT));
1084	// }
1085	mr_query_request.appendChild(query_param_list);
1086	// do the query
1087
1088	Element mr_query_response = (Element) this.mr.process(mr_query_message);
1089
1090	// find the term lists
1091	String path = GSPath.appendLink(GSXML.RESPONSE_ELEM, GSXML.TERM_ELEM + GSXML.LIST_MODIFIER);
1092	Element query_term_list_element = (Element) GSXML.getNodeByPath(mr_query_response, path);
1093	if (query_term_list_element == null)
1094	{
1095	// no term info
1096	logger.error("No query term information. xx\n");
1097	return false;
1098	}
1099	// logger.error("query term list info "+XMLConverter.getPrettyString(query_term_list_element));
1100	//String content = GSXML.getNodeText(dc_response_doc_content);
1101
1102	String metadata_path = GSPath.appendLink(GSXML.RESPONSE_ELEM, GSXML.METADATA_ELEM + GSXML.LIST_MODIFIER);
1103	Element metadata_list = (Element) GSXML.getNodeByPath(mr_query_response, metadata_path);
1104
1105	NodeList equivalent_terms_nodelist = query_term_list_element.getElementsByTagName("equivTermList");
1106	if (equivalent_terms_nodelist == null \|\| equivalent_terms_nodelist.getLength() == 0)
1107	{
1108	NodeList terms_nodelist = query_term_list_element.getElementsByTagName("term");
1109	if (terms_nodelist != null && terms_nodelist.getLength() > 0)
1110	{
1111	for (int i = 0; i < terms_nodelist.getLength(); i++)
1112	{
1113	String termValue = ((Element) terms_nodelist.item(i)).getAttribute("name");
1114	String termValueU = null;
1115	String termValueL = null;
1116
1117	if (termValue.length() > 1)
1118	{
1119	termValueU = termValue.substring(0, 1).toUpperCase() + termValue.substring(1);
1120	termValueL = termValue.substring(0, 1).toLowerCase() + termValue.substring(1);
1121	}
1122	else
1123	{
1124	termValueU = termValue.substring(0, 1).toUpperCase();
1125	termValueL = termValue.substring(0, 1).toLowerCase();
1126	}
1127	query_term_variants.add(termValueU);
1128	query_term_variants.add(termValueL);
1129	}
1130	}
1131	}
1132	else
1133	{
1134	for (int i = 0; i < equivalent_terms_nodelist.getLength(); i++)
1135	{
1136	Element equivalent_terms_element = (Element) equivalent_terms_nodelist.item(i);
1137	String[] equivalent_terms = GSXML.getAttributeValuesFromList(equivalent_terms_element, GSXML.NAME_ATT);
1138	for (int j = 0; j < equivalent_terms.length; j++)
1139	{
1140	query_term_variants.add(equivalent_terms[j]);
1141	}
1142	}
1143	}
1144
1145
1146	Element query_element = GSXML.getNamedElement(metadata_list, GSXML.METADATA_ELEM, GSXML.NAME_ATT, "query");
1147	String performed_query = GSXML.getNodeText(query_element) + " ";
1148
1149	ArrayList<HashSet<String>> phrase_query_p_term_variants_list = new ArrayList<HashSet<String>>();
1150	int term_start = 0;
1151	boolean in_term = false;
1152	boolean in_phrase = false;
1153	for (int i = 0; i < performed_query.length(); i++)
1154	{
1155	char character = performed_query.charAt(i);
1156	boolean is_character_letter_or_digit = Character.isLetterOrDigit(character);
1157
1158	// Has a query term just started?
1159	if (in_term == false && is_character_letter_or_digit == true)
1160	{
1161	in_term = true;
1162	term_start = i;
1163	}
1164
1165	// Or has a term just finished?
1166	else if (in_term == true && is_character_letter_or_digit == false)
1167	{
1168	in_term = false;
1169	String term = performed_query.substring(term_start, i);
1170
1171	Element term_element = GSXML.getNamedElement(query_term_list_element, GSXML.TERM_ELEM, GSXML.NAME_ATT, term);
1172	if (term_element != null)
1173	{
1174
1175	HashSet<String> phrase_query_p_term_x_variants = new HashSet<String>();
1176
1177	NodeList term_equivalent_terms_nodelist = term_element.getElementsByTagName("equivTermList");
1178	if (term_equivalent_terms_nodelist == null \|\| term_equivalent_terms_nodelist.getLength() == 0)
1179	{
1180	String termValueU = null;
1181	String termValueL = null;
1182
1183	if (term.length() > 1)
1184	{
1185	termValueU = term.substring(0, 1).toUpperCase() + term.substring(1);
1186	termValueL = term.substring(0, 1).toLowerCase() + term.substring(1);
1187	}
1188	else
1189	{
1190	termValueU = term.substring(0, 1).toUpperCase();
1191	termValueL = term.substring(0, 1).toLowerCase();
1192	}
1193
1194	phrase_query_p_term_x_variants.add(termValueU);
1195	phrase_query_p_term_x_variants.add(termValueL);
1196	}
1197	else
1198	{
1199	for (int j = 0; j < term_equivalent_terms_nodelist.getLength(); j++)
1200	{
1201	Element term_equivalent_terms_element = (Element) term_equivalent_terms_nodelist.item(j);
1202	String[] term_equivalent_terms = GSXML.getAttributeValuesFromList(term_equivalent_terms_element, GSXML.NAME_ATT);
1203	for (int k = 0; k < term_equivalent_terms.length; k++)
1204	{
1205	phrase_query_p_term_x_variants.add(term_equivalent_terms[k]);
1206	}
1207	}
1208	}
1209	phrase_query_p_term_variants_list.add(phrase_query_p_term_x_variants);
1210
1211	if (in_phrase == false)
1212	{
1213	phrase_query_term_variants_hierarchy.add(phrase_query_p_term_variants_list);
1214	phrase_query_p_term_variants_list = new ArrayList<HashSet<String>>();
1215	}
1216	}
1217	}
1218	// Watch for phrases (surrounded by quotes)
1219	if (character == '\"')
1220	{
1221	// Has a phrase just started?
1222	if (in_phrase == false)
1223	{
1224	in_phrase = true;
1225	}
1226	// Or has a phrase just finished?
1227	else if (in_phrase == true)
1228	{
1229	in_phrase = false;
1230	phrase_query_term_variants_hierarchy.add(phrase_query_p_term_variants_list);
1231	}
1232
1233	phrase_query_p_term_variants_list = new ArrayList<HashSet<String>>();
1234	}
1235	}
1236
1237	return true;
1238	}
1239
1240	/** redo the request to get the query terms then highlight them in the text
1241	*
1242	*/
1243	protected Element highlightQueryTermsOld(Element request, String current_node_id, Element dc_response_doc_content)
1244	{
1245	Document doc = request.getOwnerDocument();
1246
1247	// do the query again to get term info
1248	Element cgi_param_list = (Element) GSXML.getChildByTagName(request, GSXML.PARAM_ELEM + GSXML.LIST_MODIFIER);
1249	HashMap<String, Serializable> params = GSXML.extractParams(cgi_param_list, false);
1250
1251	HashMap previous_params = (HashMap) params.get("p");
1252	if (previous_params == null)
1253	{
1254	return dc_response_doc_content;
1255	}
1256	String service_name = (String) previous_params.get(GSParams.SERVICE);
1257	if (service_name == null \|\| !service_name.endsWith("Query"))
1258	{ // hack for now - we only do highlighting if we were in a query last - ie not if we were in a browse thingy
1259	logger.debug("invalid service, not doing highlighting");
1260	return dc_response_doc_content;
1261	}
1262	String collection = (String) params.get(GSParams.COLLECTION);
1263	UserContext userContext = new UserContext(request);
1264	String to = GSPath.appendLink(collection, service_name);
1265
1266	Element mr_query_message = doc.createElement(GSXML.MESSAGE_ELEM);
1267	Element mr_query_request = GSXML.createBasicRequest(doc, GSXML.REQUEST_TYPE_PROCESS, to, userContext);
1268	mr_query_message.appendChild(mr_query_request);
1269
1270	// paramList
1271	HashMap service_params = (HashMap) params.get("s1");
1272
1273	Element query_param_list = doc.createElement(GSXML.PARAM_ELEM + GSXML.LIST_MODIFIER);
1274	GSXML.addParametersToList(query_param_list, service_params);
1275	if (current_node_id != null) {
1276	GSXML.addParameterToList(query_param_list, "hldocOID", current_node_id);
1277	} else {
1278	GSXML.addParameterToList(query_param_list, "hldocOID", (String) params.get(GSParams.DOCUMENT));
1279	}
1280	mr_query_request.appendChild(query_param_list);
1281	// do the query
1282	Element mr_query_response = (Element) this.mr.process(mr_query_message);
1283	String pathNode = GSPath.appendLink(GSXML.RESPONSE_ELEM, GSXML.NODE_CONTENT_ELEM);
1284	Element highlighted_Node = (Element) GSXML.getNodeByPath(mr_query_response, pathNode);
1285	// For SOLR, the above query may come back with a nodeContent element, which is the hldocOID section content, with search terms marked up. We send it back to the documnetContentRetrieve service so that resolveTextMacros can be applied, and it can be properly encased in documentNode etc elements
1286	if (highlighted_Node != null)
1287	{
1288	// Build a request to process highlighted text
1289	logger.error("highlighted node is not null!!!!");
1290	logger.error(XMLConverter.getPrettyString(highlighted_Node));
1291	Element hl_message = doc.createElement(GSXML.MESSAGE_ELEM);
1292	to = GSPath.appendLink(collection, "DocumentContentRetrieve");
1293	Element dc_request = GSXML.createBasicRequest(doc, GSXML.REQUEST_TYPE_PROCESS, to, userContext);
1294	hl_message.appendChild(dc_request);
1295
1296	// Create a parameter list to specify the request parameters - empty for now
1297	Element dc_param_list = doc.createElement(GSXML.PARAM_ELEM + GSXML.LIST_MODIFIER);
1298	dc_request.appendChild(dc_param_list);
1299
1300	// get the content
1301	Element doc_list = doc.createElement(GSXML.DOC_NODE_ELEM + GSXML.LIST_MODIFIER);
1302	dc_request.appendChild(doc_list);
1303	Element current_doc = doc.createElement(GSXML.DOC_NODE_ELEM);
1304	doc_list.appendChild(current_doc);
1305	current_doc.setAttribute(GSXML.NODE_ID_ATT, (String) params.get(GSParams.DOCUMENT));
1306	//Append highlighted content to request for processing
1307	dc_request.appendChild(doc.importNode(highlighted_Node, true));
1308	Element hl_response_message = (Element) this.mr.process(hl_message);
1309
1310	//Get results
1311	NodeList contentList = hl_response_message.getElementsByTagName(GSXML.NODE_CONTENT_ELEM);
1312	Element content = (Element) contentList.item(0);
1313	return content;
1314	}
1315	String path = GSPath.appendLink(GSXML.RESPONSE_ELEM, GSXML.TERM_ELEM + GSXML.LIST_MODIFIER);
1316	Element query_term_list_element = (Element) GSXML.getNodeByPath(mr_query_response, path);
1317	if (query_term_list_element == null)
1318	{
1319	// no term info
1320	logger.error("No query term information. yy\n");
1321	return dc_response_doc_content;
1322	}
1323
1324	String content = GSXML.getNodeText(dc_response_doc_content);
1325
1326	String metadata_path = GSPath.appendLink(GSXML.RESPONSE_ELEM, GSXML.METADATA_ELEM + GSXML.LIST_MODIFIER);
1327	Element metadata_list = (Element) GSXML.getNodeByPath(mr_query_response, metadata_path);
1328
1329	HashSet<String> query_term_variants = new HashSet<String>();
1330	NodeList equivalent_terms_nodelist = query_term_list_element.getElementsByTagName("equivTermList");
1331	if (equivalent_terms_nodelist == null \|\| equivalent_terms_nodelist.getLength() == 0)
1332	{
1333	NodeList terms_nodelist = query_term_list_element.getElementsByTagName("term");
1334	if (terms_nodelist != null && terms_nodelist.getLength() > 0)
1335	{
1336	for (int i = 0; i < terms_nodelist.getLength(); i++)
1337	{
1338	String termValue = ((Element) terms_nodelist.item(i)).getAttribute("name");
1339	String termValueU = null;
1340	String termValueL = null;
1341
1342	if (termValue.length() > 1)
1343	{
1344	termValueU = termValue.substring(0, 1).toUpperCase() + termValue.substring(1);
1345	termValueL = termValue.substring(0, 1).toLowerCase() + termValue.substring(1);
1346	}
1347	else
1348	{
1349	termValueU = termValue.substring(0, 1).toUpperCase();
1350	termValueL = termValue.substring(0, 1).toLowerCase();
1351	}
1352
1353	query_term_variants.add(termValueU);
1354	query_term_variants.add(termValueL);
1355	}
1356	}
1357	}
1358	else
1359	{
1360	for (int i = 0; i < equivalent_terms_nodelist.getLength(); i++)
1361	{
1362	Element equivalent_terms_element = (Element) equivalent_terms_nodelist.item(i);
1363	String[] equivalent_terms = GSXML.getAttributeValuesFromList(equivalent_terms_element, GSXML.NAME_ATT);
1364	for (int j = 0; j < equivalent_terms.length; j++)
1365	{
1366	query_term_variants.add(equivalent_terms[j]);
1367	}
1368	}
1369	}
1370
1371	ArrayList<ArrayList<HashSet<String>>> phrase_query_term_variants_hierarchy = new ArrayList<ArrayList<HashSet<String>>>();
1372
1373	Element query_element = GSXML.getNamedElement(metadata_list, GSXML.METADATA_ELEM, GSXML.NAME_ATT, "query");
1374	String performed_query = GSXML.getNodeText(query_element) + " ";
1375
1376	ArrayList<HashSet<String>> phrase_query_p_term_variants_list = new ArrayList<HashSet<String>>();
1377	int term_start = 0;
1378	boolean in_term = false;
1379	boolean in_phrase = false;
1380	for (int i = 0; i < performed_query.length(); i++)
1381	{
1382	char character = performed_query.charAt(i);
1383	boolean is_character_letter_or_digit = Character.isLetterOrDigit(character);
1384
1385	// Has a query term just started?
1386	if (in_term == false && is_character_letter_or_digit == true)
1387	{
1388	in_term = true;
1389	term_start = i;
1390	}
1391
1392	// Or has a term just finished?
1393	else if (in_term == true && is_character_letter_or_digit == false)
1394	{
1395	in_term = false;
1396	String term = performed_query.substring(term_start, i);
1397
1398	Element term_element = GSXML.getNamedElement(query_term_list_element, GSXML.TERM_ELEM, GSXML.NAME_ATT, term);
1399	if (term_element != null)
1400	{
1401
1402	HashSet<String> phrase_query_p_term_x_variants = new HashSet<String>();
1403
1404	NodeList term_equivalent_terms_nodelist = term_element.getElementsByTagName("equivTermList");
1405	if (term_equivalent_terms_nodelist == null \|\| term_equivalent_terms_nodelist.getLength() == 0)
1406	{
1407	String termValueU = null;
1408	String termValueL = null;
1409
1410	if (term.length() > 1)
1411	{
1412	termValueU = term.substring(0, 1).toUpperCase() + term.substring(1);
1413	termValueL = term.substring(0, 1).toLowerCase() + term.substring(1);
1414	}
1415	else
1416	{
1417	termValueU = term.substring(0, 1).toUpperCase();
1418	termValueL = term.substring(0, 1).toLowerCase();
1419	}
1420
1421	phrase_query_p_term_x_variants.add(termValueU);
1422	phrase_query_p_term_x_variants.add(termValueL);
1423	}
1424	else
1425	{
1426	for (int j = 0; j < term_equivalent_terms_nodelist.getLength(); j++)
1427	{
1428	Element term_equivalent_terms_element = (Element) term_equivalent_terms_nodelist.item(j);
1429	String[] term_equivalent_terms = GSXML.getAttributeValuesFromList(term_equivalent_terms_element, GSXML.NAME_ATT);
1430	for (int k = 0; k < term_equivalent_terms.length; k++)
1431	{
1432	phrase_query_p_term_x_variants.add(term_equivalent_terms[k]);
1433	}
1434	}
1435	}
1436	phrase_query_p_term_variants_list.add(phrase_query_p_term_x_variants);
1437
1438	if (in_phrase == false)
1439	{
1440	phrase_query_term_variants_hierarchy.add(phrase_query_p_term_variants_list);
1441	phrase_query_p_term_variants_list = new ArrayList<HashSet<String>>();
1442	}
1443	}
1444	}
1445	// Watch for phrases (surrounded by quotes)
1446	if (character == '\"')
1447	{
1448	// Has a phrase just started?
1449	if (in_phrase == false)
1450	{
1451	in_phrase = true;
1452	}
1453	// Or has a phrase just finished?
1454	else if (in_phrase == true)
1455	{
1456	in_phrase = false;
1457	phrase_query_term_variants_hierarchy.add(phrase_query_p_term_variants_list);
1458	}
1459
1460	phrase_query_p_term_variants_list = new ArrayList<HashSet<String>>();
1461	}
1462	}
1463
1464	return highlightQueryTermsInternalOrig(doc, content, query_term_variants, phrase_query_term_variants_hierarchy);
1465	}
1466
1467	/**
1468	* Highlights query terms in specified elements (whose name is in element_names) text inside top_level_elem
1469	*/
1470	protected boolean highlightQueryTermsDOM(Document doc, Element top_level_elem, String element_name, HashSet<String> query_term_variants, ArrayList<ArrayList<HashSet<String>>> phrase_query_term_variants_hierarchy) {
1471
1472	//logger.error("begin highlight DOM "+XMLConverter.getPrettyString(top_level_elem));
1473	NodeList named_elems = top_level_elem.getElementsByTagName(element_name);
1474	for (int j=named_elems.getLength()-1; j>=0; j--) {
1475	Element this_elem = (Element)named_elems.item(j);
1476	Element replacement_elem = highlightQueryTermsElementText(doc, this_elem, query_term_variants, phrase_query_term_variants_hierarchy);
1477	this_elem.getParentNode().replaceChild(replacement_elem, this_elem);
1478	}
1479
1480
1481	//logger.error("end highlight DOM "+XMLConverter.getPrettyString(top_level_elem));
1482	return true;
1483	}
1484	/**
1485	* Highlights query terms in the text content of an element.
1486	*/
1487	private Element highlightQueryTermsElementText(Document doc, Element original_element, /String content,/ HashSet<String> query_term_variants, ArrayList<ArrayList<HashSet<String>>> phrase_query_term_variants_hierarchy)
1488	{
1489	//logger.error("in hl internal, query terms are "+query_term_variants.toString());
1490	String content = GSXML.getNodeText(original_element);
1491	//logger.error("original elem = "+XMLConverter.getPrettyString(original_element));
1492	logger.error("highlighting content: "+content);
1493	// Convert the content string to an array of characters for speed
1494	char[] content_characters = new char[content.length()];
1495	content.getChars(0, content.length(), content_characters, 0);
1496
1497	// Now skim through the content, identifying word matches
1498	ArrayList<WordMatch> word_matches = new ArrayList<WordMatch>();
1499	int word_start = 0;
1500	boolean in_word = false;
1501	boolean preceding_word_matched = false;
1502	boolean inTag = false;
1503	for (int i = 0; i < content_characters.length; i++)
1504	{
1505	//We don't want to find words inside HTML tags
1506	if (content_characters[i] == '<')
1507	{
1508	inTag = true;
1509	continue;
1510	}
1511	else if (inTag && content_characters[i] == '>')
1512	{
1513	inTag = false;
1514	}
1515	else if (inTag)
1516	{
1517	continue;
1518	}
1519
1520	boolean is_character_letter_or_digit = Character.isLetterOrDigit(content_characters[i]);
1521
1522	// Has a word just started?
1523	if (in_word == false && is_character_letter_or_digit == true)
1524	{
1525	in_word = true;
1526	word_start = i;
1527	}
1528
1529	// Or has a word just finished?
1530	else if (in_word == true && is_character_letter_or_digit == false)
1531	{
1532	in_word = false;
1533
1534	// Check if the word matches any of the query term equivalents
1535	String word = new String(content_characters, word_start, (i - word_start));
1536	//logger.error("word: "+word);
1537	if (query_term_variants.contains(word))
1538	{
1539	//logger.error("matched");
1540	// We have found a matching word, so remember its location
1541	word_matches.add(new WordMatch(word, word_start, i, preceding_word_matched));
1542	preceding_word_matched = true;
1543	}
1544	else
1545	{
1546	preceding_word_matched = false;
1547	}
1548	}
1549	}
1550
1551	// Don't forget the last word...
1552	if (in_word == true)
1553	{
1554	// Check if the word matches any of the query term equivalents
1555	String word = new String(content_characters, word_start, (content_characters.length - word_start));
1556	if (query_term_variants.contains(word))
1557	{
1558	// We have found a matching word, so remember its location
1559	word_matches.add(new WordMatch(word, word_start, content_characters.length, preceding_word_matched));
1560	}
1561	}
1562
1563	ArrayList<Integer> highlight_start_positions = new ArrayList<Integer>();
1564	ArrayList<Integer> highlight_end_positions = new ArrayList<Integer>();
1565
1566	// Deal with phrases now
1567	ArrayList<PartialPhraseMatch> partial_phrase_matches = new ArrayList<PartialPhraseMatch>();
1568	for (int i = 0; i < word_matches.size(); i++)
1569	{
1570	WordMatch word_match = word_matches.get(i);
1571
1572	// See if any partial phrase matches are extended by this word
1573	if (word_match.preceding_word_matched)
1574	{
1575	for (int j = partial_phrase_matches.size() - 1; j >= 0; j--)
1576	{
1577	PartialPhraseMatch partial_phrase_match = partial_phrase_matches.remove(j);
1578	ArrayList phrase_query_p_term_variants_list = phrase_query_term_variants_hierarchy.get(partial_phrase_match.query_phrase_number);
1579	HashSet phrase_query_p_term_x_variants = (HashSet) phrase_query_p_term_variants_list.get(partial_phrase_match.num_words_matched);
1580	if (phrase_query_p_term_x_variants.contains(word_match.word))
1581	{
1582	partial_phrase_match.num_words_matched++;
1583
1584	// Has a complete phrase match occurred?
1585	if (partial_phrase_match.num_words_matched == phrase_query_p_term_variants_list.size())
1586	{
1587	// Check for overlaps by looking at the previous highlight range
1588	if (!highlight_end_positions.isEmpty())
1589	{
1590	int last_highlight_index = highlight_end_positions.size() - 1;
1591	int last_highlight_end = highlight_end_positions.get(last_highlight_index).intValue();
1592	if (last_highlight_end > partial_phrase_match.start_position)
1593	{
1594	// There is an overlap, so remove the previous phrase match
1595	int last_highlight_start = highlight_start_positions.remove(last_highlight_index).intValue();
1596	highlight_end_positions.remove(last_highlight_index);
1597	partial_phrase_match.start_position = last_highlight_start;
1598	}
1599	}
1600
1601	highlight_start_positions.add(new Integer(partial_phrase_match.start_position));
1602	highlight_end_positions.add(new Integer(word_match.end_position));
1603	}
1604	// No, but add the partial match back into the list for next time
1605	else
1606	{
1607	partial_phrase_matches.add(partial_phrase_match);
1608	}
1609	}
1610	}
1611	}
1612	else
1613	{
1614	partial_phrase_matches.clear();
1615	}
1616
1617	// See if this word is at the start of any of the phrases
1618	for (int p = 0; p < phrase_query_term_variants_hierarchy.size(); p++)
1619	{
1620	ArrayList phrase_query_p_term_variants_list = phrase_query_term_variants_hierarchy.get(p);
1621	if (phrase_query_p_term_variants_list.size()>0) {
1622	HashSet phrase_query_p_term_1_variants = (HashSet) phrase_query_p_term_variants_list.get(0);
1623	if (phrase_query_p_term_1_variants.contains(word_match.word))
1624	{
1625	// If this phrase is just one word long, we have a complete match
1626	if (phrase_query_p_term_variants_list.size() == 1)
1627	{
1628	highlight_start_positions.add(new Integer(word_match.start_position));
1629	highlight_end_positions.add(new Integer(word_match.end_position));
1630	}
1631	// Otherwise we have the start of a potential phrase match
1632	else
1633	{
1634	partial_phrase_matches.add(new PartialPhraseMatch(word_match.start_position, p));
1635	}
1636	}
1637	}
1638	}
1639	}
1640
1641	// Now add the annotation tags into the document at the correct points
1642	//Element content_element = doc.createElement(GSXML.NODE_CONTENT_ELEM);
1643	Element content_element = (Element)doc.importNode(original_element, false); // just copy the element plus any attributes, but not any children.
1644	int last_wrote = 0;
1645	for (int i = 0; i < highlight_start_positions.size(); i++)
1646	{
1647	int highlight_start = highlight_start_positions.get(i).intValue();
1648	int highlight_end = highlight_end_positions.get(i).intValue();
1649
1650	// Print anything before the highlight range
1651	if (last_wrote < highlight_start)
1652	{
1653	String preceding_text = new String(content_characters, last_wrote, (highlight_start - last_wrote));
1654	content_element.appendChild(doc.createTextNode(preceding_text));
1655	}
1656
1657	// Print the highlight text, annotated
1658	if (highlight_end > last_wrote)
1659	{
1660	String highlight_text = new String(content_characters, highlight_start, (highlight_end - highlight_start));
1661	Element annotation_element = GSXML.createTextElement(doc, "annotation", highlight_text);
1662	annotation_element.setAttribute("type", "query_term");
1663	content_element.appendChild(annotation_element);
1664	last_wrote = highlight_end;
1665	}
1666	}
1667
1668	// Finish off any unwritten text
1669	if (last_wrote < content_characters.length)
1670	{
1671	String remaining_text = new String(content_characters, last_wrote, (content_characters.length - last_wrote));
1672	content_element.appendChild(doc.createTextNode(remaining_text));
1673	}
1674	return content_element;
1675	}
1676
1677	private Element highlightQueryTermsInternalOrig(Document doc, String content, HashSet<String> query_term_variants, ArrayList<ArrayList<HashSet<String>>> phrase_query_term_variants_hierarchy)
1678	{
1679	// Convert the content string to an array of characters for speed
1680	char[] content_characters = new char[content.length()];
1681	content.getChars(0, content.length(), content_characters, 0);
1682
1683	// Now skim through the content, identifying word matches
1684	ArrayList<WordMatch> word_matches = new ArrayList<WordMatch>();
1685	int word_start = 0;
1686	boolean in_word = false;
1687	boolean preceding_word_matched = false;
1688	boolean inTag = false;
1689	for (int i = 0; i < content_characters.length; i++)
1690	{
1691	//We don't want to find words inside HTML tags
1692	if (content_characters[i] == '<')
1693	{
1694	inTag = true;
1695	continue;
1696	}
1697	else if (inTag && content_characters[i] == '>')
1698	{
1699	inTag = false;
1700	}
1701	else if (inTag)
1702	{
1703	continue;
1704	}
1705
1706	boolean is_character_letter_or_digit = Character.isLetterOrDigit(content_characters[i]);
1707
1708	// Has a word just started?
1709	if (in_word == false && is_character_letter_or_digit == true)
1710	{
1711	in_word = true;
1712	word_start = i;
1713	}
1714
1715	// Or has a word just finished?
1716	else if (in_word == true && is_character_letter_or_digit == false)
1717	{
1718	in_word = false;
1719
1720	// Check if the word matches any of the query term equivalents
1721	String word = new String(content_characters, word_start, (i - word_start));
1722	if (query_term_variants.contains(word))
1723	{
1724	// We have found a matching word, so remember its location
1725	word_matches.add(new WordMatch(word, word_start, i, preceding_word_matched));
1726	preceding_word_matched = true;
1727	}
1728	else
1729	{
1730	preceding_word_matched = false;
1731	}
1732	}
1733	}
1734
1735	// Don't forget the last word...
1736	if (in_word == true)
1737	{
1738	// Check if the word matches any of the query term equivalents
1739	String word = new String(content_characters, word_start, (content_characters.length - word_start));
1740	if (query_term_variants.contains(word))
1741	{
1742	// We have found a matching word, so remember its location
1743	word_matches.add(new WordMatch(word, word_start, content_characters.length, preceding_word_matched));
1744	}
1745	}
1746
1747	ArrayList<Integer> highlight_start_positions = new ArrayList<Integer>();
1748	ArrayList<Integer> highlight_end_positions = new ArrayList<Integer>();
1749
1750	// Deal with phrases now
1751	ArrayList<PartialPhraseMatch> partial_phrase_matches = new ArrayList<PartialPhraseMatch>();
1752	for (int i = 0; i < word_matches.size(); i++)
1753	{
1754	WordMatch word_match = word_matches.get(i);
1755
1756	// See if any partial phrase matches are extended by this word
1757	if (word_match.preceding_word_matched)
1758	{
1759	for (int j = partial_phrase_matches.size() - 1; j >= 0; j--)
1760	{
1761	PartialPhraseMatch partial_phrase_match = partial_phrase_matches.remove(j);
1762	ArrayList phrase_query_p_term_variants_list = phrase_query_term_variants_hierarchy.get(partial_phrase_match.query_phrase_number);
1763	HashSet phrase_query_p_term_x_variants = (HashSet) phrase_query_p_term_variants_list.get(partial_phrase_match.num_words_matched);
1764	if (phrase_query_p_term_x_variants.contains(word_match.word))
1765	{
1766	partial_phrase_match.num_words_matched++;
1767
1768	// Has a complete phrase match occurred?
1769	if (partial_phrase_match.num_words_matched == phrase_query_p_term_variants_list.size())
1770	{
1771	// Check for overlaps by looking at the previous highlight range
1772	if (!highlight_end_positions.isEmpty())
1773	{
1774	int last_highlight_index = highlight_end_positions.size() - 1;
1775	int last_highlight_end = highlight_end_positions.get(last_highlight_index).intValue();
1776	if (last_highlight_end > partial_phrase_match.start_position)
1777	{
1778	// There is an overlap, so remove the previous phrase match
1779	int last_highlight_start = highlight_start_positions.remove(last_highlight_index).intValue();
1780	highlight_end_positions.remove(last_highlight_index);
1781	partial_phrase_match.start_position = last_highlight_start;
1782	}
1783	}
1784
1785	highlight_start_positions.add(new Integer(partial_phrase_match.start_position));
1786	highlight_end_positions.add(new Integer(word_match.end_position));
1787	}
1788	// No, but add the partial match back into the list for next time
1789	else
1790	{
1791	partial_phrase_matches.add(partial_phrase_match);
1792	}
1793	}
1794	}
1795	}
1796	else
1797	{
1798	partial_phrase_matches.clear();
1799	}
1800
1801	// See if this word is at the start of any of the phrases
1802	for (int p = 0; p < phrase_query_term_variants_hierarchy.size(); p++)
1803	{
1804	ArrayList phrase_query_p_term_variants_list = phrase_query_term_variants_hierarchy.get(p);
1805	if (phrase_query_p_term_variants_list.size()>0) {
1806	HashSet phrase_query_p_term_1_variants = (HashSet) phrase_query_p_term_variants_list.get(0);
1807	if (phrase_query_p_term_1_variants.contains(word_match.word))
1808	{
1809	// If this phrase is just one word long, we have a complete match
1810	if (phrase_query_p_term_variants_list.size() == 1)
1811	{
1812	highlight_start_positions.add(new Integer(word_match.start_position));
1813	highlight_end_positions.add(new Integer(word_match.end_position));
1814	}
1815	// Otherwise we have the start of a potential phrase match
1816	else
1817	{
1818	partial_phrase_matches.add(new PartialPhraseMatch(word_match.start_position, p));
1819	}
1820	}
1821	}
1822	}
1823	}
1824
1825	// Now add the annotation tags into the document at the correct points
1826	Element content_element = doc.createElement(GSXML.NODE_CONTENT_ELEM);
1827
1828	int last_wrote = 0;
1829	for (int i = 0; i < highlight_start_positions.size(); i++)
1830	{
1831	int highlight_start = highlight_start_positions.get(i).intValue();
1832	int highlight_end = highlight_end_positions.get(i).intValue();
1833
1834	// Print anything before the highlight range
1835	if (last_wrote < highlight_start)
1836	{
1837	String preceding_text = new String(content_characters, last_wrote, (highlight_start - last_wrote));
1838	content_element.appendChild(doc.createTextNode(preceding_text));
1839	}
1840
1841	// Print the highlight text, annotated
1842	if (highlight_end > last_wrote)
1843	{
1844	String highlight_text = new String(content_characters, highlight_start, (highlight_end - highlight_start));
1845	Element annotation_element = GSXML.createTextElement(doc, "annotation", highlight_text);
1846	annotation_element.setAttribute("type", "query_term");
1847	content_element.appendChild(annotation_element);
1848	last_wrote = highlight_end;
1849	}
1850	}
1851
1852	// Finish off any unwritten text
1853	if (last_wrote < content_characters.length)
1854	{
1855	String remaining_text = new String(content_characters, last_wrote, (content_characters.length - last_wrote));
1856	content_element.appendChild(doc.createTextNode(remaining_text));
1857	}
1858	return content_element;
1859	}
1860
1861	static private class WordMatch
1862	{
1863	public String word;
1864	public int start_position;
1865	public int end_position;
1866	public boolean preceding_word_matched;
1867
1868	public WordMatch(String word, int start_position, int end_position, boolean preceding_word_matched)
1869	{
1870	this.word = word;
1871	this.start_position = start_position;
1872	this.end_position = end_position;
1873	this.preceding_word_matched = preceding_word_matched;
1874	}
1875	}
1876
1877	static private class PartialPhraseMatch
1878	{
1879	public int start_position;
1880	public int query_phrase_number;
1881	public int num_words_matched;
1882
1883	public PartialPhraseMatch(int start_position, int query_phrase_number)
1884	{
1885	this.start_position = start_position;
1886	this.query_phrase_number = query_phrase_number;
1887	this.num_words_matched = 1;
1888	}
1889	}
1890	}

Note: See TracBrowser for help on using the repository browser.

Download in other formats: