Context Navigation

source: main/trunk/greenstone3/src/java/org/greenstone/gsdl3/action/DocumentAction.java@ 32068

Last change on this file since 32068 was 32068, checked in by kjdon, 6 years ago
if docEdit=1, then we just retrieve the entire document from archives, then convert to internal format using xslt. Also, don;t get document text if noText=1
Property svn:keywords set to `Author Date Id Revision`
File size: 49.7 KB

Line
1	/*
2	* DocumentAction.java
3	* Copyright (C) 2002 New Zealand Digital Library, http://www.nzdl.org
4	*
5	* This program is free software; you can redistribute it and/or modify
6	* it under the terms of the GNU General Public License as published by
7	* the Free Software Foundation; either version 2 of the License, or
8	* (at your option) any later version.
9	*
10	* This program is distributed in the hope that it will be useful,
11	* but WITHOUT ANY WARRANTY; without even the implied warranty of
12	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13	* GNU General Public License for more details.
14	*
15	* You should have received a copy of the GNU General Public License
16	* along with this program; if not, write to the Free Software
17	* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
18	*/
19	package org.greenstone.gsdl3.action;
20
21	// Greenstone classes
22	import org.greenstone.gsdl3.core.ModuleInterface;
23	import org.greenstone.gsdl3.util.*;
24
25	// XML classes
26	import org.w3c.dom.Document;
27	import org.w3c.dom.Element;
28	import org.w3c.dom.Node;
29	import org.w3c.dom.Text;
30	import org.w3c.dom.NodeList;
31
32	// General Java classes
33	import java.util.ArrayList;
34	import java.util.HashMap;
35	import java.util.HashSet;
36	import java.io.File;
37	import java.io.Serializable;
38
39	import org.apache.log4j.*;
40
41	/** Action class for retrieving Documents via the message router */
42	public class DocumentAction extends Action
43	{
44
45	static Logger logger = Logger.getLogger(org.greenstone.gsdl3.action.DocumentAction.class.getName());
46
47	// this is used to specify that the sibling nodes of a selected one should be obtained
48	public static final String SIBLING_ARG = "sib";
49	public static final String GOTO_PAGE_ARG = "gp";
50	public static final String ENRICH_DOC_ARG = "end";
51	public static final String EXPAND_DOCUMENT_ARG = "ed";
52	public static final String EXPAND_CONTENTS_ARG = "ec";
53	public static final String REALISTIC_BOOK_ARG = "book";
54	public static final String NO_TEXT_ARG = "noText";
55	public static final String DOC_EDIT_ARG = "docEdit";
56
57	/**
58	* if this is set to true, when a document is displayed, any annotation type
59	* services (enrich) will be offered to the user as well
60	*/
61	protected boolean provide_annotations = false;
62
63	protected boolean highlight_query_terms = false;
64
65	public boolean configure()
66	{
67	super.configure();
68	String highlight = (String) config_params.get("highlightQueryTerms");
69	if (highlight != null && highlight.equals("true"))
70	{
71	highlight_query_terms = true;
72	}
73	String annotate = (String) config_params.get("displayAnnotationService");
74	if (annotate != null && annotate.equals("true"))
75	{
76	provide_annotations = true;
77	}
78	return true;
79	}
80
81	public Node process(Node message_node)
82	{
83	// for now, no subaction eventually we may want to have subactions such as text assoc or something ?
84
85	Element message = GSXML.nodeToElement(message_node);
86	Document doc = XMLConverter.newDOM(); //message.getOwnerDocument();
87
88	// the response
89	Element result = doc.createElement(GSXML.MESSAGE_ELEM);
90	Element page_response = doc.createElement(GSXML.RESPONSE_ELEM);
91	result.appendChild(page_response);
92
93	// get the request - assume only one
94	Element request = (Element) GSXML.getChildByTagName(message, GSXML.REQUEST_ELEM);
95	Element cgi_paramList = (Element) GSXML.getChildByTagName(request, GSXML.PARAM_ELEM + GSXML.LIST_MODIFIER);
96	HashMap<String, Serializable> params = GSXML.extractParams(cgi_paramList, false);
97
98	// just in case there are some that need to get passed to the services
99	HashMap service_params = (HashMap) params.get("s0");
100
101	String collection = (String) params.get(GSParams.COLLECTION);
102	String document_id = (String) params.get(GSParams.DOCUMENT);
103	if (document_id != null && document_id.equals(""))
104	{
105	document_id = null;
106	}
107	String href = (String) params.get(GSParams.HREF);//for an external link : get the href URL if it is existing in the params list
108	if (href != null && href.equals(""))
109	{
110	href = null;
111	}
112	String rl = (String) params.get(GSParams.RELATIVE_LINK);//for an external link : get the rl value if it is existing in the params list
113	if (document_id == null && href == null)
114	{
115	logger.error("no document specified!");
116	return result;
117	}
118	if (rl != null && rl.equals("0"))
119	{
120	// this is a true external link, we should have been directed to a different page or action
121	logger.error("rl value was 0, shouldn't get here");
122	return result;
123	}
124
125	UserContext userContext = new UserContext(request);
126
127	//append site metadata
128	addSiteMetadata(page_response, userContext);
129	addInterfaceOptions(page_response);
130
131	// get the additional data needed for the page
132	getBackgroundData(page_response, collection, userContext);
133	Element format_elem = (Element) GSXML.getChildByTagName(page_response, GSXML.FORMAT_ELEM);
134
135	if (format_elem != null) {
136	// lets look for param defaults set in config file
137	NodeList param_defaults = format_elem.getElementsByTagName("paramDefault");
138	for (int i=0; i<param_defaults.getLength(); i++) {
139	Element p = (Element)param_defaults.item(i);
140	String name = p.getAttribute(GSXML.NAME_ATT);
141	if (params.get(name) ==null) {
142	// wasn't set from interface
143	String value = p.getAttribute(GSXML.VALUE_ATT);
144	params.put(name, value );
145	// also add into request param xml so that xslt knows it too
146	GSXML.addParameterToList(cgi_paramList, name, value);
147	}
148	}
149	}
150
151
152	boolean editing_document = false;
153	String doc_edit = (String) params.get(DOC_EDIT_ARG);
154	if (doc_edit != null && doc_edit.equals("1")) {
155	editing_document = true;
156	}
157
158	// are we editing mode? just get the archive document, convert to our internal doc format, and return it
159	if (editing_document) {
160
161	// call get archive doc
162	Element dx_message = doc.createElement(GSXML.MESSAGE_ELEM);
163	String to = "DocXMLGetSection";
164	Element dx_request = GSXML.createBasicRequest(doc, GSXML.REQUEST_TYPE_PROCESS, to, userContext);
165	dx_message.appendChild(dx_request);
166	Element dx_section = doc.createElement(GSXML.DOCXML_SECTION_ELEM);
167	dx_section.setAttribute(GSXML.NODE_ID_ATT, document_id);
168	dx_section.setAttribute(GSXML.COLLECTION_ATT, collection);
169	dx_request.appendChild(dx_section);
170
171	Element dx_response_message = (Element) this.mr.process(dx_message);
172	if (processErrorElements(dx_response_message, page_response))
173	{
174	return result;
175	}
176
177	// get the section out
178	String path = GSPath.appendLink(GSXML.RESPONSE_ELEM, GSXML.DOCXML_SECTION_ELEM);
179	Element section = (Element) GSXML.getNodeByPath(dx_response_message, path);
180	if (section == null) {
181	logger.error("no archive doc returned for "+document_id);
182	return result;
183	}
184	// convert the archive format into the internal format that the page response requires
185
186	Element doc_elem = doc.createElement(GSXML.DOCUMENT_ELEM);
187	page_response.appendChild(doc_elem);
188	section.setAttribute(GSXML.NODE_ID_ATT, document_id);
189
190	Element transformed_section = transformArchiveToDocument(section);
191	doc_elem.appendChild(doc.importNode(transformed_section, true));
192	logger.error("dx result = "+XMLConverter.getPrettyString(result));
193	return result;
194	}
195
196	String document_type = (String) params.get(GSParams.DOCUMENT_TYPE);
197	if (document_type != null && document_type.equals(""))
198	{
199	//document_type = "hierarchy";
200	document_type = null; // we'll get it later if not already specified
201	}
202	//whether to retrieve siblings or not
203	boolean get_siblings = false;
204	String sibs = (String) params.get(SIBLING_ARG);
205	if (sibs != null && sibs.equals("1"))
206	{
207	get_siblings = true;
208	}
209
210	String doc_id_modifier = "";
211	String sibling_num = (String) params.get(GOTO_PAGE_ARG);
212	if (sibling_num != null && !sibling_num.equals(""))
213	{
214	// we have to modify the doc name
215	doc_id_modifier = "." + sibling_num + ".ss";
216	}
217
218	boolean expand_document = false;
219	String ed_arg = (String) params.get(EXPAND_DOCUMENT_ARG);
220	if (ed_arg != null && ed_arg.equals("1"))
221	{
222	expand_document = true;
223	}
224
225	boolean expand_contents = false;
226	if (expand_document)
227	{ // we always expand the contents with the text
228	expand_contents = true;
229	}
230	else
231	{
232	String ec_arg = (String) params.get(EXPAND_CONTENTS_ARG);
233	if (ec_arg != null && ec_arg.equals("1"))
234	{
235	expand_contents = true;
236	}
237	}
238
239	// do we want text content? Not if no_text=1.
240	// expand_document overrides this. - should it??
241	boolean get_text = true;
242	String nt_arg = (String) params.get(NO_TEXT_ARG);
243
244	if (!expand_document && nt_arg!=null && nt_arg.equals("1")) {
245	logger.error("SETTING GET TEXT TO FALSE");
246	get_text = false;
247	} else {
248	logger.error("GET TEXT REMAINS TRUE");
249	}
250
251	// the_document is where all the doc info - structure and metadata etc
252	// is added into, to be returned in the page
253	Element the_document = doc.createElement(GSXML.DOCUMENT_ELEM);
254	page_response.appendChild(the_document);
255
256	// create a basic doc list containing the current node
257	Element basic_doc_list = doc.createElement(GSXML.DOC_NODE_ELEM + GSXML.LIST_MODIFIER);
258	Element current_doc = doc.createElement(GSXML.DOC_NODE_ELEM);
259	basic_doc_list.appendChild(current_doc);
260	if (document_id != null)
261	{
262	current_doc.setAttribute(GSXML.NODE_ID_ATT, document_id + doc_id_modifier);
263	}
264	else
265	{
266	current_doc.setAttribute(GSXML.HREF_ID_ATT, href);
267	// do we need this??
268	current_doc.setAttribute(GSXML.ID_MOD_ATT, doc_id_modifier);
269	}
270
271	if (document_type == null)
272	{
273	document_type = getDocumentType(basic_doc_list, collection, userContext, page_response);
274	}
275	if (document_type == null)
276	{
277	logger.debug("doctype is null, setting to simple");
278	document_type = GSXML.DOC_TYPE_SIMPLE;
279	}
280
281	the_document.setAttribute(GSXML.DOC_TYPE_ATT, document_type);
282
283
284	// Create a parameter list to specify the required structure information
285	Element ds_param_list = doc.createElement(GSXML.PARAM_ELEM + GSXML.LIST_MODIFIER);
286
287	if (service_params != null)
288	{
289	GSXML.addParametersToList(ds_param_list, service_params);
290	}
291
292	Element ds_param = null;
293	boolean get_structure = false;
294	boolean get_structure_info = false;
295	if (document_type.equals(GSXML.DOC_TYPE_PAGED))
296	{
297	get_structure_info = true;
298
299	if (expand_contents)
300	{
301	ds_param = doc.createElement(GSXML.PARAM_ELEM);
302	ds_param_list.appendChild(ds_param);
303	ds_param.setAttribute(GSXML.NAME_ATT, "structure");
304	ds_param.setAttribute(GSXML.VALUE_ATT, "entire");
305	}
306
307	// get the info needed for paged naviagtion
308	ds_param = doc.createElement(GSXML.PARAM_ELEM);
309	ds_param_list.appendChild(ds_param);
310	ds_param.setAttribute(GSXML.NAME_ATT, "info");
311	ds_param.setAttribute(GSXML.VALUE_ATT, "numSiblings");
312	ds_param = doc.createElement(GSXML.PARAM_ELEM);
313	ds_param_list.appendChild(ds_param);
314	ds_param.setAttribute(GSXML.NAME_ATT, "info");
315	ds_param.setAttribute(GSXML.VALUE_ATT, "numChildren");
316	ds_param = doc.createElement(GSXML.PARAM_ELEM);
317	ds_param_list.appendChild(ds_param);
318	ds_param.setAttribute(GSXML.NAME_ATT, "info");
319	ds_param.setAttribute(GSXML.VALUE_ATT, "siblingPosition");
320
321	if (get_siblings)
322	{
323	ds_param = doc.createElement(GSXML.PARAM_ELEM);
324	ds_param_list.appendChild(ds_param);
325	ds_param.setAttribute(GSXML.NAME_ATT, "structure");
326	ds_param.setAttribute(GSXML.VALUE_ATT, "siblings");
327	}
328
329	}
330	else if (document_type.equals(GSXML.DOC_TYPE_HIERARCHY) \|\| document_type.equals(GSXML.DOC_TYPE_PAGED_HIERARCHY))
331	{
332	get_structure = true;
333	if (expand_contents)
334	{
335	ds_param = doc.createElement(GSXML.PARAM_ELEM);
336	ds_param_list.appendChild(ds_param);
337	ds_param.setAttribute(GSXML.NAME_ATT, "structure");
338	ds_param.setAttribute(GSXML.VALUE_ATT, "entire");
339	}
340	else
341	{
342	// get the info needed for table of contents
343	ds_param = doc.createElement(GSXML.PARAM_ELEM);
344	ds_param_list.appendChild(ds_param);
345	ds_param.setAttribute(GSXML.NAME_ATT, "structure");
346	ds_param.setAttribute(GSXML.VALUE_ATT, "ancestors");
347	ds_param = doc.createElement(GSXML.PARAM_ELEM);
348	ds_param_list.appendChild(ds_param);
349	ds_param.setAttribute(GSXML.NAME_ATT, "structure");
350	ds_param.setAttribute(GSXML.VALUE_ATT, "children");
351	if (get_siblings)
352	{
353	ds_param = doc.createElement(GSXML.PARAM_ELEM);
354	ds_param_list.appendChild(ds_param);
355	ds_param.setAttribute(GSXML.NAME_ATT, "structure");
356	ds_param.setAttribute(GSXML.VALUE_ATT, "siblings");
357	}
358	}
359	}
360	else
361	{
362	// we dont need any structure
363	}
364
365	boolean has_dummy = false;
366	if (get_structure \|\| get_structure_info)
367	{
368
369	// Build a request to obtain the document structure
370	Element ds_message = doc.createElement(GSXML.MESSAGE_ELEM);
371	String to = GSPath.appendLink(collection, "DocumentStructureRetrieve");// Hard-wired?
372	Element ds_request = GSXML.createBasicRequest(doc, GSXML.REQUEST_TYPE_PROCESS, to, userContext);
373	ds_message.appendChild(ds_request);
374	ds_request.appendChild(ds_param_list);
375
376	// add the node list we created earlier
377	ds_request.appendChild(basic_doc_list);
378
379	// Process the document structure retrieve message
380	Element ds_response_message = (Element) this.mr.process(ds_message);
381	if (processErrorElements(ds_response_message, page_response))
382	{
383	return result;
384	}
385
386	// get the info and print out
387	String path = GSPath.appendLink(GSXML.RESPONSE_ELEM, GSXML.DOC_NODE_ELEM + GSXML.LIST_MODIFIER);
388	path = GSPath.appendLink(path, GSXML.DOC_NODE_ELEM);
389	path = GSPath.appendLink(path, "nodeStructureInfo");
390	Element ds_response_struct_info = (Element) GSXML.getNodeByPath(ds_response_message, path);
391	// get the doc_node bit
392	if (ds_response_struct_info != null)
393	{
394	the_document.appendChild(doc.importNode(ds_response_struct_info, true));
395	}
396	path = GSPath.appendLink(GSXML.RESPONSE_ELEM, GSXML.DOC_NODE_ELEM + GSXML.LIST_MODIFIER);
397	path = GSPath.appendLink(path, GSXML.DOC_NODE_ELEM);
398	path = GSPath.appendLink(path, GSXML.NODE_STRUCTURE_ELEM);
399	Element ds_response_structure = (Element) GSXML.getNodeByPath(ds_response_message, path);
400
401	if (ds_response_structure != null)
402	{
403	// add the contents of the structure bit into the_document
404	NodeList structs = ds_response_structure.getChildNodes();
405	for (int i = 0; i < structs.getLength(); i++)
406	{
407	the_document.appendChild(doc.importNode(structs.item(i), true));
408	}
409	}
410	else
411	{
412	// no structure nodes, so put in a dummy doc node
413	Element doc_node = doc.createElement(GSXML.DOC_NODE_ELEM);
414	if (document_id != null)
415	{
416	doc_node.setAttribute(GSXML.NODE_ID_ATT, document_id);
417	}
418	else
419	{
420	doc_node.setAttribute(GSXML.HREF_ID_ATT, href);
421
422	}
423	the_document.appendChild(doc_node);
424	has_dummy = true;
425	}
426	}
427	else
428	{ // a simple type - we dont have a dummy node for simple
429	// should think about this more
430	// no structure request, so just put in a dummy doc node
431	Element doc_node = doc.createElement(GSXML.DOC_NODE_ELEM);
432	if (document_id != null)
433	{
434	doc_node.setAttribute(GSXML.NODE_ID_ATT, document_id);
435	}
436	else
437	{
438	doc_node.setAttribute(GSXML.HREF_ID_ATT, href);
439	}
440	the_document.appendChild(doc_node);
441	has_dummy = true;
442	}
443
444	// Build a request to obtain some document metadata
445	Element dm_message = doc.createElement(GSXML.MESSAGE_ELEM);
446	String to = GSPath.appendLink(collection, "DocumentMetadataRetrieve"); // Hard-wired?
447	Element dm_request = GSXML.createBasicRequest(doc, GSXML.REQUEST_TYPE_PROCESS, to, userContext);
448	dm_message.appendChild(dm_request);
449	// Create a parameter list to specify the required metadata information
450
451	HashSet<String> meta_names = new HashSet<String>();
452	meta_names.add("Title"); // the default
453	if (format_elem != null)
454	{
455	getRequiredMetadataNames(format_elem, meta_names);
456	}
457
458	Element extraMetaListElem = (Element) GSXML.getChildByTagName(request, GSXML.EXTRA_METADATA + GSXML.LIST_MODIFIER);
459	if (extraMetaListElem != null)
460	{
461	NodeList extraMetaList = extraMetaListElem.getElementsByTagName(GSXML.EXTRA_METADATA);
462	for (int i = 0; i < extraMetaList.getLength(); i++)
463	{
464	meta_names.add(((Element) extraMetaList.item(i)).getAttribute(GSXML.NAME_ATT));
465	}
466	}
467
468	Element dm_param_list = createMetadataParamList(doc,meta_names);
469	if (service_params != null)
470	{
471	GSXML.addParametersToList(dm_param_list, service_params);
472	}
473
474	dm_request.appendChild(dm_param_list);
475
476	// create the doc node list for the metadata request
477	Element dm_doc_list = doc.createElement(GSXML.DOC_NODE_ELEM + GSXML.LIST_MODIFIER);
478	dm_request.appendChild(dm_doc_list);
479
480	// Add each node from the structure response into the metadata request
481	NodeList doc_nodes = the_document.getElementsByTagName(GSXML.DOC_NODE_ELEM);
482	for (int i = 0; i < doc_nodes.getLength(); i++)
483	{
484	Element doc_node = (Element) doc_nodes.item(i);
485	String doc_node_id = doc_node.getAttribute(GSXML.NODE_ID_ATT);
486
487	// Add the documentNode to the list
488	Element dm_doc_node = doc.createElement(GSXML.DOC_NODE_ELEM);
489	dm_doc_list.appendChild(dm_doc_node);
490	dm_doc_node.setAttribute(GSXML.NODE_ID_ATT, doc_node_id);
491	dm_doc_node.setAttribute(GSXML.NODE_TYPE_ATT, doc_node.getAttribute(GSXML.NODE_TYPE_ATT));
492	if (document_id == null){
493	dm_doc_node.setAttribute(GSXML.HREF_ID_ATT, href );
494	}
495
496	}
497
498	// we also want a metadata request to the top level document to get
499	// assocfilepath - this could be cached too
500	Element doc_meta_request = GSXML.createBasicRequest(doc, GSXML.REQUEST_TYPE_PROCESS, to, userContext);
501	dm_message.appendChild(doc_meta_request);
502	Element doc_meta_param_list = doc.createElement(GSXML.PARAM_ELEM + GSXML.LIST_MODIFIER);
503	if (service_params != null)
504	{
505	GSXML.addParametersToList(doc_meta_param_list, service_params);
506	}
507
508	doc_meta_request.appendChild(doc_meta_param_list);
509	Element doc_param = doc.createElement(GSXML.PARAM_ELEM);
510	doc_meta_param_list.appendChild(doc_param);
511	doc_param.setAttribute(GSXML.NAME_ATT, "metadata");
512	doc_param.setAttribute(GSXML.VALUE_ATT, "assocfilepath");
513
514	// create the doc node list for the metadata request
515	Element doc_list = doc.createElement(GSXML.DOC_NODE_ELEM + GSXML.LIST_MODIFIER);
516	doc_meta_request.appendChild(doc_list);
517
518	Element doc_node = doc.createElement(GSXML.DOC_NODE_ELEM);
519	// the node we want is the root document node
520	if (document_id != null)
521	{
522	doc_node.setAttribute(GSXML.NODE_ID_ATT, document_id + ".rt");
523	}
524	/*else
525	{
526	doc_node.setAttribute(GSXML.HREF_ID_ATT, href);// + ".rt");
527	// can we assume that href is always a top level doc??
528	//doc_node.setAttribute(GSXML.ID_MOD_ATT, ".rt");
529	//doc_node.setAttribute("externalURL", has_rl);
530	}*/
531	doc_list.appendChild(doc_node);
532
533	Element dm_response_message = (Element) this.mr.process(dm_message);
534	if (processErrorElements(dm_response_message, page_response))
535	{
536	return result;
537	}
538
539	String path = GSPath.appendLink(GSXML.RESPONSE_ELEM, GSXML.DOC_NODE_ELEM + GSXML.LIST_MODIFIER);
540	Element dm_response_doc_list = (Element) GSXML.getNodeByPath(dm_response_message, path);
541
542	// Merge the metadata with the structure information
543	NodeList dm_response_docs = dm_response_doc_list.getChildNodes();
544	for (int i = 0; i < doc_nodes.getLength(); i++)
545	{
546	GSXML.mergeMetadataLists(doc_nodes.item(i), dm_response_docs.item(i));
547	}
548	// get the top level doc metadata out
549	Element doc_meta_response = (Element) dm_response_message.getElementsByTagName(GSXML.RESPONSE_ELEM).item(1);
550	Element top_doc_node = (Element) GSXML.getNodeByPath(doc_meta_response, "documentNodeList/documentNode");
551	GSXML.mergeMetadataLists(the_document, top_doc_node);
552
553	// do we want doc text content? If not, we are done.
554	if (!get_text) {
555	// don't get text
556	return result;
557	}
558
559	// Build a request to obtain some document content
560	Element dc_message = doc.createElement(GSXML.MESSAGE_ELEM);
561	to = GSPath.appendLink(collection, "DocumentContentRetrieve"); // Hard-wired?
562	Element dc_request = GSXML.createBasicRequest(doc, GSXML.REQUEST_TYPE_PROCESS, to, userContext);
563	dc_message.appendChild(dc_request);
564
565	// Create a parameter list to specify the request parameters - empty for now
566	Element dc_param_list = doc.createElement(GSXML.PARAM_ELEM + GSXML.LIST_MODIFIER);
567	if (service_params != null)
568	{
569	GSXML.addParametersToList(dc_param_list, service_params);
570	}
571
572	dc_request.appendChild(dc_param_list);
573
574	// get the content
575	// the doc list for the content request is the same as the one for the structure request unless we want the whole document, in which case its the same as for the metadata request.
576	if (expand_document)
577	{
578	dc_request.appendChild(dm_doc_list);
579	}
580	else
581	{
582	dc_request.appendChild(basic_doc_list);
583	}
584	Element dc_response_message = (Element) this.mr.process(dc_message);
585	if (processErrorElements(dc_response_message, page_response))
586	{
587	return result;
588	}
589	Element dc_response_doc_list = (Element) GSXML.getNodeByPath(dc_response_message, path);
590
591	if (expand_document)
592	{
593	// Merge the content with the structure information
594	NodeList dc_response_docs = dc_response_doc_list.getChildNodes();
595	for (int i = 0; i < doc_nodes.getLength(); i++)
596	{
597	Node content = GSXML.getChildByTagName((Element) dc_response_docs.item(i), GSXML.NODE_CONTENT_ELEM);
598	if (content != null)
599	{
600	if (highlight_query_terms)
601	{
602	String node_id = ((Element)doc_nodes.item(i)).getAttribute(GSXML.NODE_ID_ATT);
603	content = highlightQueryTerms(request, node_id, (Element) content);
604	}
605
606	doc_nodes.item(i).appendChild(doc.importNode(content, true));
607	}
608	//GSXML.mergeMetadataLists(doc_nodes.item(i), dm_response_docs.item(i));
609	}
610	if (has_dummy && document_type.equals(GSXML.DOC_TYPE_SIMPLE)) {
611	Element dummy_node = (Element) doc_nodes.item(0);
612	the_document.removeChild(dummy_node);
613	the_document.setAttribute(GSXML.NODE_ID_ATT, dummy_node.getAttribute(GSXML.NODE_ID_ATT));
614	NodeList dummy_children = dummy_node.getChildNodes();
615	for (int i = dummy_children.getLength() - 1; i >= 0; i--)
616	{
617	// special case as we don't want more than one metadata list
618	if (dummy_children.item(i).getNodeName().equals(GSXML.METADATA_ELEM + GSXML.LIST_MODIFIER))
619	{
620	GSXML.mergeMetadataFromList(the_document, dummy_children.item(i));
621	}
622	else
623	{
624	the_document.appendChild(dummy_children.item(i));
625	}
626	}
627	}
628	}
629	else
630	{
631	//path = GSPath.appendLink(path, GSXML.DOC_NODE_ELEM);
632	Element dc_response_doc = (Element) GSXML.getChildByTagName(dc_response_doc_list, GSXML.DOC_NODE_ELEM);
633	Element dc_response_doc_content = (Element) GSXML.getChildByTagName(dc_response_doc, GSXML.NODE_CONTENT_ELEM);
634	//Element dc_response_doc_external = (Element) GSXML.getChildByTagName(dc_response_doc, "external");
635
636	if (dc_response_doc_content == null)
637	{
638	// no content to add
639	if (dc_response_doc.getAttribute("external").equals("true"))
640	{
641
642	//if (dc_response_doc_external != null)
643	//{
644	String href_id = dc_response_doc.getAttribute(GSXML.HREF_ID_ATT);
645
646	the_document.setAttribute("selectedNode", href_id);
647	the_document.setAttribute("external", href_id);
648	}
649	return result;
650	}
651	if (highlight_query_terms)
652	{
653	dc_response_doc.removeChild(dc_response_doc_content);
654
655	dc_response_doc_content = highlightQueryTerms(request, null, dc_response_doc_content);
656	dc_response_doc.appendChild(dc_response_doc.getOwnerDocument().importNode(dc_response_doc_content, true));
657	}
658
659	if (provide_annotations)
660	{
661	String service_selected = (String) params.get(ENRICH_DOC_ARG);
662	if (service_selected != null && service_selected.equals("1"))
663	{
664	// now we can modifiy the response doc if needed
665	String enrich_service = (String) params.get(GSParams.SERVICE);
666	// send a message to the service
667	Element enrich_message = doc.createElement(GSXML.MESSAGE_ELEM);
668	Element enrich_request = GSXML.createBasicRequest(doc, GSXML.REQUEST_TYPE_PROCESS, enrich_service, userContext);
669	enrich_message.appendChild(enrich_request);
670	// check for parameters
671	HashMap e_service_params = (HashMap) params.get("s1");
672	if (e_service_params != null)
673	{
674	Element enrich_pl = doc.createElement(GSXML.PARAM_ELEM + GSXML.LIST_MODIFIER);
675	GSXML.addParametersToList(enrich_pl, e_service_params);
676	enrich_request.appendChild(enrich_pl);
677	}
678	Element e_doc_list = doc.createElement(GSXML.DOC_NODE_ELEM + GSXML.LIST_MODIFIER);
679	enrich_request.appendChild(e_doc_list);
680	e_doc_list.appendChild(doc.importNode(dc_response_doc, true));
681
682	Node enrich_response = this.mr.process(enrich_message);
683
684	String[] links = { GSXML.RESPONSE_ELEM, GSXML.DOC_NODE_ELEM + GSXML.LIST_MODIFIER, GSXML.DOC_NODE_ELEM, GSXML.NODE_CONTENT_ELEM };
685	path = GSPath.createPath(links);
686	dc_response_doc_content = (Element) GSXML.getNodeByPath(enrich_response, path);
687
688	}
689	} // if provide_annotations
690
691	// use the returned id rather than the sent one cos there may have
692	// been modifiers such as .pr that are removed.
693	String modified_doc_id = dc_response_doc.getAttribute(GSXML.NODE_ID_ATT);
694	the_document.setAttribute("selectedNode", modified_doc_id);
695	if (has_dummy)
696	{
697	// change the id if necessary and add the content
698	Element dummy_node = (Element) doc_nodes.item(0);
699
700	dummy_node.setAttribute(GSXML.NODE_ID_ATT, modified_doc_id);
701	dummy_node.appendChild(doc.importNode(dc_response_doc_content, true));
702	// hack for simple type
703	if (document_type.equals(GSXML.DOC_TYPE_SIMPLE))
704	{
705	// we dont want the internal docNode, just want the content and metadata in the document
706	// rethink this!!
707	the_document.removeChild(dummy_node);
708
709	NodeList dummy_children = dummy_node.getChildNodes();
710	//for (int i=0; i<dummy_children.getLength(); i++) {
711	for (int i = dummy_children.getLength() - 1; i >= 0; i--)
712	{
713	// special case as we don't want more than one metadata list
714	if (dummy_children.item(i).getNodeName().equals(GSXML.METADATA_ELEM + GSXML.LIST_MODIFIER))
715	{
716	GSXML.mergeMetadataFromList(the_document, dummy_children.item(i));
717	}
718	else
719	{
720	the_document.appendChild(dummy_children.item(i));
721	}
722	}
723	}
724
725	the_document.setAttribute(GSXML.NODE_ID_ATT, modified_doc_id);
726	}
727	else
728	{
729	// Merge the document content with the metadata and structure information
730	for (int i = 0; i < doc_nodes.getLength(); i++)
731	{
732	Node dn = doc_nodes.item(i);
733	String dn_id = ((Element) dn).getAttribute(GSXML.NODE_ID_ATT);
734	if (dn_id.equals(modified_doc_id))
735	{
736	dn.appendChild(doc.importNode(dc_response_doc_content, true));
737	break;
738	}
739	}
740	}
741	}
742	//logger.debug("(DocumentAction) Page:\n" + GSXML.xmlNodeToString(result));
743	return result;
744	}
745
746	/**
747	* tell the param class what its arguments are if an action has its own
748	* arguments, this should add them to the params object - particularly
749	* important for args that should not be saved
750	*/
751	public boolean addActionParameters(GSParams params)
752	{
753	params.addParameter(GOTO_PAGE_ARG, false);
754	params.addParameter(ENRICH_DOC_ARG, false);
755	params.addParameter(EXPAND_DOCUMENT_ARG, false);
756	params.addParameter(EXPAND_CONTENTS_ARG, false);
757	params.addParameter(REALISTIC_BOOK_ARG, false);
758
759	return true;
760	}
761
762	/**
763	* this method gets the collection description, the format info, the list of
764	* enrich services, etc - stuff that is needed for the page, but is the same
765	* whatever the query is - should be cached
766	*/
767	protected boolean getBackgroundData(Element page_response, String collection, UserContext userContext)
768	{
769	Document doc = page_response.getOwnerDocument();
770
771	// create a message to process - contains requests for the collection
772	// description, the format element, the enrich services on offer
773	// these could all be cached
774	Element info_message = doc.createElement(GSXML.MESSAGE_ELEM);
775	String path = GSPath.appendLink(collection, "DocumentContentRetrieve");
776	// the format request - ignore for now, where does this request go to??
777	Element format_request = GSXML.createBasicRequest(doc, GSXML.REQUEST_TYPE_FORMAT, path, userContext);
778	info_message.appendChild(format_request);
779
780	// the enrich_services request - only do this if provide_annotations is true
781
782	if (provide_annotations)
783	{
784	Element enrich_services_request = GSXML.createBasicRequest(doc, GSXML.REQUEST_TYPE_DESCRIBE, "", userContext);
785	enrich_services_request.setAttribute(GSXML.INFO_ATT, "serviceList");
786	info_message.appendChild(enrich_services_request);
787	}
788
789	Element info_response = (Element) this.mr.process(info_message);
790
791	// the collection is the first response
792	NodeList responses = info_response.getElementsByTagName(GSXML.RESPONSE_ELEM);
793	Element format_resp = (Element) responses.item(0);
794
795	Element format_elem = (Element) GSXML.getChildByTagName(format_resp, GSXML.FORMAT_ELEM);
796	if (format_elem != null)
797	{
798	Element global_format_elem = (Element) GSXML.getChildByTagName(format_resp, GSXML.GLOBAL_FORMAT_ELEM);
799	if (global_format_elem != null)
800	{
801	GSXSLT.mergeFormatElements(format_elem, global_format_elem, false);
802	}
803
804	// set the format type
805	format_elem.setAttribute(GSXML.TYPE_ATT, "display");
806	page_response.appendChild(doc.importNode(format_elem, true));
807	}
808
809	if (provide_annotations)
810	{
811	Element services_resp = (Element) responses.item(1);
812
813	// a new message for the mr
814	Element enrich_message = doc.createElement(GSXML.MESSAGE_ELEM);
815	NodeList e_services = services_resp.getElementsByTagName(GSXML.SERVICE_ELEM);
816	boolean service_found = false;
817	for (int j = 0; j < e_services.getLength(); j++)
818	{
819	if (((Element) e_services.item(j)).getAttribute(GSXML.TYPE_ATT).equals("enrich"))
820	{
821	Element s = GSXML.createBasicRequest(doc, GSXML.REQUEST_TYPE_DESCRIBE, ((Element) e_services.item(j)).getAttribute(GSXML.NAME_ATT), userContext);
822	enrich_message.appendChild(s);
823	service_found = true;
824	}
825	}
826	if (service_found)
827	{
828	Element enrich_response = (Element) this.mr.process(enrich_message);
829
830	NodeList e_responses = enrich_response.getElementsByTagName(GSXML.RESPONSE_ELEM);
831	Element service_list = doc.createElement(GSXML.SERVICE_ELEM + GSXML.LIST_MODIFIER);
832	for (int i = 0; i < e_responses.getLength(); i++)
833	{
834	Element e_resp = (Element) e_responses.item(i);
835	Element e_service = (Element) doc.importNode(GSXML.getChildByTagName(e_resp, GSXML.SERVICE_ELEM), true);
836	e_service.setAttribute(GSXML.NAME_ATT, e_resp.getAttribute(GSXML.FROM_ATT));
837	service_list.appendChild(e_service);
838	}
839	page_response.appendChild(service_list);
840	}
841	} // if provide_annotations
842	return true;
843
844	}
845
846	protected String getDocumentType(Element basic_doc_list, String collection, UserContext userContext, Element page_response)
847	{
848	Document doc = basic_doc_list.getOwnerDocument();
849
850	Element ds_message = doc.createElement(GSXML.MESSAGE_ELEM);
851	String to = GSPath.appendLink(collection, "DocumentStructureRetrieve");// Hard-wired?
852	Element ds_request = GSXML.createBasicRequest(doc, GSXML.REQUEST_TYPE_PROCESS, to, userContext);
853	ds_message.appendChild(ds_request);
854
855	// Create a parameter list to specify the required structure information
856	Element ds_param_list = doc.createElement(GSXML.PARAM_ELEM + GSXML.LIST_MODIFIER);
857	Element ds_param = doc.createElement(GSXML.PARAM_ELEM);
858	ds_param_list.appendChild(ds_param);
859	ds_param.setAttribute(GSXML.NAME_ATT, "info");
860	ds_param.setAttribute(GSXML.VALUE_ATT, "documentType");
861
862	ds_request.appendChild(ds_param_list);
863
864	// add the node list we created earlier
865	ds_request.appendChild(basic_doc_list);
866
867	// Process the document structure retrieve message
868	Element ds_response_message = (Element) this.mr.process(ds_message);
869	if (processErrorElements(ds_response_message, page_response))
870	{
871	return null;
872	}
873
874	String[] links = { GSXML.RESPONSE_ELEM, GSXML.DOC_NODE_ELEM + GSXML.LIST_MODIFIER, GSXML.DOC_NODE_ELEM, "nodeStructureInfo" };
875	String path = GSPath.createPath(links);
876	Element info_elem = (Element) GSXML.getNodeByPath(ds_response_message, path);
877	if (info_elem == null) {
878	return null;
879	}
880	Element doctype_elem = GSXML.getNamedElement(info_elem, "info", "name", "documentType");
881	if (doctype_elem != null)
882	{
883	String doc_type = doctype_elem.getAttribute("value");
884	return doc_type;
885	}
886	return null;
887	}
888
889	/** run the XSLT transform which converts from doc.xml format to our internal document format */
890	protected Element transformArchiveToDocument(Element section) {
891
892	String stylesheet_file = GSFile.stylesheetFile(GlobalProperties.getGSDL3Home(), (String) this.config_params.get(GSConstants.SITE_NAME), "", (String) this.config_params.get(GSConstants.INTERFACE_NAME), null, "archive2document.xsl");
893	Document stylesheet_doc = XMLConverter.getDOM(new File(stylesheet_file));
894	if (stylesheet_doc == null) {
895	logger.error("Couldn't load in stylesheet "+stylesheet_file);
896	return section;
897	}
898
899	Document section_doc = XMLConverter.newDOM();
900	section_doc.appendChild(section_doc.importNode(section, true));
901	Node result = this.transformer.transform(stylesheet_doc, section_doc);
902	logger.error("transform result = "+XMLConverter.getPrettyString(result));
903
904	Element new_element;
905	if (result.getNodeType() == Node.DOCUMENT_NODE)
906	{
907	new_element = ((Document) result).getDocumentElement();
908	}
909	else
910	{
911	new_element = (Element) result;
912	}
913
914
915	return new_element;
916
917	}
918
919
920	/**
921	* this involves a bit of a hack to get the equivalent query terms - has to
922	* requery the query service - uses the last selected service name. (if it
923	* ends in query). should this action do the query or should it send a
924	* message to the query action? but that will involve lots of extra stuff.
925	* also doesn't handle phrases properly - just highlights all the terms
926	* found in the text.
927	*/
928	protected Element highlightQueryTerms(Element request, String current_node_id, Element dc_response_doc_content)
929	{
930	Document doc = request.getOwnerDocument();
931
932	// do the query again to get term info
933	Element cgi_param_list = (Element) GSXML.getChildByTagName(request, GSXML.PARAM_ELEM + GSXML.LIST_MODIFIER);
934	HashMap<String, Serializable> params = GSXML.extractParams(cgi_param_list, false);
935
936	HashMap previous_params = (HashMap) params.get("p");
937	if (previous_params == null)
938	{
939	return dc_response_doc_content;
940	}
941	String service_name = (String) previous_params.get(GSParams.SERVICE);
942	if (service_name == null \|\| !service_name.endsWith("Query"))
943	{ // hack for now - we only do highlighting if we were in a query last - ie not if we were in a browse thingy
944	logger.debug("invalid service, not doing highlighting");
945	return dc_response_doc_content;
946	}
947	String collection = (String) params.get(GSParams.COLLECTION);
948	UserContext userContext = new UserContext(request);
949	String to = GSPath.appendLink(collection, service_name);
950
951	Element mr_query_message = doc.createElement(GSXML.MESSAGE_ELEM);
952	Element mr_query_request = GSXML.createBasicRequest(doc, GSXML.REQUEST_TYPE_PROCESS, to, userContext);
953	mr_query_message.appendChild(mr_query_request);
954
955	// paramList
956	HashMap service_params = (HashMap) params.get("s1");
957
958	Element query_param_list = doc.createElement(GSXML.PARAM_ELEM + GSXML.LIST_MODIFIER);
959	GSXML.addParametersToList(query_param_list, service_params);
960	if (current_node_id != null) {
961	GSXML.addParameterToList(query_param_list, "hldocOID", current_node_id);
962	} else {
963	GSXML.addParameterToList(query_param_list, "hldocOID", (String) params.get(GSParams.DOCUMENT));
964	}
965	mr_query_request.appendChild(query_param_list);
966	// do the query
967	Element mr_query_response = (Element) this.mr.process(mr_query_message);
968	String pathNode = GSPath.appendLink(GSXML.RESPONSE_ELEM, GSXML.NODE_CONTENT_ELEM);
969	Element highlighted_Node = (Element) GSXML.getNodeByPath(mr_query_response, pathNode);
970	// For SOLR, the above query may come back with a nodeContent element, which is the hldocOID section content, with search terms marked up. We send it back to the documnetContentRetrieve service so that resolveTextMacros can be applied, and it can be properly encased in documentNode etc elements
971	if (highlighted_Node != null)
972	{
973	// Build a request to process highlighted text
974
975	Element hl_message = doc.createElement(GSXML.MESSAGE_ELEM);
976	to = GSPath.appendLink(collection, "DocumentContentRetrieve");
977	Element dc_request = GSXML.createBasicRequest(doc, GSXML.REQUEST_TYPE_PROCESS, to, userContext);
978	hl_message.appendChild(dc_request);
979
980	// Create a parameter list to specify the request parameters - empty for now
981	Element dc_param_list = doc.createElement(GSXML.PARAM_ELEM + GSXML.LIST_MODIFIER);
982	dc_request.appendChild(dc_param_list);
983
984	// get the content
985	Element doc_list = doc.createElement(GSXML.DOC_NODE_ELEM + GSXML.LIST_MODIFIER);
986	dc_request.appendChild(doc_list);
987	Element current_doc = doc.createElement(GSXML.DOC_NODE_ELEM);
988	doc_list.appendChild(current_doc);
989	current_doc.setAttribute(GSXML.NODE_ID_ATT, (String) params.get(GSParams.DOCUMENT));
990	//Append highlighted content to request for processing
991	dc_request.appendChild(doc.importNode(highlighted_Node, true));
992	Element hl_response_message = (Element) this.mr.process(hl_message);
993
994	//Get results
995	NodeList contentList = hl_response_message.getElementsByTagName(GSXML.NODE_CONTENT_ELEM);
996	Element content = (Element) contentList.item(0);
997	return content;
998	}
999	String path = GSPath.appendLink(GSXML.RESPONSE_ELEM, GSXML.TERM_ELEM + GSXML.LIST_MODIFIER);
1000	Element query_term_list_element = (Element) GSXML.getNodeByPath(mr_query_response, path);
1001	if (query_term_list_element == null)
1002	{
1003	// no term info
1004	logger.error("No query term information.\n");
1005	return dc_response_doc_content;
1006	}
1007
1008	String content = GSXML.getNodeText(dc_response_doc_content);
1009
1010	String metadata_path = GSPath.appendLink(GSXML.RESPONSE_ELEM, GSXML.METADATA_ELEM + GSXML.LIST_MODIFIER);
1011	Element metadata_list = (Element) GSXML.getNodeByPath(mr_query_response, metadata_path);
1012
1013	HashSet<String> query_term_variants = new HashSet<String>();
1014	NodeList equivalent_terms_nodelist = query_term_list_element.getElementsByTagName("equivTermList");
1015	if (equivalent_terms_nodelist == null \|\| equivalent_terms_nodelist.getLength() == 0)
1016	{
1017	NodeList terms_nodelist = query_term_list_element.getElementsByTagName("term");
1018	if (terms_nodelist != null && terms_nodelist.getLength() > 0)
1019	{
1020	for (int i = 0; i < terms_nodelist.getLength(); i++)
1021	{
1022	String termValue = ((Element) terms_nodelist.item(i)).getAttribute("name");
1023	String termValueU = null;
1024	String termValueL = null;
1025
1026	if (termValue.length() > 1)
1027	{
1028	termValueU = termValue.substring(0, 1).toUpperCase() + termValue.substring(1);
1029	termValueL = termValue.substring(0, 1).toLowerCase() + termValue.substring(1);
1030	}
1031	else
1032	{
1033	termValueU = termValue.substring(0, 1).toUpperCase();
1034	termValueL = termValue.substring(0, 1).toLowerCase();
1035	}
1036
1037	query_term_variants.add(termValueU);
1038	query_term_variants.add(termValueL);
1039	}
1040	}
1041	}
1042	else
1043	{
1044	for (int i = 0; i < equivalent_terms_nodelist.getLength(); i++)
1045	{
1046	Element equivalent_terms_element = (Element) equivalent_terms_nodelist.item(i);
1047	String[] equivalent_terms = GSXML.getAttributeValuesFromList(equivalent_terms_element, GSXML.NAME_ATT);
1048	for (int j = 0; j < equivalent_terms.length; j++)
1049	{
1050	query_term_variants.add(equivalent_terms[j]);
1051	}
1052	}
1053	}
1054
1055	ArrayList<ArrayList<HashSet<String>>> phrase_query_term_variants_hierarchy = new ArrayList<ArrayList<HashSet<String>>>();
1056
1057	Element query_element = GSXML.getNamedElement(metadata_list, GSXML.METADATA_ELEM, GSXML.NAME_ATT, "query");
1058	String performed_query = GSXML.getNodeText(query_element) + " ";
1059
1060	ArrayList<HashSet<String>> phrase_query_p_term_variants_list = new ArrayList<HashSet<String>>();
1061	int term_start = 0;
1062	boolean in_term = false;
1063	boolean in_phrase = false;
1064	for (int i = 0; i < performed_query.length(); i++)
1065	{
1066	char character = performed_query.charAt(i);
1067	boolean is_character_letter_or_digit = Character.isLetterOrDigit(character);
1068
1069	// Has a query term just started?
1070	if (in_term == false && is_character_letter_or_digit == true)
1071	{
1072	in_term = true;
1073	term_start = i;
1074	}
1075
1076	// Or has a term just finished?
1077	else if (in_term == true && is_character_letter_or_digit == false)
1078	{
1079	in_term = false;
1080	String term = performed_query.substring(term_start, i);
1081
1082	Element term_element = GSXML.getNamedElement(query_term_list_element, GSXML.TERM_ELEM, GSXML.NAME_ATT, term);
1083	if (term_element != null)
1084	{
1085
1086	HashSet<String> phrase_query_p_term_x_variants = new HashSet<String>();
1087
1088	NodeList term_equivalent_terms_nodelist = term_element.getElementsByTagName("equivTermList");
1089	if (term_equivalent_terms_nodelist == null \|\| term_equivalent_terms_nodelist.getLength() == 0)
1090	{
1091	String termValueU = null;
1092	String termValueL = null;
1093
1094	if (term.length() > 1)
1095	{
1096	termValueU = term.substring(0, 1).toUpperCase() + term.substring(1);
1097	termValueL = term.substring(0, 1).toLowerCase() + term.substring(1);
1098	}
1099	else
1100	{
1101	termValueU = term.substring(0, 1).toUpperCase();
1102	termValueL = term.substring(0, 1).toLowerCase();
1103	}
1104
1105	phrase_query_p_term_x_variants.add(termValueU);
1106	phrase_query_p_term_x_variants.add(termValueL);
1107	}
1108	else
1109	{
1110	for (int j = 0; j < term_equivalent_terms_nodelist.getLength(); j++)
1111	{
1112	Element term_equivalent_terms_element = (Element) term_equivalent_terms_nodelist.item(j);
1113	String[] term_equivalent_terms = GSXML.getAttributeValuesFromList(term_equivalent_terms_element, GSXML.NAME_ATT);
1114	for (int k = 0; k < term_equivalent_terms.length; k++)
1115	{
1116	phrase_query_p_term_x_variants.add(term_equivalent_terms[k]);
1117	}
1118	}
1119	}
1120	phrase_query_p_term_variants_list.add(phrase_query_p_term_x_variants);
1121
1122	if (in_phrase == false)
1123	{
1124	phrase_query_term_variants_hierarchy.add(phrase_query_p_term_variants_list);
1125	phrase_query_p_term_variants_list = new ArrayList<HashSet<String>>();
1126	}
1127	}
1128	}
1129	// Watch for phrases (surrounded by quotes)
1130	if (character == '\"')
1131	{
1132	// Has a phrase just started?
1133	if (in_phrase == false)
1134	{
1135	in_phrase = true;
1136	}
1137	// Or has a phrase just finished?
1138	else if (in_phrase == true)
1139	{
1140	in_phrase = false;
1141	phrase_query_term_variants_hierarchy.add(phrase_query_p_term_variants_list);
1142	}
1143
1144	phrase_query_p_term_variants_list = new ArrayList<HashSet<String>>();
1145	}
1146	}
1147
1148	return highlightQueryTermsInternal(doc, content, query_term_variants, phrase_query_term_variants_hierarchy);
1149	}
1150
1151	/**
1152	* Highlights query terms in a piece of text.
1153	*/
1154	private Element highlightQueryTermsInternal(Document doc, String content, HashSet<String> query_term_variants, ArrayList<ArrayList<HashSet<String>>> phrase_query_term_variants_hierarchy)
1155	{
1156	// Convert the content string to an array of characters for speed
1157	char[] content_characters = new char[content.length()];
1158	content.getChars(0, content.length(), content_characters, 0);
1159
1160	// Now skim through the content, identifying word matches
1161	ArrayList<WordMatch> word_matches = new ArrayList<WordMatch>();
1162	int word_start = 0;
1163	boolean in_word = false;
1164	boolean preceding_word_matched = false;
1165	boolean inTag = false;
1166	for (int i = 0; i < content_characters.length; i++)
1167	{
1168	//We don't want to find words inside HTML tags
1169	if (content_characters[i] == '<')
1170	{
1171	inTag = true;
1172	continue;
1173	}
1174	else if (inTag && content_characters[i] == '>')
1175	{
1176	inTag = false;
1177	}
1178	else if (inTag)
1179	{
1180	continue;
1181	}
1182
1183	boolean is_character_letter_or_digit = Character.isLetterOrDigit(content_characters[i]);
1184
1185	// Has a word just started?
1186	if (in_word == false && is_character_letter_or_digit == true)
1187	{
1188	in_word = true;
1189	word_start = i;
1190	}
1191
1192	// Or has a word just finished?
1193	else if (in_word == true && is_character_letter_or_digit == false)
1194	{
1195	in_word = false;
1196
1197	// Check if the word matches any of the query term equivalents
1198	String word = new String(content_characters, word_start, (i - word_start));
1199	if (query_term_variants.contains(word))
1200	{
1201	// We have found a matching word, so remember its location
1202	word_matches.add(new WordMatch(word, word_start, i, preceding_word_matched));
1203	preceding_word_matched = true;
1204	}
1205	else
1206	{
1207	preceding_word_matched = false;
1208	}
1209	}
1210	}
1211
1212	// Don't forget the last word...
1213	if (in_word == true)
1214	{
1215	// Check if the word matches any of the query term equivalents
1216	String word = new String(content_characters, word_start, (content_characters.length - word_start));
1217	if (query_term_variants.contains(word))
1218	{
1219	// We have found a matching word, so remember its location
1220	word_matches.add(new WordMatch(word, word_start, content_characters.length, preceding_word_matched));
1221	}
1222	}
1223
1224	ArrayList<Integer> highlight_start_positions = new ArrayList<Integer>();
1225	ArrayList<Integer> highlight_end_positions = new ArrayList<Integer>();
1226
1227	// Deal with phrases now
1228	ArrayList<PartialPhraseMatch> partial_phrase_matches = new ArrayList<PartialPhraseMatch>();
1229	for (int i = 0; i < word_matches.size(); i++)
1230	{
1231	WordMatch word_match = word_matches.get(i);
1232
1233	// See if any partial phrase matches are extended by this word
1234	if (word_match.preceding_word_matched)
1235	{
1236	for (int j = partial_phrase_matches.size() - 1; j >= 0; j--)
1237	{
1238	PartialPhraseMatch partial_phrase_match = partial_phrase_matches.remove(j);
1239	ArrayList phrase_query_p_term_variants_list = phrase_query_term_variants_hierarchy.get(partial_phrase_match.query_phrase_number);
1240	HashSet phrase_query_p_term_x_variants = (HashSet) phrase_query_p_term_variants_list.get(partial_phrase_match.num_words_matched);
1241	if (phrase_query_p_term_x_variants.contains(word_match.word))
1242	{
1243	partial_phrase_match.num_words_matched++;
1244
1245	// Has a complete phrase match occurred?
1246	if (partial_phrase_match.num_words_matched == phrase_query_p_term_variants_list.size())
1247	{
1248	// Check for overlaps by looking at the previous highlight range
1249	if (!highlight_end_positions.isEmpty())
1250	{
1251	int last_highlight_index = highlight_end_positions.size() - 1;
1252	int last_highlight_end = highlight_end_positions.get(last_highlight_index).intValue();
1253	if (last_highlight_end > partial_phrase_match.start_position)
1254	{
1255	// There is an overlap, so remove the previous phrase match
1256	int last_highlight_start = highlight_start_positions.remove(last_highlight_index).intValue();
1257	highlight_end_positions.remove(last_highlight_index);
1258	partial_phrase_match.start_position = last_highlight_start;
1259	}
1260	}
1261
1262	highlight_start_positions.add(new Integer(partial_phrase_match.start_position));
1263	highlight_end_positions.add(new Integer(word_match.end_position));
1264	}
1265	// No, but add the partial match back into the list for next time
1266	else
1267	{
1268	partial_phrase_matches.add(partial_phrase_match);
1269	}
1270	}
1271	}
1272	}
1273	else
1274	{
1275	partial_phrase_matches.clear();
1276	}
1277
1278	// See if this word is at the start of any of the phrases
1279	for (int p = 0; p < phrase_query_term_variants_hierarchy.size(); p++)
1280	{
1281	ArrayList phrase_query_p_term_variants_list = phrase_query_term_variants_hierarchy.get(p);
1282	if (phrase_query_p_term_variants_list.size()>0) {
1283	HashSet phrase_query_p_term_1_variants = (HashSet) phrase_query_p_term_variants_list.get(0);
1284	if (phrase_query_p_term_1_variants.contains(word_match.word))
1285	{
1286	// If this phrase is just one word long, we have a complete match
1287	if (phrase_query_p_term_variants_list.size() == 1)
1288	{
1289	highlight_start_positions.add(new Integer(word_match.start_position));
1290	highlight_end_positions.add(new Integer(word_match.end_position));
1291	}
1292	// Otherwise we have the start of a potential phrase match
1293	else
1294	{
1295	partial_phrase_matches.add(new PartialPhraseMatch(word_match.start_position, p));
1296	}
1297	}
1298	}
1299	}
1300	}
1301
1302	// Now add the annotation tags into the document at the correct points
1303	Element content_element = doc.createElement(GSXML.NODE_CONTENT_ELEM);
1304
1305	int last_wrote = 0;
1306	for (int i = 0; i < highlight_start_positions.size(); i++)
1307	{
1308	int highlight_start = highlight_start_positions.get(i).intValue();
1309	int highlight_end = highlight_end_positions.get(i).intValue();
1310
1311	// Print anything before the highlight range
1312	if (last_wrote < highlight_start)
1313	{
1314	String preceding_text = new String(content_characters, last_wrote, (highlight_start - last_wrote));
1315	content_element.appendChild(doc.createTextNode(preceding_text));
1316	}
1317
1318	// Print the highlight text, annotated
1319	if (highlight_end > last_wrote)
1320	{
1321	String highlight_text = new String(content_characters, highlight_start, (highlight_end - highlight_start));
1322	Element annotation_element = GSXML.createTextElement(doc, "annotation", highlight_text);
1323	annotation_element.setAttribute("type", "query_term");
1324	content_element.appendChild(annotation_element);
1325	last_wrote = highlight_end;
1326	}
1327	}
1328
1329	// Finish off any unwritten text
1330	if (last_wrote < content_characters.length)
1331	{
1332	String remaining_text = new String(content_characters, last_wrote, (content_characters.length - last_wrote));
1333	content_element.appendChild(doc.createTextNode(remaining_text));
1334	}
1335	return content_element;
1336	}
1337
1338	static private class WordMatch
1339	{
1340	public String word;
1341	public int start_position;
1342	public int end_position;
1343	public boolean preceding_word_matched;
1344
1345	public WordMatch(String word, int start_position, int end_position, boolean preceding_word_matched)
1346	{
1347	this.word = word;
1348	this.start_position = start_position;
1349	this.end_position = end_position;
1350	this.preceding_word_matched = preceding_word_matched;
1351	}
1352	}
1353
1354	static private class PartialPhraseMatch
1355	{
1356	public int start_position;
1357	public int query_phrase_number;
1358	public int num_words_matched;
1359
1360	public PartialPhraseMatch(int start_position, int query_phrase_number)
1361	{
1362	this.start_position = start_position;
1363	this.query_phrase_number = query_phrase_number;
1364	this.num_words_matched = 1;
1365	}
1366	}
1367	}

Note: See TracBrowser for help on using the repository browser.

Download in other formats: