Context Navigation

source: main/trunk/greenstone3/src/java/org/greenstone/gsdl3/action/DocumentAction.java@ 32069

Last change on this file since 32069 was 32069, checked in by kjdon, 6 years ago
forgot to add the import GlobalProperties line
Property svn:keywords set to `Author Date Id Revision`
File size: 49.7 KB

Line
1	/*
2	* DocumentAction.java
3	* Copyright (C) 2002 New Zealand Digital Library, http://www.nzdl.org
4	*
5	* This program is free software; you can redistribute it and/or modify
6	* it under the terms of the GNU General Public License as published by
7	* the Free Software Foundation; either version 2 of the License, or
8	* (at your option) any later version.
9	*
10	* This program is distributed in the hope that it will be useful,
11	* but WITHOUT ANY WARRANTY; without even the implied warranty of
12	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13	* GNU General Public License for more details.
14	*
15	* You should have received a copy of the GNU General Public License
16	* along with this program; if not, write to the Free Software
17	* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
18	*/
19	package org.greenstone.gsdl3.action;
20
21	// Greenstone classes
22	import org.greenstone.gsdl3.core.ModuleInterface;
23	import org.greenstone.gsdl3.util.*;
24	import org.greenstone.util.GlobalProperties;
25
26	// XML classes
27	import org.w3c.dom.Document;
28	import org.w3c.dom.Element;
29	import org.w3c.dom.Node;
30	import org.w3c.dom.Text;
31	import org.w3c.dom.NodeList;
32
33	// General Java classes
34	import java.util.ArrayList;
35	import java.util.HashMap;
36	import java.util.HashSet;
37	import java.io.File;
38	import java.io.Serializable;
39
40	import org.apache.log4j.*;
41
42	/** Action class for retrieving Documents via the message router */
43	public class DocumentAction extends Action
44	{
45
46	static Logger logger = Logger.getLogger(org.greenstone.gsdl3.action.DocumentAction.class.getName());
47
48	// this is used to specify that the sibling nodes of a selected one should be obtained
49	public static final String SIBLING_ARG = "sib";
50	public static final String GOTO_PAGE_ARG = "gp";
51	public static final String ENRICH_DOC_ARG = "end";
52	public static final String EXPAND_DOCUMENT_ARG = "ed";
53	public static final String EXPAND_CONTENTS_ARG = "ec";
54	public static final String REALISTIC_BOOK_ARG = "book";
55	public static final String NO_TEXT_ARG = "noText";
56	public static final String DOC_EDIT_ARG = "docEdit";
57
58	/**
59	* if this is set to true, when a document is displayed, any annotation type
60	* services (enrich) will be offered to the user as well
61	*/
62	protected boolean provide_annotations = false;
63
64	protected boolean highlight_query_terms = false;
65
66	public boolean configure()
67	{
68	super.configure();
69	String highlight = (String) config_params.get("highlightQueryTerms");
70	if (highlight != null && highlight.equals("true"))
71	{
72	highlight_query_terms = true;
73	}
74	String annotate = (String) config_params.get("displayAnnotationService");
75	if (annotate != null && annotate.equals("true"))
76	{
77	provide_annotations = true;
78	}
79	return true;
80	}
81
82	public Node process(Node message_node)
83	{
84	// for now, no subaction eventually we may want to have subactions such as text assoc or something ?
85
86	Element message = GSXML.nodeToElement(message_node);
87	Document doc = XMLConverter.newDOM(); //message.getOwnerDocument();
88
89	// the response
90	Element result = doc.createElement(GSXML.MESSAGE_ELEM);
91	Element page_response = doc.createElement(GSXML.RESPONSE_ELEM);
92	result.appendChild(page_response);
93
94	// get the request - assume only one
95	Element request = (Element) GSXML.getChildByTagName(message, GSXML.REQUEST_ELEM);
96	Element cgi_paramList = (Element) GSXML.getChildByTagName(request, GSXML.PARAM_ELEM + GSXML.LIST_MODIFIER);
97	HashMap<String, Serializable> params = GSXML.extractParams(cgi_paramList, false);
98
99	// just in case there are some that need to get passed to the services
100	HashMap service_params = (HashMap) params.get("s0");
101
102	String collection = (String) params.get(GSParams.COLLECTION);
103	String document_id = (String) params.get(GSParams.DOCUMENT);
104	if (document_id != null && document_id.equals(""))
105	{
106	document_id = null;
107	}
108	String href = (String) params.get(GSParams.HREF);//for an external link : get the href URL if it is existing in the params list
109	if (href != null && href.equals(""))
110	{
111	href = null;
112	}
113	String rl = (String) params.get(GSParams.RELATIVE_LINK);//for an external link : get the rl value if it is existing in the params list
114	if (document_id == null && href == null)
115	{
116	logger.error("no document specified!");
117	return result;
118	}
119	if (rl != null && rl.equals("0"))
120	{
121	// this is a true external link, we should have been directed to a different page or action
122	logger.error("rl value was 0, shouldn't get here");
123	return result;
124	}
125
126	UserContext userContext = new UserContext(request);
127
128	//append site metadata
129	addSiteMetadata(page_response, userContext);
130	addInterfaceOptions(page_response);
131
132	// get the additional data needed for the page
133	getBackgroundData(page_response, collection, userContext);
134	Element format_elem = (Element) GSXML.getChildByTagName(page_response, GSXML.FORMAT_ELEM);
135
136	if (format_elem != null) {
137	// lets look for param defaults set in config file
138	NodeList param_defaults = format_elem.getElementsByTagName("paramDefault");
139	for (int i=0; i<param_defaults.getLength(); i++) {
140	Element p = (Element)param_defaults.item(i);
141	String name = p.getAttribute(GSXML.NAME_ATT);
142	if (params.get(name) ==null) {
143	// wasn't set from interface
144	String value = p.getAttribute(GSXML.VALUE_ATT);
145	params.put(name, value );
146	// also add into request param xml so that xslt knows it too
147	GSXML.addParameterToList(cgi_paramList, name, value);
148	}
149	}
150	}
151
152
153	boolean editing_document = false;
154	String doc_edit = (String) params.get(DOC_EDIT_ARG);
155	if (doc_edit != null && doc_edit.equals("1")) {
156	editing_document = true;
157	}
158
159	// are we editing mode? just get the archive document, convert to our internal doc format, and return it
160	if (editing_document) {
161
162	// call get archive doc
163	Element dx_message = doc.createElement(GSXML.MESSAGE_ELEM);
164	String to = "DocXMLGetSection";
165	Element dx_request = GSXML.createBasicRequest(doc, GSXML.REQUEST_TYPE_PROCESS, to, userContext);
166	dx_message.appendChild(dx_request);
167	Element dx_section = doc.createElement(GSXML.DOCXML_SECTION_ELEM);
168	dx_section.setAttribute(GSXML.NODE_ID_ATT, document_id);
169	dx_section.setAttribute(GSXML.COLLECTION_ATT, collection);
170	dx_request.appendChild(dx_section);
171
172	Element dx_response_message = (Element) this.mr.process(dx_message);
173	if (processErrorElements(dx_response_message, page_response))
174	{
175	return result;
176	}
177
178	// get the section out
179	String path = GSPath.appendLink(GSXML.RESPONSE_ELEM, GSXML.DOCXML_SECTION_ELEM);
180	Element section = (Element) GSXML.getNodeByPath(dx_response_message, path);
181	if (section == null) {
182	logger.error("no archive doc returned for "+document_id);
183	return result;
184	}
185	// convert the archive format into the internal format that the page response requires
186
187	Element doc_elem = doc.createElement(GSXML.DOCUMENT_ELEM);
188	page_response.appendChild(doc_elem);
189	section.setAttribute(GSXML.NODE_ID_ATT, document_id);
190
191	Element transformed_section = transformArchiveToDocument(section);
192	doc_elem.appendChild(doc.importNode(transformed_section, true));
193	logger.error("dx result = "+XMLConverter.getPrettyString(result));
194	return result;
195	}
196
197	String document_type = (String) params.get(GSParams.DOCUMENT_TYPE);
198	if (document_type != null && document_type.equals(""))
199	{
200	//document_type = "hierarchy";
201	document_type = null; // we'll get it later if not already specified
202	}
203	//whether to retrieve siblings or not
204	boolean get_siblings = false;
205	String sibs = (String) params.get(SIBLING_ARG);
206	if (sibs != null && sibs.equals("1"))
207	{
208	get_siblings = true;
209	}
210
211	String doc_id_modifier = "";
212	String sibling_num = (String) params.get(GOTO_PAGE_ARG);
213	if (sibling_num != null && !sibling_num.equals(""))
214	{
215	// we have to modify the doc name
216	doc_id_modifier = "." + sibling_num + ".ss";
217	}
218
219	boolean expand_document = false;
220	String ed_arg = (String) params.get(EXPAND_DOCUMENT_ARG);
221	if (ed_arg != null && ed_arg.equals("1"))
222	{
223	expand_document = true;
224	}
225
226	boolean expand_contents = false;
227	if (expand_document)
228	{ // we always expand the contents with the text
229	expand_contents = true;
230	}
231	else
232	{
233	String ec_arg = (String) params.get(EXPAND_CONTENTS_ARG);
234	if (ec_arg != null && ec_arg.equals("1"))
235	{
236	expand_contents = true;
237	}
238	}
239
240	// do we want text content? Not if no_text=1.
241	// expand_document overrides this. - should it??
242	boolean get_text = true;
243	String nt_arg = (String) params.get(NO_TEXT_ARG);
244
245	if (!expand_document && nt_arg!=null && nt_arg.equals("1")) {
246	logger.error("SETTING GET TEXT TO FALSE");
247	get_text = false;
248	} else {
249	logger.error("GET TEXT REMAINS TRUE");
250	}
251
252	// the_document is where all the doc info - structure and metadata etc
253	// is added into, to be returned in the page
254	Element the_document = doc.createElement(GSXML.DOCUMENT_ELEM);
255	page_response.appendChild(the_document);
256
257	// create a basic doc list containing the current node
258	Element basic_doc_list = doc.createElement(GSXML.DOC_NODE_ELEM + GSXML.LIST_MODIFIER);
259	Element current_doc = doc.createElement(GSXML.DOC_NODE_ELEM);
260	basic_doc_list.appendChild(current_doc);
261	if (document_id != null)
262	{
263	current_doc.setAttribute(GSXML.NODE_ID_ATT, document_id + doc_id_modifier);
264	}
265	else
266	{
267	current_doc.setAttribute(GSXML.HREF_ID_ATT, href);
268	// do we need this??
269	current_doc.setAttribute(GSXML.ID_MOD_ATT, doc_id_modifier);
270	}
271
272	if (document_type == null)
273	{
274	document_type = getDocumentType(basic_doc_list, collection, userContext, page_response);
275	}
276	if (document_type == null)
277	{
278	logger.debug("doctype is null, setting to simple");
279	document_type = GSXML.DOC_TYPE_SIMPLE;
280	}
281
282	the_document.setAttribute(GSXML.DOC_TYPE_ATT, document_type);
283
284
285	// Create a parameter list to specify the required structure information
286	Element ds_param_list = doc.createElement(GSXML.PARAM_ELEM + GSXML.LIST_MODIFIER);
287
288	if (service_params != null)
289	{
290	GSXML.addParametersToList(ds_param_list, service_params);
291	}
292
293	Element ds_param = null;
294	boolean get_structure = false;
295	boolean get_structure_info = false;
296	if (document_type.equals(GSXML.DOC_TYPE_PAGED))
297	{
298	get_structure_info = true;
299
300	if (expand_contents)
301	{
302	ds_param = doc.createElement(GSXML.PARAM_ELEM);
303	ds_param_list.appendChild(ds_param);
304	ds_param.setAttribute(GSXML.NAME_ATT, "structure");
305	ds_param.setAttribute(GSXML.VALUE_ATT, "entire");
306	}
307
308	// get the info needed for paged naviagtion
309	ds_param = doc.createElement(GSXML.PARAM_ELEM);
310	ds_param_list.appendChild(ds_param);
311	ds_param.setAttribute(GSXML.NAME_ATT, "info");
312	ds_param.setAttribute(GSXML.VALUE_ATT, "numSiblings");
313	ds_param = doc.createElement(GSXML.PARAM_ELEM);
314	ds_param_list.appendChild(ds_param);
315	ds_param.setAttribute(GSXML.NAME_ATT, "info");
316	ds_param.setAttribute(GSXML.VALUE_ATT, "numChildren");
317	ds_param = doc.createElement(GSXML.PARAM_ELEM);
318	ds_param_list.appendChild(ds_param);
319	ds_param.setAttribute(GSXML.NAME_ATT, "info");
320	ds_param.setAttribute(GSXML.VALUE_ATT, "siblingPosition");
321
322	if (get_siblings)
323	{
324	ds_param = doc.createElement(GSXML.PARAM_ELEM);
325	ds_param_list.appendChild(ds_param);
326	ds_param.setAttribute(GSXML.NAME_ATT, "structure");
327	ds_param.setAttribute(GSXML.VALUE_ATT, "siblings");
328	}
329
330	}
331	else if (document_type.equals(GSXML.DOC_TYPE_HIERARCHY) \|\| document_type.equals(GSXML.DOC_TYPE_PAGED_HIERARCHY))
332	{
333	get_structure = true;
334	if (expand_contents)
335	{
336	ds_param = doc.createElement(GSXML.PARAM_ELEM);
337	ds_param_list.appendChild(ds_param);
338	ds_param.setAttribute(GSXML.NAME_ATT, "structure");
339	ds_param.setAttribute(GSXML.VALUE_ATT, "entire");
340	}
341	else
342	{
343	// get the info needed for table of contents
344	ds_param = doc.createElement(GSXML.PARAM_ELEM);
345	ds_param_list.appendChild(ds_param);
346	ds_param.setAttribute(GSXML.NAME_ATT, "structure");
347	ds_param.setAttribute(GSXML.VALUE_ATT, "ancestors");
348	ds_param = doc.createElement(GSXML.PARAM_ELEM);
349	ds_param_list.appendChild(ds_param);
350	ds_param.setAttribute(GSXML.NAME_ATT, "structure");
351	ds_param.setAttribute(GSXML.VALUE_ATT, "children");
352	if (get_siblings)
353	{
354	ds_param = doc.createElement(GSXML.PARAM_ELEM);
355	ds_param_list.appendChild(ds_param);
356	ds_param.setAttribute(GSXML.NAME_ATT, "structure");
357	ds_param.setAttribute(GSXML.VALUE_ATT, "siblings");
358	}
359	}
360	}
361	else
362	{
363	// we dont need any structure
364	}
365
366	boolean has_dummy = false;
367	if (get_structure \|\| get_structure_info)
368	{
369
370	// Build a request to obtain the document structure
371	Element ds_message = doc.createElement(GSXML.MESSAGE_ELEM);
372	String to = GSPath.appendLink(collection, "DocumentStructureRetrieve");// Hard-wired?
373	Element ds_request = GSXML.createBasicRequest(doc, GSXML.REQUEST_TYPE_PROCESS, to, userContext);
374	ds_message.appendChild(ds_request);
375	ds_request.appendChild(ds_param_list);
376
377	// add the node list we created earlier
378	ds_request.appendChild(basic_doc_list);
379
380	// Process the document structure retrieve message
381	Element ds_response_message = (Element) this.mr.process(ds_message);
382	if (processErrorElements(ds_response_message, page_response))
383	{
384	return result;
385	}
386
387	// get the info and print out
388	String path = GSPath.appendLink(GSXML.RESPONSE_ELEM, GSXML.DOC_NODE_ELEM + GSXML.LIST_MODIFIER);
389	path = GSPath.appendLink(path, GSXML.DOC_NODE_ELEM);
390	path = GSPath.appendLink(path, "nodeStructureInfo");
391	Element ds_response_struct_info = (Element) GSXML.getNodeByPath(ds_response_message, path);
392	// get the doc_node bit
393	if (ds_response_struct_info != null)
394	{
395	the_document.appendChild(doc.importNode(ds_response_struct_info, true));
396	}
397	path = GSPath.appendLink(GSXML.RESPONSE_ELEM, GSXML.DOC_NODE_ELEM + GSXML.LIST_MODIFIER);
398	path = GSPath.appendLink(path, GSXML.DOC_NODE_ELEM);
399	path = GSPath.appendLink(path, GSXML.NODE_STRUCTURE_ELEM);
400	Element ds_response_structure = (Element) GSXML.getNodeByPath(ds_response_message, path);
401
402	if (ds_response_structure != null)
403	{
404	// add the contents of the structure bit into the_document
405	NodeList structs = ds_response_structure.getChildNodes();
406	for (int i = 0; i < structs.getLength(); i++)
407	{
408	the_document.appendChild(doc.importNode(structs.item(i), true));
409	}
410	}
411	else
412	{
413	// no structure nodes, so put in a dummy doc node
414	Element doc_node = doc.createElement(GSXML.DOC_NODE_ELEM);
415	if (document_id != null)
416	{
417	doc_node.setAttribute(GSXML.NODE_ID_ATT, document_id);
418	}
419	else
420	{
421	doc_node.setAttribute(GSXML.HREF_ID_ATT, href);
422
423	}
424	the_document.appendChild(doc_node);
425	has_dummy = true;
426	}
427	}
428	else
429	{ // a simple type - we dont have a dummy node for simple
430	// should think about this more
431	// no structure request, so just put in a dummy doc node
432	Element doc_node = doc.createElement(GSXML.DOC_NODE_ELEM);
433	if (document_id != null)
434	{
435	doc_node.setAttribute(GSXML.NODE_ID_ATT, document_id);
436	}
437	else
438	{
439	doc_node.setAttribute(GSXML.HREF_ID_ATT, href);
440	}
441	the_document.appendChild(doc_node);
442	has_dummy = true;
443	}
444
445	// Build a request to obtain some document metadata
446	Element dm_message = doc.createElement(GSXML.MESSAGE_ELEM);
447	String to = GSPath.appendLink(collection, "DocumentMetadataRetrieve"); // Hard-wired?
448	Element dm_request = GSXML.createBasicRequest(doc, GSXML.REQUEST_TYPE_PROCESS, to, userContext);
449	dm_message.appendChild(dm_request);
450	// Create a parameter list to specify the required metadata information
451
452	HashSet<String> meta_names = new HashSet<String>();
453	meta_names.add("Title"); // the default
454	if (format_elem != null)
455	{
456	getRequiredMetadataNames(format_elem, meta_names);
457	}
458
459	Element extraMetaListElem = (Element) GSXML.getChildByTagName(request, GSXML.EXTRA_METADATA + GSXML.LIST_MODIFIER);
460	if (extraMetaListElem != null)
461	{
462	NodeList extraMetaList = extraMetaListElem.getElementsByTagName(GSXML.EXTRA_METADATA);
463	for (int i = 0; i < extraMetaList.getLength(); i++)
464	{
465	meta_names.add(((Element) extraMetaList.item(i)).getAttribute(GSXML.NAME_ATT));
466	}
467	}
468
469	Element dm_param_list = createMetadataParamList(doc,meta_names);
470	if (service_params != null)
471	{
472	GSXML.addParametersToList(dm_param_list, service_params);
473	}
474
475	dm_request.appendChild(dm_param_list);
476
477	// create the doc node list for the metadata request
478	Element dm_doc_list = doc.createElement(GSXML.DOC_NODE_ELEM + GSXML.LIST_MODIFIER);
479	dm_request.appendChild(dm_doc_list);
480
481	// Add each node from the structure response into the metadata request
482	NodeList doc_nodes = the_document.getElementsByTagName(GSXML.DOC_NODE_ELEM);
483	for (int i = 0; i < doc_nodes.getLength(); i++)
484	{
485	Element doc_node = (Element) doc_nodes.item(i);
486	String doc_node_id = doc_node.getAttribute(GSXML.NODE_ID_ATT);
487
488	// Add the documentNode to the list
489	Element dm_doc_node = doc.createElement(GSXML.DOC_NODE_ELEM);
490	dm_doc_list.appendChild(dm_doc_node);
491	dm_doc_node.setAttribute(GSXML.NODE_ID_ATT, doc_node_id);
492	dm_doc_node.setAttribute(GSXML.NODE_TYPE_ATT, doc_node.getAttribute(GSXML.NODE_TYPE_ATT));
493	if (document_id == null){
494	dm_doc_node.setAttribute(GSXML.HREF_ID_ATT, href );
495	}
496
497	}
498
499	// we also want a metadata request to the top level document to get
500	// assocfilepath - this could be cached too
501	Element doc_meta_request = GSXML.createBasicRequest(doc, GSXML.REQUEST_TYPE_PROCESS, to, userContext);
502	dm_message.appendChild(doc_meta_request);
503	Element doc_meta_param_list = doc.createElement(GSXML.PARAM_ELEM + GSXML.LIST_MODIFIER);
504	if (service_params != null)
505	{
506	GSXML.addParametersToList(doc_meta_param_list, service_params);
507	}
508
509	doc_meta_request.appendChild(doc_meta_param_list);
510	Element doc_param = doc.createElement(GSXML.PARAM_ELEM);
511	doc_meta_param_list.appendChild(doc_param);
512	doc_param.setAttribute(GSXML.NAME_ATT, "metadata");
513	doc_param.setAttribute(GSXML.VALUE_ATT, "assocfilepath");
514
515	// create the doc node list for the metadata request
516	Element doc_list = doc.createElement(GSXML.DOC_NODE_ELEM + GSXML.LIST_MODIFIER);
517	doc_meta_request.appendChild(doc_list);
518
519	Element doc_node = doc.createElement(GSXML.DOC_NODE_ELEM);
520	// the node we want is the root document node
521	if (document_id != null)
522	{
523	doc_node.setAttribute(GSXML.NODE_ID_ATT, document_id + ".rt");
524	}
525	/*else
526	{
527	doc_node.setAttribute(GSXML.HREF_ID_ATT, href);// + ".rt");
528	// can we assume that href is always a top level doc??
529	//doc_node.setAttribute(GSXML.ID_MOD_ATT, ".rt");
530	//doc_node.setAttribute("externalURL", has_rl);
531	}*/
532	doc_list.appendChild(doc_node);
533
534	Element dm_response_message = (Element) this.mr.process(dm_message);
535	if (processErrorElements(dm_response_message, page_response))
536	{
537	return result;
538	}
539
540	String path = GSPath.appendLink(GSXML.RESPONSE_ELEM, GSXML.DOC_NODE_ELEM + GSXML.LIST_MODIFIER);
541	Element dm_response_doc_list = (Element) GSXML.getNodeByPath(dm_response_message, path);
542
543	// Merge the metadata with the structure information
544	NodeList dm_response_docs = dm_response_doc_list.getChildNodes();
545	for (int i = 0; i < doc_nodes.getLength(); i++)
546	{
547	GSXML.mergeMetadataLists(doc_nodes.item(i), dm_response_docs.item(i));
548	}
549	// get the top level doc metadata out
550	Element doc_meta_response = (Element) dm_response_message.getElementsByTagName(GSXML.RESPONSE_ELEM).item(1);
551	Element top_doc_node = (Element) GSXML.getNodeByPath(doc_meta_response, "documentNodeList/documentNode");
552	GSXML.mergeMetadataLists(the_document, top_doc_node);
553
554	// do we want doc text content? If not, we are done.
555	if (!get_text) {
556	// don't get text
557	return result;
558	}
559
560	// Build a request to obtain some document content
561	Element dc_message = doc.createElement(GSXML.MESSAGE_ELEM);
562	to = GSPath.appendLink(collection, "DocumentContentRetrieve"); // Hard-wired?
563	Element dc_request = GSXML.createBasicRequest(doc, GSXML.REQUEST_TYPE_PROCESS, to, userContext);
564	dc_message.appendChild(dc_request);
565
566	// Create a parameter list to specify the request parameters - empty for now
567	Element dc_param_list = doc.createElement(GSXML.PARAM_ELEM + GSXML.LIST_MODIFIER);
568	if (service_params != null)
569	{
570	GSXML.addParametersToList(dc_param_list, service_params);
571	}
572
573	dc_request.appendChild(dc_param_list);
574
575	// get the content
576	// the doc list for the content request is the same as the one for the structure request unless we want the whole document, in which case its the same as for the metadata request.
577	if (expand_document)
578	{
579	dc_request.appendChild(dm_doc_list);
580	}
581	else
582	{
583	dc_request.appendChild(basic_doc_list);
584	}
585	Element dc_response_message = (Element) this.mr.process(dc_message);
586	if (processErrorElements(dc_response_message, page_response))
587	{
588	return result;
589	}
590	Element dc_response_doc_list = (Element) GSXML.getNodeByPath(dc_response_message, path);
591
592	if (expand_document)
593	{
594	// Merge the content with the structure information
595	NodeList dc_response_docs = dc_response_doc_list.getChildNodes();
596	for (int i = 0; i < doc_nodes.getLength(); i++)
597	{
598	Node content = GSXML.getChildByTagName((Element) dc_response_docs.item(i), GSXML.NODE_CONTENT_ELEM);
599	if (content != null)
600	{
601	if (highlight_query_terms)
602	{
603	String node_id = ((Element)doc_nodes.item(i)).getAttribute(GSXML.NODE_ID_ATT);
604	content = highlightQueryTerms(request, node_id, (Element) content);
605	}
606
607	doc_nodes.item(i).appendChild(doc.importNode(content, true));
608	}
609	//GSXML.mergeMetadataLists(doc_nodes.item(i), dm_response_docs.item(i));
610	}
611	if (has_dummy && document_type.equals(GSXML.DOC_TYPE_SIMPLE)) {
612	Element dummy_node = (Element) doc_nodes.item(0);
613	the_document.removeChild(dummy_node);
614	the_document.setAttribute(GSXML.NODE_ID_ATT, dummy_node.getAttribute(GSXML.NODE_ID_ATT));
615	NodeList dummy_children = dummy_node.getChildNodes();
616	for (int i = dummy_children.getLength() - 1; i >= 0; i--)
617	{
618	// special case as we don't want more than one metadata list
619	if (dummy_children.item(i).getNodeName().equals(GSXML.METADATA_ELEM + GSXML.LIST_MODIFIER))
620	{
621	GSXML.mergeMetadataFromList(the_document, dummy_children.item(i));
622	}
623	else
624	{
625	the_document.appendChild(dummy_children.item(i));
626	}
627	}
628	}
629	}
630	else
631	{
632	//path = GSPath.appendLink(path, GSXML.DOC_NODE_ELEM);
633	Element dc_response_doc = (Element) GSXML.getChildByTagName(dc_response_doc_list, GSXML.DOC_NODE_ELEM);
634	Element dc_response_doc_content = (Element) GSXML.getChildByTagName(dc_response_doc, GSXML.NODE_CONTENT_ELEM);
635	//Element dc_response_doc_external = (Element) GSXML.getChildByTagName(dc_response_doc, "external");
636
637	if (dc_response_doc_content == null)
638	{
639	// no content to add
640	if (dc_response_doc.getAttribute("external").equals("true"))
641	{
642
643	//if (dc_response_doc_external != null)
644	//{
645	String href_id = dc_response_doc.getAttribute(GSXML.HREF_ID_ATT);
646
647	the_document.setAttribute("selectedNode", href_id);
648	the_document.setAttribute("external", href_id);
649	}
650	return result;
651	}
652	if (highlight_query_terms)
653	{
654	dc_response_doc.removeChild(dc_response_doc_content);
655
656	dc_response_doc_content = highlightQueryTerms(request, null, dc_response_doc_content);
657	dc_response_doc.appendChild(dc_response_doc.getOwnerDocument().importNode(dc_response_doc_content, true));
658	}
659
660	if (provide_annotations)
661	{
662	String service_selected = (String) params.get(ENRICH_DOC_ARG);
663	if (service_selected != null && service_selected.equals("1"))
664	{
665	// now we can modifiy the response doc if needed
666	String enrich_service = (String) params.get(GSParams.SERVICE);
667	// send a message to the service
668	Element enrich_message = doc.createElement(GSXML.MESSAGE_ELEM);
669	Element enrich_request = GSXML.createBasicRequest(doc, GSXML.REQUEST_TYPE_PROCESS, enrich_service, userContext);
670	enrich_message.appendChild(enrich_request);
671	// check for parameters
672	HashMap e_service_params = (HashMap) params.get("s1");
673	if (e_service_params != null)
674	{
675	Element enrich_pl = doc.createElement(GSXML.PARAM_ELEM + GSXML.LIST_MODIFIER);
676	GSXML.addParametersToList(enrich_pl, e_service_params);
677	enrich_request.appendChild(enrich_pl);
678	}
679	Element e_doc_list = doc.createElement(GSXML.DOC_NODE_ELEM + GSXML.LIST_MODIFIER);
680	enrich_request.appendChild(e_doc_list);
681	e_doc_list.appendChild(doc.importNode(dc_response_doc, true));
682
683	Node enrich_response = this.mr.process(enrich_message);
684
685	String[] links = { GSXML.RESPONSE_ELEM, GSXML.DOC_NODE_ELEM + GSXML.LIST_MODIFIER, GSXML.DOC_NODE_ELEM, GSXML.NODE_CONTENT_ELEM };
686	path = GSPath.createPath(links);
687	dc_response_doc_content = (Element) GSXML.getNodeByPath(enrich_response, path);
688
689	}
690	} // if provide_annotations
691
692	// use the returned id rather than the sent one cos there may have
693	// been modifiers such as .pr that are removed.
694	String modified_doc_id = dc_response_doc.getAttribute(GSXML.NODE_ID_ATT);
695	the_document.setAttribute("selectedNode", modified_doc_id);
696	if (has_dummy)
697	{
698	// change the id if necessary and add the content
699	Element dummy_node = (Element) doc_nodes.item(0);
700
701	dummy_node.setAttribute(GSXML.NODE_ID_ATT, modified_doc_id);
702	dummy_node.appendChild(doc.importNode(dc_response_doc_content, true));
703	// hack for simple type
704	if (document_type.equals(GSXML.DOC_TYPE_SIMPLE))
705	{
706	// we dont want the internal docNode, just want the content and metadata in the document
707	// rethink this!!
708	the_document.removeChild(dummy_node);
709
710	NodeList dummy_children = dummy_node.getChildNodes();
711	//for (int i=0; i<dummy_children.getLength(); i++) {
712	for (int i = dummy_children.getLength() - 1; i >= 0; i--)
713	{
714	// special case as we don't want more than one metadata list
715	if (dummy_children.item(i).getNodeName().equals(GSXML.METADATA_ELEM + GSXML.LIST_MODIFIER))
716	{
717	GSXML.mergeMetadataFromList(the_document, dummy_children.item(i));
718	}
719	else
720	{
721	the_document.appendChild(dummy_children.item(i));
722	}
723	}
724	}
725
726	the_document.setAttribute(GSXML.NODE_ID_ATT, modified_doc_id);
727	}
728	else
729	{
730	// Merge the document content with the metadata and structure information
731	for (int i = 0; i < doc_nodes.getLength(); i++)
732	{
733	Node dn = doc_nodes.item(i);
734	String dn_id = ((Element) dn).getAttribute(GSXML.NODE_ID_ATT);
735	if (dn_id.equals(modified_doc_id))
736	{
737	dn.appendChild(doc.importNode(dc_response_doc_content, true));
738	break;
739	}
740	}
741	}
742	}
743	//logger.debug("(DocumentAction) Page:\n" + GSXML.xmlNodeToString(result));
744	return result;
745	}
746
747	/**
748	* tell the param class what its arguments are if an action has its own
749	* arguments, this should add them to the params object - particularly
750	* important for args that should not be saved
751	*/
752	public boolean addActionParameters(GSParams params)
753	{
754	params.addParameter(GOTO_PAGE_ARG, false);
755	params.addParameter(ENRICH_DOC_ARG, false);
756	params.addParameter(EXPAND_DOCUMENT_ARG, false);
757	params.addParameter(EXPAND_CONTENTS_ARG, false);
758	params.addParameter(REALISTIC_BOOK_ARG, false);
759
760	return true;
761	}
762
763	/**
764	* this method gets the collection description, the format info, the list of
765	* enrich services, etc - stuff that is needed for the page, but is the same
766	* whatever the query is - should be cached
767	*/
768	protected boolean getBackgroundData(Element page_response, String collection, UserContext userContext)
769	{
770	Document doc = page_response.getOwnerDocument();
771
772	// create a message to process - contains requests for the collection
773	// description, the format element, the enrich services on offer
774	// these could all be cached
775	Element info_message = doc.createElement(GSXML.MESSAGE_ELEM);
776	String path = GSPath.appendLink(collection, "DocumentContentRetrieve");
777	// the format request - ignore for now, where does this request go to??
778	Element format_request = GSXML.createBasicRequest(doc, GSXML.REQUEST_TYPE_FORMAT, path, userContext);
779	info_message.appendChild(format_request);
780
781	// the enrich_services request - only do this if provide_annotations is true
782
783	if (provide_annotations)
784	{
785	Element enrich_services_request = GSXML.createBasicRequest(doc, GSXML.REQUEST_TYPE_DESCRIBE, "", userContext);
786	enrich_services_request.setAttribute(GSXML.INFO_ATT, "serviceList");
787	info_message.appendChild(enrich_services_request);
788	}
789
790	Element info_response = (Element) this.mr.process(info_message);
791
792	// the collection is the first response
793	NodeList responses = info_response.getElementsByTagName(GSXML.RESPONSE_ELEM);
794	Element format_resp = (Element) responses.item(0);
795
796	Element format_elem = (Element) GSXML.getChildByTagName(format_resp, GSXML.FORMAT_ELEM);
797	if (format_elem != null)
798	{
799	Element global_format_elem = (Element) GSXML.getChildByTagName(format_resp, GSXML.GLOBAL_FORMAT_ELEM);
800	if (global_format_elem != null)
801	{
802	GSXSLT.mergeFormatElements(format_elem, global_format_elem, false);
803	}
804
805	// set the format type
806	format_elem.setAttribute(GSXML.TYPE_ATT, "display");
807	page_response.appendChild(doc.importNode(format_elem, true));
808	}
809
810	if (provide_annotations)
811	{
812	Element services_resp = (Element) responses.item(1);
813
814	// a new message for the mr
815	Element enrich_message = doc.createElement(GSXML.MESSAGE_ELEM);
816	NodeList e_services = services_resp.getElementsByTagName(GSXML.SERVICE_ELEM);
817	boolean service_found = false;
818	for (int j = 0; j < e_services.getLength(); j++)
819	{
820	if (((Element) e_services.item(j)).getAttribute(GSXML.TYPE_ATT).equals("enrich"))
821	{
822	Element s = GSXML.createBasicRequest(doc, GSXML.REQUEST_TYPE_DESCRIBE, ((Element) e_services.item(j)).getAttribute(GSXML.NAME_ATT), userContext);
823	enrich_message.appendChild(s);
824	service_found = true;
825	}
826	}
827	if (service_found)
828	{
829	Element enrich_response = (Element) this.mr.process(enrich_message);
830
831	NodeList e_responses = enrich_response.getElementsByTagName(GSXML.RESPONSE_ELEM);
832	Element service_list = doc.createElement(GSXML.SERVICE_ELEM + GSXML.LIST_MODIFIER);
833	for (int i = 0; i < e_responses.getLength(); i++)
834	{
835	Element e_resp = (Element) e_responses.item(i);
836	Element e_service = (Element) doc.importNode(GSXML.getChildByTagName(e_resp, GSXML.SERVICE_ELEM), true);
837	e_service.setAttribute(GSXML.NAME_ATT, e_resp.getAttribute(GSXML.FROM_ATT));
838	service_list.appendChild(e_service);
839	}
840	page_response.appendChild(service_list);
841	}
842	} // if provide_annotations
843	return true;
844
845	}
846
847	protected String getDocumentType(Element basic_doc_list, String collection, UserContext userContext, Element page_response)
848	{
849	Document doc = basic_doc_list.getOwnerDocument();
850
851	Element ds_message = doc.createElement(GSXML.MESSAGE_ELEM);
852	String to = GSPath.appendLink(collection, "DocumentStructureRetrieve");// Hard-wired?
853	Element ds_request = GSXML.createBasicRequest(doc, GSXML.REQUEST_TYPE_PROCESS, to, userContext);
854	ds_message.appendChild(ds_request);
855
856	// Create a parameter list to specify the required structure information
857	Element ds_param_list = doc.createElement(GSXML.PARAM_ELEM + GSXML.LIST_MODIFIER);
858	Element ds_param = doc.createElement(GSXML.PARAM_ELEM);
859	ds_param_list.appendChild(ds_param);
860	ds_param.setAttribute(GSXML.NAME_ATT, "info");
861	ds_param.setAttribute(GSXML.VALUE_ATT, "documentType");
862
863	ds_request.appendChild(ds_param_list);
864
865	// add the node list we created earlier
866	ds_request.appendChild(basic_doc_list);
867
868	// Process the document structure retrieve message
869	Element ds_response_message = (Element) this.mr.process(ds_message);
870	if (processErrorElements(ds_response_message, page_response))
871	{
872	return null;
873	}
874
875	String[] links = { GSXML.RESPONSE_ELEM, GSXML.DOC_NODE_ELEM + GSXML.LIST_MODIFIER, GSXML.DOC_NODE_ELEM, "nodeStructureInfo" };
876	String path = GSPath.createPath(links);
877	Element info_elem = (Element) GSXML.getNodeByPath(ds_response_message, path);
878	if (info_elem == null) {
879	return null;
880	}
881	Element doctype_elem = GSXML.getNamedElement(info_elem, "info", "name", "documentType");
882	if (doctype_elem != null)
883	{
884	String doc_type = doctype_elem.getAttribute("value");
885	return doc_type;
886	}
887	return null;
888	}
889
890	/** run the XSLT transform which converts from doc.xml format to our internal document format */
891	protected Element transformArchiveToDocument(Element section) {
892
893	String stylesheet_file = GSFile.stylesheetFile(GlobalProperties.getGSDL3Home(), (String) this.config_params.get(GSConstants.SITE_NAME), "", (String) this.config_params.get(GSConstants.INTERFACE_NAME), null, "archive2document.xsl");
894	Document stylesheet_doc = XMLConverter.getDOM(new File(stylesheet_file));
895	if (stylesheet_doc == null) {
896	logger.error("Couldn't load in stylesheet "+stylesheet_file);
897	return section;
898	}
899
900	Document section_doc = XMLConverter.newDOM();
901	section_doc.appendChild(section_doc.importNode(section, true));
902	Node result = this.transformer.transform(stylesheet_doc, section_doc);
903	logger.error("transform result = "+XMLConverter.getPrettyString(result));
904
905	Element new_element;
906	if (result.getNodeType() == Node.DOCUMENT_NODE)
907	{
908	new_element = ((Document) result).getDocumentElement();
909	}
910	else
911	{
912	new_element = (Element) result;
913	}
914
915
916	return new_element;
917
918	}
919
920
921	/**
922	* this involves a bit of a hack to get the equivalent query terms - has to
923	* requery the query service - uses the last selected service name. (if it
924	* ends in query). should this action do the query or should it send a
925	* message to the query action? but that will involve lots of extra stuff.
926	* also doesn't handle phrases properly - just highlights all the terms
927	* found in the text.
928	*/
929	protected Element highlightQueryTerms(Element request, String current_node_id, Element dc_response_doc_content)
930	{
931	Document doc = request.getOwnerDocument();
932
933	// do the query again to get term info
934	Element cgi_param_list = (Element) GSXML.getChildByTagName(request, GSXML.PARAM_ELEM + GSXML.LIST_MODIFIER);
935	HashMap<String, Serializable> params = GSXML.extractParams(cgi_param_list, false);
936
937	HashMap previous_params = (HashMap) params.get("p");
938	if (previous_params == null)
939	{
940	return dc_response_doc_content;
941	}
942	String service_name = (String) previous_params.get(GSParams.SERVICE);
943	if (service_name == null \|\| !service_name.endsWith("Query"))
944	{ // hack for now - we only do highlighting if we were in a query last - ie not if we were in a browse thingy
945	logger.debug("invalid service, not doing highlighting");
946	return dc_response_doc_content;
947	}
948	String collection = (String) params.get(GSParams.COLLECTION);
949	UserContext userContext = new UserContext(request);
950	String to = GSPath.appendLink(collection, service_name);
951
952	Element mr_query_message = doc.createElement(GSXML.MESSAGE_ELEM);
953	Element mr_query_request = GSXML.createBasicRequest(doc, GSXML.REQUEST_TYPE_PROCESS, to, userContext);
954	mr_query_message.appendChild(mr_query_request);
955
956	// paramList
957	HashMap service_params = (HashMap) params.get("s1");
958
959	Element query_param_list = doc.createElement(GSXML.PARAM_ELEM + GSXML.LIST_MODIFIER);
960	GSXML.addParametersToList(query_param_list, service_params);
961	if (current_node_id != null) {
962	GSXML.addParameterToList(query_param_list, "hldocOID", current_node_id);
963	} else {
964	GSXML.addParameterToList(query_param_list, "hldocOID", (String) params.get(GSParams.DOCUMENT));
965	}
966	mr_query_request.appendChild(query_param_list);
967	// do the query
968	Element mr_query_response = (Element) this.mr.process(mr_query_message);
969	String pathNode = GSPath.appendLink(GSXML.RESPONSE_ELEM, GSXML.NODE_CONTENT_ELEM);
970	Element highlighted_Node = (Element) GSXML.getNodeByPath(mr_query_response, pathNode);
971	// For SOLR, the above query may come back with a nodeContent element, which is the hldocOID section content, with search terms marked up. We send it back to the documnetContentRetrieve service so that resolveTextMacros can be applied, and it can be properly encased in documentNode etc elements
972	if (highlighted_Node != null)
973	{
974	// Build a request to process highlighted text
975
976	Element hl_message = doc.createElement(GSXML.MESSAGE_ELEM);
977	to = GSPath.appendLink(collection, "DocumentContentRetrieve");
978	Element dc_request = GSXML.createBasicRequest(doc, GSXML.REQUEST_TYPE_PROCESS, to, userContext);
979	hl_message.appendChild(dc_request);
980
981	// Create a parameter list to specify the request parameters - empty for now
982	Element dc_param_list = doc.createElement(GSXML.PARAM_ELEM + GSXML.LIST_MODIFIER);
983	dc_request.appendChild(dc_param_list);
984
985	// get the content
986	Element doc_list = doc.createElement(GSXML.DOC_NODE_ELEM + GSXML.LIST_MODIFIER);
987	dc_request.appendChild(doc_list);
988	Element current_doc = doc.createElement(GSXML.DOC_NODE_ELEM);
989	doc_list.appendChild(current_doc);
990	current_doc.setAttribute(GSXML.NODE_ID_ATT, (String) params.get(GSParams.DOCUMENT));
991	//Append highlighted content to request for processing
992	dc_request.appendChild(doc.importNode(highlighted_Node, true));
993	Element hl_response_message = (Element) this.mr.process(hl_message);
994
995	//Get results
996	NodeList contentList = hl_response_message.getElementsByTagName(GSXML.NODE_CONTENT_ELEM);
997	Element content = (Element) contentList.item(0);
998	return content;
999	}
1000	String path = GSPath.appendLink(GSXML.RESPONSE_ELEM, GSXML.TERM_ELEM + GSXML.LIST_MODIFIER);
1001	Element query_term_list_element = (Element) GSXML.getNodeByPath(mr_query_response, path);
1002	if (query_term_list_element == null)
1003	{
1004	// no term info
1005	logger.error("No query term information.\n");
1006	return dc_response_doc_content;
1007	}
1008
1009	String content = GSXML.getNodeText(dc_response_doc_content);
1010
1011	String metadata_path = GSPath.appendLink(GSXML.RESPONSE_ELEM, GSXML.METADATA_ELEM + GSXML.LIST_MODIFIER);
1012	Element metadata_list = (Element) GSXML.getNodeByPath(mr_query_response, metadata_path);
1013
1014	HashSet<String> query_term_variants = new HashSet<String>();
1015	NodeList equivalent_terms_nodelist = query_term_list_element.getElementsByTagName("equivTermList");
1016	if (equivalent_terms_nodelist == null \|\| equivalent_terms_nodelist.getLength() == 0)
1017	{
1018	NodeList terms_nodelist = query_term_list_element.getElementsByTagName("term");
1019	if (terms_nodelist != null && terms_nodelist.getLength() > 0)
1020	{
1021	for (int i = 0; i < terms_nodelist.getLength(); i++)
1022	{
1023	String termValue = ((Element) terms_nodelist.item(i)).getAttribute("name");
1024	String termValueU = null;
1025	String termValueL = null;
1026
1027	if (termValue.length() > 1)
1028	{
1029	termValueU = termValue.substring(0, 1).toUpperCase() + termValue.substring(1);
1030	termValueL = termValue.substring(0, 1).toLowerCase() + termValue.substring(1);
1031	}
1032	else
1033	{
1034	termValueU = termValue.substring(0, 1).toUpperCase();
1035	termValueL = termValue.substring(0, 1).toLowerCase();
1036	}
1037
1038	query_term_variants.add(termValueU);
1039	query_term_variants.add(termValueL);
1040	}
1041	}
1042	}
1043	else
1044	{
1045	for (int i = 0; i < equivalent_terms_nodelist.getLength(); i++)
1046	{
1047	Element equivalent_terms_element = (Element) equivalent_terms_nodelist.item(i);
1048	String[] equivalent_terms = GSXML.getAttributeValuesFromList(equivalent_terms_element, GSXML.NAME_ATT);
1049	for (int j = 0; j < equivalent_terms.length; j++)
1050	{
1051	query_term_variants.add(equivalent_terms[j]);
1052	}
1053	}
1054	}
1055
1056	ArrayList<ArrayList<HashSet<String>>> phrase_query_term_variants_hierarchy = new ArrayList<ArrayList<HashSet<String>>>();
1057
1058	Element query_element = GSXML.getNamedElement(metadata_list, GSXML.METADATA_ELEM, GSXML.NAME_ATT, "query");
1059	String performed_query = GSXML.getNodeText(query_element) + " ";
1060
1061	ArrayList<HashSet<String>> phrase_query_p_term_variants_list = new ArrayList<HashSet<String>>();
1062	int term_start = 0;
1063	boolean in_term = false;
1064	boolean in_phrase = false;
1065	for (int i = 0; i < performed_query.length(); i++)
1066	{
1067	char character = performed_query.charAt(i);
1068	boolean is_character_letter_or_digit = Character.isLetterOrDigit(character);
1069
1070	// Has a query term just started?
1071	if (in_term == false && is_character_letter_or_digit == true)
1072	{
1073	in_term = true;
1074	term_start = i;
1075	}
1076
1077	// Or has a term just finished?
1078	else if (in_term == true && is_character_letter_or_digit == false)
1079	{
1080	in_term = false;
1081	String term = performed_query.substring(term_start, i);
1082
1083	Element term_element = GSXML.getNamedElement(query_term_list_element, GSXML.TERM_ELEM, GSXML.NAME_ATT, term);
1084	if (term_element != null)
1085	{
1086
1087	HashSet<String> phrase_query_p_term_x_variants = new HashSet<String>();
1088
1089	NodeList term_equivalent_terms_nodelist = term_element.getElementsByTagName("equivTermList");
1090	if (term_equivalent_terms_nodelist == null \|\| term_equivalent_terms_nodelist.getLength() == 0)
1091	{
1092	String termValueU = null;
1093	String termValueL = null;
1094
1095	if (term.length() > 1)
1096	{
1097	termValueU = term.substring(0, 1).toUpperCase() + term.substring(1);
1098	termValueL = term.substring(0, 1).toLowerCase() + term.substring(1);
1099	}
1100	else
1101	{
1102	termValueU = term.substring(0, 1).toUpperCase();
1103	termValueL = term.substring(0, 1).toLowerCase();
1104	}
1105
1106	phrase_query_p_term_x_variants.add(termValueU);
1107	phrase_query_p_term_x_variants.add(termValueL);
1108	}
1109	else
1110	{
1111	for (int j = 0; j < term_equivalent_terms_nodelist.getLength(); j++)
1112	{
1113	Element term_equivalent_terms_element = (Element) term_equivalent_terms_nodelist.item(j);
1114	String[] term_equivalent_terms = GSXML.getAttributeValuesFromList(term_equivalent_terms_element, GSXML.NAME_ATT);
1115	for (int k = 0; k < term_equivalent_terms.length; k++)
1116	{
1117	phrase_query_p_term_x_variants.add(term_equivalent_terms[k]);
1118	}
1119	}
1120	}
1121	phrase_query_p_term_variants_list.add(phrase_query_p_term_x_variants);
1122
1123	if (in_phrase == false)
1124	{
1125	phrase_query_term_variants_hierarchy.add(phrase_query_p_term_variants_list);
1126	phrase_query_p_term_variants_list = new ArrayList<HashSet<String>>();
1127	}
1128	}
1129	}
1130	// Watch for phrases (surrounded by quotes)
1131	if (character == '\"')
1132	{
1133	// Has a phrase just started?
1134	if (in_phrase == false)
1135	{
1136	in_phrase = true;
1137	}
1138	// Or has a phrase just finished?
1139	else if (in_phrase == true)
1140	{
1141	in_phrase = false;
1142	phrase_query_term_variants_hierarchy.add(phrase_query_p_term_variants_list);
1143	}
1144
1145	phrase_query_p_term_variants_list = new ArrayList<HashSet<String>>();
1146	}
1147	}
1148
1149	return highlightQueryTermsInternal(doc, content, query_term_variants, phrase_query_term_variants_hierarchy);
1150	}
1151
1152	/**
1153	* Highlights query terms in a piece of text.
1154	*/
1155	private Element highlightQueryTermsInternal(Document doc, String content, HashSet<String> query_term_variants, ArrayList<ArrayList<HashSet<String>>> phrase_query_term_variants_hierarchy)
1156	{
1157	// Convert the content string to an array of characters for speed
1158	char[] content_characters = new char[content.length()];
1159	content.getChars(0, content.length(), content_characters, 0);
1160
1161	// Now skim through the content, identifying word matches
1162	ArrayList<WordMatch> word_matches = new ArrayList<WordMatch>();
1163	int word_start = 0;
1164	boolean in_word = false;
1165	boolean preceding_word_matched = false;
1166	boolean inTag = false;
1167	for (int i = 0; i < content_characters.length; i++)
1168	{
1169	//We don't want to find words inside HTML tags
1170	if (content_characters[i] == '<')
1171	{
1172	inTag = true;
1173	continue;
1174	}
1175	else if (inTag && content_characters[i] == '>')
1176	{
1177	inTag = false;
1178	}
1179	else if (inTag)
1180	{
1181	continue;
1182	}
1183
1184	boolean is_character_letter_or_digit = Character.isLetterOrDigit(content_characters[i]);
1185
1186	// Has a word just started?
1187	if (in_word == false && is_character_letter_or_digit == true)
1188	{
1189	in_word = true;
1190	word_start = i;
1191	}
1192
1193	// Or has a word just finished?
1194	else if (in_word == true && is_character_letter_or_digit == false)
1195	{
1196	in_word = false;
1197
1198	// Check if the word matches any of the query term equivalents
1199	String word = new String(content_characters, word_start, (i - word_start));
1200	if (query_term_variants.contains(word))
1201	{
1202	// We have found a matching word, so remember its location
1203	word_matches.add(new WordMatch(word, word_start, i, preceding_word_matched));
1204	preceding_word_matched = true;
1205	}
1206	else
1207	{
1208	preceding_word_matched = false;
1209	}
1210	}
1211	}
1212
1213	// Don't forget the last word...
1214	if (in_word == true)
1215	{
1216	// Check if the word matches any of the query term equivalents
1217	String word = new String(content_characters, word_start, (content_characters.length - word_start));
1218	if (query_term_variants.contains(word))
1219	{
1220	// We have found a matching word, so remember its location
1221	word_matches.add(new WordMatch(word, word_start, content_characters.length, preceding_word_matched));
1222	}
1223	}
1224
1225	ArrayList<Integer> highlight_start_positions = new ArrayList<Integer>();
1226	ArrayList<Integer> highlight_end_positions = new ArrayList<Integer>();
1227
1228	// Deal with phrases now
1229	ArrayList<PartialPhraseMatch> partial_phrase_matches = new ArrayList<PartialPhraseMatch>();
1230	for (int i = 0; i < word_matches.size(); i++)
1231	{
1232	WordMatch word_match = word_matches.get(i);
1233
1234	// See if any partial phrase matches are extended by this word
1235	if (word_match.preceding_word_matched)
1236	{
1237	for (int j = partial_phrase_matches.size() - 1; j >= 0; j--)
1238	{
1239	PartialPhraseMatch partial_phrase_match = partial_phrase_matches.remove(j);
1240	ArrayList phrase_query_p_term_variants_list = phrase_query_term_variants_hierarchy.get(partial_phrase_match.query_phrase_number);
1241	HashSet phrase_query_p_term_x_variants = (HashSet) phrase_query_p_term_variants_list.get(partial_phrase_match.num_words_matched);
1242	if (phrase_query_p_term_x_variants.contains(word_match.word))
1243	{
1244	partial_phrase_match.num_words_matched++;
1245
1246	// Has a complete phrase match occurred?
1247	if (partial_phrase_match.num_words_matched == phrase_query_p_term_variants_list.size())
1248	{
1249	// Check for overlaps by looking at the previous highlight range
1250	if (!highlight_end_positions.isEmpty())
1251	{
1252	int last_highlight_index = highlight_end_positions.size() - 1;
1253	int last_highlight_end = highlight_end_positions.get(last_highlight_index).intValue();
1254	if (last_highlight_end > partial_phrase_match.start_position)
1255	{
1256	// There is an overlap, so remove the previous phrase match
1257	int last_highlight_start = highlight_start_positions.remove(last_highlight_index).intValue();
1258	highlight_end_positions.remove(last_highlight_index);
1259	partial_phrase_match.start_position = last_highlight_start;
1260	}
1261	}
1262
1263	highlight_start_positions.add(new Integer(partial_phrase_match.start_position));
1264	highlight_end_positions.add(new Integer(word_match.end_position));
1265	}
1266	// No, but add the partial match back into the list for next time
1267	else
1268	{
1269	partial_phrase_matches.add(partial_phrase_match);
1270	}
1271	}
1272	}
1273	}
1274	else
1275	{
1276	partial_phrase_matches.clear();
1277	}
1278
1279	// See if this word is at the start of any of the phrases
1280	for (int p = 0; p < phrase_query_term_variants_hierarchy.size(); p++)
1281	{
1282	ArrayList phrase_query_p_term_variants_list = phrase_query_term_variants_hierarchy.get(p);
1283	if (phrase_query_p_term_variants_list.size()>0) {
1284	HashSet phrase_query_p_term_1_variants = (HashSet) phrase_query_p_term_variants_list.get(0);
1285	if (phrase_query_p_term_1_variants.contains(word_match.word))
1286	{
1287	// If this phrase is just one word long, we have a complete match
1288	if (phrase_query_p_term_variants_list.size() == 1)
1289	{
1290	highlight_start_positions.add(new Integer(word_match.start_position));
1291	highlight_end_positions.add(new Integer(word_match.end_position));
1292	}
1293	// Otherwise we have the start of a potential phrase match
1294	else
1295	{
1296	partial_phrase_matches.add(new PartialPhraseMatch(word_match.start_position, p));
1297	}
1298	}
1299	}
1300	}
1301	}
1302
1303	// Now add the annotation tags into the document at the correct points
1304	Element content_element = doc.createElement(GSXML.NODE_CONTENT_ELEM);
1305
1306	int last_wrote = 0;
1307	for (int i = 0; i < highlight_start_positions.size(); i++)
1308	{
1309	int highlight_start = highlight_start_positions.get(i).intValue();
1310	int highlight_end = highlight_end_positions.get(i).intValue();
1311
1312	// Print anything before the highlight range
1313	if (last_wrote < highlight_start)
1314	{
1315	String preceding_text = new String(content_characters, last_wrote, (highlight_start - last_wrote));
1316	content_element.appendChild(doc.createTextNode(preceding_text));
1317	}
1318
1319	// Print the highlight text, annotated
1320	if (highlight_end > last_wrote)
1321	{
1322	String highlight_text = new String(content_characters, highlight_start, (highlight_end - highlight_start));
1323	Element annotation_element = GSXML.createTextElement(doc, "annotation", highlight_text);
1324	annotation_element.setAttribute("type", "query_term");
1325	content_element.appendChild(annotation_element);
1326	last_wrote = highlight_end;
1327	}
1328	}
1329
1330	// Finish off any unwritten text
1331	if (last_wrote < content_characters.length)
1332	{
1333	String remaining_text = new String(content_characters, last_wrote, (content_characters.length - last_wrote));
1334	content_element.appendChild(doc.createTextNode(remaining_text));
1335	}
1336	return content_element;
1337	}
1338
1339	static private class WordMatch
1340	{
1341	public String word;
1342	public int start_position;
1343	public int end_position;
1344	public boolean preceding_word_matched;
1345
1346	public WordMatch(String word, int start_position, int end_position, boolean preceding_word_matched)
1347	{
1348	this.word = word;
1349	this.start_position = start_position;
1350	this.end_position = end_position;
1351	this.preceding_word_matched = preceding_word_matched;
1352	}
1353	}
1354
1355	static private class PartialPhraseMatch
1356	{
1357	public int start_position;
1358	public int query_phrase_number;
1359	public int num_words_matched;
1360
1361	public PartialPhraseMatch(int start_position, int query_phrase_number)
1362	{
1363	this.start_position = start_position;
1364	this.query_phrase_number = query_phrase_number;
1365	this.num_words_matched = 1;
1366	}
1367	}
1368	}

Note: See TracBrowser for help on using the repository browser.

Download in other formats: