Context Navigation

source: main/trunk/greenstone3/src/java/org/greenstone/gsdl3/action/DocumentAction.java@ 32071

Last change on this file since 32071 was 32071, checked in by ak19, 6 years ago
Changes to set the docType of the <document> element and to set the docType, nodeType and nodeID on each <documentNode>.
Property svn:keywords set to `Author Date Id Revision`
File size: 52.4 KB

Line
1	/*
2	* DocumentAction.java
3	* Copyright (C) 2002 New Zealand Digital Library, http://www.nzdl.org
4	*
5	* This program is free software; you can redistribute it and/or modify
6	* it under the terms of the GNU General Public License as published by
7	* the Free Software Foundation; either version 2 of the License, or
8	* (at your option) any later version.
9	*
10	* This program is distributed in the hope that it will be useful,
11	* but WITHOUT ANY WARRANTY; without even the implied warranty of
12	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13	* GNU General Public License for more details.
14	*
15	* You should have received a copy of the GNU General Public License
16	* along with this program; if not, write to the Free Software
17	* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
18	*/
19	package org.greenstone.gsdl3.action;
20
21	// Greenstone classes
22	import org.greenstone.gsdl3.core.ModuleInterface;
23	import org.greenstone.gsdl3.util.*;
24	import org.greenstone.util.GlobalProperties;
25
26	// XML classes
27	import org.w3c.dom.Document;
28	import org.w3c.dom.Element;
29	import org.w3c.dom.Node;
30	import org.w3c.dom.Text;
31	import org.w3c.dom.NodeList;
32
33	// General Java classes
34	import java.util.ArrayList;
35	import java.util.HashMap;
36	import java.util.HashSet;
37	import java.io.File;
38	import java.io.Serializable;
39
40	import org.apache.log4j.*;
41
42	/** Action class for retrieving Documents via the message router */
43	public class DocumentAction extends Action
44	{
45
46	static Logger logger = Logger.getLogger(org.greenstone.gsdl3.action.DocumentAction.class.getName());
47
48	// this is used to specify that the sibling nodes of a selected one should be obtained
49	public static final String SIBLING_ARG = "sib";
50	public static final String GOTO_PAGE_ARG = "gp";
51	public static final String ENRICH_DOC_ARG = "end";
52	public static final String EXPAND_DOCUMENT_ARG = "ed";
53	public static final String EXPAND_CONTENTS_ARG = "ec";
54	public static final String REALISTIC_BOOK_ARG = "book";
55	public static final String NO_TEXT_ARG = "noText";
56	public static final String DOC_EDIT_ARG = "docEdit";
57
58	/**
59	* if this is set to true, when a document is displayed, any annotation type
60	* services (enrich) will be offered to the user as well
61	*/
62	protected boolean provide_annotations = false;
63
64	protected boolean highlight_query_terms = false;
65
66	public boolean configure()
67	{
68	super.configure();
69	String highlight = (String) config_params.get("highlightQueryTerms");
70	if (highlight != null && highlight.equals("true"))
71	{
72	highlight_query_terms = true;
73	}
74	String annotate = (String) config_params.get("displayAnnotationService");
75	if (annotate != null && annotate.equals("true"))
76	{
77	provide_annotations = true;
78	}
79	return true;
80	}
81
82	public Node process(Node message_node)
83	{
84	// for now, no subaction eventually we may want to have subactions such as text assoc or something ?
85
86	Element message = GSXML.nodeToElement(message_node);
87	Document doc = XMLConverter.newDOM(); //message.getOwnerDocument();
88
89	// the response
90	Element result = doc.createElement(GSXML.MESSAGE_ELEM);
91	Element page_response = doc.createElement(GSXML.RESPONSE_ELEM);
92	result.appendChild(page_response);
93
94	// get the request - assume only one
95	Element request = (Element) GSXML.getChildByTagName(message, GSXML.REQUEST_ELEM);
96	Element cgi_paramList = (Element) GSXML.getChildByTagName(request, GSXML.PARAM_ELEM + GSXML.LIST_MODIFIER);
97	HashMap<String, Serializable> params = GSXML.extractParams(cgi_paramList, false);
98
99	// just in case there are some that need to get passed to the services
100	HashMap service_params = (HashMap) params.get("s0");
101
102	String collection = (String) params.get(GSParams.COLLECTION);
103	String document_id = (String) params.get(GSParams.DOCUMENT);
104	if (document_id != null && document_id.equals(""))
105	{
106	document_id = null;
107	}
108	String href = (String) params.get(GSParams.HREF);//for an external link : get the href URL if it is existing in the params list
109	if (href != null && href.equals(""))
110	{
111	href = null;
112	}
113	String rl = (String) params.get(GSParams.RELATIVE_LINK);//for an external link : get the rl value if it is existing in the params list
114	if (document_id == null && href == null)
115	{
116	logger.error("no document specified!");
117	return result;
118	}
119	if (rl != null && rl.equals("0"))
120	{
121	// this is a true external link, we should have been directed to a different page or action
122	logger.error("rl value was 0, shouldn't get here");
123	return result;
124	}
125
126	UserContext userContext = new UserContext(request);
127
128	//append site metadata
129	addSiteMetadata(page_response, userContext);
130	addInterfaceOptions(page_response);
131
132	// get the additional data needed for the page
133	getBackgroundData(page_response, collection, userContext);
134	Element format_elem = (Element) GSXML.getChildByTagName(page_response, GSXML.FORMAT_ELEM);
135
136	if (format_elem != null) {
137	// lets look for param defaults set in config file
138	NodeList param_defaults = format_elem.getElementsByTagName("paramDefault");
139	for (int i=0; i<param_defaults.getLength(); i++) {
140	Element p = (Element)param_defaults.item(i);
141	String name = p.getAttribute(GSXML.NAME_ATT);
142	if (params.get(name) ==null) {
143	// wasn't set from interface
144	String value = p.getAttribute(GSXML.VALUE_ATT);
145	params.put(name, value );
146	// also add into request param xml so that xslt knows it too
147	GSXML.addParameterToList(cgi_paramList, name, value);
148	}
149	}
150	}
151
152	String document_type = (String) params.get(GSParams.DOCUMENT_TYPE);
153	if (document_type != null && document_type.equals(""))
154	{
155	//document_type = "hierarchy";
156	document_type = null; // we'll get it later if not already specified
157	}
158	// what if it is null here?? Anu to check...
159
160
161	boolean editing_document = false;
162	String doc_edit = (String) params.get(DOC_EDIT_ARG);
163	if (doc_edit != null && doc_edit.equals("1")) {
164	editing_document = true;
165	}
166
167	// are we editing mode? just get the archive document, convert to our internal doc format, and return it
168	if (editing_document) {
169
170	// call get archive doc
171	Element dx_message = doc.createElement(GSXML.MESSAGE_ELEM);
172	String to = "DocXMLGetSection";
173	Element dx_request = GSXML.createBasicRequest(doc, GSXML.REQUEST_TYPE_PROCESS, to, userContext);
174	dx_message.appendChild(dx_request);
175	Element dx_section = doc.createElement(GSXML.DOCXML_SECTION_ELEM);
176	dx_section.setAttribute(GSXML.NODE_ID_ATT, document_id);
177	dx_section.setAttribute(GSXML.COLLECTION_ATT, collection);
178	dx_request.appendChild(dx_section);
179
180	Element dx_response_message = (Element) this.mr.process(dx_message);
181	if (processErrorElements(dx_response_message, page_response))
182	{
183	return result;
184	}
185
186	// get the section out
187	String path = GSPath.appendLink(GSXML.RESPONSE_ELEM, GSXML.DOCXML_SECTION_ELEM);
188	Element section = (Element) GSXML.getNodeByPath(dx_response_message, path);
189	if (section == null) {
190	logger.error("no archive doc returned for "+document_id);
191	return result;
192	}
193	// convert the archive format into the internal format that the page response requires
194
195	// work out doctype
196	// create a basic doc list containing the current node
197	Element basic_doc_list = doc.createElement(GSXML.DOC_NODE_ELEM + GSXML.LIST_MODIFIER);
198	Element current_doc = doc.createElement(GSXML.DOC_NODE_ELEM);
199	basic_doc_list.appendChild(current_doc);
200	current_doc.setAttribute(GSXML.NODE_ID_ATT, document_id);
201	basic_doc_list.appendChild(current_doc);
202	if (document_type == null) {
203	document_type = getDocumentType(basic_doc_list, collection, userContext, page_response);
204	}
205	if (document_type == null) {
206	logger.debug("@@@ doctype is null, setting to simple");
207	document_type = GSXML.DOC_TYPE_SIMPLE;
208	}
209
210	Element doc_elem = doc.createElement(GSXML.DOCUMENT_ELEM);
211	doc_elem.setAttribute(GSXML.DOC_TYPE_ATT, document_type);
212	page_response.appendChild(doc_elem);
213	section.setAttribute(GSXML.NODE_ID_ATT, document_id);
214
215
216	Element transformed_section = transformArchiveToDocument(section);
217	// In docEdit mode, we obtain the text from archives, from doc.xml
218	// Now the transformation has replaced <Section> with <documentNode>
219	// Need to add nodeID, nodeType and docType attributes to each docNode
220	// as doc.xml doesn't store that.
221	insertDocNodeAttributes(transformed_section, document_type, null);
222	doc_elem.appendChild(doc.importNode(transformed_section, true));
223	logger.debug("dx result = "+XMLConverter.getPrettyString(result));
224
225	return result;
226	}
227
228	//whether to retrieve siblings or not
229	boolean get_siblings = false;
230	String sibs = (String) params.get(SIBLING_ARG);
231	if (sibs != null && sibs.equals("1"))
232	{
233	get_siblings = true;
234	}
235
236	String doc_id_modifier = "";
237	String sibling_num = (String) params.get(GOTO_PAGE_ARG);
238	if (sibling_num != null && !sibling_num.equals(""))
239	{
240	// we have to modify the doc name
241	doc_id_modifier = "." + sibling_num + ".ss";
242	}
243
244	boolean expand_document = false;
245	String ed_arg = (String) params.get(EXPAND_DOCUMENT_ARG);
246	if (ed_arg != null && ed_arg.equals("1"))
247	{
248	expand_document = true;
249	}
250
251	boolean expand_contents = false;
252	if (expand_document)
253	{ // we always expand the contents with the text
254	expand_contents = true;
255	}
256	else
257	{
258	String ec_arg = (String) params.get(EXPAND_CONTENTS_ARG);
259	if (ec_arg != null && ec_arg.equals("1"))
260	{
261	expand_contents = true;
262	}
263	}
264
265	// do we want text content? Not if no_text=1.
266	// expand_document overrides this. - should it??
267	boolean get_text = true;
268	String nt_arg = (String) params.get(NO_TEXT_ARG);
269
270	if (!expand_document && nt_arg!=null && nt_arg.equals("1")) {
271	logger.debug("SETTING GET TEXT TO FALSE");
272	get_text = false;
273	} else {
274	logger.debug("GET TEXT REMAINS TRUE");
275	}
276
277	// the_document is where all the doc info - structure and metadata etc
278	// is added into, to be returned in the page
279	Element the_document = doc.createElement(GSXML.DOCUMENT_ELEM);
280	page_response.appendChild(the_document);
281
282	// create a basic doc list containing the current node
283	Element basic_doc_list = doc.createElement(GSXML.DOC_NODE_ELEM + GSXML.LIST_MODIFIER);
284	Element current_doc = doc.createElement(GSXML.DOC_NODE_ELEM);
285	basic_doc_list.appendChild(current_doc);
286	if (document_id != null)
287	{
288	current_doc.setAttribute(GSXML.NODE_ID_ATT, document_id + doc_id_modifier);
289	}
290	else
291	{
292	current_doc.setAttribute(GSXML.HREF_ID_ATT, href);
293	// do we need this??
294	current_doc.setAttribute(GSXML.ID_MOD_ATT, doc_id_modifier);
295	}
296
297	if (document_type == null)
298	{
299	document_type = getDocumentType(basic_doc_list, collection, userContext, page_response);
300	}
301	if (document_type == null)
302	{
303	logger.debug("##### doctype is null, setting to simple");
304	document_type = GSXML.DOC_TYPE_SIMPLE;
305	}
306
307	the_document.setAttribute(GSXML.DOC_TYPE_ATT, document_type);
308
309	// Create a parameter list to specify the required structure information
310	Element ds_param_list = doc.createElement(GSXML.PARAM_ELEM + GSXML.LIST_MODIFIER);
311
312	if (service_params != null)
313	{
314	GSXML.addParametersToList(ds_param_list, service_params);
315	}
316
317	Element ds_param = null;
318	boolean get_structure = false;
319	boolean get_structure_info = false;
320	if (document_type.equals(GSXML.DOC_TYPE_PAGED))
321	{
322	get_structure_info = true;
323
324	if (expand_contents)
325	{
326	ds_param = doc.createElement(GSXML.PARAM_ELEM);
327	ds_param_list.appendChild(ds_param);
328	ds_param.setAttribute(GSXML.NAME_ATT, "structure");
329	ds_param.setAttribute(GSXML.VALUE_ATT, "entire");
330	}
331
332	// get the info needed for paged naviagtion
333	ds_param = doc.createElement(GSXML.PARAM_ELEM);
334	ds_param_list.appendChild(ds_param);
335	ds_param.setAttribute(GSXML.NAME_ATT, "info");
336	ds_param.setAttribute(GSXML.VALUE_ATT, "numSiblings");
337	ds_param = doc.createElement(GSXML.PARAM_ELEM);
338	ds_param_list.appendChild(ds_param);
339	ds_param.setAttribute(GSXML.NAME_ATT, "info");
340	ds_param.setAttribute(GSXML.VALUE_ATT, "numChildren");
341	ds_param = doc.createElement(GSXML.PARAM_ELEM);
342	ds_param_list.appendChild(ds_param);
343	ds_param.setAttribute(GSXML.NAME_ATT, "info");
344	ds_param.setAttribute(GSXML.VALUE_ATT, "siblingPosition");
345
346	if (get_siblings)
347	{
348	ds_param = doc.createElement(GSXML.PARAM_ELEM);
349	ds_param_list.appendChild(ds_param);
350	ds_param.setAttribute(GSXML.NAME_ATT, "structure");
351	ds_param.setAttribute(GSXML.VALUE_ATT, "siblings");
352	}
353
354	}
355	else if (document_type.equals(GSXML.DOC_TYPE_HIERARCHY) \|\| document_type.equals(GSXML.DOC_TYPE_PAGED_HIERARCHY))
356	{
357	get_structure = true;
358	if (expand_contents)
359	{
360	ds_param = doc.createElement(GSXML.PARAM_ELEM);
361	ds_param_list.appendChild(ds_param);
362	ds_param.setAttribute(GSXML.NAME_ATT, "structure");
363	ds_param.setAttribute(GSXML.VALUE_ATT, "entire");
364	}
365	else
366	{
367	// get the info needed for table of contents
368	ds_param = doc.createElement(GSXML.PARAM_ELEM);
369	ds_param_list.appendChild(ds_param);
370	ds_param.setAttribute(GSXML.NAME_ATT, "structure");
371	ds_param.setAttribute(GSXML.VALUE_ATT, "ancestors");
372	ds_param = doc.createElement(GSXML.PARAM_ELEM);
373	ds_param_list.appendChild(ds_param);
374	ds_param.setAttribute(GSXML.NAME_ATT, "structure");
375	ds_param.setAttribute(GSXML.VALUE_ATT, "children");
376	if (get_siblings)
377	{
378	ds_param = doc.createElement(GSXML.PARAM_ELEM);
379	ds_param_list.appendChild(ds_param);
380	ds_param.setAttribute(GSXML.NAME_ATT, "structure");
381	ds_param.setAttribute(GSXML.VALUE_ATT, "siblings");
382	}
383	}
384	}
385	else
386	{
387	// we dont need any structure
388	}
389
390	boolean has_dummy = false;
391	if (get_structure \|\| get_structure_info)
392	{
393
394	// Build a request to obtain the document structure
395	Element ds_message = doc.createElement(GSXML.MESSAGE_ELEM);
396	String to = GSPath.appendLink(collection, "DocumentStructureRetrieve");// Hard-wired?
397	Element ds_request = GSXML.createBasicRequest(doc, GSXML.REQUEST_TYPE_PROCESS, to, userContext);
398	ds_message.appendChild(ds_request);
399	ds_request.appendChild(ds_param_list);
400
401	// add the node list we created earlier
402	ds_request.appendChild(basic_doc_list);
403
404	// Process the document structure retrieve message
405	Element ds_response_message = (Element) this.mr.process(ds_message);
406	if (processErrorElements(ds_response_message, page_response))
407	{
408	return result;
409	}
410
411	// get the info and print out
412	String path = GSPath.appendLink(GSXML.RESPONSE_ELEM, GSXML.DOC_NODE_ELEM + GSXML.LIST_MODIFIER);
413	path = GSPath.appendLink(path, GSXML.DOC_NODE_ELEM);
414	path = GSPath.appendLink(path, "nodeStructureInfo");
415	Element ds_response_struct_info = (Element) GSXML.getNodeByPath(ds_response_message, path);
416	// get the doc_node bit
417	if (ds_response_struct_info != null)
418	{
419	the_document.appendChild(doc.importNode(ds_response_struct_info, true));
420	}
421	path = GSPath.appendLink(GSXML.RESPONSE_ELEM, GSXML.DOC_NODE_ELEM + GSXML.LIST_MODIFIER);
422	path = GSPath.appendLink(path, GSXML.DOC_NODE_ELEM);
423	path = GSPath.appendLink(path, GSXML.NODE_STRUCTURE_ELEM);
424	Element ds_response_structure = (Element) GSXML.getNodeByPath(ds_response_message, path);
425
426	if (ds_response_structure != null)
427	{
428	// add the contents of the structure bit into the_document
429	NodeList structs = ds_response_structure.getChildNodes();
430	for (int i = 0; i < structs.getLength(); i++)
431	{
432	the_document.appendChild(doc.importNode(structs.item(i), true));
433	}
434	}
435	else
436	{
437	// no structure nodes, so put in a dummy doc node
438	Element doc_node = doc.createElement(GSXML.DOC_NODE_ELEM);
439	if (document_id != null)
440	{
441	doc_node.setAttribute(GSXML.NODE_ID_ATT, document_id);
442	}
443	else
444	{
445	doc_node.setAttribute(GSXML.HREF_ID_ATT, href);
446
447	}
448	the_document.appendChild(doc_node);
449	has_dummy = true;
450	}
451	}
452	else
453	{ // a simple type - we dont have a dummy node for simple
454	// should think about this more
455	// no structure request, so just put in a dummy doc node
456	Element doc_node = doc.createElement(GSXML.DOC_NODE_ELEM);
457	if (document_id != null)
458	{
459	doc_node.setAttribute(GSXML.NODE_ID_ATT, document_id);
460	}
461	else
462	{
463	doc_node.setAttribute(GSXML.HREF_ID_ATT, href);
464	}
465	the_document.appendChild(doc_node);
466	has_dummy = true;
467	}
468
469	// Build a request to obtain some document metadata
470	Element dm_message = doc.createElement(GSXML.MESSAGE_ELEM);
471	String to = GSPath.appendLink(collection, "DocumentMetadataRetrieve"); // Hard-wired?
472	Element dm_request = GSXML.createBasicRequest(doc, GSXML.REQUEST_TYPE_PROCESS, to, userContext);
473	dm_message.appendChild(dm_request);
474	// Create a parameter list to specify the required metadata information
475
476	HashSet<String> meta_names = new HashSet<String>();
477	meta_names.add("Title"); // the default
478	if (format_elem != null)
479	{
480	getRequiredMetadataNames(format_elem, meta_names);
481	}
482
483	Element extraMetaListElem = (Element) GSXML.getChildByTagName(request, GSXML.EXTRA_METADATA + GSXML.LIST_MODIFIER);
484	if (extraMetaListElem != null)
485	{
486	NodeList extraMetaList = extraMetaListElem.getElementsByTagName(GSXML.EXTRA_METADATA);
487	for (int i = 0; i < extraMetaList.getLength(); i++)
488	{
489	meta_names.add(((Element) extraMetaList.item(i)).getAttribute(GSXML.NAME_ATT));
490	}
491	}
492
493	Element dm_param_list = createMetadataParamList(doc,meta_names);
494	if (service_params != null)
495	{
496	GSXML.addParametersToList(dm_param_list, service_params);
497	}
498
499	dm_request.appendChild(dm_param_list);
500
501	// create the doc node list for the metadata request
502	Element dm_doc_list = doc.createElement(GSXML.DOC_NODE_ELEM + GSXML.LIST_MODIFIER);
503	dm_request.appendChild(dm_doc_list);
504
505	// Add each node from the structure response into the metadata request
506	NodeList doc_nodes = the_document.getElementsByTagName(GSXML.DOC_NODE_ELEM);
507	for (int i = 0; i < doc_nodes.getLength(); i++)
508	{
509	Element doc_node = (Element) doc_nodes.item(i);
510	String doc_node_id = doc_node.getAttribute(GSXML.NODE_ID_ATT);
511
512	// Add the documentNode to the list
513	Element dm_doc_node = doc.createElement(GSXML.DOC_NODE_ELEM);
514	dm_doc_list.appendChild(dm_doc_node);
515	dm_doc_node.setAttribute(GSXML.NODE_ID_ATT, doc_node_id);
516	dm_doc_node.setAttribute(GSXML.NODE_TYPE_ATT, doc_node.getAttribute(GSXML.NODE_TYPE_ATT));
517	if (document_id == null){
518	dm_doc_node.setAttribute(GSXML.HREF_ID_ATT, href );
519	}
520
521	}
522
523	// we also want a metadata request to the top level document to get
524	// assocfilepath - this could be cached too
525	Element doc_meta_request = GSXML.createBasicRequest(doc, GSXML.REQUEST_TYPE_PROCESS, to, userContext);
526	dm_message.appendChild(doc_meta_request);
527	Element doc_meta_param_list = doc.createElement(GSXML.PARAM_ELEM + GSXML.LIST_MODIFIER);
528	if (service_params != null)
529	{
530	GSXML.addParametersToList(doc_meta_param_list, service_params);
531	}
532
533	doc_meta_request.appendChild(doc_meta_param_list);
534	Element doc_param = doc.createElement(GSXML.PARAM_ELEM);
535	doc_meta_param_list.appendChild(doc_param);
536	doc_param.setAttribute(GSXML.NAME_ATT, "metadata");
537	doc_param.setAttribute(GSXML.VALUE_ATT, "assocfilepath");
538
539	// create the doc node list for the metadata request
540	Element doc_list = doc.createElement(GSXML.DOC_NODE_ELEM + GSXML.LIST_MODIFIER);
541	doc_meta_request.appendChild(doc_list);
542
543	Element doc_node = doc.createElement(GSXML.DOC_NODE_ELEM);
544	// the node we want is the root document node
545	if (document_id != null)
546	{
547	doc_node.setAttribute(GSXML.NODE_ID_ATT, document_id + ".rt");
548	}
549	/*else
550	{
551	doc_node.setAttribute(GSXML.HREF_ID_ATT, href);// + ".rt");
552	// can we assume that href is always a top level doc??
553	//doc_node.setAttribute(GSXML.ID_MOD_ATT, ".rt");
554	//doc_node.setAttribute("externalURL", has_rl);
555	}*/
556	doc_list.appendChild(doc_node);
557
558	Element dm_response_message = (Element) this.mr.process(dm_message);
559	if (processErrorElements(dm_response_message, page_response))
560	{
561	return result;
562	}
563
564	String path = GSPath.appendLink(GSXML.RESPONSE_ELEM, GSXML.DOC_NODE_ELEM + GSXML.LIST_MODIFIER);
565	Element dm_response_doc_list = (Element) GSXML.getNodeByPath(dm_response_message, path);
566
567	// Merge the metadata with the structure information
568	NodeList dm_response_docs = dm_response_doc_list.getChildNodes();
569	for (int i = 0; i < doc_nodes.getLength(); i++)
570	{
571	GSXML.mergeMetadataLists(doc_nodes.item(i), dm_response_docs.item(i));
572	}
573	// get the top level doc metadata out
574	Element doc_meta_response = (Element) dm_response_message.getElementsByTagName(GSXML.RESPONSE_ELEM).item(1);
575	Element top_doc_node = (Element) GSXML.getNodeByPath(doc_meta_response, "documentNodeList/documentNode");
576	GSXML.mergeMetadataLists(the_document, top_doc_node);
577
578	// do we want doc text content? If not, we are done.
579	if (!get_text) {
580	// don't get text
581	return result;
582	}
583
584	// Build a request to obtain some document content
585	Element dc_message = doc.createElement(GSXML.MESSAGE_ELEM);
586	to = GSPath.appendLink(collection, "DocumentContentRetrieve"); // Hard-wired?
587	Element dc_request = GSXML.createBasicRequest(doc, GSXML.REQUEST_TYPE_PROCESS, to, userContext);
588	dc_message.appendChild(dc_request);
589
590	// Create a parameter list to specify the request parameters - empty for now
591	Element dc_param_list = doc.createElement(GSXML.PARAM_ELEM + GSXML.LIST_MODIFIER);
592	if (service_params != null)
593	{
594	GSXML.addParametersToList(dc_param_list, service_params);
595	}
596
597	dc_request.appendChild(dc_param_list);
598
599	// get the content
600	// the doc list for the content request is the same as the one for the structure request unless we want the whole document, in which case its the same as for the metadata request.
601	if (expand_document)
602	{
603	dc_request.appendChild(dm_doc_list);
604	}
605	else
606	{
607	dc_request.appendChild(basic_doc_list);
608	}
609	Element dc_response_message = (Element) this.mr.process(dc_message);
610	if (processErrorElements(dc_response_message, page_response))
611	{
612	return result;
613	}
614	Element dc_response_doc_list = (Element) GSXML.getNodeByPath(dc_response_message, path);
615
616	if (expand_document)
617	{
618	// Merge the content with the structure information
619	NodeList dc_response_docs = dc_response_doc_list.getChildNodes();
620	for (int i = 0; i < doc_nodes.getLength(); i++)
621	{
622	Node content = GSXML.getChildByTagName((Element) dc_response_docs.item(i), GSXML.NODE_CONTENT_ELEM);
623	if (content != null)
624	{
625	if (highlight_query_terms)
626	{
627	String node_id = ((Element)doc_nodes.item(i)).getAttribute(GSXML.NODE_ID_ATT);
628	content = highlightQueryTerms(request, node_id, (Element) content);
629	}
630
631	doc_nodes.item(i).appendChild(doc.importNode(content, true));
632	}
633	//GSXML.mergeMetadataLists(doc_nodes.item(i), dm_response_docs.item(i));
634	}
635	if (has_dummy && document_type.equals(GSXML.DOC_TYPE_SIMPLE)) {
636	Element dummy_node = (Element) doc_nodes.item(0);
637	the_document.removeChild(dummy_node);
638	the_document.setAttribute(GSXML.NODE_ID_ATT, dummy_node.getAttribute(GSXML.NODE_ID_ATT));
639	NodeList dummy_children = dummy_node.getChildNodes();
640	for (int i = dummy_children.getLength() - 1; i >= 0; i--)
641	{
642	// special case as we don't want more than one metadata list
643	if (dummy_children.item(i).getNodeName().equals(GSXML.METADATA_ELEM + GSXML.LIST_MODIFIER))
644	{
645	GSXML.mergeMetadataFromList(the_document, dummy_children.item(i));
646	}
647	else
648	{
649	the_document.appendChild(dummy_children.item(i));
650	}
651	}
652	}
653	}
654	else
655	{
656	//path = GSPath.appendLink(path, GSXML.DOC_NODE_ELEM);
657	Element dc_response_doc = (Element) GSXML.getChildByTagName(dc_response_doc_list, GSXML.DOC_NODE_ELEM);
658	Element dc_response_doc_content = (Element) GSXML.getChildByTagName(dc_response_doc, GSXML.NODE_CONTENT_ELEM);
659	//Element dc_response_doc_external = (Element) GSXML.getChildByTagName(dc_response_doc, "external");
660
661	if (dc_response_doc_content == null)
662	{
663	// no content to add
664	if (dc_response_doc.getAttribute("external").equals("true"))
665	{
666
667	//if (dc_response_doc_external != null)
668	//{
669	String href_id = dc_response_doc.getAttribute(GSXML.HREF_ID_ATT);
670
671	the_document.setAttribute("selectedNode", href_id);
672	the_document.setAttribute("external", href_id);
673	}
674	return result;
675	}
676	if (highlight_query_terms)
677	{
678	dc_response_doc.removeChild(dc_response_doc_content);
679
680	dc_response_doc_content = highlightQueryTerms(request, null, dc_response_doc_content);
681	dc_response_doc.appendChild(dc_response_doc.getOwnerDocument().importNode(dc_response_doc_content, true));
682	}
683
684	if (provide_annotations)
685	{
686	String service_selected = (String) params.get(ENRICH_DOC_ARG);
687	if (service_selected != null && service_selected.equals("1"))
688	{
689	// now we can modifiy the response doc if needed
690	String enrich_service = (String) params.get(GSParams.SERVICE);
691	// send a message to the service
692	Element enrich_message = doc.createElement(GSXML.MESSAGE_ELEM);
693	Element enrich_request = GSXML.createBasicRequest(doc, GSXML.REQUEST_TYPE_PROCESS, enrich_service, userContext);
694	enrich_message.appendChild(enrich_request);
695	// check for parameters
696	HashMap e_service_params = (HashMap) params.get("s1");
697	if (e_service_params != null)
698	{
699	Element enrich_pl = doc.createElement(GSXML.PARAM_ELEM + GSXML.LIST_MODIFIER);
700	GSXML.addParametersToList(enrich_pl, e_service_params);
701	enrich_request.appendChild(enrich_pl);
702	}
703	Element e_doc_list = doc.createElement(GSXML.DOC_NODE_ELEM + GSXML.LIST_MODIFIER);
704	enrich_request.appendChild(e_doc_list);
705	e_doc_list.appendChild(doc.importNode(dc_response_doc, true));
706
707	Node enrich_response = this.mr.process(enrich_message);
708
709	String[] links = { GSXML.RESPONSE_ELEM, GSXML.DOC_NODE_ELEM + GSXML.LIST_MODIFIER, GSXML.DOC_NODE_ELEM, GSXML.NODE_CONTENT_ELEM };
710	path = GSPath.createPath(links);
711	dc_response_doc_content = (Element) GSXML.getNodeByPath(enrich_response, path);
712
713	}
714	} // if provide_annotations
715
716	// use the returned id rather than the sent one cos there may have
717	// been modifiers such as .pr that are removed.
718	String modified_doc_id = dc_response_doc.getAttribute(GSXML.NODE_ID_ATT);
719	the_document.setAttribute("selectedNode", modified_doc_id);
720	if (has_dummy)
721	{
722	// change the id if necessary and add the content
723	Element dummy_node = (Element) doc_nodes.item(0);
724
725	dummy_node.setAttribute(GSXML.NODE_ID_ATT, modified_doc_id);
726	dummy_node.appendChild(doc.importNode(dc_response_doc_content, true));
727	// hack for simple type
728	if (document_type.equals(GSXML.DOC_TYPE_SIMPLE))
729	{
730	// we dont want the internal docNode, just want the content and metadata in the document
731	// rethink this!!
732	the_document.removeChild(dummy_node);
733
734	NodeList dummy_children = dummy_node.getChildNodes();
735	//for (int i=0; i<dummy_children.getLength(); i++) {
736	for (int i = dummy_children.getLength() - 1; i >= 0; i--)
737	{
738	// special case as we don't want more than one metadata list
739	if (dummy_children.item(i).getNodeName().equals(GSXML.METADATA_ELEM + GSXML.LIST_MODIFIER))
740	{
741	GSXML.mergeMetadataFromList(the_document, dummy_children.item(i));
742	}
743	else
744	{
745	the_document.appendChild(dummy_children.item(i));
746	}
747	}
748	}
749
750	the_document.setAttribute(GSXML.NODE_ID_ATT, modified_doc_id);
751	}
752	else
753	{
754	// Merge the document content with the metadata and structure information
755	for (int i = 0; i < doc_nodes.getLength(); i++)
756	{
757	Node dn = doc_nodes.item(i);
758	String dn_id = ((Element) dn).getAttribute(GSXML.NODE_ID_ATT);
759	if (dn_id.equals(modified_doc_id))
760	{
761	dn.appendChild(doc.importNode(dc_response_doc_content, true));
762	break;
763	}
764	}
765	}
766	}
767	//logger.debug("(DocumentAction) Page:\n" + GSXML.xmlNodeToString(result));
768	return result;
769	}
770
771	/**
772	* tell the param class what its arguments are if an action has its own
773	* arguments, this should add them to the params object - particularly
774	* important for args that should not be saved
775	*/
776	public boolean addActionParameters(GSParams params)
777	{
778	params.addParameter(GOTO_PAGE_ARG, false);
779	params.addParameter(ENRICH_DOC_ARG, false);
780	params.addParameter(EXPAND_DOCUMENT_ARG, false);
781	params.addParameter(EXPAND_CONTENTS_ARG, false);
782	params.addParameter(REALISTIC_BOOK_ARG, false);
783
784	return true;
785	}
786
787	/**
788	* this method gets the collection description, the format info, the list of
789	* enrich services, etc - stuff that is needed for the page, but is the same
790	* whatever the query is - should be cached
791	*/
792	protected boolean getBackgroundData(Element page_response, String collection, UserContext userContext)
793	{
794	Document doc = page_response.getOwnerDocument();
795
796	// create a message to process - contains requests for the collection
797	// description, the format element, the enrich services on offer
798	// these could all be cached
799	Element info_message = doc.createElement(GSXML.MESSAGE_ELEM);
800	String path = GSPath.appendLink(collection, "DocumentContentRetrieve");
801	// the format request - ignore for now, where does this request go to??
802	Element format_request = GSXML.createBasicRequest(doc, GSXML.REQUEST_TYPE_FORMAT, path, userContext);
803	info_message.appendChild(format_request);
804
805	// the enrich_services request - only do this if provide_annotations is true
806
807	if (provide_annotations)
808	{
809	Element enrich_services_request = GSXML.createBasicRequest(doc, GSXML.REQUEST_TYPE_DESCRIBE, "", userContext);
810	enrich_services_request.setAttribute(GSXML.INFO_ATT, "serviceList");
811	info_message.appendChild(enrich_services_request);
812	}
813
814	Element info_response = (Element) this.mr.process(info_message);
815
816	// the collection is the first response
817	NodeList responses = info_response.getElementsByTagName(GSXML.RESPONSE_ELEM);
818	Element format_resp = (Element) responses.item(0);
819
820	Element format_elem = (Element) GSXML.getChildByTagName(format_resp, GSXML.FORMAT_ELEM);
821	if (format_elem != null)
822	{
823	Element global_format_elem = (Element) GSXML.getChildByTagName(format_resp, GSXML.GLOBAL_FORMAT_ELEM);
824	if (global_format_elem != null)
825	{
826	GSXSLT.mergeFormatElements(format_elem, global_format_elem, false);
827	}
828
829	// set the format type
830	format_elem.setAttribute(GSXML.TYPE_ATT, "display");
831	page_response.appendChild(doc.importNode(format_elem, true));
832	}
833
834	if (provide_annotations)
835	{
836	Element services_resp = (Element) responses.item(1);
837
838	// a new message for the mr
839	Element enrich_message = doc.createElement(GSXML.MESSAGE_ELEM);
840	NodeList e_services = services_resp.getElementsByTagName(GSXML.SERVICE_ELEM);
841	boolean service_found = false;
842	for (int j = 0; j < e_services.getLength(); j++)
843	{
844	if (((Element) e_services.item(j)).getAttribute(GSXML.TYPE_ATT).equals("enrich"))
845	{
846	Element s = GSXML.createBasicRequest(doc, GSXML.REQUEST_TYPE_DESCRIBE, ((Element) e_services.item(j)).getAttribute(GSXML.NAME_ATT), userContext);
847	enrich_message.appendChild(s);
848	service_found = true;
849	}
850	}
851	if (service_found)
852	{
853	Element enrich_response = (Element) this.mr.process(enrich_message);
854
855	NodeList e_responses = enrich_response.getElementsByTagName(GSXML.RESPONSE_ELEM);
856	Element service_list = doc.createElement(GSXML.SERVICE_ELEM + GSXML.LIST_MODIFIER);
857	for (int i = 0; i < e_responses.getLength(); i++)
858	{
859	Element e_resp = (Element) e_responses.item(i);
860	Element e_service = (Element) doc.importNode(GSXML.getChildByTagName(e_resp, GSXML.SERVICE_ELEM), true);
861	e_service.setAttribute(GSXML.NAME_ATT, e_resp.getAttribute(GSXML.FROM_ATT));
862	service_list.appendChild(e_service);
863	}
864	page_response.appendChild(service_list);
865	}
866	} // if provide_annotations
867	return true;
868
869	}
870
871	protected String getDocumentType(Element basic_doc_list, String collection, UserContext userContext, Element page_response)
872	{
873	Document doc = basic_doc_list.getOwnerDocument();
874
875	Element ds_message = doc.createElement(GSXML.MESSAGE_ELEM);
876	String to = GSPath.appendLink(collection, "DocumentStructureRetrieve");// Hard-wired?
877	Element ds_request = GSXML.createBasicRequest(doc, GSXML.REQUEST_TYPE_PROCESS, to, userContext);
878	ds_message.appendChild(ds_request);
879
880	// Create a parameter list to specify the required structure information
881	Element ds_param_list = doc.createElement(GSXML.PARAM_ELEM + GSXML.LIST_MODIFIER);
882	Element ds_param = doc.createElement(GSXML.PARAM_ELEM);
883	ds_param_list.appendChild(ds_param);
884	ds_param.setAttribute(GSXML.NAME_ATT, "info");
885	ds_param.setAttribute(GSXML.VALUE_ATT, "documentType");
886
887	ds_request.appendChild(ds_param_list);
888
889	// add the node list we created earlier
890	ds_request.appendChild(basic_doc_list);
891
892	// Process the document structure retrieve message
893	Element ds_response_message = (Element) this.mr.process(ds_message);
894	if (processErrorElements(ds_response_message, page_response))
895	{
896	return null;
897	}
898
899	String[] links = { GSXML.RESPONSE_ELEM, GSXML.DOC_NODE_ELEM + GSXML.LIST_MODIFIER, GSXML.DOC_NODE_ELEM, "nodeStructureInfo" };
900	String path = GSPath.createPath(links);
901	Element info_elem = (Element) GSXML.getNodeByPath(ds_response_message, path);
902	if (info_elem == null) {
903	return null;
904	}
905	Element doctype_elem = GSXML.getNamedElement(info_elem, "info", "name", "documentType");
906	if (doctype_elem != null)
907	{
908	String doc_type = doctype_elem.getAttribute("value");
909	return doc_type;
910	}
911	return null;
912	}
913
914	// Recursive method to set the docType, nodeType and nodeID attributes of each docNode
915	// The docType remains constant as in parameter document_type
916	// The nodeID for the first (root) docNode is already set. For all children, the rootNode id
917	// is updated to be <parent-id>.<num-child>, where the first parent-id is rootNode id.
918	// The nodeType is root if rootNode, internal if there are children and leaf if no children
919	protected void insertDocNodeAttributes(Element docNode, String document_type, String id) {
920
921	boolean isRoot = false;
922	if(id == null) { // rootNode, get the root nodeID to work with recursively
923	id = docNode.getAttribute(GSXML.NODE_ID_ATT);
924	isRoot = true;
925	} else { // for all but the root node, need to still set the nodeID
926	docNode.setAttribute(GSXML.NODE_ID_ATT, id);
927	}
928
929	docNode.setAttribute(GSXML.DOC_TYPE_ATT, document_type);
930
931	NodeList docNodes = GSXML.getChildrenByTagName(docNode, GSXML.DOC_NODE_ELEM);
932	if(docNodes.getLength() > 0) {
933	docNode.setAttribute(GSXML.NODE_TYPE_ATT, GSXML.NODE_TYPE_INTERNAL);
934	for(int i = 0; i < docNodes.getLength(); i++) {
935	Element childDocNode = (Element)docNodes.item(i);
936
937	// work out the child docNode's nodeID based on current id
938	String nodeID = id + "." + (i+1);
939	insertDocNodeAttributes(childDocNode, document_type, nodeID); //recursion step
940	}
941	} else {
942	docNode.setAttribute(GSXML.NODE_TYPE_ATT, GSXML.NODE_TYPE_LEAF);
943	}
944
945	// rootNode's nodeType is a special case: it's "root", not "leaf" or "internal"
946	if(isRoot) docNode.setAttribute(GSXML.NODE_TYPE_ATT, GSXML.NODE_TYPE_ROOT);
947
948	}
949
950	/** run the XSLT transform which converts from doc.xml format to our internal document format */
951	protected Element transformArchiveToDocument(Element section) {
952
953	String stylesheet_file = GSFile.stylesheetFile(GlobalProperties.getGSDL3Home(), (String) this.config_params.get(GSConstants.SITE_NAME), "", (String) this.config_params.get(GSConstants.INTERFACE_NAME), null, "archive2document.xsl");
954	Document stylesheet_doc = XMLConverter.getDOM(new File(stylesheet_file));
955	if (stylesheet_doc == null) {
956	logger.error("Couldn't load in stylesheet "+stylesheet_file);
957	return section;
958	}
959
960	Document section_doc = XMLConverter.newDOM();
961	section_doc.appendChild(section_doc.importNode(section, true));
962	Node result = this.transformer.transform(stylesheet_doc, section_doc);
963	logger.debug("transform result = "+XMLConverter.getPrettyString(result));
964
965	Element new_element;
966	if (result.getNodeType() == Node.DOCUMENT_NODE) {
967	new_element = ((Document) result).getDocumentElement();
968	} else {
969	new_element = (Element) result;
970	}
971
972
973	return new_element;
974
975	}
976
977
978	/**
979	* this involves a bit of a hack to get the equivalent query terms - has to
980	* requery the query service - uses the last selected service name. (if it
981	* ends in query). should this action do the query or should it send a
982	* message to the query action? but that will involve lots of extra stuff.
983	* also doesn't handle phrases properly - just highlights all the terms
984	* found in the text.
985	*/
986	protected Element highlightQueryTerms(Element request, String current_node_id, Element dc_response_doc_content)
987	{
988	Document doc = request.getOwnerDocument();
989
990	// do the query again to get term info
991	Element cgi_param_list = (Element) GSXML.getChildByTagName(request, GSXML.PARAM_ELEM + GSXML.LIST_MODIFIER);
992	HashMap<String, Serializable> params = GSXML.extractParams(cgi_param_list, false);
993
994	HashMap previous_params = (HashMap) params.get("p");
995	if (previous_params == null)
996	{
997	return dc_response_doc_content;
998	}
999	String service_name = (String) previous_params.get(GSParams.SERVICE);
1000	if (service_name == null \|\| !service_name.endsWith("Query"))
1001	{ // hack for now - we only do highlighting if we were in a query last - ie not if we were in a browse thingy
1002	logger.debug("invalid service, not doing highlighting");
1003	return dc_response_doc_content;
1004	}
1005	String collection = (String) params.get(GSParams.COLLECTION);
1006	UserContext userContext = new UserContext(request);
1007	String to = GSPath.appendLink(collection, service_name);
1008
1009	Element mr_query_message = doc.createElement(GSXML.MESSAGE_ELEM);
1010	Element mr_query_request = GSXML.createBasicRequest(doc, GSXML.REQUEST_TYPE_PROCESS, to, userContext);
1011	mr_query_message.appendChild(mr_query_request);
1012
1013	// paramList
1014	HashMap service_params = (HashMap) params.get("s1");
1015
1016	Element query_param_list = doc.createElement(GSXML.PARAM_ELEM + GSXML.LIST_MODIFIER);
1017	GSXML.addParametersToList(query_param_list, service_params);
1018	if (current_node_id != null) {
1019	GSXML.addParameterToList(query_param_list, "hldocOID", current_node_id);
1020	} else {
1021	GSXML.addParameterToList(query_param_list, "hldocOID", (String) params.get(GSParams.DOCUMENT));
1022	}
1023	mr_query_request.appendChild(query_param_list);
1024	// do the query
1025	Element mr_query_response = (Element) this.mr.process(mr_query_message);
1026	String pathNode = GSPath.appendLink(GSXML.RESPONSE_ELEM, GSXML.NODE_CONTENT_ELEM);
1027	Element highlighted_Node = (Element) GSXML.getNodeByPath(mr_query_response, pathNode);
1028	// For SOLR, the above query may come back with a nodeContent element, which is the hldocOID section content, with search terms marked up. We send it back to the documnetContentRetrieve service so that resolveTextMacros can be applied, and it can be properly encased in documentNode etc elements
1029	if (highlighted_Node != null)
1030	{
1031	// Build a request to process highlighted text
1032
1033	Element hl_message = doc.createElement(GSXML.MESSAGE_ELEM);
1034	to = GSPath.appendLink(collection, "DocumentContentRetrieve");
1035	Element dc_request = GSXML.createBasicRequest(doc, GSXML.REQUEST_TYPE_PROCESS, to, userContext);
1036	hl_message.appendChild(dc_request);
1037
1038	// Create a parameter list to specify the request parameters - empty for now
1039	Element dc_param_list = doc.createElement(GSXML.PARAM_ELEM + GSXML.LIST_MODIFIER);
1040	dc_request.appendChild(dc_param_list);
1041
1042	// get the content
1043	Element doc_list = doc.createElement(GSXML.DOC_NODE_ELEM + GSXML.LIST_MODIFIER);
1044	dc_request.appendChild(doc_list);
1045	Element current_doc = doc.createElement(GSXML.DOC_NODE_ELEM);
1046	doc_list.appendChild(current_doc);
1047	current_doc.setAttribute(GSXML.NODE_ID_ATT, (String) params.get(GSParams.DOCUMENT));
1048	//Append highlighted content to request for processing
1049	dc_request.appendChild(doc.importNode(highlighted_Node, true));
1050	Element hl_response_message = (Element) this.mr.process(hl_message);
1051
1052	//Get results
1053	NodeList contentList = hl_response_message.getElementsByTagName(GSXML.NODE_CONTENT_ELEM);
1054	Element content = (Element) contentList.item(0);
1055	return content;
1056	}
1057	String path = GSPath.appendLink(GSXML.RESPONSE_ELEM, GSXML.TERM_ELEM + GSXML.LIST_MODIFIER);
1058	Element query_term_list_element = (Element) GSXML.getNodeByPath(mr_query_response, path);
1059	if (query_term_list_element == null)
1060	{
1061	// no term info
1062	logger.error("No query term information.\n");
1063	return dc_response_doc_content;
1064	}
1065
1066	String content = GSXML.getNodeText(dc_response_doc_content);
1067
1068	String metadata_path = GSPath.appendLink(GSXML.RESPONSE_ELEM, GSXML.METADATA_ELEM + GSXML.LIST_MODIFIER);
1069	Element metadata_list = (Element) GSXML.getNodeByPath(mr_query_response, metadata_path);
1070
1071	HashSet<String> query_term_variants = new HashSet<String>();
1072	NodeList equivalent_terms_nodelist = query_term_list_element.getElementsByTagName("equivTermList");
1073	if (equivalent_terms_nodelist == null \|\| equivalent_terms_nodelist.getLength() == 0)
1074	{
1075	NodeList terms_nodelist = query_term_list_element.getElementsByTagName("term");
1076	if (terms_nodelist != null && terms_nodelist.getLength() > 0)
1077	{
1078	for (int i = 0; i < terms_nodelist.getLength(); i++)
1079	{
1080	String termValue = ((Element) terms_nodelist.item(i)).getAttribute("name");
1081	String termValueU = null;
1082	String termValueL = null;
1083
1084	if (termValue.length() > 1)
1085	{
1086	termValueU = termValue.substring(0, 1).toUpperCase() + termValue.substring(1);
1087	termValueL = termValue.substring(0, 1).toLowerCase() + termValue.substring(1);
1088	}
1089	else
1090	{
1091	termValueU = termValue.substring(0, 1).toUpperCase();
1092	termValueL = termValue.substring(0, 1).toLowerCase();
1093	}
1094
1095	query_term_variants.add(termValueU);
1096	query_term_variants.add(termValueL);
1097	}
1098	}
1099	}
1100	else
1101	{
1102	for (int i = 0; i < equivalent_terms_nodelist.getLength(); i++)
1103	{
1104	Element equivalent_terms_element = (Element) equivalent_terms_nodelist.item(i);
1105	String[] equivalent_terms = GSXML.getAttributeValuesFromList(equivalent_terms_element, GSXML.NAME_ATT);
1106	for (int j = 0; j < equivalent_terms.length; j++)
1107	{
1108	query_term_variants.add(equivalent_terms[j]);
1109	}
1110	}
1111	}
1112
1113	ArrayList<ArrayList<HashSet<String>>> phrase_query_term_variants_hierarchy = new ArrayList<ArrayList<HashSet<String>>>();
1114
1115	Element query_element = GSXML.getNamedElement(metadata_list, GSXML.METADATA_ELEM, GSXML.NAME_ATT, "query");
1116	String performed_query = GSXML.getNodeText(query_element) + " ";
1117
1118	ArrayList<HashSet<String>> phrase_query_p_term_variants_list = new ArrayList<HashSet<String>>();
1119	int term_start = 0;
1120	boolean in_term = false;
1121	boolean in_phrase = false;
1122	for (int i = 0; i < performed_query.length(); i++)
1123	{
1124	char character = performed_query.charAt(i);
1125	boolean is_character_letter_or_digit = Character.isLetterOrDigit(character);
1126
1127	// Has a query term just started?
1128	if (in_term == false && is_character_letter_or_digit == true)
1129	{
1130	in_term = true;
1131	term_start = i;
1132	}
1133
1134	// Or has a term just finished?
1135	else if (in_term == true && is_character_letter_or_digit == false)
1136	{
1137	in_term = false;
1138	String term = performed_query.substring(term_start, i);
1139
1140	Element term_element = GSXML.getNamedElement(query_term_list_element, GSXML.TERM_ELEM, GSXML.NAME_ATT, term);
1141	if (term_element != null)
1142	{
1143
1144	HashSet<String> phrase_query_p_term_x_variants = new HashSet<String>();
1145
1146	NodeList term_equivalent_terms_nodelist = term_element.getElementsByTagName("equivTermList");
1147	if (term_equivalent_terms_nodelist == null \|\| term_equivalent_terms_nodelist.getLength() == 0)
1148	{
1149	String termValueU = null;
1150	String termValueL = null;
1151
1152	if (term.length() > 1)
1153	{
1154	termValueU = term.substring(0, 1).toUpperCase() + term.substring(1);
1155	termValueL = term.substring(0, 1).toLowerCase() + term.substring(1);
1156	}
1157	else
1158	{
1159	termValueU = term.substring(0, 1).toUpperCase();
1160	termValueL = term.substring(0, 1).toLowerCase();
1161	}
1162
1163	phrase_query_p_term_x_variants.add(termValueU);
1164	phrase_query_p_term_x_variants.add(termValueL);
1165	}
1166	else
1167	{
1168	for (int j = 0; j < term_equivalent_terms_nodelist.getLength(); j++)
1169	{
1170	Element term_equivalent_terms_element = (Element) term_equivalent_terms_nodelist.item(j);
1171	String[] term_equivalent_terms = GSXML.getAttributeValuesFromList(term_equivalent_terms_element, GSXML.NAME_ATT);
1172	for (int k = 0; k < term_equivalent_terms.length; k++)
1173	{
1174	phrase_query_p_term_x_variants.add(term_equivalent_terms[k]);
1175	}
1176	}
1177	}
1178	phrase_query_p_term_variants_list.add(phrase_query_p_term_x_variants);
1179
1180	if (in_phrase == false)
1181	{
1182	phrase_query_term_variants_hierarchy.add(phrase_query_p_term_variants_list);
1183	phrase_query_p_term_variants_list = new ArrayList<HashSet<String>>();
1184	}
1185	}
1186	}
1187	// Watch for phrases (surrounded by quotes)
1188	if (character == '\"')
1189	{
1190	// Has a phrase just started?
1191	if (in_phrase == false)
1192	{
1193	in_phrase = true;
1194	}
1195	// Or has a phrase just finished?
1196	else if (in_phrase == true)
1197	{
1198	in_phrase = false;
1199	phrase_query_term_variants_hierarchy.add(phrase_query_p_term_variants_list);
1200	}
1201
1202	phrase_query_p_term_variants_list = new ArrayList<HashSet<String>>();
1203	}
1204	}
1205
1206	return highlightQueryTermsInternal(doc, content, query_term_variants, phrase_query_term_variants_hierarchy);
1207	}
1208
1209	/**
1210	* Highlights query terms in a piece of text.
1211	*/
1212	private Element highlightQueryTermsInternal(Document doc, String content, HashSet<String> query_term_variants, ArrayList<ArrayList<HashSet<String>>> phrase_query_term_variants_hierarchy)
1213	{
1214	// Convert the content string to an array of characters for speed
1215	char[] content_characters = new char[content.length()];
1216	content.getChars(0, content.length(), content_characters, 0);
1217
1218	// Now skim through the content, identifying word matches
1219	ArrayList<WordMatch> word_matches = new ArrayList<WordMatch>();
1220	int word_start = 0;
1221	boolean in_word = false;
1222	boolean preceding_word_matched = false;
1223	boolean inTag = false;
1224	for (int i = 0; i < content_characters.length; i++)
1225	{
1226	//We don't want to find words inside HTML tags
1227	if (content_characters[i] == '<')
1228	{
1229	inTag = true;
1230	continue;
1231	}
1232	else if (inTag && content_characters[i] == '>')
1233	{
1234	inTag = false;
1235	}
1236	else if (inTag)
1237	{
1238	continue;
1239	}
1240
1241	boolean is_character_letter_or_digit = Character.isLetterOrDigit(content_characters[i]);
1242
1243	// Has a word just started?
1244	if (in_word == false && is_character_letter_or_digit == true)
1245	{
1246	in_word = true;
1247	word_start = i;
1248	}
1249
1250	// Or has a word just finished?
1251	else if (in_word == true && is_character_letter_or_digit == false)
1252	{
1253	in_word = false;
1254
1255	// Check if the word matches any of the query term equivalents
1256	String word = new String(content_characters, word_start, (i - word_start));
1257	if (query_term_variants.contains(word))
1258	{
1259	// We have found a matching word, so remember its location
1260	word_matches.add(new WordMatch(word, word_start, i, preceding_word_matched));
1261	preceding_word_matched = true;
1262	}
1263	else
1264	{
1265	preceding_word_matched = false;
1266	}
1267	}
1268	}
1269
1270	// Don't forget the last word...
1271	if (in_word == true)
1272	{
1273	// Check if the word matches any of the query term equivalents
1274	String word = new String(content_characters, word_start, (content_characters.length - word_start));
1275	if (query_term_variants.contains(word))
1276	{
1277	// We have found a matching word, so remember its location
1278	word_matches.add(new WordMatch(word, word_start, content_characters.length, preceding_word_matched));
1279	}
1280	}
1281
1282	ArrayList<Integer> highlight_start_positions = new ArrayList<Integer>();
1283	ArrayList<Integer> highlight_end_positions = new ArrayList<Integer>();
1284
1285	// Deal with phrases now
1286	ArrayList<PartialPhraseMatch> partial_phrase_matches = new ArrayList<PartialPhraseMatch>();
1287	for (int i = 0; i < word_matches.size(); i++)
1288	{
1289	WordMatch word_match = word_matches.get(i);
1290
1291	// See if any partial phrase matches are extended by this word
1292	if (word_match.preceding_word_matched)
1293	{
1294	for (int j = partial_phrase_matches.size() - 1; j >= 0; j--)
1295	{
1296	PartialPhraseMatch partial_phrase_match = partial_phrase_matches.remove(j);
1297	ArrayList phrase_query_p_term_variants_list = phrase_query_term_variants_hierarchy.get(partial_phrase_match.query_phrase_number);
1298	HashSet phrase_query_p_term_x_variants = (HashSet) phrase_query_p_term_variants_list.get(partial_phrase_match.num_words_matched);
1299	if (phrase_query_p_term_x_variants.contains(word_match.word))
1300	{
1301	partial_phrase_match.num_words_matched++;
1302
1303	// Has a complete phrase match occurred?
1304	if (partial_phrase_match.num_words_matched == phrase_query_p_term_variants_list.size())
1305	{
1306	// Check for overlaps by looking at the previous highlight range
1307	if (!highlight_end_positions.isEmpty())
1308	{
1309	int last_highlight_index = highlight_end_positions.size() - 1;
1310	int last_highlight_end = highlight_end_positions.get(last_highlight_index).intValue();
1311	if (last_highlight_end > partial_phrase_match.start_position)
1312	{
1313	// There is an overlap, so remove the previous phrase match
1314	int last_highlight_start = highlight_start_positions.remove(last_highlight_index).intValue();
1315	highlight_end_positions.remove(last_highlight_index);
1316	partial_phrase_match.start_position = last_highlight_start;
1317	}
1318	}
1319
1320	highlight_start_positions.add(new Integer(partial_phrase_match.start_position));
1321	highlight_end_positions.add(new Integer(word_match.end_position));
1322	}
1323	// No, but add the partial match back into the list for next time
1324	else
1325	{
1326	partial_phrase_matches.add(partial_phrase_match);
1327	}
1328	}
1329	}
1330	}
1331	else
1332	{
1333	partial_phrase_matches.clear();
1334	}
1335
1336	// See if this word is at the start of any of the phrases
1337	for (int p = 0; p < phrase_query_term_variants_hierarchy.size(); p++)
1338	{
1339	ArrayList phrase_query_p_term_variants_list = phrase_query_term_variants_hierarchy.get(p);
1340	if (phrase_query_p_term_variants_list.size()>0) {
1341	HashSet phrase_query_p_term_1_variants = (HashSet) phrase_query_p_term_variants_list.get(0);
1342	if (phrase_query_p_term_1_variants.contains(word_match.word))
1343	{
1344	// If this phrase is just one word long, we have a complete match
1345	if (phrase_query_p_term_variants_list.size() == 1)
1346	{
1347	highlight_start_positions.add(new Integer(word_match.start_position));
1348	highlight_end_positions.add(new Integer(word_match.end_position));
1349	}
1350	// Otherwise we have the start of a potential phrase match
1351	else
1352	{
1353	partial_phrase_matches.add(new PartialPhraseMatch(word_match.start_position, p));
1354	}
1355	}
1356	}
1357	}
1358	}
1359
1360	// Now add the annotation tags into the document at the correct points
1361	Element content_element = doc.createElement(GSXML.NODE_CONTENT_ELEM);
1362
1363	int last_wrote = 0;
1364	for (int i = 0; i < highlight_start_positions.size(); i++)
1365	{
1366	int highlight_start = highlight_start_positions.get(i).intValue();
1367	int highlight_end = highlight_end_positions.get(i).intValue();
1368
1369	// Print anything before the highlight range
1370	if (last_wrote < highlight_start)
1371	{
1372	String preceding_text = new String(content_characters, last_wrote, (highlight_start - last_wrote));
1373	content_element.appendChild(doc.createTextNode(preceding_text));
1374	}
1375
1376	// Print the highlight text, annotated
1377	if (highlight_end > last_wrote)
1378	{
1379	String highlight_text = new String(content_characters, highlight_start, (highlight_end - highlight_start));
1380	Element annotation_element = GSXML.createTextElement(doc, "annotation", highlight_text);
1381	annotation_element.setAttribute("type", "query_term");
1382	content_element.appendChild(annotation_element);
1383	last_wrote = highlight_end;
1384	}
1385	}
1386
1387	// Finish off any unwritten text
1388	if (last_wrote < content_characters.length)
1389	{
1390	String remaining_text = new String(content_characters, last_wrote, (content_characters.length - last_wrote));
1391	content_element.appendChild(doc.createTextNode(remaining_text));
1392	}
1393	return content_element;
1394	}
1395
1396	static private class WordMatch
1397	{
1398	public String word;
1399	public int start_position;
1400	public int end_position;
1401	public boolean preceding_word_matched;
1402
1403	public WordMatch(String word, int start_position, int end_position, boolean preceding_word_matched)
1404	{
1405	this.word = word;
1406	this.start_position = start_position;
1407	this.end_position = end_position;
1408	this.preceding_word_matched = preceding_word_matched;
1409	}
1410	}
1411
1412	static private class PartialPhraseMatch
1413	{
1414	public int start_position;
1415	public int query_phrase_number;
1416	public int num_words_matched;
1417
1418	public PartialPhraseMatch(int start_position, int query_phrase_number)
1419	{
1420	this.start_position = start_position;
1421	this.query_phrase_number = query_phrase_number;
1422	this.num_words_matched = 1;
1423	}
1424	}
1425	}

Note: See TracBrowser for help on using the repository browser.

Download in other formats: