Context Navigation

source: main/trunk/greenstone3/src/java/org/greenstone/gsdl3/action/DocumentAction.java@ 32111

Last change on this file since 32111 was 32111, checked in by kjdon, 6 years ago
pass in base interfaces array to the call to find archive2document.xsl. If you have a custom interface it will probably live in hte default one. Then check to make sure the file was there before trying to use it.
Property svn:keywords set to `Author Date Id Revision`
File size: 53.3 KB

Line
1	/*
2	* DocumentAction.java
3	* Copyright (C) 2002 New Zealand Digital Library, http://www.nzdl.org
4	*
5	* This program is free software; you can redistribute it and/or modify
6	* it under the terms of the GNU General Public License as published by
7	* the Free Software Foundation; either version 2 of the License, or
8	* (at your option) any later version.
9	*
10	* This program is distributed in the hope that it will be useful,
11	* but WITHOUT ANY WARRANTY; without even the implied warranty of
12	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13	* GNU General Public License for more details.
14	*
15	* You should have received a copy of the GNU General Public License
16	* along with this program; if not, write to the Free Software
17	* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
18	*/
19	package org.greenstone.gsdl3.action;
20
21	// Greenstone classes
22	import org.greenstone.gsdl3.core.ModuleInterface;
23	import org.greenstone.gsdl3.util.*;
24	import org.greenstone.util.GlobalProperties;
25
26	// XML classes
27	import org.w3c.dom.Document;
28	import org.w3c.dom.Element;
29	import org.w3c.dom.Node;
30	import org.w3c.dom.Text;
31	import org.w3c.dom.NodeList;
32
33	// General Java classes
34	import java.util.ArrayList;
35	import java.util.HashMap;
36	import java.util.HashSet;
37	import java.io.File;
38	import java.io.Serializable;
39
40	import org.apache.log4j.*;
41
42	/** Action class for retrieving Documents via the message router */
43	public class DocumentAction extends Action
44	{
45
46	static Logger logger = Logger.getLogger(org.greenstone.gsdl3.action.DocumentAction.class.getName());
47
48	// this is used to specify that the sibling nodes of a selected one should be obtained
49	public static final String SIBLING_ARG = "sib";
50	public static final String GOTO_PAGE_ARG = "gp";
51	public static final String ENRICH_DOC_ARG = "end";
52	public static final String EXPAND_DOCUMENT_ARG = "ed";
53	public static final String EXPAND_CONTENTS_ARG = "ec";
54	public static final String REALISTIC_BOOK_ARG = "book";
55	public static final String NO_TEXT_ARG = "noText";
56	public static final String DOC_EDIT_ARG = "docEdit";
57
58	/**
59	* if this is set to true, when a document is displayed, any annotation type
60	* services (enrich) will be offered to the user as well
61	*/
62	protected boolean provide_annotations = false;
63
64	protected boolean highlight_query_terms = false;
65
66	public boolean configure()
67	{
68	super.configure();
69	String highlight = (String) config_params.get("highlightQueryTerms");
70	if (highlight != null && highlight.equals("true"))
71	{
72	highlight_query_terms = true;
73	}
74	String annotate = (String) config_params.get("displayAnnotationService");
75	if (annotate != null && annotate.equals("true"))
76	{
77	provide_annotations = true;
78	}
79	return true;
80	}
81
82	public Node process(Node message_node)
83	{
84	// for now, no subaction eventually we may want to have subactions such as text assoc or something ?
85
86	Element message = GSXML.nodeToElement(message_node);
87	Document doc = XMLConverter.newDOM(); //message.getOwnerDocument();
88
89	// the response
90	Element result = doc.createElement(GSXML.MESSAGE_ELEM);
91	Element page_response = doc.createElement(GSXML.RESPONSE_ELEM);
92	result.appendChild(page_response);
93
94	// get the request - assume only one
95	Element request = (Element) GSXML.getChildByTagName(message, GSXML.REQUEST_ELEM);
96	Element cgi_paramList = (Element) GSXML.getChildByTagName(request, GSXML.PARAM_ELEM + GSXML.LIST_MODIFIER);
97	HashMap<String, Serializable> params = GSXML.extractParams(cgi_paramList, false);
98
99	// just in case there are some that need to get passed to the services
100	HashMap service_params = (HashMap) params.get("s0");
101
102	String collection = (String) params.get(GSParams.COLLECTION);
103	String document_id = (String) params.get(GSParams.DOCUMENT);
104	if (document_id != null && document_id.equals(""))
105	{
106	document_id = null;
107	}
108	String href = (String) params.get(GSParams.HREF);//for an external link : get the href URL if it is existing in the params list
109	if (href != null && href.equals(""))
110	{
111	href = null;
112	}
113	String rl = (String) params.get(GSParams.RELATIVE_LINK);//for an external link : get the rl value if it is existing in the params list
114	if (document_id == null && href == null)
115	{
116	logger.error("no document specified!");
117	return result;
118	}
119	if (rl != null && rl.equals("0"))
120	{
121	// this is a true external link, we should have been directed to a different page or action
122	logger.error("rl value was 0, shouldn't get here");
123	return result;
124	}
125
126	UserContext userContext = new UserContext(request);
127
128	//append site metadata
129	addSiteMetadata(page_response, userContext);
130	addInterfaceOptions(page_response);
131
132	// get the additional data needed for the page
133	getBackgroundData(page_response, collection, userContext);
134	Element format_elem = (Element) GSXML.getChildByTagName(page_response, GSXML.FORMAT_ELEM);
135
136	if (format_elem != null) {
137	// lets look for param defaults set in config file
138	NodeList param_defaults = format_elem.getElementsByTagName("paramDefault");
139	for (int i=0; i<param_defaults.getLength(); i++) {
140	Element p = (Element)param_defaults.item(i);
141	String name = p.getAttribute(GSXML.NAME_ATT);
142	if (params.get(name) ==null) {
143	// wasn't set from interface
144	String value = p.getAttribute(GSXML.VALUE_ATT);
145	params.put(name, value );
146	// also add into request param xml so that xslt knows it too
147	GSXML.addParameterToList(cgi_paramList, name, value);
148	}
149	}
150	}
151
152	String document_type = (String) params.get(GSParams.DOCUMENT_TYPE);
153	if (document_type != null && document_type.equals(""))
154	{
155	//document_type = "hierarchy";
156	document_type = null; // we'll get it later if not already specified
157	}
158	// what if it is null here?? Anu to check...
159
160
161	boolean editing_document = false;
162	String doc_edit = (String) params.get(DOC_EDIT_ARG);
163	if (doc_edit != null && doc_edit.equals("1")) {
164	editing_document = true;
165	}
166
167	// are we editing mode? just get the archive document, convert to our internal doc format, and return it
168	if (editing_document) {
169
170	// call get archive doc
171	Element dx_message = doc.createElement(GSXML.MESSAGE_ELEM);
172	String to = "DocXMLGetSection";
173	Element dx_request = GSXML.createBasicRequest(doc, GSXML.REQUEST_TYPE_PROCESS, to, userContext);
174	dx_message.appendChild(dx_request);
175	Element dx_section = doc.createElement(GSXML.DOCXML_SECTION_ELEM);
176	dx_section.setAttribute(GSXML.NODE_ID_ATT, document_id);
177	dx_section.setAttribute(GSXML.COLLECTION_ATT, collection);
178	dx_request.appendChild(dx_section);
179
180	Element dx_response_message = (Element) this.mr.process(dx_message);
181	if (processErrorElements(dx_response_message, page_response))
182	{
183	return result;
184	}
185
186	// get the section out
187	String path = GSPath.appendLink(GSXML.RESPONSE_ELEM, GSXML.DOCXML_SECTION_ELEM);
188	Element section = (Element) GSXML.getNodeByPath(dx_response_message, path);
189	if (section == null) {
190	logger.error("no archive doc returned for "+document_id);
191	return result;
192	}
193	// convert the archive format into the internal format that the page response requires
194
195	// work out doctype
196	// NOTE: this will be coming from collection database in index
197	// the archive file doesn't store this. So we have to assume
198	// that the doc type will not be changing with any
199	// modifications happening to archives.
200
201	// if doc type is null, then we need to work it out.
202	// create a basic doc list containing the current node
203
204	if (document_type == null) {
205	Element basic_doc_list = doc.createElement(GSXML.DOC_NODE_ELEM + GSXML.LIST_MODIFIER);
206	Element current_doc = doc.createElement(GSXML.DOC_NODE_ELEM);
207	basic_doc_list.appendChild(current_doc);
208	current_doc.setAttribute(GSXML.NODE_ID_ATT, document_id);
209	basic_doc_list.appendChild(current_doc);
210	document_type = getDocumentType(basic_doc_list, collection, userContext, page_response);
211	}
212
213	if (document_type == null) {
214	logger.debug("@@@ doctype is null, setting to simple");
215	document_type = GSXML.DOC_TYPE_SIMPLE;
216	}
217
218	Element doc_elem = doc.createElement(GSXML.DOCUMENT_ELEM);
219	doc_elem.setAttribute(GSXML.DOC_TYPE_ATT, document_type);
220	page_response.appendChild(doc_elem);
221
222	Element transformed_section = transformArchiveToDocument(section);
223	if (document_type == GSXML.DOC_TYPE_SIMPLE) {
224	// simple doc, only returning a single document node, which is the top level section.
225	doc_elem.setAttribute(GSXML.NODE_ID_ATT, document_id);
226	GSXML.mergeElements(doc_elem, transformed_section);
227	return result;
228	}
229
230	// multi sectioned document.
231	transformed_section.setAttribute(GSXML.NODE_ID_ATT, document_id);
232	// In docEdit mode, we obtain the text from archives, from doc.xml
233	// Now the transformation has replaced <Section> with <documentNode>
234	// Need to add nodeID, nodeType and docType attributes to each docNode
235	// as doc.xml doesn't store that.
236	insertDocNodeAttributes(transformed_section, document_type, null);
237	doc_elem.appendChild(doc.importNode(transformed_section, true));
238	logger.debug("dx result = "+XMLConverter.getPrettyString(result));
239
240	return result;
241	}
242
243	//whether to retrieve siblings or not
244	boolean get_siblings = false;
245	String sibs = (String) params.get(SIBLING_ARG);
246	if (sibs != null && sibs.equals("1"))
247	{
248	get_siblings = true;
249	}
250
251	String doc_id_modifier = "";
252	String sibling_num = (String) params.get(GOTO_PAGE_ARG);
253	if (sibling_num != null && !sibling_num.equals(""))
254	{
255	// we have to modify the doc name
256	doc_id_modifier = "." + sibling_num + ".ss";
257	}
258
259	boolean expand_document = false;
260	String ed_arg = (String) params.get(EXPAND_DOCUMENT_ARG);
261	if (ed_arg != null && ed_arg.equals("1"))
262	{
263	expand_document = true;
264	}
265
266	boolean expand_contents = false;
267	if (expand_document)
268	{ // we always expand the contents with the text
269	expand_contents = true;
270	}
271	else
272	{
273	String ec_arg = (String) params.get(EXPAND_CONTENTS_ARG);
274	if (ec_arg != null && ec_arg.equals("1"))
275	{
276	expand_contents = true;
277	}
278	}
279
280	// do we want text content? Not if no_text=1.
281	// expand_document overrides this. - should it??
282	boolean get_text = true;
283	String nt_arg = (String) params.get(NO_TEXT_ARG);
284
285	if (!expand_document && nt_arg!=null && nt_arg.equals("1")) {
286	logger.debug("SETTING GET TEXT TO FALSE");
287	get_text = false;
288	} else {
289	logger.debug("GET TEXT REMAINS TRUE");
290	}
291
292	// the_document is where all the doc info - structure and metadata etc
293	// is added into, to be returned in the page
294	Element the_document = doc.createElement(GSXML.DOCUMENT_ELEM);
295	page_response.appendChild(the_document);
296
297	// create a basic doc list containing the current node
298	Element basic_doc_list = doc.createElement(GSXML.DOC_NODE_ELEM + GSXML.LIST_MODIFIER);
299	Element current_doc = doc.createElement(GSXML.DOC_NODE_ELEM);
300	basic_doc_list.appendChild(current_doc);
301	if (document_id != null)
302	{
303	current_doc.setAttribute(GSXML.NODE_ID_ATT, document_id + doc_id_modifier);
304	}
305	else
306	{
307	current_doc.setAttribute(GSXML.HREF_ID_ATT, href);
308	// do we need this??
309	current_doc.setAttribute(GSXML.ID_MOD_ATT, doc_id_modifier);
310	}
311
312	if (document_type == null)
313	{
314	document_type = getDocumentType(basic_doc_list, collection, userContext, page_response);
315	}
316	if (document_type == null)
317	{
318	logger.debug("##### doctype is null, setting to simple");
319	document_type = GSXML.DOC_TYPE_SIMPLE;
320	}
321
322	the_document.setAttribute(GSXML.DOC_TYPE_ATT, document_type);
323
324	// Create a parameter list to specify the required structure information
325	Element ds_param_list = doc.createElement(GSXML.PARAM_ELEM + GSXML.LIST_MODIFIER);
326
327	if (service_params != null)
328	{
329	GSXML.addParametersToList(ds_param_list, service_params);
330	}
331
332	Element ds_param = null;
333	boolean get_structure = false;
334	boolean get_structure_info = false;
335	if (document_type.equals(GSXML.DOC_TYPE_PAGED))
336	{
337	get_structure_info = true;
338
339	if (expand_contents)
340	{
341	ds_param = doc.createElement(GSXML.PARAM_ELEM);
342	ds_param_list.appendChild(ds_param);
343	ds_param.setAttribute(GSXML.NAME_ATT, "structure");
344	ds_param.setAttribute(GSXML.VALUE_ATT, "entire");
345	}
346
347	// get the info needed for paged naviagtion
348	ds_param = doc.createElement(GSXML.PARAM_ELEM);
349	ds_param_list.appendChild(ds_param);
350	ds_param.setAttribute(GSXML.NAME_ATT, "info");
351	ds_param.setAttribute(GSXML.VALUE_ATT, "numSiblings");
352	ds_param = doc.createElement(GSXML.PARAM_ELEM);
353	ds_param_list.appendChild(ds_param);
354	ds_param.setAttribute(GSXML.NAME_ATT, "info");
355	ds_param.setAttribute(GSXML.VALUE_ATT, "numChildren");
356	ds_param = doc.createElement(GSXML.PARAM_ELEM);
357	ds_param_list.appendChild(ds_param);
358	ds_param.setAttribute(GSXML.NAME_ATT, "info");
359	ds_param.setAttribute(GSXML.VALUE_ATT, "siblingPosition");
360
361	if (get_siblings)
362	{
363	ds_param = doc.createElement(GSXML.PARAM_ELEM);
364	ds_param_list.appendChild(ds_param);
365	ds_param.setAttribute(GSXML.NAME_ATT, "structure");
366	ds_param.setAttribute(GSXML.VALUE_ATT, "siblings");
367	}
368
369	}
370	else if (document_type.equals(GSXML.DOC_TYPE_HIERARCHY) \|\| document_type.equals(GSXML.DOC_TYPE_PAGED_HIERARCHY))
371	{
372	get_structure = true;
373	if (expand_contents)
374	{
375	ds_param = doc.createElement(GSXML.PARAM_ELEM);
376	ds_param_list.appendChild(ds_param);
377	ds_param.setAttribute(GSXML.NAME_ATT, "structure");
378	ds_param.setAttribute(GSXML.VALUE_ATT, "entire");
379	}
380	else
381	{
382	// get the info needed for table of contents
383	ds_param = doc.createElement(GSXML.PARAM_ELEM);
384	ds_param_list.appendChild(ds_param);
385	ds_param.setAttribute(GSXML.NAME_ATT, "structure");
386	ds_param.setAttribute(GSXML.VALUE_ATT, "ancestors");
387	ds_param = doc.createElement(GSXML.PARAM_ELEM);
388	ds_param_list.appendChild(ds_param);
389	ds_param.setAttribute(GSXML.NAME_ATT, "structure");
390	ds_param.setAttribute(GSXML.VALUE_ATT, "children");
391	if (get_siblings)
392	{
393	ds_param = doc.createElement(GSXML.PARAM_ELEM);
394	ds_param_list.appendChild(ds_param);
395	ds_param.setAttribute(GSXML.NAME_ATT, "structure");
396	ds_param.setAttribute(GSXML.VALUE_ATT, "siblings");
397	}
398	}
399	}
400	else
401	{
402	// we dont need any structure
403	}
404
405	boolean has_dummy = false;
406	if (get_structure \|\| get_structure_info)
407	{
408
409	// Build a request to obtain the document structure
410	Element ds_message = doc.createElement(GSXML.MESSAGE_ELEM);
411	String to = GSPath.appendLink(collection, "DocumentStructureRetrieve");// Hard-wired?
412	Element ds_request = GSXML.createBasicRequest(doc, GSXML.REQUEST_TYPE_PROCESS, to, userContext);
413	ds_message.appendChild(ds_request);
414	ds_request.appendChild(ds_param_list);
415
416	// add the node list we created earlier
417	ds_request.appendChild(basic_doc_list);
418
419	// Process the document structure retrieve message
420	Element ds_response_message = (Element) this.mr.process(ds_message);
421	if (processErrorElements(ds_response_message, page_response))
422	{
423	return result;
424	}
425
426	// get the info and print out
427	String path = GSPath.appendLink(GSXML.RESPONSE_ELEM, GSXML.DOC_NODE_ELEM + GSXML.LIST_MODIFIER);
428	path = GSPath.appendLink(path, GSXML.DOC_NODE_ELEM);
429	path = GSPath.appendLink(path, "nodeStructureInfo");
430	Element ds_response_struct_info = (Element) GSXML.getNodeByPath(ds_response_message, path);
431	// get the doc_node bit
432	if (ds_response_struct_info != null)
433	{
434	the_document.appendChild(doc.importNode(ds_response_struct_info, true));
435	}
436	path = GSPath.appendLink(GSXML.RESPONSE_ELEM, GSXML.DOC_NODE_ELEM + GSXML.LIST_MODIFIER);
437	path = GSPath.appendLink(path, GSXML.DOC_NODE_ELEM);
438	path = GSPath.appendLink(path, GSXML.NODE_STRUCTURE_ELEM);
439	Element ds_response_structure = (Element) GSXML.getNodeByPath(ds_response_message, path);
440
441	if (ds_response_structure != null)
442	{
443	// add the contents of the structure bit into the_document
444	NodeList structs = ds_response_structure.getChildNodes();
445	for (int i = 0; i < structs.getLength(); i++)
446	{
447	the_document.appendChild(doc.importNode(structs.item(i), true));
448	}
449	}
450	else
451	{
452	// no structure nodes, so put in a dummy doc node
453	Element doc_node = doc.createElement(GSXML.DOC_NODE_ELEM);
454	if (document_id != null)
455	{
456	doc_node.setAttribute(GSXML.NODE_ID_ATT, document_id);
457	}
458	else
459	{
460	doc_node.setAttribute(GSXML.HREF_ID_ATT, href);
461
462	}
463	the_document.appendChild(doc_node);
464	has_dummy = true;
465	}
466	}
467	else
468	{ // a simple type - we dont have a dummy node for simple
469	// should think about this more
470	// no structure request, so just put in a dummy doc node
471	Element doc_node = doc.createElement(GSXML.DOC_NODE_ELEM);
472	if (document_id != null)
473	{
474	doc_node.setAttribute(GSXML.NODE_ID_ATT, document_id);
475	}
476	else
477	{
478	doc_node.setAttribute(GSXML.HREF_ID_ATT, href);
479	}
480	the_document.appendChild(doc_node);
481	has_dummy = true;
482	}
483
484	// Build a request to obtain some document metadata
485	Element dm_message = doc.createElement(GSXML.MESSAGE_ELEM);
486	String to = GSPath.appendLink(collection, "DocumentMetadataRetrieve"); // Hard-wired?
487	Element dm_request = GSXML.createBasicRequest(doc, GSXML.REQUEST_TYPE_PROCESS, to, userContext);
488	dm_message.appendChild(dm_request);
489	// Create a parameter list to specify the required metadata information
490
491	HashSet<String> meta_names = new HashSet<String>();
492	meta_names.add("Title"); // the default
493	if (format_elem != null)
494	{
495	getRequiredMetadataNames(format_elem, meta_names);
496	}
497
498	Element extraMetaListElem = (Element) GSXML.getChildByTagName(request, GSXML.EXTRA_METADATA + GSXML.LIST_MODIFIER);
499	if (extraMetaListElem != null)
500	{
501	NodeList extraMetaList = extraMetaListElem.getElementsByTagName(GSXML.EXTRA_METADATA);
502	for (int i = 0; i < extraMetaList.getLength(); i++)
503	{
504	meta_names.add(((Element) extraMetaList.item(i)).getAttribute(GSXML.NAME_ATT));
505	}
506	}
507
508	Element dm_param_list = createMetadataParamList(doc,meta_names);
509	if (service_params != null)
510	{
511	GSXML.addParametersToList(dm_param_list, service_params);
512	}
513
514	dm_request.appendChild(dm_param_list);
515
516	// create the doc node list for the metadata request
517	Element dm_doc_list = doc.createElement(GSXML.DOC_NODE_ELEM + GSXML.LIST_MODIFIER);
518	dm_request.appendChild(dm_doc_list);
519
520	// Add each node from the structure response into the metadata request
521	NodeList doc_nodes = the_document.getElementsByTagName(GSXML.DOC_NODE_ELEM);
522	for (int i = 0; i < doc_nodes.getLength(); i++)
523	{
524	Element doc_node = (Element) doc_nodes.item(i);
525	String doc_node_id = doc_node.getAttribute(GSXML.NODE_ID_ATT);
526
527	// Add the documentNode to the list
528	Element dm_doc_node = doc.createElement(GSXML.DOC_NODE_ELEM);
529	dm_doc_list.appendChild(dm_doc_node);
530	dm_doc_node.setAttribute(GSXML.NODE_ID_ATT, doc_node_id);
531	dm_doc_node.setAttribute(GSXML.NODE_TYPE_ATT, doc_node.getAttribute(GSXML.NODE_TYPE_ATT));
532	if (document_id == null){
533	dm_doc_node.setAttribute(GSXML.HREF_ID_ATT, href );
534	}
535
536	}
537
538	// we also want a metadata request to the top level document to get
539	// assocfilepath - this could be cached too
540	Element doc_meta_request = GSXML.createBasicRequest(doc, GSXML.REQUEST_TYPE_PROCESS, to, userContext);
541	dm_message.appendChild(doc_meta_request);
542	Element doc_meta_param_list = doc.createElement(GSXML.PARAM_ELEM + GSXML.LIST_MODIFIER);
543	if (service_params != null)
544	{
545	GSXML.addParametersToList(doc_meta_param_list, service_params);
546	}
547
548	doc_meta_request.appendChild(doc_meta_param_list);
549	Element doc_param = doc.createElement(GSXML.PARAM_ELEM);
550	doc_meta_param_list.appendChild(doc_param);
551	doc_param.setAttribute(GSXML.NAME_ATT, "metadata");
552	doc_param.setAttribute(GSXML.VALUE_ATT, "assocfilepath");
553
554	// create the doc node list for the metadata request
555	Element doc_list = doc.createElement(GSXML.DOC_NODE_ELEM + GSXML.LIST_MODIFIER);
556	doc_meta_request.appendChild(doc_list);
557
558	Element doc_node = doc.createElement(GSXML.DOC_NODE_ELEM);
559	// the node we want is the root document node
560	if (document_id != null)
561	{
562	doc_node.setAttribute(GSXML.NODE_ID_ATT, document_id + ".rt");
563	}
564	/*else
565	{
566	doc_node.setAttribute(GSXML.HREF_ID_ATT, href);// + ".rt");
567	// can we assume that href is always a top level doc??
568	//doc_node.setAttribute(GSXML.ID_MOD_ATT, ".rt");
569	//doc_node.setAttribute("externalURL", has_rl);
570	}*/
571	doc_list.appendChild(doc_node);
572
573	Element dm_response_message = (Element) this.mr.process(dm_message);
574	if (processErrorElements(dm_response_message, page_response))
575	{
576	return result;
577	}
578
579	String path = GSPath.appendLink(GSXML.RESPONSE_ELEM, GSXML.DOC_NODE_ELEM + GSXML.LIST_MODIFIER);
580	Element dm_response_doc_list = (Element) GSXML.getNodeByPath(dm_response_message, path);
581
582	// Merge the metadata with the structure information
583	NodeList dm_response_docs = dm_response_doc_list.getChildNodes();
584	for (int i = 0; i < doc_nodes.getLength(); i++)
585	{
586	GSXML.mergeMetadataLists(doc_nodes.item(i), dm_response_docs.item(i));
587	}
588	// get the top level doc metadata out
589	Element doc_meta_response = (Element) dm_response_message.getElementsByTagName(GSXML.RESPONSE_ELEM).item(1);
590	Element top_doc_node = (Element) GSXML.getNodeByPath(doc_meta_response, "documentNodeList/documentNode");
591	GSXML.mergeMetadataLists(the_document, top_doc_node);
592
593	// do we want doc text content? If not, we are done.
594	if (!get_text) {
595	// don't get text
596	return result;
597	}
598
599	// Build a request to obtain some document content
600	Element dc_message = doc.createElement(GSXML.MESSAGE_ELEM);
601	to = GSPath.appendLink(collection, "DocumentContentRetrieve"); // Hard-wired?
602	Element dc_request = GSXML.createBasicRequest(doc, GSXML.REQUEST_TYPE_PROCESS, to, userContext);
603	dc_message.appendChild(dc_request);
604
605	// Create a parameter list to specify the request parameters - empty for now
606	Element dc_param_list = doc.createElement(GSXML.PARAM_ELEM + GSXML.LIST_MODIFIER);
607	if (service_params != null)
608	{
609	GSXML.addParametersToList(dc_param_list, service_params);
610	}
611
612	dc_request.appendChild(dc_param_list);
613
614	// get the content
615	// the doc list for the content request is the same as the one for the structure request unless we want the whole document, in which case its the same as for the metadata request.
616	if (expand_document)
617	{
618	dc_request.appendChild(dm_doc_list);
619	}
620	else
621	{
622	dc_request.appendChild(basic_doc_list);
623	}
624	Element dc_response_message = (Element) this.mr.process(dc_message);
625	if (processErrorElements(dc_response_message, page_response))
626	{
627	return result;
628	}
629	Element dc_response_doc_list = (Element) GSXML.getNodeByPath(dc_response_message, path);
630
631	if (expand_document)
632	{
633	// Merge the content with the structure information
634	NodeList dc_response_docs = dc_response_doc_list.getChildNodes();
635	for (int i = 0; i < doc_nodes.getLength(); i++)
636	{
637	Node content = GSXML.getChildByTagName((Element) dc_response_docs.item(i), GSXML.NODE_CONTENT_ELEM);
638	if (content != null)
639	{
640	if (highlight_query_terms)
641	{
642	String node_id = ((Element)doc_nodes.item(i)).getAttribute(GSXML.NODE_ID_ATT);
643	content = highlightQueryTerms(request, node_id, (Element) content);
644	}
645
646	doc_nodes.item(i).appendChild(doc.importNode(content, true));
647	}
648	//GSXML.mergeMetadataLists(doc_nodes.item(i), dm_response_docs.item(i));
649	}
650	if (has_dummy && document_type.equals(GSXML.DOC_TYPE_SIMPLE)) {
651	Element dummy_node = (Element) doc_nodes.item(0);
652	the_document.removeChild(dummy_node);
653	the_document.setAttribute(GSXML.NODE_ID_ATT, dummy_node.getAttribute(GSXML.NODE_ID_ATT));
654	NodeList dummy_children = dummy_node.getChildNodes();
655	for (int i = dummy_children.getLength() - 1; i >= 0; i--)
656	{
657	// special case as we don't want more than one metadata list
658	if (dummy_children.item(i).getNodeName().equals(GSXML.METADATA_ELEM + GSXML.LIST_MODIFIER))
659	{
660	GSXML.mergeMetadataFromList(the_document, dummy_children.item(i));
661	}
662	else
663	{
664	the_document.appendChild(dummy_children.item(i));
665	}
666	}
667	}
668	}
669	else
670	{
671	//path = GSPath.appendLink(path, GSXML.DOC_NODE_ELEM);
672	Element dc_response_doc = (Element) GSXML.getChildByTagName(dc_response_doc_list, GSXML.DOC_NODE_ELEM);
673	Element dc_response_doc_content = (Element) GSXML.getChildByTagName(dc_response_doc, GSXML.NODE_CONTENT_ELEM);
674	//Element dc_response_doc_external = (Element) GSXML.getChildByTagName(dc_response_doc, "external");
675
676	if (dc_response_doc_content == null)
677	{
678	// no content to add
679	if (dc_response_doc.getAttribute("external").equals("true"))
680	{
681
682	//if (dc_response_doc_external != null)
683	//{
684	String href_id = dc_response_doc.getAttribute(GSXML.HREF_ID_ATT);
685
686	the_document.setAttribute("selectedNode", href_id);
687	the_document.setAttribute("external", href_id);
688	}
689	return result;
690	}
691	if (highlight_query_terms)
692	{
693	dc_response_doc.removeChild(dc_response_doc_content);
694
695	dc_response_doc_content = highlightQueryTerms(request, null, dc_response_doc_content);
696	dc_response_doc.appendChild(dc_response_doc.getOwnerDocument().importNode(dc_response_doc_content, true));
697	}
698
699	if (provide_annotations)
700	{
701	String service_selected = (String) params.get(ENRICH_DOC_ARG);
702	if (service_selected != null && service_selected.equals("1"))
703	{
704	// now we can modifiy the response doc if needed
705	String enrich_service = (String) params.get(GSParams.SERVICE);
706	// send a message to the service
707	Element enrich_message = doc.createElement(GSXML.MESSAGE_ELEM);
708	Element enrich_request = GSXML.createBasicRequest(doc, GSXML.REQUEST_TYPE_PROCESS, enrich_service, userContext);
709	enrich_message.appendChild(enrich_request);
710	// check for parameters
711	HashMap e_service_params = (HashMap) params.get("s1");
712	if (e_service_params != null)
713	{
714	Element enrich_pl = doc.createElement(GSXML.PARAM_ELEM + GSXML.LIST_MODIFIER);
715	GSXML.addParametersToList(enrich_pl, e_service_params);
716	enrich_request.appendChild(enrich_pl);
717	}
718	Element e_doc_list = doc.createElement(GSXML.DOC_NODE_ELEM + GSXML.LIST_MODIFIER);
719	enrich_request.appendChild(e_doc_list);
720	e_doc_list.appendChild(doc.importNode(dc_response_doc, true));
721
722	Node enrich_response = this.mr.process(enrich_message);
723
724	String[] links = { GSXML.RESPONSE_ELEM, GSXML.DOC_NODE_ELEM + GSXML.LIST_MODIFIER, GSXML.DOC_NODE_ELEM, GSXML.NODE_CONTENT_ELEM };
725	path = GSPath.createPath(links);
726	dc_response_doc_content = (Element) GSXML.getNodeByPath(enrich_response, path);
727
728	}
729	} // if provide_annotations
730
731	// use the returned id rather than the sent one cos there may have
732	// been modifiers such as .pr that are removed.
733	String modified_doc_id = dc_response_doc.getAttribute(GSXML.NODE_ID_ATT);
734	the_document.setAttribute("selectedNode", modified_doc_id);
735	if (has_dummy)
736	{
737	// change the id if necessary and add the content
738	Element dummy_node = (Element) doc_nodes.item(0);
739
740	dummy_node.setAttribute(GSXML.NODE_ID_ATT, modified_doc_id);
741	dummy_node.appendChild(doc.importNode(dc_response_doc_content, true));
742	// hack for simple type
743	if (document_type.equals(GSXML.DOC_TYPE_SIMPLE))
744	{
745	// we dont want the internal docNode, just want the content and metadata in the document
746	// rethink this!!
747	the_document.removeChild(dummy_node);
748
749	NodeList dummy_children = dummy_node.getChildNodes();
750	//for (int i=0; i<dummy_children.getLength(); i++) {
751	for (int i = dummy_children.getLength() - 1; i >= 0; i--)
752	{
753	// special case as we don't want more than one metadata list
754	if (dummy_children.item(i).getNodeName().equals(GSXML.METADATA_ELEM + GSXML.LIST_MODIFIER))
755	{
756	GSXML.mergeMetadataFromList(the_document, dummy_children.item(i));
757	}
758	else
759	{
760	the_document.appendChild(dummy_children.item(i));
761	}
762	}
763	}
764
765	the_document.setAttribute(GSXML.NODE_ID_ATT, modified_doc_id);
766	}
767	else
768	{
769	// Merge the document content with the metadata and structure information
770	for (int i = 0; i < doc_nodes.getLength(); i++)
771	{
772	Node dn = doc_nodes.item(i);
773	String dn_id = ((Element) dn).getAttribute(GSXML.NODE_ID_ATT);
774	if (dn_id.equals(modified_doc_id))
775	{
776	dn.appendChild(doc.importNode(dc_response_doc_content, true));
777	break;
778	}
779	}
780	}
781	}
782	//logger.debug("(DocumentAction) Page:\n" + GSXML.xmlNodeToString(result));
783	return result;
784	}
785
786	/**
787	* tell the param class what its arguments are if an action has its own
788	* arguments, this should add them to the params object - particularly
789	* important for args that should not be saved
790	*/
791	public boolean addActionParameters(GSParams params)
792	{
793	params.addParameter(GOTO_PAGE_ARG, false);
794	params.addParameter(ENRICH_DOC_ARG, false);
795	params.addParameter(EXPAND_DOCUMENT_ARG, false);
796	params.addParameter(EXPAND_CONTENTS_ARG, false);
797	params.addParameter(REALISTIC_BOOK_ARG, false);
798
799	return true;
800	}
801
802	/**
803	* this method gets the collection description, the format info, the list of
804	* enrich services, etc - stuff that is needed for the page, but is the same
805	* whatever the query is - should be cached
806	*/
807	protected boolean getBackgroundData(Element page_response, String collection, UserContext userContext)
808	{
809	Document doc = page_response.getOwnerDocument();
810
811	// create a message to process - contains requests for the collection
812	// description, the format element, the enrich services on offer
813	// these could all be cached
814	Element info_message = doc.createElement(GSXML.MESSAGE_ELEM);
815	String path = GSPath.appendLink(collection, "DocumentContentRetrieve");
816	// the format request - ignore for now, where does this request go to??
817	Element format_request = GSXML.createBasicRequest(doc, GSXML.REQUEST_TYPE_FORMAT, path, userContext);
818	info_message.appendChild(format_request);
819
820	// the enrich_services request - only do this if provide_annotations is true
821
822	if (provide_annotations)
823	{
824	Element enrich_services_request = GSXML.createBasicRequest(doc, GSXML.REQUEST_TYPE_DESCRIBE, "", userContext);
825	enrich_services_request.setAttribute(GSXML.INFO_ATT, "serviceList");
826	info_message.appendChild(enrich_services_request);
827	}
828
829	Element info_response = (Element) this.mr.process(info_message);
830
831	// the collection is the first response
832	NodeList responses = info_response.getElementsByTagName(GSXML.RESPONSE_ELEM);
833	Element format_resp = (Element) responses.item(0);
834
835	Element format_elem = (Element) GSXML.getChildByTagName(format_resp, GSXML.FORMAT_ELEM);
836	if (format_elem != null)
837	{
838	Element global_format_elem = (Element) GSXML.getChildByTagName(format_resp, GSXML.GLOBAL_FORMAT_ELEM);
839	if (global_format_elem != null)
840	{
841	GSXSLT.mergeFormatElements(format_elem, global_format_elem, false);
842	}
843
844	// set the format type
845	format_elem.setAttribute(GSXML.TYPE_ATT, "display");
846	page_response.appendChild(doc.importNode(format_elem, true));
847	}
848
849	if (provide_annotations)
850	{
851	Element services_resp = (Element) responses.item(1);
852
853	// a new message for the mr
854	Element enrich_message = doc.createElement(GSXML.MESSAGE_ELEM);
855	NodeList e_services = services_resp.getElementsByTagName(GSXML.SERVICE_ELEM);
856	boolean service_found = false;
857	for (int j = 0; j < e_services.getLength(); j++)
858	{
859	if (((Element) e_services.item(j)).getAttribute(GSXML.TYPE_ATT).equals("enrich"))
860	{
861	Element s = GSXML.createBasicRequest(doc, GSXML.REQUEST_TYPE_DESCRIBE, ((Element) e_services.item(j)).getAttribute(GSXML.NAME_ATT), userContext);
862	enrich_message.appendChild(s);
863	service_found = true;
864	}
865	}
866	if (service_found)
867	{
868	Element enrich_response = (Element) this.mr.process(enrich_message);
869
870	NodeList e_responses = enrich_response.getElementsByTagName(GSXML.RESPONSE_ELEM);
871	Element service_list = doc.createElement(GSXML.SERVICE_ELEM + GSXML.LIST_MODIFIER);
872	for (int i = 0; i < e_responses.getLength(); i++)
873	{
874	Element e_resp = (Element) e_responses.item(i);
875	Element e_service = (Element) doc.importNode(GSXML.getChildByTagName(e_resp, GSXML.SERVICE_ELEM), true);
876	e_service.setAttribute(GSXML.NAME_ATT, e_resp.getAttribute(GSXML.FROM_ATT));
877	service_list.appendChild(e_service);
878	}
879	page_response.appendChild(service_list);
880	}
881	} // if provide_annotations
882	return true;
883
884	}
885
886	protected String getDocumentType(Element basic_doc_list, String collection, UserContext userContext, Element page_response)
887	{
888	Document doc = basic_doc_list.getOwnerDocument();
889
890	Element ds_message = doc.createElement(GSXML.MESSAGE_ELEM);
891	String to = GSPath.appendLink(collection, "DocumentStructureRetrieve");// Hard-wired?
892	Element ds_request = GSXML.createBasicRequest(doc, GSXML.REQUEST_TYPE_PROCESS, to, userContext);
893	ds_message.appendChild(ds_request);
894
895	// Create a parameter list to specify the required structure information
896	Element ds_param_list = doc.createElement(GSXML.PARAM_ELEM + GSXML.LIST_MODIFIER);
897	Element ds_param = doc.createElement(GSXML.PARAM_ELEM);
898	ds_param_list.appendChild(ds_param);
899	ds_param.setAttribute(GSXML.NAME_ATT, "info");
900	ds_param.setAttribute(GSXML.VALUE_ATT, "documentType");
901
902	ds_request.appendChild(ds_param_list);
903
904	// add the node list we created earlier
905	ds_request.appendChild(basic_doc_list);
906
907	// Process the document structure retrieve message
908	Element ds_response_message = (Element) this.mr.process(ds_message);
909	if (processErrorElements(ds_response_message, page_response))
910	{
911	return null;
912	}
913
914	String[] links = { GSXML.RESPONSE_ELEM, GSXML.DOC_NODE_ELEM + GSXML.LIST_MODIFIER, GSXML.DOC_NODE_ELEM, "nodeStructureInfo" };
915	String path = GSPath.createPath(links);
916	Element info_elem = (Element) GSXML.getNodeByPath(ds_response_message, path);
917	if (info_elem == null) {
918	return null;
919	}
920	Element doctype_elem = GSXML.getNamedElement(info_elem, "info", "name", "documentType");
921	if (doctype_elem != null)
922	{
923	String doc_type = doctype_elem.getAttribute("value");
924	return doc_type;
925	}
926	return null;
927	}
928
929	// Recursive method to set the docType, nodeType and nodeID attributes of each docNode
930	// The docType remains constant as in parameter document_type
931	// The nodeID for the first (root) docNode is already set. For all children, the rootNode id
932	// is updated to be <parent-id>.<num-child>, where the first parent-id is rootNode id.
933	// The nodeType is root if rootNode, internal if there are children and leaf if no children
934	protected void insertDocNodeAttributes(Element docNode, String document_type, String id) {
935
936	boolean isRoot = false;
937	if(id == null) { // rootNode, get the root nodeID to work with recursively
938	id = docNode.getAttribute(GSXML.NODE_ID_ATT);
939	isRoot = true;
940	} else { // for all but the root node, need to still set the nodeID
941	docNode.setAttribute(GSXML.NODE_ID_ATT, id);
942	}
943
944	docNode.setAttribute(GSXML.DOC_TYPE_ATT, document_type);
945
946	NodeList docNodes = GSXML.getChildrenByTagName(docNode, GSXML.DOC_NODE_ELEM);
947	if(docNodes.getLength() > 0) {
948	docNode.setAttribute(GSXML.NODE_TYPE_ATT, GSXML.NODE_TYPE_INTERNAL);
949	for(int i = 0; i < docNodes.getLength(); i++) {
950	Element childDocNode = (Element)docNodes.item(i);
951
952	// work out the child docNode's nodeID based on current id
953	String nodeID = id + "." + (i+1);
954	insertDocNodeAttributes(childDocNode, document_type, nodeID); //recursion step
955	}
956	} else {
957	docNode.setAttribute(GSXML.NODE_TYPE_ATT, GSXML.NODE_TYPE_LEAF);
958	}
959
960	// rootNode's nodeType is a special case: it's "root", not "leaf" or "internal"
961	if(isRoot) docNode.setAttribute(GSXML.NODE_TYPE_ATT, GSXML.NODE_TYPE_ROOT);
962
963	}
964
965	/** run the XSLT transform which converts from doc.xml format to our internal document format */
966	protected Element transformArchiveToDocument(Element section) {
967
968	String stylesheet_filename = GSFile.stylesheetFile(GlobalProperties.getGSDL3Home(), (String) this.config_params.get(GSConstants.SITE_NAME), "", (String) this.config_params.get(GSConstants.INTERFACE_NAME), (ArrayList<String>) this.config_params.get(GSConstants.BASE_INTERFACES), "archive2document.xsl");
969	if (stylesheet_filename == null) {
970	logger.error("Couldn't find stylesheet archive2document.xsl");
971	return section;
972	}
973
974	Document stylesheet_doc = XMLConverter.getDOM(new File(stylesheet_filename));
975	if (stylesheet_doc == null) {
976	logger.error("Couldn't load in stylesheet "+stylesheet_filename);
977	return section;
978	}
979
980	Document section_doc = XMLConverter.newDOM();
981	section_doc.appendChild(section_doc.importNode(section, true));
982	Node result = this.transformer.transform(stylesheet_doc, section_doc);
983	logger.debug("transform result = "+XMLConverter.getPrettyString(result));
984
985	Element new_element;
986	if (result.getNodeType() == Node.DOCUMENT_NODE) {
987	new_element = ((Document) result).getDocumentElement();
988	} else {
989	new_element = (Element) result;
990	}
991
992
993	return new_element;
994
995	}
996
997
998	/**
999	* this involves a bit of a hack to get the equivalent query terms - has to
1000	* requery the query service - uses the last selected service name. (if it
1001	* ends in query). should this action do the query or should it send a
1002	* message to the query action? but that will involve lots of extra stuff.
1003	* also doesn't handle phrases properly - just highlights all the terms
1004	* found in the text.
1005	*/
1006	protected Element highlightQueryTerms(Element request, String current_node_id, Element dc_response_doc_content)
1007	{
1008	Document doc = request.getOwnerDocument();
1009
1010	// do the query again to get term info
1011	Element cgi_param_list = (Element) GSXML.getChildByTagName(request, GSXML.PARAM_ELEM + GSXML.LIST_MODIFIER);
1012	HashMap<String, Serializable> params = GSXML.extractParams(cgi_param_list, false);
1013
1014	HashMap previous_params = (HashMap) params.get("p");
1015	if (previous_params == null)
1016	{
1017	return dc_response_doc_content;
1018	}
1019	String service_name = (String) previous_params.get(GSParams.SERVICE);
1020	if (service_name == null \|\| !service_name.endsWith("Query"))
1021	{ // hack for now - we only do highlighting if we were in a query last - ie not if we were in a browse thingy
1022	logger.debug("invalid service, not doing highlighting");
1023	return dc_response_doc_content;
1024	}
1025	String collection = (String) params.get(GSParams.COLLECTION);
1026	UserContext userContext = new UserContext(request);
1027	String to = GSPath.appendLink(collection, service_name);
1028
1029	Element mr_query_message = doc.createElement(GSXML.MESSAGE_ELEM);
1030	Element mr_query_request = GSXML.createBasicRequest(doc, GSXML.REQUEST_TYPE_PROCESS, to, userContext);
1031	mr_query_message.appendChild(mr_query_request);
1032
1033	// paramList
1034	HashMap service_params = (HashMap) params.get("s1");
1035
1036	Element query_param_list = doc.createElement(GSXML.PARAM_ELEM + GSXML.LIST_MODIFIER);
1037	GSXML.addParametersToList(query_param_list, service_params);
1038	if (current_node_id != null) {
1039	GSXML.addParameterToList(query_param_list, "hldocOID", current_node_id);
1040	} else {
1041	GSXML.addParameterToList(query_param_list, "hldocOID", (String) params.get(GSParams.DOCUMENT));
1042	}
1043	mr_query_request.appendChild(query_param_list);
1044	// do the query
1045	Element mr_query_response = (Element) this.mr.process(mr_query_message);
1046	String pathNode = GSPath.appendLink(GSXML.RESPONSE_ELEM, GSXML.NODE_CONTENT_ELEM);
1047	Element highlighted_Node = (Element) GSXML.getNodeByPath(mr_query_response, pathNode);
1048	// For SOLR, the above query may come back with a nodeContent element, which is the hldocOID section content, with search terms marked up. We send it back to the documnetContentRetrieve service so that resolveTextMacros can be applied, and it can be properly encased in documentNode etc elements
1049	if (highlighted_Node != null)
1050	{
1051	// Build a request to process highlighted text
1052
1053	Element hl_message = doc.createElement(GSXML.MESSAGE_ELEM);
1054	to = GSPath.appendLink(collection, "DocumentContentRetrieve");
1055	Element dc_request = GSXML.createBasicRequest(doc, GSXML.REQUEST_TYPE_PROCESS, to, userContext);
1056	hl_message.appendChild(dc_request);
1057
1058	// Create a parameter list to specify the request parameters - empty for now
1059	Element dc_param_list = doc.createElement(GSXML.PARAM_ELEM + GSXML.LIST_MODIFIER);
1060	dc_request.appendChild(dc_param_list);
1061
1062	// get the content
1063	Element doc_list = doc.createElement(GSXML.DOC_NODE_ELEM + GSXML.LIST_MODIFIER);
1064	dc_request.appendChild(doc_list);
1065	Element current_doc = doc.createElement(GSXML.DOC_NODE_ELEM);
1066	doc_list.appendChild(current_doc);
1067	current_doc.setAttribute(GSXML.NODE_ID_ATT, (String) params.get(GSParams.DOCUMENT));
1068	//Append highlighted content to request for processing
1069	dc_request.appendChild(doc.importNode(highlighted_Node, true));
1070	Element hl_response_message = (Element) this.mr.process(hl_message);
1071
1072	//Get results
1073	NodeList contentList = hl_response_message.getElementsByTagName(GSXML.NODE_CONTENT_ELEM);
1074	Element content = (Element) contentList.item(0);
1075	return content;
1076	}
1077	String path = GSPath.appendLink(GSXML.RESPONSE_ELEM, GSXML.TERM_ELEM + GSXML.LIST_MODIFIER);
1078	Element query_term_list_element = (Element) GSXML.getNodeByPath(mr_query_response, path);
1079	if (query_term_list_element == null)
1080	{
1081	// no term info
1082	logger.error("No query term information.\n");
1083	return dc_response_doc_content;
1084	}
1085
1086	String content = GSXML.getNodeText(dc_response_doc_content);
1087
1088	String metadata_path = GSPath.appendLink(GSXML.RESPONSE_ELEM, GSXML.METADATA_ELEM + GSXML.LIST_MODIFIER);
1089	Element metadata_list = (Element) GSXML.getNodeByPath(mr_query_response, metadata_path);
1090
1091	HashSet<String> query_term_variants = new HashSet<String>();
1092	NodeList equivalent_terms_nodelist = query_term_list_element.getElementsByTagName("equivTermList");
1093	if (equivalent_terms_nodelist == null \|\| equivalent_terms_nodelist.getLength() == 0)
1094	{
1095	NodeList terms_nodelist = query_term_list_element.getElementsByTagName("term");
1096	if (terms_nodelist != null && terms_nodelist.getLength() > 0)
1097	{
1098	for (int i = 0; i < terms_nodelist.getLength(); i++)
1099	{
1100	String termValue = ((Element) terms_nodelist.item(i)).getAttribute("name");
1101	String termValueU = null;
1102	String termValueL = null;
1103
1104	if (termValue.length() > 1)
1105	{
1106	termValueU = termValue.substring(0, 1).toUpperCase() + termValue.substring(1);
1107	termValueL = termValue.substring(0, 1).toLowerCase() + termValue.substring(1);
1108	}
1109	else
1110	{
1111	termValueU = termValue.substring(0, 1).toUpperCase();
1112	termValueL = termValue.substring(0, 1).toLowerCase();
1113	}
1114
1115	query_term_variants.add(termValueU);
1116	query_term_variants.add(termValueL);
1117	}
1118	}
1119	}
1120	else
1121	{
1122	for (int i = 0; i < equivalent_terms_nodelist.getLength(); i++)
1123	{
1124	Element equivalent_terms_element = (Element) equivalent_terms_nodelist.item(i);
1125	String[] equivalent_terms = GSXML.getAttributeValuesFromList(equivalent_terms_element, GSXML.NAME_ATT);
1126	for (int j = 0; j < equivalent_terms.length; j++)
1127	{
1128	query_term_variants.add(equivalent_terms[j]);
1129	}
1130	}
1131	}
1132
1133	ArrayList<ArrayList<HashSet<String>>> phrase_query_term_variants_hierarchy = new ArrayList<ArrayList<HashSet<String>>>();
1134
1135	Element query_element = GSXML.getNamedElement(metadata_list, GSXML.METADATA_ELEM, GSXML.NAME_ATT, "query");
1136	String performed_query = GSXML.getNodeText(query_element) + " ";
1137
1138	ArrayList<HashSet<String>> phrase_query_p_term_variants_list = new ArrayList<HashSet<String>>();
1139	int term_start = 0;
1140	boolean in_term = false;
1141	boolean in_phrase = false;
1142	for (int i = 0; i < performed_query.length(); i++)
1143	{
1144	char character = performed_query.charAt(i);
1145	boolean is_character_letter_or_digit = Character.isLetterOrDigit(character);
1146
1147	// Has a query term just started?
1148	if (in_term == false && is_character_letter_or_digit == true)
1149	{
1150	in_term = true;
1151	term_start = i;
1152	}
1153
1154	// Or has a term just finished?
1155	else if (in_term == true && is_character_letter_or_digit == false)
1156	{
1157	in_term = false;
1158	String term = performed_query.substring(term_start, i);
1159
1160	Element term_element = GSXML.getNamedElement(query_term_list_element, GSXML.TERM_ELEM, GSXML.NAME_ATT, term);
1161	if (term_element != null)
1162	{
1163
1164	HashSet<String> phrase_query_p_term_x_variants = new HashSet<String>();
1165
1166	NodeList term_equivalent_terms_nodelist = term_element.getElementsByTagName("equivTermList");
1167	if (term_equivalent_terms_nodelist == null \|\| term_equivalent_terms_nodelist.getLength() == 0)
1168	{
1169	String termValueU = null;
1170	String termValueL = null;
1171
1172	if (term.length() > 1)
1173	{
1174	termValueU = term.substring(0, 1).toUpperCase() + term.substring(1);
1175	termValueL = term.substring(0, 1).toLowerCase() + term.substring(1);
1176	}
1177	else
1178	{
1179	termValueU = term.substring(0, 1).toUpperCase();
1180	termValueL = term.substring(0, 1).toLowerCase();
1181	}
1182
1183	phrase_query_p_term_x_variants.add(termValueU);
1184	phrase_query_p_term_x_variants.add(termValueL);
1185	}
1186	else
1187	{
1188	for (int j = 0; j < term_equivalent_terms_nodelist.getLength(); j++)
1189	{
1190	Element term_equivalent_terms_element = (Element) term_equivalent_terms_nodelist.item(j);
1191	String[] term_equivalent_terms = GSXML.getAttributeValuesFromList(term_equivalent_terms_element, GSXML.NAME_ATT);
1192	for (int k = 0; k < term_equivalent_terms.length; k++)
1193	{
1194	phrase_query_p_term_x_variants.add(term_equivalent_terms[k]);
1195	}
1196	}
1197	}
1198	phrase_query_p_term_variants_list.add(phrase_query_p_term_x_variants);
1199
1200	if (in_phrase == false)
1201	{
1202	phrase_query_term_variants_hierarchy.add(phrase_query_p_term_variants_list);
1203	phrase_query_p_term_variants_list = new ArrayList<HashSet<String>>();
1204	}
1205	}
1206	}
1207	// Watch for phrases (surrounded by quotes)
1208	if (character == '\"')
1209	{
1210	// Has a phrase just started?
1211	if (in_phrase == false)
1212	{
1213	in_phrase = true;
1214	}
1215	// Or has a phrase just finished?
1216	else if (in_phrase == true)
1217	{
1218	in_phrase = false;
1219	phrase_query_term_variants_hierarchy.add(phrase_query_p_term_variants_list);
1220	}
1221
1222	phrase_query_p_term_variants_list = new ArrayList<HashSet<String>>();
1223	}
1224	}
1225
1226	return highlightQueryTermsInternal(doc, content, query_term_variants, phrase_query_term_variants_hierarchy);
1227	}
1228
1229	/**
1230	* Highlights query terms in a piece of text.
1231	*/
1232	private Element highlightQueryTermsInternal(Document doc, String content, HashSet<String> query_term_variants, ArrayList<ArrayList<HashSet<String>>> phrase_query_term_variants_hierarchy)
1233	{
1234	// Convert the content string to an array of characters for speed
1235	char[] content_characters = new char[content.length()];
1236	content.getChars(0, content.length(), content_characters, 0);
1237
1238	// Now skim through the content, identifying word matches
1239	ArrayList<WordMatch> word_matches = new ArrayList<WordMatch>();
1240	int word_start = 0;
1241	boolean in_word = false;
1242	boolean preceding_word_matched = false;
1243	boolean inTag = false;
1244	for (int i = 0; i < content_characters.length; i++)
1245	{
1246	//We don't want to find words inside HTML tags
1247	if (content_characters[i] == '<')
1248	{
1249	inTag = true;
1250	continue;
1251	}
1252	else if (inTag && content_characters[i] == '>')
1253	{
1254	inTag = false;
1255	}
1256	else if (inTag)
1257	{
1258	continue;
1259	}
1260
1261	boolean is_character_letter_or_digit = Character.isLetterOrDigit(content_characters[i]);
1262
1263	// Has a word just started?
1264	if (in_word == false && is_character_letter_or_digit == true)
1265	{
1266	in_word = true;
1267	word_start = i;
1268	}
1269
1270	// Or has a word just finished?
1271	else if (in_word == true && is_character_letter_or_digit == false)
1272	{
1273	in_word = false;
1274
1275	// Check if the word matches any of the query term equivalents
1276	String word = new String(content_characters, word_start, (i - word_start));
1277	if (query_term_variants.contains(word))
1278	{
1279	// We have found a matching word, so remember its location
1280	word_matches.add(new WordMatch(word, word_start, i, preceding_word_matched));
1281	preceding_word_matched = true;
1282	}
1283	else
1284	{
1285	preceding_word_matched = false;
1286	}
1287	}
1288	}
1289
1290	// Don't forget the last word...
1291	if (in_word == true)
1292	{
1293	// Check if the word matches any of the query term equivalents
1294	String word = new String(content_characters, word_start, (content_characters.length - word_start));
1295	if (query_term_variants.contains(word))
1296	{
1297	// We have found a matching word, so remember its location
1298	word_matches.add(new WordMatch(word, word_start, content_characters.length, preceding_word_matched));
1299	}
1300	}
1301
1302	ArrayList<Integer> highlight_start_positions = new ArrayList<Integer>();
1303	ArrayList<Integer> highlight_end_positions = new ArrayList<Integer>();
1304
1305	// Deal with phrases now
1306	ArrayList<PartialPhraseMatch> partial_phrase_matches = new ArrayList<PartialPhraseMatch>();
1307	for (int i = 0; i < word_matches.size(); i++)
1308	{
1309	WordMatch word_match = word_matches.get(i);
1310
1311	// See if any partial phrase matches are extended by this word
1312	if (word_match.preceding_word_matched)
1313	{
1314	for (int j = partial_phrase_matches.size() - 1; j >= 0; j--)
1315	{
1316	PartialPhraseMatch partial_phrase_match = partial_phrase_matches.remove(j);
1317	ArrayList phrase_query_p_term_variants_list = phrase_query_term_variants_hierarchy.get(partial_phrase_match.query_phrase_number);
1318	HashSet phrase_query_p_term_x_variants = (HashSet) phrase_query_p_term_variants_list.get(partial_phrase_match.num_words_matched);
1319	if (phrase_query_p_term_x_variants.contains(word_match.word))
1320	{
1321	partial_phrase_match.num_words_matched++;
1322
1323	// Has a complete phrase match occurred?
1324	if (partial_phrase_match.num_words_matched == phrase_query_p_term_variants_list.size())
1325	{
1326	// Check for overlaps by looking at the previous highlight range
1327	if (!highlight_end_positions.isEmpty())
1328	{
1329	int last_highlight_index = highlight_end_positions.size() - 1;
1330	int last_highlight_end = highlight_end_positions.get(last_highlight_index).intValue();
1331	if (last_highlight_end > partial_phrase_match.start_position)
1332	{
1333	// There is an overlap, so remove the previous phrase match
1334	int last_highlight_start = highlight_start_positions.remove(last_highlight_index).intValue();
1335	highlight_end_positions.remove(last_highlight_index);
1336	partial_phrase_match.start_position = last_highlight_start;
1337	}
1338	}
1339
1340	highlight_start_positions.add(new Integer(partial_phrase_match.start_position));
1341	highlight_end_positions.add(new Integer(word_match.end_position));
1342	}
1343	// No, but add the partial match back into the list for next time
1344	else
1345	{
1346	partial_phrase_matches.add(partial_phrase_match);
1347	}
1348	}
1349	}
1350	}
1351	else
1352	{
1353	partial_phrase_matches.clear();
1354	}
1355
1356	// See if this word is at the start of any of the phrases
1357	for (int p = 0; p < phrase_query_term_variants_hierarchy.size(); p++)
1358	{
1359	ArrayList phrase_query_p_term_variants_list = phrase_query_term_variants_hierarchy.get(p);
1360	if (phrase_query_p_term_variants_list.size()>0) {
1361	HashSet phrase_query_p_term_1_variants = (HashSet) phrase_query_p_term_variants_list.get(0);
1362	if (phrase_query_p_term_1_variants.contains(word_match.word))
1363	{
1364	// If this phrase is just one word long, we have a complete match
1365	if (phrase_query_p_term_variants_list.size() == 1)
1366	{
1367	highlight_start_positions.add(new Integer(word_match.start_position));
1368	highlight_end_positions.add(new Integer(word_match.end_position));
1369	}
1370	// Otherwise we have the start of a potential phrase match
1371	else
1372	{
1373	partial_phrase_matches.add(new PartialPhraseMatch(word_match.start_position, p));
1374	}
1375	}
1376	}
1377	}
1378	}
1379
1380	// Now add the annotation tags into the document at the correct points
1381	Element content_element = doc.createElement(GSXML.NODE_CONTENT_ELEM);
1382
1383	int last_wrote = 0;
1384	for (int i = 0; i < highlight_start_positions.size(); i++)
1385	{
1386	int highlight_start = highlight_start_positions.get(i).intValue();
1387	int highlight_end = highlight_end_positions.get(i).intValue();
1388
1389	// Print anything before the highlight range
1390	if (last_wrote < highlight_start)
1391	{
1392	String preceding_text = new String(content_characters, last_wrote, (highlight_start - last_wrote));
1393	content_element.appendChild(doc.createTextNode(preceding_text));
1394	}
1395
1396	// Print the highlight text, annotated
1397	if (highlight_end > last_wrote)
1398	{
1399	String highlight_text = new String(content_characters, highlight_start, (highlight_end - highlight_start));
1400	Element annotation_element = GSXML.createTextElement(doc, "annotation", highlight_text);
1401	annotation_element.setAttribute("type", "query_term");
1402	content_element.appendChild(annotation_element);
1403	last_wrote = highlight_end;
1404	}
1405	}
1406
1407	// Finish off any unwritten text
1408	if (last_wrote < content_characters.length)
1409	{
1410	String remaining_text = new String(content_characters, last_wrote, (content_characters.length - last_wrote));
1411	content_element.appendChild(doc.createTextNode(remaining_text));
1412	}
1413	return content_element;
1414	}
1415
1416	static private class WordMatch
1417	{
1418	public String word;
1419	public int start_position;
1420	public int end_position;
1421	public boolean preceding_word_matched;
1422
1423	public WordMatch(String word, int start_position, int end_position, boolean preceding_word_matched)
1424	{
1425	this.word = word;
1426	this.start_position = start_position;
1427	this.end_position = end_position;
1428	this.preceding_word_matched = preceding_word_matched;
1429	}
1430	}
1431
1432	static private class PartialPhraseMatch
1433	{
1434	public int start_position;
1435	public int query_phrase_number;
1436	public int num_words_matched;
1437
1438	public PartialPhraseMatch(int start_position, int query_phrase_number)
1439	{
1440	this.start_position = start_position;
1441	this.query_phrase_number = query_phrase_number;
1442	this.num_words_matched = 1;
1443	}
1444	}
1445	}

Note: See TracBrowser for help on using the repository browser.

Download in other formats: