source: main/trunk/greenstone3/src/java/org/greenstone/gsdl3/action/DocumentAction.java@ 32505

Last change on this file since 32505 was 32505, checked in by kjdon, 6 years ago

moved soem code into a new method getFormattedArchiveDoc. Modified search term highlighting code. separated the getting of query term variants, and marking up the text. then redoing the query is only called once. now can call the text marking up bit on metadata too - useful if the document page displays a table of metadata - want to highlight search terms in the table.

  • Property svn:keywords set to Author Date Id Revision
File size: 70.9 KB
Line 
1/*
2 * DocumentAction.java
3 * Copyright (C) 2002 New Zealand Digital Library, http://www.nzdl.org
4 *
5 * This program is free software; you can redistribute it and/or modify
6 * it under the terms of the GNU General Public License as published by
7 * the Free Software Foundation; either version 2 of the License, or
8 * (at your option) any later version.
9 *
10 * This program is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 * GNU General Public License for more details.
14 *
15 * You should have received a copy of the GNU General Public License
16 * along with this program; if not, write to the Free Software
17 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
18 */
19package org.greenstone.gsdl3.action;
20
21// Greenstone classes
22import org.greenstone.gsdl3.core.ModuleInterface;
23import org.greenstone.gsdl3.util.*;
24import org.greenstone.util.GlobalProperties;
25
26// XML classes
27import org.w3c.dom.Document;
28import org.w3c.dom.Element;
29import org.w3c.dom.Node;
30import org.w3c.dom.Text;
31import org.w3c.dom.NodeList;
32
33// General Java classes
34import java.util.ArrayList;
35import java.util.HashMap;
36import java.util.HashSet;
37import java.io.File;
38import java.io.Serializable;
39
40import org.apache.log4j.*;
41
42/** Action class for retrieving Documents via the message router */
43public class DocumentAction extends Action
44{
45
46 static Logger logger = Logger.getLogger(org.greenstone.gsdl3.action.DocumentAction.class.getName());
47
48 // this is used to specify that the sibling nodes of a selected one should be obtained
49 public static final String SIBLING_ARG = "sib";
50 public static final String GOTO_PAGE_ARG = "gp";
51 public static final String ENRICH_DOC_ARG = "end";
52 public static final String EXPAND_DOCUMENT_ARG = "ed";
53 public static final String EXPAND_CONTENTS_ARG = "ec";
54 public static final String REALISTIC_BOOK_ARG = "book";
55 public static final String NO_TEXT_ARG = "noText";
56 public static final String DOC_EDIT_ARG = "docEdit";
57
58 /**
59 * if this is set to true, when a document is displayed, any annotation type
60 * services (enrich) will be offered to the user as well
61 */
62 protected boolean provide_annotations = false;
63
64 protected boolean highlight_query_terms = false;
65
66 public boolean configure()
67 {
68 super.configure();
69 String highlight = (String) config_params.get("highlightQueryTerms");
70 if (highlight != null && highlight.equals("true"))
71 {
72 highlight_query_terms = true;
73 }
74 String annotate = (String) config_params.get("displayAnnotationService");
75 if (annotate != null && annotate.equals("true"))
76 {
77 provide_annotations = true;
78 }
79 return true;
80 }
81
82 public Node process(Node message_node)
83 {
84 // for now, no subaction eventually we may want to have subactions such as text assoc or something ?
85
86 Element message = GSXML.nodeToElement(message_node);
87 Document doc = XMLConverter.newDOM();
88
89 // the response
90 Element result = doc.createElement(GSXML.MESSAGE_ELEM);
91 Element page_response = doc.createElement(GSXML.RESPONSE_ELEM);
92 result.appendChild(page_response);
93
94 // get the request - assume only one
95 Element request = (Element) GSXML.getChildByTagName(message, GSXML.REQUEST_ELEM);
96 Element cgi_paramList = (Element) GSXML.getChildByTagName(request, GSXML.PARAM_ELEM + GSXML.LIST_MODIFIER);
97 HashMap<String, Serializable> params = GSXML.extractParams(cgi_paramList, false);
98
99 // just in case there are some that need to get passed to the services
100 // why do we use s0 here and s1 in other places???
101 HashMap service_params = (HashMap) params.get("s0");
102
103 String collection = (String) params.get(GSParams.COLLECTION);
104 String document_id = (String) params.get(GSParams.DOCUMENT);
105 if (document_id != null && document_id.equals(""))
106 {
107 document_id = null;
108 }
109 String href = (String) params.get(GSParams.HREF);//for an external link : get the href URL if it is existing in the params list
110 if (href != null && href.equals(""))
111 {
112 href = null;
113 }
114 String rl = (String) params.get(GSParams.RELATIVE_LINK);//for an external link : get the rl value if it is existing in the params list
115 if (document_id == null && href == null)
116 {
117 logger.error("no document specified!");
118 return result;
119 }
120 if (rl != null && rl.equals("0"))
121 {
122 // this is a true external link, we should have been directed to a different page or action
123 logger.error("rl value was 0, shouldn't get here");
124 return result;
125 }
126
127 UserContext userContext = new UserContext(request);
128
129 //append site metadata
130 addSiteMetadata(page_response, userContext);
131 addInterfaceOptions(page_response);
132
133 // get the additional data needed for the page
134 getBackgroundData(page_response, collection, userContext);
135 Element format_elem = (Element) GSXML.getChildByTagName(page_response, GSXML.FORMAT_ELEM);
136
137 if (format_elem != null) {
138 // lets look for param defaults set in config file
139 NodeList param_defaults = format_elem.getElementsByTagName("paramDefault");
140 for (int i=0; i<param_defaults.getLength(); i++) {
141 Element p = (Element)param_defaults.item(i);
142 String name = p.getAttribute(GSXML.NAME_ATT);
143 if (params.get(name) ==null) {
144 // wasn't set from interface
145 String value = p.getAttribute(GSXML.VALUE_ATT);
146 params.put(name, value );
147 // also add into request param xml so that xslt knows it too
148 GSXML.addParameterToList(cgi_paramList, name, value);
149 }
150 }
151 }
152
153 String document_type = (String) params.get(GSParams.DOCUMENT_TYPE);
154 if (document_type != null && document_type.equals(""))
155 {
156 //document_type = "hierarchy";
157 document_type = null; // we'll get it later if not already specified
158 }
159 // what if it is null here?? Anu to check...
160
161
162 boolean editing_document = false;
163 String doc_edit = (String) params.get(DOC_EDIT_ARG);
164 if (doc_edit != null && doc_edit.equals("1")) {
165 editing_document = true;
166 }
167
168 // are we editing mode? just get the archive document, convert to our internal doc format, and return it
169 if (editing_document) {
170 return getFormattedArchiveDoc(doc, collection, document_id, document_type, result, page_response, userContext);
171 }
172
173 //whether to retrieve siblings or not
174 boolean get_siblings = false;
175 String sibs = (String) params.get(SIBLING_ARG);
176 if (sibs != null && sibs.equals("1"))
177 {
178 get_siblings = true;
179 }
180
181 String doc_id_modifier = "";
182 String sibling_num = (String) params.get(GOTO_PAGE_ARG);
183 if (sibling_num != null && !sibling_num.equals(""))
184 {
185 // we have to modify the doc name
186 doc_id_modifier = "." + sibling_num + ".ss";
187 }
188
189 boolean expand_document = false;
190 String ed_arg = (String) params.get(EXPAND_DOCUMENT_ARG);
191 if (ed_arg != null && ed_arg.equals("1"))
192 {
193 expand_document = true;
194 }
195
196 boolean expand_contents = false;
197 if (expand_document)
198 { // we always expand the contents with the text
199 expand_contents = true;
200 }
201 else
202 {
203 String ec_arg = (String) params.get(EXPAND_CONTENTS_ARG);
204 if (ec_arg != null && ec_arg.equals("1"))
205 {
206 expand_contents = true;
207 }
208 }
209
210 // do we want text content? Not if no_text=1.
211 // expand_document overrides this. - should it??
212 boolean get_text = true;
213 String nt_arg = (String) params.get(NO_TEXT_ARG);
214
215 if (!expand_document && nt_arg!=null && nt_arg.equals("1")) {
216 logger.debug("SETTING GET TEXT TO FALSE");
217 get_text = false;
218 } else {
219 logger.debug("GET TEXT REMAINS TRUE");
220 }
221
222 // the_document is where all the doc info - structure and metadata etc
223 // is added into, to be returned in the page
224 Element the_document = doc.createElement(GSXML.DOCUMENT_ELEM);
225 page_response.appendChild(the_document);
226
227 // create a basic doc list containing the current node
228 Element basic_doc_list = doc.createElement(GSXML.DOC_NODE_ELEM + GSXML.LIST_MODIFIER);
229 Element current_doc = doc.createElement(GSXML.DOC_NODE_ELEM);
230 basic_doc_list.appendChild(current_doc);
231 if (document_id != null)
232 {
233 current_doc.setAttribute(GSXML.NODE_ID_ATT, document_id + doc_id_modifier);
234 }
235 else
236 {
237 current_doc.setAttribute(GSXML.HREF_ID_ATT, href);
238 // do we need this??
239 current_doc.setAttribute(GSXML.ID_MOD_ATT, doc_id_modifier);
240 }
241
242 if (document_type == null)
243 {
244 document_type = getDocumentType(basic_doc_list, collection, userContext, page_response);
245 }
246 if (document_type == null)
247 {
248 logger.debug("##### doctype is null, setting to simple");
249 document_type = GSXML.DOC_TYPE_SIMPLE;
250 }
251
252 the_document.setAttribute(GSXML.DOC_TYPE_ATT, document_type);
253
254 // start getting doc structure
255
256 // Create a parameter list to specify the required structure information
257 Element ds_param_list = doc.createElement(GSXML.PARAM_ELEM + GSXML.LIST_MODIFIER);
258
259 if (service_params != null)
260 {
261 GSXML.addParametersToList(ds_param_list, service_params);
262 }
263
264 Element ds_param = null;
265 boolean get_structure = false;
266 boolean get_structure_info = false;
267 if (document_type.equals(GSXML.DOC_TYPE_PAGED))
268 {
269 get_structure_info = true;
270
271 if (expand_contents)
272 {
273 ds_param = doc.createElement(GSXML.PARAM_ELEM);
274 ds_param_list.appendChild(ds_param);
275 ds_param.setAttribute(GSXML.NAME_ATT, "structure");
276 ds_param.setAttribute(GSXML.VALUE_ATT, "entire");
277 }
278
279 // get the info needed for paged naviagtion
280 ds_param = doc.createElement(GSXML.PARAM_ELEM);
281 ds_param_list.appendChild(ds_param);
282 ds_param.setAttribute(GSXML.NAME_ATT, "info");
283 ds_param.setAttribute(GSXML.VALUE_ATT, "numSiblings");
284 ds_param = doc.createElement(GSXML.PARAM_ELEM);
285 ds_param_list.appendChild(ds_param);
286 ds_param.setAttribute(GSXML.NAME_ATT, "info");
287 ds_param.setAttribute(GSXML.VALUE_ATT, "numChildren");
288 ds_param = doc.createElement(GSXML.PARAM_ELEM);
289 ds_param_list.appendChild(ds_param);
290 ds_param.setAttribute(GSXML.NAME_ATT, "info");
291 ds_param.setAttribute(GSXML.VALUE_ATT, "siblingPosition");
292
293 if (get_siblings)
294 {
295 ds_param = doc.createElement(GSXML.PARAM_ELEM);
296 ds_param_list.appendChild(ds_param);
297 ds_param.setAttribute(GSXML.NAME_ATT, "structure");
298 ds_param.setAttribute(GSXML.VALUE_ATT, "siblings");
299 }
300
301 }
302 else if (document_type.equals(GSXML.DOC_TYPE_HIERARCHY) || document_type.equals(GSXML.DOC_TYPE_PAGED_HIERARCHY))
303 {
304 get_structure = true;
305 if (expand_contents)
306 {
307 ds_param = doc.createElement(GSXML.PARAM_ELEM);
308 ds_param_list.appendChild(ds_param);
309 ds_param.setAttribute(GSXML.NAME_ATT, "structure");
310 ds_param.setAttribute(GSXML.VALUE_ATT, "entire");
311 }
312 else
313 {
314 // get the info needed for table of contents
315 ds_param = doc.createElement(GSXML.PARAM_ELEM);
316 ds_param_list.appendChild(ds_param);
317 ds_param.setAttribute(GSXML.NAME_ATT, "structure");
318 ds_param.setAttribute(GSXML.VALUE_ATT, "ancestors");
319 ds_param = doc.createElement(GSXML.PARAM_ELEM);
320 ds_param_list.appendChild(ds_param);
321 ds_param.setAttribute(GSXML.NAME_ATT, "structure");
322 ds_param.setAttribute(GSXML.VALUE_ATT, "children");
323 if (get_siblings)
324 {
325 ds_param = doc.createElement(GSXML.PARAM_ELEM);
326 ds_param_list.appendChild(ds_param);
327 ds_param.setAttribute(GSXML.NAME_ATT, "structure");
328 ds_param.setAttribute(GSXML.VALUE_ATT, "siblings");
329 }
330 }
331 }
332 else
333 {
334 // we dont need any structure
335 }
336
337 boolean has_dummy = false;
338 if (get_structure || get_structure_info)
339 {
340
341 // Build a request to obtain the document structure
342 Element ds_message = doc.createElement(GSXML.MESSAGE_ELEM);
343 String to = GSPath.appendLink(collection, "DocumentStructureRetrieve");// Hard-wired?
344 Element ds_request = GSXML.createBasicRequest(doc, GSXML.REQUEST_TYPE_PROCESS, to, userContext);
345 ds_message.appendChild(ds_request);
346 ds_request.appendChild(ds_param_list);
347
348 // add the node list we created earlier
349 ds_request.appendChild(basic_doc_list);
350
351 // Process the document structure retrieve message
352 Element ds_response_message = (Element) this.mr.process(ds_message);
353 if (processErrorElements(ds_response_message, page_response))
354 {
355 return result;
356 }
357
358 // get the info and print out
359 String path = GSPath.appendLink(GSXML.RESPONSE_ELEM, GSXML.DOC_NODE_ELEM + GSXML.LIST_MODIFIER);
360 path = GSPath.appendLink(path, GSXML.DOC_NODE_ELEM);
361 path = GSPath.appendLink(path, "nodeStructureInfo");
362 Element ds_response_struct_info = (Element) GSXML.getNodeByPath(ds_response_message, path);
363 // get the doc_node bit
364 if (ds_response_struct_info != null)
365 {
366 the_document.appendChild(doc.importNode(ds_response_struct_info, true));
367 }
368 path = GSPath.appendLink(GSXML.RESPONSE_ELEM, GSXML.DOC_NODE_ELEM + GSXML.LIST_MODIFIER);
369 path = GSPath.appendLink(path, GSXML.DOC_NODE_ELEM);
370 path = GSPath.appendLink(path, GSXML.NODE_STRUCTURE_ELEM);
371 Element ds_response_structure = (Element) GSXML.getNodeByPath(ds_response_message, path);
372
373 if (ds_response_structure != null)
374 {
375 // add the contents of the structure bit into the_document
376 NodeList structs = ds_response_structure.getChildNodes();
377 for (int i = 0; i < structs.getLength(); i++)
378 {
379 the_document.appendChild(doc.importNode(structs.item(i), true));
380 }
381 }
382 else
383 {
384 // no structure nodes, so put in a dummy doc node
385 Element doc_node = doc.createElement(GSXML.DOC_NODE_ELEM);
386 if (document_id != null)
387 {
388 doc_node.setAttribute(GSXML.NODE_ID_ATT, document_id);
389 }
390 else
391 {
392 doc_node.setAttribute(GSXML.HREF_ID_ATT, href);
393
394 }
395 the_document.appendChild(doc_node);
396 has_dummy = true;
397 }
398 }
399 else
400 { // a simple type - we dont have a dummy node for simple
401 // should think about this more
402 // no structure request, so just put in a dummy doc node
403 Element doc_node = doc.createElement(GSXML.DOC_NODE_ELEM);
404 if (document_id != null)
405 {
406 doc_node.setAttribute(GSXML.NODE_ID_ATT, document_id);
407 }
408 else
409 {
410 doc_node.setAttribute(GSXML.HREF_ID_ATT, href);
411 }
412 the_document.appendChild(doc_node);
413 has_dummy = true;
414 }
415
416 // end getting doc structure
417
418 // start getting doc metadata
419
420 // Build a request to obtain some document metadata
421 Element dm_message = doc.createElement(GSXML.MESSAGE_ELEM);
422 String to = GSPath.appendLink(collection, "DocumentMetadataRetrieve"); // Hard-wired?
423 Element dm_request = GSXML.createBasicRequest(doc, GSXML.REQUEST_TYPE_PROCESS, to, userContext);
424 dm_message.appendChild(dm_request);
425 // Create a parameter list to specify the required metadata information
426
427 HashSet<String> meta_names = new HashSet<String>();
428 meta_names.add("Title"); // the default
429 if (format_elem != null)
430 {
431 getRequiredMetadataNames(format_elem, meta_names);
432 }
433
434 Element extraMetaListElem = (Element) GSXML.getChildByTagName(request, GSXML.EXTRA_METADATA + GSXML.LIST_MODIFIER);
435 if (extraMetaListElem != null)
436 {
437 NodeList extraMetaList = extraMetaListElem.getElementsByTagName(GSXML.EXTRA_METADATA);
438 for (int i = 0; i < extraMetaList.getLength(); i++)
439 {
440 meta_names.add(((Element) extraMetaList.item(i)).getAttribute(GSXML.NAME_ATT));
441 }
442 }
443
444 Element dm_param_list = createMetadataParamList(doc,meta_names);
445 if (service_params != null)
446 {
447 GSXML.addParametersToList(dm_param_list, service_params);
448 }
449
450 dm_request.appendChild(dm_param_list);
451
452 // create the doc node list for the metadata request
453 Element dm_doc_list = doc.createElement(GSXML.DOC_NODE_ELEM + GSXML.LIST_MODIFIER);
454 dm_request.appendChild(dm_doc_list);
455
456 // Add each node from the structure response into the metadata request
457 NodeList doc_nodes = the_document.getElementsByTagName(GSXML.DOC_NODE_ELEM);
458 for (int i = 0; i < doc_nodes.getLength(); i++)
459 {
460 Element doc_node = (Element) doc_nodes.item(i);
461 String doc_node_id = doc_node.getAttribute(GSXML.NODE_ID_ATT);
462
463 // Add the documentNode to the list
464 Element dm_doc_node = doc.createElement(GSXML.DOC_NODE_ELEM);
465 if (needSectionContent(params)) {
466 if (doc_node_id.equals(document_id)) {
467 dm_doc_list.appendChild(dm_doc_node);
468 }
469 } else {
470 dm_doc_list.appendChild(dm_doc_node);
471 }
472 //dm_doc_list.appendChild(dm_doc_node);
473 dm_doc_node.setAttribute(GSXML.NODE_ID_ATT, doc_node_id);
474 dm_doc_node.setAttribute(GSXML.NODE_TYPE_ATT, doc_node.getAttribute(GSXML.NODE_TYPE_ATT));
475 if (document_id == null){
476 dm_doc_node.setAttribute(GSXML.HREF_ID_ATT, href );
477 }
478
479 }
480 // we also want a metadata request to the top level document to get
481 // assocfilepath - this could be cached too
482 Element doc_meta_request = GSXML.createBasicRequest(doc, GSXML.REQUEST_TYPE_PROCESS, to, userContext);
483 dm_message.appendChild(doc_meta_request);
484 Element doc_meta_param_list = doc.createElement(GSXML.PARAM_ELEM + GSXML.LIST_MODIFIER);
485 if (service_params != null)
486 {
487 GSXML.addParametersToList(doc_meta_param_list, service_params);
488 }
489
490 doc_meta_request.appendChild(doc_meta_param_list);
491 Element doc_param = doc.createElement(GSXML.PARAM_ELEM);
492 doc_meta_param_list.appendChild(doc_param);
493 doc_param.setAttribute(GSXML.NAME_ATT, "metadata");
494 doc_param.setAttribute(GSXML.VALUE_ATT, "assocfilepath");
495
496 // create the doc node list for the metadata request
497 Element doc_list = doc.createElement(GSXML.DOC_NODE_ELEM + GSXML.LIST_MODIFIER);
498 doc_meta_request.appendChild(doc_list);
499
500 Element doc_node = doc.createElement(GSXML.DOC_NODE_ELEM);
501 // the node we want is the root document node
502 if (document_id != null)
503 {
504 doc_node.setAttribute(GSXML.NODE_ID_ATT, document_id + ".rt");
505 }
506 /*else
507 {
508 doc_node.setAttribute(GSXML.HREF_ID_ATT, href);// + ".rt");
509 // can we assume that href is always a top level doc??
510 //doc_node.setAttribute(GSXML.ID_MOD_ATT, ".rt");
511 //doc_node.setAttribute("externalURL", has_rl);
512 }*/
513 doc_list.appendChild(doc_node);
514
515 Element dm_response_message = (Element) this.mr.process(dm_message);
516 if (processErrorElements(dm_response_message, page_response))
517 {
518 return result;
519 }
520
521 String path = GSPath.appendLink(GSXML.RESPONSE_ELEM, GSXML.DOC_NODE_ELEM + GSXML.LIST_MODIFIER);
522 Element dm_response_doc_list = (Element) GSXML.getNodeByPath(dm_response_message, path);
523
524 // Merge the metadata with the structure information
525 NodeList dm_response_docs = dm_response_doc_list.getChildNodes();
526 for (int i = 0; i < doc_nodes.getLength(); i++)
527 {
528 Node dcNode;
529 String node_idd = ((Element)doc_nodes.item(i)).getAttribute(GSXML.NODE_ID_ATT);
530 if (node_idd.isEmpty()) {
531 String href_id_att = ((Element)doc_nodes.item(i)).getAttribute(GSXML.HREF_ID_ATT);
532 dcNode = GSXML.getNamedElement(dm_response_doc_list, "documentNode", GSXML.HREF_ID_ATT, href_id_att);
533 } else {
534 dcNode = GSXML.getNamedElement(dm_response_doc_list, "documentNode", GSXML.NODE_ID_ATT, node_idd);
535 }
536 GSXML.mergeMetadataLists(doc_nodes.item(i), dcNode);
537 }
538 // get the top level doc metadata out
539 Element doc_meta_response = (Element) dm_response_message.getElementsByTagName(GSXML.RESPONSE_ELEM).item(1);
540 Element top_doc_node = (Element) GSXML.getNodeByPath(doc_meta_response, "documentNodeList/documentNode");
541 GSXML.mergeMetadataLists(the_document, top_doc_node);
542
543 // do we want doc text content? If not, we are done.
544 if (!get_text) {
545 // don't get text
546 return result;
547 }
548
549
550 HashSet<String> query_term_variants = null;
551 ArrayList<ArrayList<HashSet<String>>> phrase_query_term_variants_hierarchy = null;
552 boolean do_highlight_query_terms = highlight_query_terms;
553 if (highlight_query_terms) {
554 // lets get the query term equivalents
555 query_term_variants = new HashSet<String>();
556 phrase_query_term_variants_hierarchy = new ArrayList<ArrayList<HashSet<String>>>();
557 if (!getQueryTermVariants(request, null, /*current_node_id,*/ query_term_variants, phrase_query_term_variants_hierarchy)) {
558 do_highlight_query_terms = false; // we couldn't get the terms
559 }
560 }
561
562 // lets try marking up the metadata with search terms
563 if (do_highlight_query_terms) {
564 highlightQueryTermsDOM(doc, the_document, "metadata", query_term_variants, phrase_query_term_variants_hierarchy);
565 }
566
567 // Build a request to obtain some document content
568 Element dc_message = doc.createElement(GSXML.MESSAGE_ELEM);
569 to = GSPath.appendLink(collection, "DocumentContentRetrieve"); // Hard-wired?
570 Element dc_request = GSXML.createBasicRequest(doc, GSXML.REQUEST_TYPE_PROCESS, to, userContext);
571 dc_message.appendChild(dc_request);
572
573 // Create a parameter list to specify the request parameters - empty for now
574 Element dc_param_list = doc.createElement(GSXML.PARAM_ELEM + GSXML.LIST_MODIFIER);
575 if (service_params != null)
576 {
577 GSXML.addParametersToList(dc_param_list, service_params);
578 }
579
580 dc_request.appendChild(dc_param_list);
581
582 // get the content
583 // the doc list for the content request is the same as the one for the structure request unless we want the whole document, in which case its the same as for the metadata request.
584 if (expand_document)
585 {
586 dc_request.appendChild(dm_doc_list);
587 }
588 else
589 {
590 dc_request.appendChild(basic_doc_list);
591 }
592 Element dc_response_message = (Element) this.mr.process(dc_message);
593
594 if (processErrorElements(dc_response_message, page_response))
595 {
596 return result;
597
598 }
599 Element dc_response_doc_list = (Element) GSXML.getNodeByPath(dc_response_message, path);
600
601 if (expand_document)
602 {
603 // Merge the content with the structure information
604 NodeList dc_response_docs = dc_response_doc_list.getChildNodes();
605 for (int i = 0; i < doc_nodes.getLength(); i++)
606 {
607 String node_id = ((Element)doc_nodes.item(i)).getAttribute(GSXML.NODE_ID_ATT);
608 //Node content = GSXML.getChildByTagName((Element) dc_response_docs.item(i), GSXML.NODE_CONTENT_ELEM);
609 Node docNode = GSXML.getNamedElement(dc_response_doc_list, "documentNode", GSXML.NODE_ID_ATT, node_id);
610 Node content = GSXML.getChildByTagName(docNode, GSXML.NODE_CONTENT_ELEM);
611 if (content != null)
612 {
613 if (do_highlight_query_terms)
614 {
615 content = highlightQueryTermsElementText(doc, (Element)content, query_term_variants, phrase_query_term_variants_hierarchy);
616 }
617
618 doc_nodes.item(i).appendChild(doc.importNode(content, true));
619 }
620 //GSXML.mergeMetadataLists(doc_nodes.item(i), dm_response_docs.item(i));
621 }
622 if (has_dummy && document_type.equals(GSXML.DOC_TYPE_SIMPLE)) {
623 Element dummy_node = (Element) doc_nodes.item(0);
624 the_document.removeChild(dummy_node);
625 the_document.setAttribute(GSXML.NODE_ID_ATT, dummy_node.getAttribute(GSXML.NODE_ID_ATT));
626 NodeList dummy_children = dummy_node.getChildNodes();
627 for (int i = dummy_children.getLength() - 1; i >= 0; i--)
628 {
629 // special case as we don't want more than one metadata list
630 if (dummy_children.item(i).getNodeName().equals(GSXML.METADATA_ELEM + GSXML.LIST_MODIFIER))
631 {
632 GSXML.mergeMetadataFromList(the_document, dummy_children.item(i));
633 }
634 else
635 {
636 the_document.appendChild(dummy_children.item(i));
637 }
638 }
639 }
640 }
641 else
642 {
643 //path = GSPath.appendLink(path, GSXML.DOC_NODE_ELEM);
644 Element dc_response_doc = (Element) GSXML.getChildByTagName(dc_response_doc_list, GSXML.DOC_NODE_ELEM);
645 Element dc_response_doc_content = (Element) GSXML.getChildByTagName(dc_response_doc, GSXML.NODE_CONTENT_ELEM);
646 //Element dc_response_doc_external = (Element) GSXML.getChildByTagName(dc_response_doc, "external");
647
648 if (dc_response_doc_content == null)
649 {
650 // no content to add
651 if (dc_response_doc.getAttribute("external").equals("true"))
652 {
653
654 //if (dc_response_doc_external != null)
655 //{
656 String href_id = dc_response_doc.getAttribute(GSXML.HREF_ID_ATT);
657
658 the_document.setAttribute("selectedNode", href_id);
659 the_document.setAttribute("external", href_id);
660 }
661 return result;
662 }
663 if (do_highlight_query_terms)
664 {
665 dc_response_doc.removeChild(dc_response_doc_content);
666
667 dc_response_doc_content = highlightQueryTermsElementText(doc, (Element)dc_response_doc_content, query_term_variants, phrase_query_term_variants_hierarchy);
668 dc_response_doc.appendChild(dc_response_doc.getOwnerDocument().importNode(dc_response_doc_content, true));
669 }
670
671 if (provide_annotations)
672 {
673 String service_selected = (String) params.get(ENRICH_DOC_ARG);
674 if (service_selected != null && service_selected.equals("1"))
675 {
676 // now we can modifiy the response doc if needed
677 String enrich_service = (String) params.get(GSParams.SERVICE);
678 // send a message to the service
679 Element enrich_message = doc.createElement(GSXML.MESSAGE_ELEM);
680 Element enrich_request = GSXML.createBasicRequest(doc, GSXML.REQUEST_TYPE_PROCESS, enrich_service, userContext);
681 enrich_message.appendChild(enrich_request);
682 // check for parameters
683 HashMap e_service_params = (HashMap) params.get("s1");
684 if (e_service_params != null)
685 {
686 Element enrich_pl = doc.createElement(GSXML.PARAM_ELEM + GSXML.LIST_MODIFIER);
687 GSXML.addParametersToList(enrich_pl, e_service_params);
688 enrich_request.appendChild(enrich_pl);
689 }
690 Element e_doc_list = doc.createElement(GSXML.DOC_NODE_ELEM + GSXML.LIST_MODIFIER);
691 enrich_request.appendChild(e_doc_list);
692 e_doc_list.appendChild(doc.importNode(dc_response_doc, true));
693
694 Node enrich_response = this.mr.process(enrich_message);
695
696 String[] links = { GSXML.RESPONSE_ELEM, GSXML.DOC_NODE_ELEM + GSXML.LIST_MODIFIER, GSXML.DOC_NODE_ELEM, GSXML.NODE_CONTENT_ELEM };
697 path = GSPath.createPath(links);
698 dc_response_doc_content = (Element) GSXML.getNodeByPath(enrich_response, path);
699
700 }
701 } // if provide_annotations
702
703 // use the returned id rather than the sent one cos there may have
704 // been modifiers such as .pr that are removed.
705 String modified_doc_id = dc_response_doc.getAttribute(GSXML.NODE_ID_ATT);
706 the_document.setAttribute("selectedNode", modified_doc_id);
707 if (has_dummy)
708 {
709 // change the id if necessary and add the content
710 Element dummy_node = (Element) doc_nodes.item(0);
711
712 dummy_node.setAttribute(GSXML.NODE_ID_ATT, modified_doc_id);
713 dummy_node.appendChild(doc.importNode(dc_response_doc_content, true));
714 // hack for simple type
715 if (document_type.equals(GSXML.DOC_TYPE_SIMPLE))
716 {
717 // we dont want the internal docNode, just want the content and metadata in the document
718 // rethink this!!
719 the_document.removeChild(dummy_node);
720
721 NodeList dummy_children = dummy_node.getChildNodes();
722 //for (int i=0; i<dummy_children.getLength(); i++) {
723 for (int i = dummy_children.getLength() - 1; i >= 0; i--)
724 {
725 // special case as we don't want more than one metadata list
726 if (dummy_children.item(i).getNodeName().equals(GSXML.METADATA_ELEM + GSXML.LIST_MODIFIER))
727 {
728 GSXML.mergeMetadataFromList(the_document, dummy_children.item(i));
729 }
730 else
731 {
732 the_document.appendChild(dummy_children.item(i));
733 }
734 }
735 }
736
737 the_document.setAttribute(GSXML.NODE_ID_ATT, modified_doc_id);
738 }
739 else
740 {
741 // Merge the document content with the metadata and structure information
742 for (int i = 0; i < doc_nodes.getLength(); i++)
743 {
744 Node dn = doc_nodes.item(i);
745 String dn_id = ((Element) dn).getAttribute(GSXML.NODE_ID_ATT);
746 if (dn_id.equals(modified_doc_id))
747 {
748 dn.appendChild(doc.importNode(dc_response_doc_content, true));
749 break;
750 }
751 }
752 }
753 }
754 //logger.debug("(DocumentAction) Page:\n" + GSXML.xmlNodeToString(result));
755 return result;
756 }
757
758 protected Element getFormattedArchiveDoc(Document doc, String collection, String document_id, String document_type, Element result, Element page_response, UserContext userContext ) {
759 // call get archive doc
760 Element dx_message = doc.createElement(GSXML.MESSAGE_ELEM);
761 String to = "DocXMLGetSection";
762 Element dx_request = GSXML.createBasicRequest(doc, GSXML.REQUEST_TYPE_PROCESS, to, userContext);
763 dx_message.appendChild(dx_request);
764 Element dx_section = doc.createElement(GSXML.DOCXML_SECTION_ELEM);
765 dx_section.setAttribute(GSXML.NODE_ID_ATT, document_id);
766 dx_section.setAttribute(GSXML.COLLECTION_ATT, collection);
767 dx_request.appendChild(dx_section);
768
769 Element dx_response_message = (Element) this.mr.process(dx_message);
770 if (processErrorElements(dx_response_message, page_response))
771 {
772 return result;
773 }
774
775 // get the section out
776 String path = GSPath.appendLink(GSXML.RESPONSE_ELEM, GSXML.DOCXML_SECTION_ELEM);
777 Element section = (Element) GSXML.getNodeByPath(dx_response_message, path);
778 if (section == null) {
779 logger.error("no archive doc returned for "+document_id);
780 return result;
781 }
782 // convert the archive format into the internal format that the page response requires
783
784 // work out doctype
785 // NOTE: this will be coming from collection database in index
786 // the archive file doesn't store this. So we have to assume
787 // that the doc type will not be changing with any
788 // modifications happening to archives.
789
790 // if doc type is null, then we need to work it out.
791 // create a basic doc list containing the current node
792
793 if (document_type == null) {
794 Element basic_doc_list = doc.createElement(GSXML.DOC_NODE_ELEM + GSXML.LIST_MODIFIER);
795 Element current_doc = doc.createElement(GSXML.DOC_NODE_ELEM);
796 basic_doc_list.appendChild(current_doc);
797 current_doc.setAttribute(GSXML.NODE_ID_ATT, document_id);
798 basic_doc_list.appendChild(current_doc);
799 document_type = getDocumentType(basic_doc_list, collection, userContext, page_response);
800 }
801
802 if (document_type == null) {
803 logger.debug("@@@ doctype is null, setting to simple");
804 document_type = GSXML.DOC_TYPE_SIMPLE;
805 }
806
807 Element doc_elem = doc.createElement(GSXML.DOCUMENT_ELEM);
808 doc_elem.setAttribute(GSXML.DOC_TYPE_ATT, document_type);
809 page_response.appendChild(doc_elem);
810
811 Element transformed_section = transformArchiveToDocument(section);
812 if (document_type == GSXML.DOC_TYPE_SIMPLE) {
813 // simple doc, only returning a single document node, which is the top level section.
814 doc_elem.setAttribute(GSXML.NODE_ID_ATT, document_id);
815 GSXML.mergeElements(doc_elem, transformed_section);
816 return result;
817 }
818
819 // multi sectioned document.
820 transformed_section.setAttribute(GSXML.NODE_ID_ATT, document_id);
821 // In docEdit mode, we obtain the text from archives, from doc.xml
822 // Now the transformation has replaced <Section> with <documentNode>
823 // Need to add nodeID, nodeType and docType attributes to each docNode
824 // as doc.xml doesn't store that.
825 insertDocNodeAttributes(transformed_section, document_type, null);
826 doc_elem.appendChild(doc.importNode(transformed_section, true));
827 logger.debug("dx result = "+XMLConverter.getPrettyString(result));
828
829 return result;
830 }
831
832
833 private boolean needSectionContent(HashMap<String, Serializable> params) {
834 String document_id = (String) params.get(GSParams.DOCUMENT);
835 String ilt = (String) params.get(GSParams.INLINE_TEMPLATE);
836 String iltPrefix = "<xsl:template match=\"/\"><text><xsl:for-each select=\"/page/pageResponse/document//documentNode[@nodeID =";
837 if (ilt != null && ilt.startsWith(iltPrefix) && document_id != null) {
838 return true;
839 }
840
841 return false;
842 }
843 /**
844 * this method gets the collection description, the format info, the list of
845 * enrich services, etc - stuff that is needed for the page, but is the same
846 * whatever the query is - should be cached
847 */
848 protected boolean getBackgroundData(Element page_response, String collection, UserContext userContext)
849 {
850 Document doc = page_response.getOwnerDocument();
851
852 // create a message to process - contains requests for the collection
853 // description, the format element, the enrich services on offer
854 // these could all be cached
855 Element info_message = doc.createElement(GSXML.MESSAGE_ELEM);
856 String path = GSPath.appendLink(collection, "DocumentContentRetrieve");
857 // the format request - ignore for now, where does this request go to??
858 Element format_request = GSXML.createBasicRequest(doc, GSXML.REQUEST_TYPE_FORMAT, path, userContext);
859 info_message.appendChild(format_request);
860
861 // the enrich_services request - only do this if provide_annotations is true
862
863 if (provide_annotations)
864 {
865 Element enrich_services_request = GSXML.createBasicRequest(doc, GSXML.REQUEST_TYPE_DESCRIBE, "", userContext);
866 enrich_services_request.setAttribute(GSXML.INFO_ATT, "serviceList");
867 info_message.appendChild(enrich_services_request);
868 }
869
870 Element info_response = (Element) this.mr.process(info_message);
871
872 // the collection is the first response
873 NodeList responses = info_response.getElementsByTagName(GSXML.RESPONSE_ELEM);
874 Element format_resp = (Element) responses.item(0);
875
876 Element format_elem = (Element) GSXML.getChildByTagName(format_resp, GSXML.FORMAT_ELEM);
877 if (format_elem != null)
878 {
879 Element global_format_elem = (Element) GSXML.getChildByTagName(format_resp, GSXML.GLOBAL_FORMAT_ELEM);
880 if (global_format_elem != null)
881 {
882 GSXSLT.mergeFormatElements(format_elem, global_format_elem, false);
883 }
884
885 // set the format type
886 format_elem.setAttribute(GSXML.TYPE_ATT, "display");
887 page_response.appendChild(doc.importNode(format_elem, true));
888 }
889
890 if (provide_annotations)
891 {
892 Element services_resp = (Element) responses.item(1);
893
894 // a new message for the mr
895 Element enrich_message = doc.createElement(GSXML.MESSAGE_ELEM);
896 NodeList e_services = services_resp.getElementsByTagName(GSXML.SERVICE_ELEM);
897 boolean service_found = false;
898 for (int j = 0; j < e_services.getLength(); j++)
899 {
900 if (((Element) e_services.item(j)).getAttribute(GSXML.TYPE_ATT).equals("enrich"))
901 {
902 Element s = GSXML.createBasicRequest(doc, GSXML.REQUEST_TYPE_DESCRIBE, ((Element) e_services.item(j)).getAttribute(GSXML.NAME_ATT), userContext);
903 enrich_message.appendChild(s);
904 service_found = true;
905 }
906 }
907 if (service_found)
908 {
909 Element enrich_response = (Element) this.mr.process(enrich_message);
910
911 NodeList e_responses = enrich_response.getElementsByTagName(GSXML.RESPONSE_ELEM);
912 Element service_list = doc.createElement(GSXML.SERVICE_ELEM + GSXML.LIST_MODIFIER);
913 for (int i = 0; i < e_responses.getLength(); i++)
914 {
915 Element e_resp = (Element) e_responses.item(i);
916 Element e_service = (Element) doc.importNode(GSXML.getChildByTagName(e_resp, GSXML.SERVICE_ELEM), true);
917 e_service.setAttribute(GSXML.NAME_ATT, e_resp.getAttribute(GSXML.FROM_ATT));
918 service_list.appendChild(e_service);
919 }
920 page_response.appendChild(service_list);
921 }
922 } // if provide_annotations
923 return true;
924
925 }
926
927 protected String getDocumentType(Element basic_doc_list, String collection, UserContext userContext, Element page_response)
928 {
929 Document doc = basic_doc_list.getOwnerDocument();
930
931 Element ds_message = doc.createElement(GSXML.MESSAGE_ELEM);
932 String to = GSPath.appendLink(collection, "DocumentStructureRetrieve");// Hard-wired?
933 Element ds_request = GSXML.createBasicRequest(doc, GSXML.REQUEST_TYPE_PROCESS, to, userContext);
934 ds_message.appendChild(ds_request);
935
936 // Create a parameter list to specify the required structure information
937 Element ds_param_list = doc.createElement(GSXML.PARAM_ELEM + GSXML.LIST_MODIFIER);
938 Element ds_param = doc.createElement(GSXML.PARAM_ELEM);
939 ds_param_list.appendChild(ds_param);
940 ds_param.setAttribute(GSXML.NAME_ATT, "info");
941 ds_param.setAttribute(GSXML.VALUE_ATT, "documentType");
942
943 ds_request.appendChild(ds_param_list);
944
945 // add the node list we created earlier
946 ds_request.appendChild(basic_doc_list);
947
948 // Process the document structure retrieve message
949 Element ds_response_message = (Element) this.mr.process(ds_message);
950 if (processErrorElements(ds_response_message, page_response))
951 {
952 return null;
953 }
954
955 String[] links = { GSXML.RESPONSE_ELEM, GSXML.DOC_NODE_ELEM + GSXML.LIST_MODIFIER, GSXML.DOC_NODE_ELEM, "nodeStructureInfo" };
956 String path = GSPath.createPath(links);
957 Element info_elem = (Element) GSXML.getNodeByPath(ds_response_message, path);
958 if (info_elem == null) {
959 return null;
960 }
961 Element doctype_elem = GSXML.getNamedElement(info_elem, "info", "name", "documentType");
962 if (doctype_elem != null)
963 {
964 String doc_type = doctype_elem.getAttribute("value");
965 return doc_type;
966 }
967 return null;
968 }
969
970 // Recursive method to set the docType, nodeType and nodeID attributes of each docNode
971 // The docType remains constant as in parameter document_type
972 // The nodeID for the first (root) docNode is already set. For all children, the rootNode id
973 // is updated to be <parent-id>.<num-child>, where the first parent-id is rootNode id.
974 // The nodeType is root if rootNode, internal if there are children and leaf if no children
975 protected void insertDocNodeAttributes(Element docNode, String document_type, String id) {
976
977 boolean isRoot = false;
978 if(id == null) { // rootNode, get the root nodeID to work with recursively
979 id = docNode.getAttribute(GSXML.NODE_ID_ATT);
980 isRoot = true;
981 } else { // for all but the root node, need to still set the nodeID
982 docNode.setAttribute(GSXML.NODE_ID_ATT, id);
983 }
984
985 docNode.setAttribute(GSXML.DOC_TYPE_ATT, document_type);
986
987 NodeList docNodes = GSXML.getChildrenByTagName(docNode, GSXML.DOC_NODE_ELEM);
988 if(docNodes.getLength() > 0) {
989 docNode.setAttribute(GSXML.NODE_TYPE_ATT, GSXML.NODE_TYPE_INTERNAL);
990 for(int i = 0; i < docNodes.getLength(); i++) {
991 Element childDocNode = (Element)docNodes.item(i);
992
993 // work out the child docNode's nodeID based on current id
994 String nodeID = id + "." + (i+1);
995 insertDocNodeAttributes(childDocNode, document_type, nodeID); //recursion step
996 }
997 } else {
998 docNode.setAttribute(GSXML.NODE_TYPE_ATT, GSXML.NODE_TYPE_LEAF);
999 }
1000
1001 // rootNode's nodeType is a special case: it's "root", not "leaf" or "internal"
1002 if(isRoot) docNode.setAttribute(GSXML.NODE_TYPE_ATT, GSXML.NODE_TYPE_ROOT);
1003
1004 }
1005
1006 /** run the XSLT transform which converts from doc.xml format to our internal document format */
1007 protected Element transformArchiveToDocument(Element section) {
1008
1009 String stylesheet_filename = GSFile.stylesheetFile(GlobalProperties.getGSDL3Home(), (String) this.config_params.get(GSConstants.SITE_NAME), "", (String) this.config_params.get(GSConstants.INTERFACE_NAME), (ArrayList<String>) this.config_params.get(GSConstants.BASE_INTERFACES), "archive2document.xsl");
1010 if (stylesheet_filename == null) {
1011 logger.error("Couldn't find stylesheet archive2document.xsl");
1012 return section;
1013 }
1014
1015 Document stylesheet_doc = XMLConverter.getDOM(new File(stylesheet_filename));
1016 if (stylesheet_doc == null) {
1017 logger.error("Couldn't load in stylesheet "+stylesheet_filename);
1018 return section;
1019 }
1020
1021 Document section_doc = XMLConverter.newDOM();
1022 section_doc.appendChild(section_doc.importNode(section, true));
1023 Node result = this.transformer.transform(stylesheet_doc, section_doc);
1024 logger.debug("transform result = "+XMLConverter.getPrettyString(result));
1025
1026 Element new_element;
1027 if (result.getNodeType() == Node.DOCUMENT_NODE) {
1028 new_element = ((Document) result).getDocumentElement();
1029 } else {
1030 new_element = (Element) result;
1031 }
1032
1033
1034 return new_element;
1035
1036 }
1037
1038 /**
1039 * this involves a bit of a hack to get the equivalent query terms - has to
1040 * requery the query service - uses the last selected service name. (if it
1041 * ends in query). should this action do the query or should it send a
1042 * message to the query action? but that will involve lots of extra stuff.
1043 */
1044 protected boolean getQueryTermVariants(Element request, String current_node_id, HashSet<String> query_term_variants, ArrayList<ArrayList<HashSet<String>>> phrase_query_term_variants_hierarchy)
1045 {
1046 Document doc = request.getOwnerDocument();
1047
1048 // do the query again to get term info
1049 Element cgi_param_list = (Element) GSXML.getChildByTagName(request, GSXML.PARAM_ELEM + GSXML.LIST_MODIFIER);
1050 //logger.error("cgi param list = "+XMLConverter.getPrettyString(cgi_param_list));
1051 HashMap<String, Serializable> params = GSXML.extractParams(cgi_param_list, false);
1052
1053 HashMap previous_params = (HashMap) params.get("p");
1054 if (previous_params == null)
1055 {
1056 //logger.error("no p parms");
1057 return false;
1058 }
1059 String service_name = (String) previous_params.get(GSParams.SERVICE);
1060 if (service_name == null || !service_name.endsWith("Query"))
1061 { // hack for now - we only do highlighting if we were in a query last - ie not if we were in a browse thingy
1062 logger.debug("invalid service, not doing highlighting");
1063 return false;
1064 }
1065
1066 String collection = (String) params.get(GSParams.COLLECTION);
1067 UserContext userContext = new UserContext(request);
1068 String to = GSPath.appendLink(collection, service_name);
1069
1070 Element mr_query_message = doc.createElement(GSXML.MESSAGE_ELEM);
1071 Element mr_query_request = GSXML.createBasicRequest(doc, GSXML.REQUEST_TYPE_PROCESS, to, userContext);
1072 mr_query_message.appendChild(mr_query_request);
1073
1074 // paramList
1075 HashMap service_params = (HashMap) params.get("s1");
1076
1077 Element query_param_list = doc.createElement(GSXML.PARAM_ELEM + GSXML.LIST_MODIFIER);
1078 GSXML.addParametersToList(query_param_list, service_params);
1079 // is this only used for solr??? - do we still want it for solr??
1080 // if (current_node_id != null) {
1081 // GSXML.addParameterToList(query_param_list, "hldocOID", current_node_id);
1082 // } else {
1083 // GSXML.addParameterToList(query_param_list, "hldocOID", (String) params.get(GSParams.DOCUMENT));
1084 // }
1085 mr_query_request.appendChild(query_param_list);
1086 // do the query
1087
1088 Element mr_query_response = (Element) this.mr.process(mr_query_message);
1089
1090 // find the term lists
1091 String path = GSPath.appendLink(GSXML.RESPONSE_ELEM, GSXML.TERM_ELEM + GSXML.LIST_MODIFIER);
1092 Element query_term_list_element = (Element) GSXML.getNodeByPath(mr_query_response, path);
1093 if (query_term_list_element == null)
1094 {
1095 // no term info
1096 logger.error("No query term information. xx\n");
1097 return false;
1098 }
1099 // logger.error("query term list info "+XMLConverter.getPrettyString(query_term_list_element));
1100 //String content = GSXML.getNodeText(dc_response_doc_content);
1101
1102 String metadata_path = GSPath.appendLink(GSXML.RESPONSE_ELEM, GSXML.METADATA_ELEM + GSXML.LIST_MODIFIER);
1103 Element metadata_list = (Element) GSXML.getNodeByPath(mr_query_response, metadata_path);
1104
1105 NodeList equivalent_terms_nodelist = query_term_list_element.getElementsByTagName("equivTermList");
1106 if (equivalent_terms_nodelist == null || equivalent_terms_nodelist.getLength() == 0)
1107 {
1108 NodeList terms_nodelist = query_term_list_element.getElementsByTagName("term");
1109 if (terms_nodelist != null && terms_nodelist.getLength() > 0)
1110 {
1111 for (int i = 0; i < terms_nodelist.getLength(); i++)
1112 {
1113 String termValue = ((Element) terms_nodelist.item(i)).getAttribute("name");
1114 String termValueU = null;
1115 String termValueL = null;
1116
1117 if (termValue.length() > 1)
1118 {
1119 termValueU = termValue.substring(0, 1).toUpperCase() + termValue.substring(1);
1120 termValueL = termValue.substring(0, 1).toLowerCase() + termValue.substring(1);
1121 }
1122 else
1123 {
1124 termValueU = termValue.substring(0, 1).toUpperCase();
1125 termValueL = termValue.substring(0, 1).toLowerCase();
1126 }
1127 query_term_variants.add(termValueU);
1128 query_term_variants.add(termValueL);
1129 }
1130 }
1131 }
1132 else
1133 {
1134 for (int i = 0; i < equivalent_terms_nodelist.getLength(); i++)
1135 {
1136 Element equivalent_terms_element = (Element) equivalent_terms_nodelist.item(i);
1137 String[] equivalent_terms = GSXML.getAttributeValuesFromList(equivalent_terms_element, GSXML.NAME_ATT);
1138 for (int j = 0; j < equivalent_terms.length; j++)
1139 {
1140 query_term_variants.add(equivalent_terms[j]);
1141 }
1142 }
1143 }
1144
1145
1146 Element query_element = GSXML.getNamedElement(metadata_list, GSXML.METADATA_ELEM, GSXML.NAME_ATT, "query");
1147 String performed_query = GSXML.getNodeText(query_element) + " ";
1148
1149 ArrayList<HashSet<String>> phrase_query_p_term_variants_list = new ArrayList<HashSet<String>>();
1150 int term_start = 0;
1151 boolean in_term = false;
1152 boolean in_phrase = false;
1153 for (int i = 0; i < performed_query.length(); i++)
1154 {
1155 char character = performed_query.charAt(i);
1156 boolean is_character_letter_or_digit = Character.isLetterOrDigit(character);
1157
1158 // Has a query term just started?
1159 if (in_term == false && is_character_letter_or_digit == true)
1160 {
1161 in_term = true;
1162 term_start = i;
1163 }
1164
1165 // Or has a term just finished?
1166 else if (in_term == true && is_character_letter_or_digit == false)
1167 {
1168 in_term = false;
1169 String term = performed_query.substring(term_start, i);
1170
1171 Element term_element = GSXML.getNamedElement(query_term_list_element, GSXML.TERM_ELEM, GSXML.NAME_ATT, term);
1172 if (term_element != null)
1173 {
1174
1175 HashSet<String> phrase_query_p_term_x_variants = new HashSet<String>();
1176
1177 NodeList term_equivalent_terms_nodelist = term_element.getElementsByTagName("equivTermList");
1178 if (term_equivalent_terms_nodelist == null || term_equivalent_terms_nodelist.getLength() == 0)
1179 {
1180 String termValueU = null;
1181 String termValueL = null;
1182
1183 if (term.length() > 1)
1184 {
1185 termValueU = term.substring(0, 1).toUpperCase() + term.substring(1);
1186 termValueL = term.substring(0, 1).toLowerCase() + term.substring(1);
1187 }
1188 else
1189 {
1190 termValueU = term.substring(0, 1).toUpperCase();
1191 termValueL = term.substring(0, 1).toLowerCase();
1192 }
1193
1194 phrase_query_p_term_x_variants.add(termValueU);
1195 phrase_query_p_term_x_variants.add(termValueL);
1196 }
1197 else
1198 {
1199 for (int j = 0; j < term_equivalent_terms_nodelist.getLength(); j++)
1200 {
1201 Element term_equivalent_terms_element = (Element) term_equivalent_terms_nodelist.item(j);
1202 String[] term_equivalent_terms = GSXML.getAttributeValuesFromList(term_equivalent_terms_element, GSXML.NAME_ATT);
1203 for (int k = 0; k < term_equivalent_terms.length; k++)
1204 {
1205 phrase_query_p_term_x_variants.add(term_equivalent_terms[k]);
1206 }
1207 }
1208 }
1209 phrase_query_p_term_variants_list.add(phrase_query_p_term_x_variants);
1210
1211 if (in_phrase == false)
1212 {
1213 phrase_query_term_variants_hierarchy.add(phrase_query_p_term_variants_list);
1214 phrase_query_p_term_variants_list = new ArrayList<HashSet<String>>();
1215 }
1216 }
1217 }
1218 // Watch for phrases (surrounded by quotes)
1219 if (character == '\"')
1220 {
1221 // Has a phrase just started?
1222 if (in_phrase == false)
1223 {
1224 in_phrase = true;
1225 }
1226 // Or has a phrase just finished?
1227 else if (in_phrase == true)
1228 {
1229 in_phrase = false;
1230 phrase_query_term_variants_hierarchy.add(phrase_query_p_term_variants_list);
1231 }
1232
1233 phrase_query_p_term_variants_list = new ArrayList<HashSet<String>>();
1234 }
1235 }
1236
1237 return true;
1238 }
1239
1240 /** redo the request to get the query terms then highlight them in the text
1241 *
1242 */
1243 protected Element highlightQueryTermsOld(Element request, String current_node_id, Element dc_response_doc_content)
1244 {
1245 Document doc = request.getOwnerDocument();
1246
1247 // do the query again to get term info
1248 Element cgi_param_list = (Element) GSXML.getChildByTagName(request, GSXML.PARAM_ELEM + GSXML.LIST_MODIFIER);
1249 HashMap<String, Serializable> params = GSXML.extractParams(cgi_param_list, false);
1250
1251 HashMap previous_params = (HashMap) params.get("p");
1252 if (previous_params == null)
1253 {
1254 return dc_response_doc_content;
1255 }
1256 String service_name = (String) previous_params.get(GSParams.SERVICE);
1257 if (service_name == null || !service_name.endsWith("Query"))
1258 { // hack for now - we only do highlighting if we were in a query last - ie not if we were in a browse thingy
1259 logger.debug("invalid service, not doing highlighting");
1260 return dc_response_doc_content;
1261 }
1262 String collection = (String) params.get(GSParams.COLLECTION);
1263 UserContext userContext = new UserContext(request);
1264 String to = GSPath.appendLink(collection, service_name);
1265
1266 Element mr_query_message = doc.createElement(GSXML.MESSAGE_ELEM);
1267 Element mr_query_request = GSXML.createBasicRequest(doc, GSXML.REQUEST_TYPE_PROCESS, to, userContext);
1268 mr_query_message.appendChild(mr_query_request);
1269
1270 // paramList
1271 HashMap service_params = (HashMap) params.get("s1");
1272
1273 Element query_param_list = doc.createElement(GSXML.PARAM_ELEM + GSXML.LIST_MODIFIER);
1274 GSXML.addParametersToList(query_param_list, service_params);
1275 if (current_node_id != null) {
1276 GSXML.addParameterToList(query_param_list, "hldocOID", current_node_id);
1277 } else {
1278 GSXML.addParameterToList(query_param_list, "hldocOID", (String) params.get(GSParams.DOCUMENT));
1279 }
1280 mr_query_request.appendChild(query_param_list);
1281 // do the query
1282 Element mr_query_response = (Element) this.mr.process(mr_query_message);
1283 String pathNode = GSPath.appendLink(GSXML.RESPONSE_ELEM, GSXML.NODE_CONTENT_ELEM);
1284 Element highlighted_Node = (Element) GSXML.getNodeByPath(mr_query_response, pathNode);
1285 // For SOLR, the above query may come back with a nodeContent element, which is the hldocOID section content, with search terms marked up. We send it back to the documnetContentRetrieve service so that resolveTextMacros can be applied, and it can be properly encased in documentNode etc elements
1286 if (highlighted_Node != null)
1287 {
1288 // Build a request to process highlighted text
1289 logger.error("highlighted node is not null!!!!");
1290 logger.error(XMLConverter.getPrettyString(highlighted_Node));
1291 Element hl_message = doc.createElement(GSXML.MESSAGE_ELEM);
1292 to = GSPath.appendLink(collection, "DocumentContentRetrieve");
1293 Element dc_request = GSXML.createBasicRequest(doc, GSXML.REQUEST_TYPE_PROCESS, to, userContext);
1294 hl_message.appendChild(dc_request);
1295
1296 // Create a parameter list to specify the request parameters - empty for now
1297 Element dc_param_list = doc.createElement(GSXML.PARAM_ELEM + GSXML.LIST_MODIFIER);
1298 dc_request.appendChild(dc_param_list);
1299
1300 // get the content
1301 Element doc_list = doc.createElement(GSXML.DOC_NODE_ELEM + GSXML.LIST_MODIFIER);
1302 dc_request.appendChild(doc_list);
1303 Element current_doc = doc.createElement(GSXML.DOC_NODE_ELEM);
1304 doc_list.appendChild(current_doc);
1305 current_doc.setAttribute(GSXML.NODE_ID_ATT, (String) params.get(GSParams.DOCUMENT));
1306 //Append highlighted content to request for processing
1307 dc_request.appendChild(doc.importNode(highlighted_Node, true));
1308 Element hl_response_message = (Element) this.mr.process(hl_message);
1309
1310 //Get results
1311 NodeList contentList = hl_response_message.getElementsByTagName(GSXML.NODE_CONTENT_ELEM);
1312 Element content = (Element) contentList.item(0);
1313 return content;
1314 }
1315 String path = GSPath.appendLink(GSXML.RESPONSE_ELEM, GSXML.TERM_ELEM + GSXML.LIST_MODIFIER);
1316 Element query_term_list_element = (Element) GSXML.getNodeByPath(mr_query_response, path);
1317 if (query_term_list_element == null)
1318 {
1319 // no term info
1320 logger.error("No query term information. yy\n");
1321 return dc_response_doc_content;
1322 }
1323
1324 String content = GSXML.getNodeText(dc_response_doc_content);
1325
1326 String metadata_path = GSPath.appendLink(GSXML.RESPONSE_ELEM, GSXML.METADATA_ELEM + GSXML.LIST_MODIFIER);
1327 Element metadata_list = (Element) GSXML.getNodeByPath(mr_query_response, metadata_path);
1328
1329 HashSet<String> query_term_variants = new HashSet<String>();
1330 NodeList equivalent_terms_nodelist = query_term_list_element.getElementsByTagName("equivTermList");
1331 if (equivalent_terms_nodelist == null || equivalent_terms_nodelist.getLength() == 0)
1332 {
1333 NodeList terms_nodelist = query_term_list_element.getElementsByTagName("term");
1334 if (terms_nodelist != null && terms_nodelist.getLength() > 0)
1335 {
1336 for (int i = 0; i < terms_nodelist.getLength(); i++)
1337 {
1338 String termValue = ((Element) terms_nodelist.item(i)).getAttribute("name");
1339 String termValueU = null;
1340 String termValueL = null;
1341
1342 if (termValue.length() > 1)
1343 {
1344 termValueU = termValue.substring(0, 1).toUpperCase() + termValue.substring(1);
1345 termValueL = termValue.substring(0, 1).toLowerCase() + termValue.substring(1);
1346 }
1347 else
1348 {
1349 termValueU = termValue.substring(0, 1).toUpperCase();
1350 termValueL = termValue.substring(0, 1).toLowerCase();
1351 }
1352
1353 query_term_variants.add(termValueU);
1354 query_term_variants.add(termValueL);
1355 }
1356 }
1357 }
1358 else
1359 {
1360 for (int i = 0; i < equivalent_terms_nodelist.getLength(); i++)
1361 {
1362 Element equivalent_terms_element = (Element) equivalent_terms_nodelist.item(i);
1363 String[] equivalent_terms = GSXML.getAttributeValuesFromList(equivalent_terms_element, GSXML.NAME_ATT);
1364 for (int j = 0; j < equivalent_terms.length; j++)
1365 {
1366 query_term_variants.add(equivalent_terms[j]);
1367 }
1368 }
1369 }
1370
1371 ArrayList<ArrayList<HashSet<String>>> phrase_query_term_variants_hierarchy = new ArrayList<ArrayList<HashSet<String>>>();
1372
1373 Element query_element = GSXML.getNamedElement(metadata_list, GSXML.METADATA_ELEM, GSXML.NAME_ATT, "query");
1374 String performed_query = GSXML.getNodeText(query_element) + " ";
1375
1376 ArrayList<HashSet<String>> phrase_query_p_term_variants_list = new ArrayList<HashSet<String>>();
1377 int term_start = 0;
1378 boolean in_term = false;
1379 boolean in_phrase = false;
1380 for (int i = 0; i < performed_query.length(); i++)
1381 {
1382 char character = performed_query.charAt(i);
1383 boolean is_character_letter_or_digit = Character.isLetterOrDigit(character);
1384
1385 // Has a query term just started?
1386 if (in_term == false && is_character_letter_or_digit == true)
1387 {
1388 in_term = true;
1389 term_start = i;
1390 }
1391
1392 // Or has a term just finished?
1393 else if (in_term == true && is_character_letter_or_digit == false)
1394 {
1395 in_term = false;
1396 String term = performed_query.substring(term_start, i);
1397
1398 Element term_element = GSXML.getNamedElement(query_term_list_element, GSXML.TERM_ELEM, GSXML.NAME_ATT, term);
1399 if (term_element != null)
1400 {
1401
1402 HashSet<String> phrase_query_p_term_x_variants = new HashSet<String>();
1403
1404 NodeList term_equivalent_terms_nodelist = term_element.getElementsByTagName("equivTermList");
1405 if (term_equivalent_terms_nodelist == null || term_equivalent_terms_nodelist.getLength() == 0)
1406 {
1407 String termValueU = null;
1408 String termValueL = null;
1409
1410 if (term.length() > 1)
1411 {
1412 termValueU = term.substring(0, 1).toUpperCase() + term.substring(1);
1413 termValueL = term.substring(0, 1).toLowerCase() + term.substring(1);
1414 }
1415 else
1416 {
1417 termValueU = term.substring(0, 1).toUpperCase();
1418 termValueL = term.substring(0, 1).toLowerCase();
1419 }
1420
1421 phrase_query_p_term_x_variants.add(termValueU);
1422 phrase_query_p_term_x_variants.add(termValueL);
1423 }
1424 else
1425 {
1426 for (int j = 0; j < term_equivalent_terms_nodelist.getLength(); j++)
1427 {
1428 Element term_equivalent_terms_element = (Element) term_equivalent_terms_nodelist.item(j);
1429 String[] term_equivalent_terms = GSXML.getAttributeValuesFromList(term_equivalent_terms_element, GSXML.NAME_ATT);
1430 for (int k = 0; k < term_equivalent_terms.length; k++)
1431 {
1432 phrase_query_p_term_x_variants.add(term_equivalent_terms[k]);
1433 }
1434 }
1435 }
1436 phrase_query_p_term_variants_list.add(phrase_query_p_term_x_variants);
1437
1438 if (in_phrase == false)
1439 {
1440 phrase_query_term_variants_hierarchy.add(phrase_query_p_term_variants_list);
1441 phrase_query_p_term_variants_list = new ArrayList<HashSet<String>>();
1442 }
1443 }
1444 }
1445 // Watch for phrases (surrounded by quotes)
1446 if (character == '\"')
1447 {
1448 // Has a phrase just started?
1449 if (in_phrase == false)
1450 {
1451 in_phrase = true;
1452 }
1453 // Or has a phrase just finished?
1454 else if (in_phrase == true)
1455 {
1456 in_phrase = false;
1457 phrase_query_term_variants_hierarchy.add(phrase_query_p_term_variants_list);
1458 }
1459
1460 phrase_query_p_term_variants_list = new ArrayList<HashSet<String>>();
1461 }
1462 }
1463
1464 return highlightQueryTermsInternalOrig(doc, content, query_term_variants, phrase_query_term_variants_hierarchy);
1465 }
1466
1467 /**
1468 * Highlights query terms in specified elements (whose name is in element_names) text inside top_level_elem
1469 */
1470 protected boolean highlightQueryTermsDOM(Document doc, Element top_level_elem, String element_name, HashSet<String> query_term_variants, ArrayList<ArrayList<HashSet<String>>> phrase_query_term_variants_hierarchy) {
1471
1472 //logger.error("begin highlight DOM "+XMLConverter.getPrettyString(top_level_elem));
1473 NodeList named_elems = top_level_elem.getElementsByTagName(element_name);
1474 for (int j=named_elems.getLength()-1; j>=0; j--) {
1475 Element this_elem = (Element)named_elems.item(j);
1476 Element replacement_elem = highlightQueryTermsElementText(doc, this_elem, query_term_variants, phrase_query_term_variants_hierarchy);
1477 this_elem.getParentNode().replaceChild(replacement_elem, this_elem);
1478 }
1479
1480
1481 //logger.error("end highlight DOM "+XMLConverter.getPrettyString(top_level_elem));
1482 return true;
1483 }
1484 /**
1485 * Highlights query terms in the text content of an element.
1486 */
1487 private Element highlightQueryTermsElementText(Document doc, Element original_element, /*String content,*/ HashSet<String> query_term_variants, ArrayList<ArrayList<HashSet<String>>> phrase_query_term_variants_hierarchy)
1488 {
1489 //logger.error("in hl internal, query terms are "+query_term_variants.toString());
1490 String content = GSXML.getNodeText(original_element);
1491 //logger.error("original elem = "+XMLConverter.getPrettyString(original_element));
1492 logger.error("highlighting content: "+content);
1493 // Convert the content string to an array of characters for speed
1494 char[] content_characters = new char[content.length()];
1495 content.getChars(0, content.length(), content_characters, 0);
1496
1497 // Now skim through the content, identifying word matches
1498 ArrayList<WordMatch> word_matches = new ArrayList<WordMatch>();
1499 int word_start = 0;
1500 boolean in_word = false;
1501 boolean preceding_word_matched = false;
1502 boolean inTag = false;
1503 for (int i = 0; i < content_characters.length; i++)
1504 {
1505 //We don't want to find words inside HTML tags
1506 if (content_characters[i] == '<')
1507 {
1508 inTag = true;
1509 continue;
1510 }
1511 else if (inTag && content_characters[i] == '>')
1512 {
1513 inTag = false;
1514 }
1515 else if (inTag)
1516 {
1517 continue;
1518 }
1519
1520 boolean is_character_letter_or_digit = Character.isLetterOrDigit(content_characters[i]);
1521
1522 // Has a word just started?
1523 if (in_word == false && is_character_letter_or_digit == true)
1524 {
1525 in_word = true;
1526 word_start = i;
1527 }
1528
1529 // Or has a word just finished?
1530 else if (in_word == true && is_character_letter_or_digit == false)
1531 {
1532 in_word = false;
1533
1534 // Check if the word matches any of the query term equivalents
1535 String word = new String(content_characters, word_start, (i - word_start));
1536 //logger.error("word: "+word);
1537 if (query_term_variants.contains(word))
1538 {
1539 //logger.error("matched");
1540 // We have found a matching word, so remember its location
1541 word_matches.add(new WordMatch(word, word_start, i, preceding_word_matched));
1542 preceding_word_matched = true;
1543 }
1544 else
1545 {
1546 preceding_word_matched = false;
1547 }
1548 }
1549 }
1550
1551 // Don't forget the last word...
1552 if (in_word == true)
1553 {
1554 // Check if the word matches any of the query term equivalents
1555 String word = new String(content_characters, word_start, (content_characters.length - word_start));
1556 if (query_term_variants.contains(word))
1557 {
1558 // We have found a matching word, so remember its location
1559 word_matches.add(new WordMatch(word, word_start, content_characters.length, preceding_word_matched));
1560 }
1561 }
1562
1563 ArrayList<Integer> highlight_start_positions = new ArrayList<Integer>();
1564 ArrayList<Integer> highlight_end_positions = new ArrayList<Integer>();
1565
1566 // Deal with phrases now
1567 ArrayList<PartialPhraseMatch> partial_phrase_matches = new ArrayList<PartialPhraseMatch>();
1568 for (int i = 0; i < word_matches.size(); i++)
1569 {
1570 WordMatch word_match = word_matches.get(i);
1571
1572 // See if any partial phrase matches are extended by this word
1573 if (word_match.preceding_word_matched)
1574 {
1575 for (int j = partial_phrase_matches.size() - 1; j >= 0; j--)
1576 {
1577 PartialPhraseMatch partial_phrase_match = partial_phrase_matches.remove(j);
1578 ArrayList phrase_query_p_term_variants_list = phrase_query_term_variants_hierarchy.get(partial_phrase_match.query_phrase_number);
1579 HashSet phrase_query_p_term_x_variants = (HashSet) phrase_query_p_term_variants_list.get(partial_phrase_match.num_words_matched);
1580 if (phrase_query_p_term_x_variants.contains(word_match.word))
1581 {
1582 partial_phrase_match.num_words_matched++;
1583
1584 // Has a complete phrase match occurred?
1585 if (partial_phrase_match.num_words_matched == phrase_query_p_term_variants_list.size())
1586 {
1587 // Check for overlaps by looking at the previous highlight range
1588 if (!highlight_end_positions.isEmpty())
1589 {
1590 int last_highlight_index = highlight_end_positions.size() - 1;
1591 int last_highlight_end = highlight_end_positions.get(last_highlight_index).intValue();
1592 if (last_highlight_end > partial_phrase_match.start_position)
1593 {
1594 // There is an overlap, so remove the previous phrase match
1595 int last_highlight_start = highlight_start_positions.remove(last_highlight_index).intValue();
1596 highlight_end_positions.remove(last_highlight_index);
1597 partial_phrase_match.start_position = last_highlight_start;
1598 }
1599 }
1600
1601 highlight_start_positions.add(new Integer(partial_phrase_match.start_position));
1602 highlight_end_positions.add(new Integer(word_match.end_position));
1603 }
1604 // No, but add the partial match back into the list for next time
1605 else
1606 {
1607 partial_phrase_matches.add(partial_phrase_match);
1608 }
1609 }
1610 }
1611 }
1612 else
1613 {
1614 partial_phrase_matches.clear();
1615 }
1616
1617 // See if this word is at the start of any of the phrases
1618 for (int p = 0; p < phrase_query_term_variants_hierarchy.size(); p++)
1619 {
1620 ArrayList phrase_query_p_term_variants_list = phrase_query_term_variants_hierarchy.get(p);
1621 if (phrase_query_p_term_variants_list.size()>0) {
1622 HashSet phrase_query_p_term_1_variants = (HashSet) phrase_query_p_term_variants_list.get(0);
1623 if (phrase_query_p_term_1_variants.contains(word_match.word))
1624 {
1625 // If this phrase is just one word long, we have a complete match
1626 if (phrase_query_p_term_variants_list.size() == 1)
1627 {
1628 highlight_start_positions.add(new Integer(word_match.start_position));
1629 highlight_end_positions.add(new Integer(word_match.end_position));
1630 }
1631 // Otherwise we have the start of a potential phrase match
1632 else
1633 {
1634 partial_phrase_matches.add(new PartialPhraseMatch(word_match.start_position, p));
1635 }
1636 }
1637 }
1638 }
1639 }
1640
1641 // Now add the annotation tags into the document at the correct points
1642 //Element content_element = doc.createElement(GSXML.NODE_CONTENT_ELEM);
1643 Element content_element = (Element)doc.importNode(original_element, false); // just copy the element plus any attributes, but not any children.
1644 int last_wrote = 0;
1645 for (int i = 0; i < highlight_start_positions.size(); i++)
1646 {
1647 int highlight_start = highlight_start_positions.get(i).intValue();
1648 int highlight_end = highlight_end_positions.get(i).intValue();
1649
1650 // Print anything before the highlight range
1651 if (last_wrote < highlight_start)
1652 {
1653 String preceding_text = new String(content_characters, last_wrote, (highlight_start - last_wrote));
1654 content_element.appendChild(doc.createTextNode(preceding_text));
1655 }
1656
1657 // Print the highlight text, annotated
1658 if (highlight_end > last_wrote)
1659 {
1660 String highlight_text = new String(content_characters, highlight_start, (highlight_end - highlight_start));
1661 Element annotation_element = GSXML.createTextElement(doc, "annotation", highlight_text);
1662 annotation_element.setAttribute("type", "query_term");
1663 content_element.appendChild(annotation_element);
1664 last_wrote = highlight_end;
1665 }
1666 }
1667
1668 // Finish off any unwritten text
1669 if (last_wrote < content_characters.length)
1670 {
1671 String remaining_text = new String(content_characters, last_wrote, (content_characters.length - last_wrote));
1672 content_element.appendChild(doc.createTextNode(remaining_text));
1673 }
1674 return content_element;
1675 }
1676
1677 private Element highlightQueryTermsInternalOrig(Document doc, String content, HashSet<String> query_term_variants, ArrayList<ArrayList<HashSet<String>>> phrase_query_term_variants_hierarchy)
1678 {
1679 // Convert the content string to an array of characters for speed
1680 char[] content_characters = new char[content.length()];
1681 content.getChars(0, content.length(), content_characters, 0);
1682
1683 // Now skim through the content, identifying word matches
1684 ArrayList<WordMatch> word_matches = new ArrayList<WordMatch>();
1685 int word_start = 0;
1686 boolean in_word = false;
1687 boolean preceding_word_matched = false;
1688 boolean inTag = false;
1689 for (int i = 0; i < content_characters.length; i++)
1690 {
1691 //We don't want to find words inside HTML tags
1692 if (content_characters[i] == '<')
1693 {
1694 inTag = true;
1695 continue;
1696 }
1697 else if (inTag && content_characters[i] == '>')
1698 {
1699 inTag = false;
1700 }
1701 else if (inTag)
1702 {
1703 continue;
1704 }
1705
1706 boolean is_character_letter_or_digit = Character.isLetterOrDigit(content_characters[i]);
1707
1708 // Has a word just started?
1709 if (in_word == false && is_character_letter_or_digit == true)
1710 {
1711 in_word = true;
1712 word_start = i;
1713 }
1714
1715 // Or has a word just finished?
1716 else if (in_word == true && is_character_letter_or_digit == false)
1717 {
1718 in_word = false;
1719
1720 // Check if the word matches any of the query term equivalents
1721 String word = new String(content_characters, word_start, (i - word_start));
1722 if (query_term_variants.contains(word))
1723 {
1724 // We have found a matching word, so remember its location
1725 word_matches.add(new WordMatch(word, word_start, i, preceding_word_matched));
1726 preceding_word_matched = true;
1727 }
1728 else
1729 {
1730 preceding_word_matched = false;
1731 }
1732 }
1733 }
1734
1735 // Don't forget the last word...
1736 if (in_word == true)
1737 {
1738 // Check if the word matches any of the query term equivalents
1739 String word = new String(content_characters, word_start, (content_characters.length - word_start));
1740 if (query_term_variants.contains(word))
1741 {
1742 // We have found a matching word, so remember its location
1743 word_matches.add(new WordMatch(word, word_start, content_characters.length, preceding_word_matched));
1744 }
1745 }
1746
1747 ArrayList<Integer> highlight_start_positions = new ArrayList<Integer>();
1748 ArrayList<Integer> highlight_end_positions = new ArrayList<Integer>();
1749
1750 // Deal with phrases now
1751 ArrayList<PartialPhraseMatch> partial_phrase_matches = new ArrayList<PartialPhraseMatch>();
1752 for (int i = 0; i < word_matches.size(); i++)
1753 {
1754 WordMatch word_match = word_matches.get(i);
1755
1756 // See if any partial phrase matches are extended by this word
1757 if (word_match.preceding_word_matched)
1758 {
1759 for (int j = partial_phrase_matches.size() - 1; j >= 0; j--)
1760 {
1761 PartialPhraseMatch partial_phrase_match = partial_phrase_matches.remove(j);
1762 ArrayList phrase_query_p_term_variants_list = phrase_query_term_variants_hierarchy.get(partial_phrase_match.query_phrase_number);
1763 HashSet phrase_query_p_term_x_variants = (HashSet) phrase_query_p_term_variants_list.get(partial_phrase_match.num_words_matched);
1764 if (phrase_query_p_term_x_variants.contains(word_match.word))
1765 {
1766 partial_phrase_match.num_words_matched++;
1767
1768 // Has a complete phrase match occurred?
1769 if (partial_phrase_match.num_words_matched == phrase_query_p_term_variants_list.size())
1770 {
1771 // Check for overlaps by looking at the previous highlight range
1772 if (!highlight_end_positions.isEmpty())
1773 {
1774 int last_highlight_index = highlight_end_positions.size() - 1;
1775 int last_highlight_end = highlight_end_positions.get(last_highlight_index).intValue();
1776 if (last_highlight_end > partial_phrase_match.start_position)
1777 {
1778 // There is an overlap, so remove the previous phrase match
1779 int last_highlight_start = highlight_start_positions.remove(last_highlight_index).intValue();
1780 highlight_end_positions.remove(last_highlight_index);
1781 partial_phrase_match.start_position = last_highlight_start;
1782 }
1783 }
1784
1785 highlight_start_positions.add(new Integer(partial_phrase_match.start_position));
1786 highlight_end_positions.add(new Integer(word_match.end_position));
1787 }
1788 // No, but add the partial match back into the list for next time
1789 else
1790 {
1791 partial_phrase_matches.add(partial_phrase_match);
1792 }
1793 }
1794 }
1795 }
1796 else
1797 {
1798 partial_phrase_matches.clear();
1799 }
1800
1801 // See if this word is at the start of any of the phrases
1802 for (int p = 0; p < phrase_query_term_variants_hierarchy.size(); p++)
1803 {
1804 ArrayList phrase_query_p_term_variants_list = phrase_query_term_variants_hierarchy.get(p);
1805 if (phrase_query_p_term_variants_list.size()>0) {
1806 HashSet phrase_query_p_term_1_variants = (HashSet) phrase_query_p_term_variants_list.get(0);
1807 if (phrase_query_p_term_1_variants.contains(word_match.word))
1808 {
1809 // If this phrase is just one word long, we have a complete match
1810 if (phrase_query_p_term_variants_list.size() == 1)
1811 {
1812 highlight_start_positions.add(new Integer(word_match.start_position));
1813 highlight_end_positions.add(new Integer(word_match.end_position));
1814 }
1815 // Otherwise we have the start of a potential phrase match
1816 else
1817 {
1818 partial_phrase_matches.add(new PartialPhraseMatch(word_match.start_position, p));
1819 }
1820 }
1821 }
1822 }
1823 }
1824
1825 // Now add the annotation tags into the document at the correct points
1826 Element content_element = doc.createElement(GSXML.NODE_CONTENT_ELEM);
1827
1828 int last_wrote = 0;
1829 for (int i = 0; i < highlight_start_positions.size(); i++)
1830 {
1831 int highlight_start = highlight_start_positions.get(i).intValue();
1832 int highlight_end = highlight_end_positions.get(i).intValue();
1833
1834 // Print anything before the highlight range
1835 if (last_wrote < highlight_start)
1836 {
1837 String preceding_text = new String(content_characters, last_wrote, (highlight_start - last_wrote));
1838 content_element.appendChild(doc.createTextNode(preceding_text));
1839 }
1840
1841 // Print the highlight text, annotated
1842 if (highlight_end > last_wrote)
1843 {
1844 String highlight_text = new String(content_characters, highlight_start, (highlight_end - highlight_start));
1845 Element annotation_element = GSXML.createTextElement(doc, "annotation", highlight_text);
1846 annotation_element.setAttribute("type", "query_term");
1847 content_element.appendChild(annotation_element);
1848 last_wrote = highlight_end;
1849 }
1850 }
1851
1852 // Finish off any unwritten text
1853 if (last_wrote < content_characters.length)
1854 {
1855 String remaining_text = new String(content_characters, last_wrote, (content_characters.length - last_wrote));
1856 content_element.appendChild(doc.createTextNode(remaining_text));
1857 }
1858 return content_element;
1859 }
1860
1861 static private class WordMatch
1862 {
1863 public String word;
1864 public int start_position;
1865 public int end_position;
1866 public boolean preceding_word_matched;
1867
1868 public WordMatch(String word, int start_position, int end_position, boolean preceding_word_matched)
1869 {
1870 this.word = word;
1871 this.start_position = start_position;
1872 this.end_position = end_position;
1873 this.preceding_word_matched = preceding_word_matched;
1874 }
1875 }
1876
1877 static private class PartialPhraseMatch
1878 {
1879 public int start_position;
1880 public int query_phrase_number;
1881 public int num_words_matched;
1882
1883 public PartialPhraseMatch(int start_position, int query_phrase_number)
1884 {
1885 this.start_position = start_position;
1886 this.query_phrase_number = query_phrase_number;
1887 this.num_words_matched = 1;
1888 }
1889 }
1890}
Note: See TracBrowser for help on using the repository browser.