source: main/trunk/greenstone3/src/java/org/greenstone/gsdl3/action/DocumentAction.java@ 30553

Last change on this file since 30553 was 30553, checked in by kjdon, 8 years ago

added ability for teh collectionConfig.xml file to carry additional stuff. Can have extraInfo element at the top level (inside collectionConfig). For now this is used to add extra items to the navigation bar (<navigationTab type=external-link

  • Property svn:keywords set to Author Date Id Revision
File size: 51.1 KB
Line 
1/*
2 * DocumentAction.java
3 * Copyright (C) 2002 New Zealand Digital Library, http://www.nzdl.org
4 *
5 * This program is free software; you can redistribute it and/or modify
6 * it under the terms of the GNU General Public License as published by
7 * the Free Software Foundation; either version 2 of the License, or
8 * (at your option) any later version.
9 *
10 * This program is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 * GNU General Public License for more details.
14 *
15 * You should have received a copy of the GNU General Public License
16 * along with this program; if not, write to the Free Software
17 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
18 */
19package org.greenstone.gsdl3.action;
20
21// Greenstone classes
22import org.greenstone.gsdl3.core.ModuleInterface;
23import org.greenstone.gsdl3.util.*;
24
25// XML classes
26import org.w3c.dom.Document;
27import org.w3c.dom.Element;
28import org.w3c.dom.Node;
29import org.w3c.dom.Text;
30import org.w3c.dom.NodeList;
31
32// General Java classes
33import java.util.ArrayList;
34import java.util.Arrays;
35import java.util.HashMap;
36import java.util.HashSet;
37import java.io.File;
38import java.io.Serializable;
39
40import org.apache.log4j.*;
41
42/** Action class for retrieving Documents via the message router */
43public class DocumentAction extends Action
44{
45
46 static Logger logger = Logger.getLogger(org.greenstone.gsdl3.action.DocumentAction.class.getName());
47
48 // this is used to specify that the sibling nodes of a selected one should be obtained
49 public static final String SIBLING_ARG = "sib";
50 public static final String GOTO_PAGE_ARG = "gp";
51 public static final String ENRICH_DOC_ARG = "end";
52 public static final String EXPAND_DOCUMENT_ARG = "ed";
53 public static final String EXPAND_CONTENTS_ARG = "ec";
54 public static final String REALISTIC_BOOK_ARG = "book";
55
56 /**
57 * if this is set to true, when a document is displayed, any annotation type
58 * services (enrich) will be offered to the user as well
59 */
60 protected boolean provide_annotations = false;
61
62 protected boolean highlight_query_terms = false;
63
64 public boolean configure()
65 {
66 super.configure();
67 String highlight = (String) config_params.get("highlightQueryTerms");
68 if (highlight != null && highlight.equals("true"))
69 {
70 highlight_query_terms = true;
71 }
72 String annotate = (String) config_params.get("displayAnnotationService");
73 if (annotate != null && annotate.equals("true"))
74 {
75 provide_annotations = true;
76 }
77 return true;
78 }
79
80 public Node process(Node message_node)
81 {
82 // for now, no subaction eventually we may want to have subactions such as text assoc or something ?
83
84 Element message = GSXML.nodeToElement(message_node);
85 Document doc = message.getOwnerDocument();
86
87 // the response
88 Element result = doc.createElement(GSXML.MESSAGE_ELEM);
89 Element page_response = doc.createElement(GSXML.RESPONSE_ELEM);
90 result.appendChild(page_response);
91
92 // get the request - assume only one
93 Element request = (Element) GSXML.getChildByTagName(message, GSXML.REQUEST_ELEM);
94 Element cgi_paramList = (Element) GSXML.getChildByTagName(request, GSXML.PARAM_ELEM + GSXML.LIST_MODIFIER);
95 HashMap<String, Serializable> params = GSXML.extractParams(cgi_paramList, false);
96
97 // just in case there are some that need to get passed to the services
98 HashMap service_params = (HashMap) params.get("s0");
99
100 String collection = (String) params.get(GSParams.COLLECTION);
101 String document_id = (String) params.get(GSParams.DOCUMENT);
102 if (document_id != null && document_id.equals(""))
103 {
104 document_id = null;
105 }
106 String href = (String) params.get(GSParams.HREF);//for an external link : get the href URL if it is existing in the params list
107 if (href != null && href.equals(""))
108 {
109 href = null;
110 }
111 String rl = (String) params.get(GSParams.RELATIVE_LINK);//for an external link : get the rl value if it is existing in the params list
112 if (document_id == null && href == null)
113 {
114 logger.error("no document specified!");
115 return result;
116 }
117 if (rl != null && rl.equals("0"))
118 {
119 // this is a true external link, we should have been directed to a different page or action
120 logger.error("rl value was 0, shouldn't get here");
121 return result;
122 }
123
124 String query_terms = (String) params.get("terms");
125 logger.error("terms = "+query_terms);
126 String query = (String) params.get("query");
127 UserContext userContext = new UserContext(request);
128
129 //append site metadata
130 addSiteMetadata(page_response, userContext);
131 addInterfaceOptions(page_response);
132
133 // get the additional data needed for the page
134 getBackgroundData(page_response, collection, userContext);
135 Element format_elem = (Element) GSXML.getChildByTagName(page_response, GSXML.FORMAT_ELEM);
136
137 if (format_elem != null) {
138 // lets look for param defaults set in config file
139 NodeList param_defaults = format_elem.getElementsByTagName("paramDefault");
140 for (int i=0; i<param_defaults.getLength(); i++) {
141 Element p = (Element)param_defaults.item(i);
142 String name = p.getAttribute(GSXML.NAME_ATT);
143 if (params.get(name) ==null) {
144 // wasn't set from interface
145 String value = p.getAttribute(GSXML.VALUE_ATT);
146 params.put(name, value );
147 // also add into request param xml so that xslt knows it too
148 GSXML.addParameterToList(cgi_paramList, name, value);
149 }
150 }
151 }
152 String document_type = (String) params.get(GSParams.DOCUMENT_TYPE);
153 if (document_type != null && document_type.equals(""))
154 {
155 //document_type = "hierarchy";
156 document_type = null; // we'll get it later if not already specified
157 }
158 //whether to retrieve siblings or not
159 boolean get_siblings = false;
160 String sibs = (String) params.get(SIBLING_ARG);
161 if (sibs != null && sibs.equals("1"))
162 {
163 get_siblings = true;
164 }
165
166 String doc_id_modifier = "";
167 String sibling_num = (String) params.get(GOTO_PAGE_ARG);
168 if (sibling_num != null && !sibling_num.equals(""))
169 {
170 // we have to modify the doc name
171 doc_id_modifier = "." + sibling_num + ".ss";
172 }
173
174 boolean expand_document = false;
175 String ed_arg = (String) params.get(EXPAND_DOCUMENT_ARG);
176 if (ed_arg != null && ed_arg.equals("1"))
177 {
178 expand_document = true;
179 }
180
181 boolean expand_contents = false;
182 if (expand_document)
183 { // we always expand the contents with the text
184 expand_contents = true;
185 }
186 else
187 {
188 String ec_arg = (String) params.get(EXPAND_CONTENTS_ARG);
189 if (ec_arg != null && ec_arg.equals("1"))
190 {
191 expand_contents = true;
192 }
193 }
194
195 // UserContext userContext = new UserContext(request);
196
197 // //append site metadata
198 // addSiteMetadata(page_response, userContext);
199 // addInterfaceOptions(page_response);
200
201 // // get the additional data needed for the page
202 // getBackgroundData(page_response, collection, userContext);
203 // Element format_elem = (Element) GSXML.getChildByTagName(page_response, GSXML.FORMAT_ELEM);
204
205 // the_document is where all the doc info - structure and metadata etc
206 // is added into, to be returned in the page
207 Element the_document = doc.createElement(GSXML.DOCUMENT_ELEM);
208 page_response.appendChild(the_document);
209
210 // create a basic doc list containing the current node
211 Element basic_doc_list = doc.createElement(GSXML.DOC_NODE_ELEM + GSXML.LIST_MODIFIER);
212 Element current_doc = doc.createElement(GSXML.DOC_NODE_ELEM);
213 basic_doc_list.appendChild(current_doc);
214 if (document_id != null)
215 {
216 current_doc.setAttribute(GSXML.NODE_ID_ATT, document_id + doc_id_modifier);
217 }
218 else
219 {
220 current_doc.setAttribute(GSXML.HREF_ID_ATT, href);
221 // do we need this??
222 current_doc.setAttribute(GSXML.ID_MOD_ATT, doc_id_modifier);
223 }
224
225 if (document_type == null)
226 {
227 document_type = getDocumentType(basic_doc_list, collection, userContext, page_response);
228 }
229 if (document_type == null)
230 {
231 logger.error("doctype is null!!!***********");
232 document_type = GSXML.DOC_TYPE_SIMPLE;
233 }
234
235 the_document.setAttribute(GSXML.DOC_TYPE_ATT, document_type);
236
237
238 // Create a parameter list to specify the required structure information
239 Element ds_param_list = doc.createElement(GSXML.PARAM_ELEM + GSXML.LIST_MODIFIER);
240
241 if (service_params != null)
242 {
243 GSXML.addParametersToList(ds_param_list, service_params);
244 }
245
246 Element ds_param = null;
247 boolean get_structure = false;
248 boolean get_structure_info = false;
249 if (document_type.equals(GSXML.DOC_TYPE_PAGED))
250 {
251 get_structure_info = true;
252
253 if (expand_contents)
254 {
255 ds_param = doc.createElement(GSXML.PARAM_ELEM);
256 ds_param_list.appendChild(ds_param);
257 ds_param.setAttribute(GSXML.NAME_ATT, "structure");
258 ds_param.setAttribute(GSXML.VALUE_ATT, "entire");
259 }
260
261 // get the info needed for paged naviagtion
262 ds_param = doc.createElement(GSXML.PARAM_ELEM);
263 ds_param_list.appendChild(ds_param);
264 ds_param.setAttribute(GSXML.NAME_ATT, "info");
265 ds_param.setAttribute(GSXML.VALUE_ATT, "numSiblings");
266 ds_param = doc.createElement(GSXML.PARAM_ELEM);
267 ds_param_list.appendChild(ds_param);
268 ds_param.setAttribute(GSXML.NAME_ATT, "info");
269 ds_param.setAttribute(GSXML.VALUE_ATT, "numChildren");
270 ds_param = doc.createElement(GSXML.PARAM_ELEM);
271 ds_param_list.appendChild(ds_param);
272 ds_param.setAttribute(GSXML.NAME_ATT, "info");
273 ds_param.setAttribute(GSXML.VALUE_ATT, "siblingPosition");
274
275 if (get_siblings)
276 {
277 ds_param = doc.createElement(GSXML.PARAM_ELEM);
278 ds_param_list.appendChild(ds_param);
279 ds_param.setAttribute(GSXML.NAME_ATT, "structure");
280 ds_param.setAttribute(GSXML.VALUE_ATT, "siblings");
281 }
282
283 }
284 else if (document_type.equals(GSXML.DOC_TYPE_HIERARCHY) || document_type.equals(GSXML.DOC_TYPE_PAGED_HIERARCHY))
285 {
286 get_structure = true;
287 if (expand_contents)
288 {
289 ds_param = doc.createElement(GSXML.PARAM_ELEM);
290 ds_param_list.appendChild(ds_param);
291 ds_param.setAttribute(GSXML.NAME_ATT, "structure");
292 ds_param.setAttribute(GSXML.VALUE_ATT, "entire");
293 }
294 else
295 {
296 // get the info needed for table of contents
297 ds_param = doc.createElement(GSXML.PARAM_ELEM);
298 ds_param_list.appendChild(ds_param);
299 ds_param.setAttribute(GSXML.NAME_ATT, "structure");
300 ds_param.setAttribute(GSXML.VALUE_ATT, "ancestors");
301 ds_param = doc.createElement(GSXML.PARAM_ELEM);
302 ds_param_list.appendChild(ds_param);
303 ds_param.setAttribute(GSXML.NAME_ATT, "structure");
304 ds_param.setAttribute(GSXML.VALUE_ATT, "children");
305 if (get_siblings)
306 {
307 ds_param = doc.createElement(GSXML.PARAM_ELEM);
308 ds_param_list.appendChild(ds_param);
309 ds_param.setAttribute(GSXML.NAME_ATT, "structure");
310 ds_param.setAttribute(GSXML.VALUE_ATT, "siblings");
311 }
312 }
313 }
314 else
315 {
316 // we dont need any structure
317 }
318
319 boolean has_dummy = false;
320 if (get_structure || get_structure_info)
321 {
322
323 // Build a request to obtain the document structure
324 Element ds_message = doc.createElement(GSXML.MESSAGE_ELEM);
325 String to = GSPath.appendLink(collection, "DocumentStructureRetrieve");// Hard-wired?
326 Element ds_request = GSXML.createBasicRequest(doc, GSXML.REQUEST_TYPE_PROCESS, to, userContext);
327 ds_message.appendChild(ds_request);
328 ds_request.appendChild(ds_param_list);
329
330 // add the node list we created earlier
331 ds_request.appendChild(basic_doc_list);
332
333 // Process the document structure retrieve message
334 Element ds_response_message = (Element) this.mr.process(ds_message);
335 if (processErrorElements(ds_response_message, page_response))
336 {
337 return result;
338 }
339
340 // get the info and print out
341 String path = GSPath.appendLink(GSXML.RESPONSE_ELEM, GSXML.DOC_NODE_ELEM + GSXML.LIST_MODIFIER);
342 path = GSPath.appendLink(path, GSXML.DOC_NODE_ELEM);
343 path = GSPath.appendLink(path, "nodeStructureInfo");
344 Element ds_response_struct_info = (Element) GSXML.getNodeByPath(ds_response_message, path);
345 // get the doc_node bit
346 if (ds_response_struct_info != null)
347 {
348 the_document.appendChild(doc.importNode(ds_response_struct_info, true));
349 }
350 path = GSPath.appendLink(GSXML.RESPONSE_ELEM, GSXML.DOC_NODE_ELEM + GSXML.LIST_MODIFIER);
351 path = GSPath.appendLink(path, GSXML.DOC_NODE_ELEM);
352 path = GSPath.appendLink(path, GSXML.NODE_STRUCTURE_ELEM);
353 Element ds_response_structure = (Element) GSXML.getNodeByPath(ds_response_message, path);
354
355 if (ds_response_structure != null)
356 {
357 // add the contents of the structure bit into the_document
358 NodeList structs = ds_response_structure.getChildNodes();
359 for (int i = 0; i < structs.getLength(); i++)
360 {
361 the_document.appendChild(doc.importNode(structs.item(i), true));
362 }
363 }
364 else
365 {
366 // no structure nodes, so put in a dummy doc node
367 Element doc_node = doc.createElement(GSXML.DOC_NODE_ELEM);
368 if (document_id != null)
369 {
370 doc_node.setAttribute(GSXML.NODE_ID_ATT, document_id);
371 }
372 else
373 {
374 doc_node.setAttribute(GSXML.HREF_ID_ATT, href);
375
376 }
377 the_document.appendChild(doc_node);
378 has_dummy = true;
379 }
380 }
381 else
382 { // a simple type - we dont have a dummy node for simple
383 // should think about this more
384 // no structure request, so just put in a dummy doc node
385 Element doc_node = doc.createElement(GSXML.DOC_NODE_ELEM);
386 if (document_id != null)
387 {
388 doc_node.setAttribute(GSXML.NODE_ID_ATT, document_id);
389 }
390 else
391 {
392 doc_node.setAttribute(GSXML.HREF_ID_ATT, href);
393 }
394 the_document.appendChild(doc_node);
395 has_dummy = true;
396 }
397
398 // Build a request to obtain some document metadata
399 Element dm_message = doc.createElement(GSXML.MESSAGE_ELEM);
400 String to = GSPath.appendLink(collection, "DocumentMetadataRetrieve"); // Hard-wired?
401 Element dm_request = GSXML.createBasicRequest(doc, GSXML.REQUEST_TYPE_PROCESS, to, userContext);
402 dm_message.appendChild(dm_request);
403 // Create a parameter list to specify the required metadata information
404
405 HashSet<String> meta_names = new HashSet<String>();
406 meta_names.add("Title"); // the default
407 if (format_elem != null)
408 {
409 getRequiredMetadataNames(format_elem, meta_names);
410 }
411
412 Element extraMetaListElem = (Element) GSXML.getChildByTagName(request, GSXML.EXTRA_METADATA + GSXML.LIST_MODIFIER);
413 if (extraMetaListElem != null)
414 {
415 NodeList extraMetaList = extraMetaListElem.getElementsByTagName(GSXML.EXTRA_METADATA);
416 for (int i = 0; i < extraMetaList.getLength(); i++)
417 {
418 meta_names.add(((Element) extraMetaList.item(i)).getAttribute(GSXML.NAME_ATT));
419 }
420 }
421
422 Element dm_param_list = createMetadataParamList(doc,meta_names);
423 if (service_params != null)
424 {
425 GSXML.addParametersToList(dm_param_list, service_params);
426 }
427
428 dm_request.appendChild(dm_param_list);
429
430 // create the doc node list for the metadata request
431 Element dm_doc_list = doc.createElement(GSXML.DOC_NODE_ELEM + GSXML.LIST_MODIFIER);
432 dm_request.appendChild(dm_doc_list);
433
434 // Add each node from the structure response into the metadata request
435 NodeList doc_nodes = the_document.getElementsByTagName(GSXML.DOC_NODE_ELEM);
436 for (int i = 0; i < doc_nodes.getLength(); i++)
437 {
438 Element doc_node = (Element) doc_nodes.item(i);
439 String doc_node_id = doc_node.getAttribute(GSXML.NODE_ID_ATT);
440
441 // Add the documentNode to the list
442 Element dm_doc_node = doc.createElement(GSXML.DOC_NODE_ELEM);
443 dm_doc_list.appendChild(dm_doc_node);
444 dm_doc_node.setAttribute(GSXML.NODE_ID_ATT, doc_node_id);
445 dm_doc_node.setAttribute(GSXML.NODE_TYPE_ATT, doc_node.getAttribute(GSXML.NODE_TYPE_ATT));
446 if (document_id == null){
447 dm_doc_node.setAttribute(GSXML.HREF_ID_ATT, href );
448 }
449
450 }
451
452 // we also want a metadata request to the top level document to get
453 // assocfilepath - this could be cached too
454 Element doc_meta_request = GSXML.createBasicRequest(doc, GSXML.REQUEST_TYPE_PROCESS, to, userContext);
455 dm_message.appendChild(doc_meta_request);
456 Element doc_meta_param_list = doc.createElement(GSXML.PARAM_ELEM + GSXML.LIST_MODIFIER);
457 if (service_params != null)
458 {
459 GSXML.addParametersToList(doc_meta_param_list, service_params);
460 }
461
462 doc_meta_request.appendChild(doc_meta_param_list);
463 Element doc_param = doc.createElement(GSXML.PARAM_ELEM);
464 doc_meta_param_list.appendChild(doc_param);
465 doc_param.setAttribute(GSXML.NAME_ATT, "metadata");
466 doc_param.setAttribute(GSXML.VALUE_ATT, "assocfilepath");
467
468 // create the doc node list for the metadata request
469 Element doc_list = doc.createElement(GSXML.DOC_NODE_ELEM + GSXML.LIST_MODIFIER);
470 doc_meta_request.appendChild(doc_list);
471
472 Element doc_node = doc.createElement(GSXML.DOC_NODE_ELEM);
473 // the node we want is the root document node
474 if (document_id != null)
475 {
476 doc_node.setAttribute(GSXML.NODE_ID_ATT, document_id + ".rt");
477 }
478 /*else
479 {
480 doc_node.setAttribute(GSXML.HREF_ID_ATT, href);// + ".rt");
481 // can we assume that href is always a top level doc??
482 //doc_node.setAttribute(GSXML.ID_MOD_ATT, ".rt");
483 //doc_node.setAttribute("externalURL", has_rl);
484 }*/
485 doc_list.appendChild(doc_node);
486
487 Element dm_response_message = (Element) this.mr.process(dm_message);
488 if (processErrorElements(dm_response_message, page_response))
489 {
490 return result;
491 }
492
493 String path = GSPath.appendLink(GSXML.RESPONSE_ELEM, GSXML.DOC_NODE_ELEM + GSXML.LIST_MODIFIER);
494 Element dm_response_doc_list = (Element) GSXML.getNodeByPath(dm_response_message, path);
495
496 // Merge the metadata with the structure information
497 NodeList dm_response_docs = dm_response_doc_list.getChildNodes();
498 for (int i = 0; i < doc_nodes.getLength(); i++)
499 {
500 GSXML.mergeMetadataLists(doc_nodes.item(i), dm_response_docs.item(i));
501 }
502 // get the top level doc metadata out
503 Element doc_meta_response = (Element) dm_response_message.getElementsByTagName(GSXML.RESPONSE_ELEM).item(1);
504 Element top_doc_node = (Element) GSXML.getNodeByPath(doc_meta_response, "documentNodeList/documentNode");
505 GSXML.mergeMetadataLists(the_document, top_doc_node);
506
507 // Build a request to obtain some document content
508 Element dc_message = doc.createElement(GSXML.MESSAGE_ELEM);
509 to = GSPath.appendLink(collection, "DocumentContentRetrieve"); // Hard-wired?
510 Element dc_request = GSXML.createBasicRequest(doc, GSXML.REQUEST_TYPE_PROCESS, to, userContext);
511 dc_message.appendChild(dc_request);
512
513 // Create a parameter list to specify the request parameters - empty for now
514 Element dc_param_list = doc.createElement(GSXML.PARAM_ELEM + GSXML.LIST_MODIFIER);
515 if (service_params != null)
516 {
517 GSXML.addParametersToList(dc_param_list, service_params);
518 }
519
520 dc_request.appendChild(dc_param_list);
521
522 // get the content
523 // the doc list for the content request is the same as the one for the structure request unless we want the whole document, in which case its the same as for the metadata request.
524 if (expand_document)
525 {
526 dc_request.appendChild(dm_doc_list);
527 }
528 else
529 {
530 dc_request.appendChild(basic_doc_list);
531 }
532 logger.debug("request = " + XMLConverter.getString(dc_message));
533 Element dc_response_message = (Element) this.mr.process(dc_message);
534 if (processErrorElements(dc_response_message, page_response))
535 {
536 return result;
537 }
538
539 Element dc_response_doc_list = (Element) GSXML.getNodeByPath(dc_response_message, path);
540
541 if (expand_document)
542 {
543 // Merge the content with the structure information
544 NodeList dc_response_docs = dc_response_doc_list.getChildNodes();
545 for (int i = 0; i < doc_nodes.getLength(); i++)
546 {
547 Node content = GSXML.getChildByTagName((Element) dc_response_docs.item(i), "nodeContent");
548 if (content != null)
549 {
550 if (highlight_query_terms)
551 {
552 content = highlightQueryTermsOld(request, (Element) content); // highlightQueryTerms(query_terms, query, request.getOwnerDocument(), (Element) content); //request, (Element) content);
553 }
554 doc_nodes.item(i).appendChild(doc.importNode(content, true));
555 }
556 //GSXML.mergeMetadataLists(doc_nodes.item(i), dm_response_docs.item(i));
557 }
558 if (has_dummy && document_type.equals(GSXML.DOC_TYPE_SIMPLE)) {
559 Element dummy_node = (Element) doc_nodes.item(0);
560 the_document.removeChild(dummy_node);
561 the_document.setAttribute(GSXML.NODE_ID_ATT, dummy_node.getAttribute(GSXML.NODE_ID_ATT));
562 NodeList dummy_children = dummy_node.getChildNodes();
563 for (int i = dummy_children.getLength() - 1; i >= 0; i--)
564 {
565 // special case as we don't want more than one metadata list
566 if (dummy_children.item(i).getNodeName().equals(GSXML.METADATA_ELEM + GSXML.LIST_MODIFIER))
567 {
568 GSXML.mergeMetadataFromList(the_document, dummy_children.item(i));
569 }
570 else
571 {
572 the_document.appendChild(dummy_children.item(i));
573 }
574 }
575 }
576 }
577 else
578 {
579 //path = GSPath.appendLink(path, GSXML.DOC_NODE_ELEM);
580 Element dc_response_doc = (Element) GSXML.getChildByTagName(dc_response_doc_list, GSXML.DOC_NODE_ELEM);
581 Element dc_response_doc_content = (Element) GSXML.getChildByTagName(dc_response_doc, GSXML.NODE_CONTENT_ELEM);
582 //Element dc_response_doc_external = (Element) GSXML.getChildByTagName(dc_response_doc, "external");
583
584 if (dc_response_doc_content == null)
585 {
586 // no content to add
587 if (dc_response_doc.getAttribute("external").equals("true"))
588 {
589
590 //if (dc_response_doc_external != null)
591 //{
592 String href_id = dc_response_doc.getAttribute(GSXML.HREF_ID_ATT);
593
594 the_document.setAttribute("selectedNode", href_id);
595 the_document.setAttribute("external", href_id);
596 }
597 return result;
598 }
599 if (highlight_query_terms)
600 {
601 dc_response_doc.removeChild(dc_response_doc_content);
602
603 dc_response_doc_content = highlightQueryTermsOld(request, dc_response_doc_content); //highlightQueryTerms(query_terms, query, request.getOwnerDocument(), dc_response_doc_content); //request, dc_response_doc_content);
604 dc_response_doc.appendChild(dc_response_doc.getOwnerDocument().importNode(dc_response_doc_content, true));
605 }
606
607 if (provide_annotations)
608 {
609 String service_selected = (String) params.get(ENRICH_DOC_ARG);
610 if (service_selected != null && service_selected.equals("1"))
611 {
612 // now we can modifiy the response doc if needed
613 String enrich_service = (String) params.get(GSParams.SERVICE);
614 // send a message to the service
615 Element enrich_message = doc.createElement(GSXML.MESSAGE_ELEM);
616 Element enrich_request = GSXML.createBasicRequest(doc, GSXML.REQUEST_TYPE_PROCESS, enrich_service, userContext);
617 enrich_message.appendChild(enrich_request);
618 // check for parameters
619 HashMap e_service_params = (HashMap) params.get("s1");
620 if (e_service_params != null)
621 {
622 Element enrich_pl = doc.createElement(GSXML.PARAM_ELEM + GSXML.LIST_MODIFIER);
623 GSXML.addParametersToList(enrich_pl, e_service_params);
624 enrich_request.appendChild(enrich_pl);
625 }
626 Element e_doc_list = doc.createElement(GSXML.DOC_NODE_ELEM + GSXML.LIST_MODIFIER);
627 enrich_request.appendChild(e_doc_list);
628 e_doc_list.appendChild(doc.importNode(dc_response_doc, true));
629
630 Node enrich_response = this.mr.process(enrich_message);
631
632 String[] links = { GSXML.RESPONSE_ELEM, GSXML.DOC_NODE_ELEM + GSXML.LIST_MODIFIER, GSXML.DOC_NODE_ELEM, GSXML.NODE_CONTENT_ELEM };
633 path = GSPath.createPath(links);
634 dc_response_doc_content = (Element) GSXML.getNodeByPath(enrich_response, path);
635
636 }
637 } // if provide_annotations
638
639 // use the returned id rather than the sent one cos there may have
640 // been modifiers such as .pr that are removed.
641 String modified_doc_id = dc_response_doc.getAttribute(GSXML.NODE_ID_ATT);
642 the_document.setAttribute("selectedNode", modified_doc_id);
643 if (has_dummy)
644 {
645 // change the id if necessary and add the content
646 Element dummy_node = (Element) doc_nodes.item(0);
647
648 dummy_node.setAttribute(GSXML.NODE_ID_ATT, modified_doc_id);
649 dummy_node.appendChild(doc.importNode(dc_response_doc_content, true));
650 // hack for simple type
651 if (document_type.equals(GSXML.DOC_TYPE_SIMPLE))
652 {
653 // we dont want the internal docNode, just want the content and metadata in the document
654 // rethink this!!
655 the_document.removeChild(dummy_node);
656
657 NodeList dummy_children = dummy_node.getChildNodes();
658 //for (int i=0; i<dummy_children.getLength(); i++) {
659 for (int i = dummy_children.getLength() - 1; i >= 0; i--)
660 {
661 // special case as we don't want more than one metadata list
662 if (dummy_children.item(i).getNodeName().equals(GSXML.METADATA_ELEM + GSXML.LIST_MODIFIER))
663 {
664 GSXML.mergeMetadataFromList(the_document, dummy_children.item(i));
665 }
666 else
667 {
668 the_document.appendChild(dummy_children.item(i));
669 }
670 }
671 }
672
673 the_document.setAttribute(GSXML.NODE_ID_ATT, modified_doc_id);
674 }
675 else
676 {
677 // Merge the document content with the metadata and structure information
678 for (int i = 0; i < doc_nodes.getLength(); i++)
679 {
680 Node dn = doc_nodes.item(i);
681 String dn_id = ((Element) dn).getAttribute(GSXML.NODE_ID_ATT);
682 if (dn_id.equals(modified_doc_id))
683 {
684 dn.appendChild(doc.importNode(dc_response_doc_content, true));
685 break;
686 }
687 }
688 }
689 }
690 //logger.debug("(DocumentAction) Page:\n" + GSXML.xmlNodeToString(result));
691 return result;
692 }
693
694 /**
695 * tell the param class what its arguments are if an action has its own
696 * arguments, this should add them to the params object - particularly
697 * important for args that should not be saved
698 */
699 public boolean addActionParameters(GSParams params)
700 {
701 params.addParameter(GOTO_PAGE_ARG, false);
702 params.addParameter(ENRICH_DOC_ARG, false);
703 params.addParameter(EXPAND_DOCUMENT_ARG, false);
704 params.addParameter(EXPAND_CONTENTS_ARG, false);
705 params.addParameter(REALISTIC_BOOK_ARG, false);
706
707 return true;
708 }
709
710 /**
711 * this method gets the collection description, the format info, the list of
712 * enrich services, etc - stuff that is needed for the page, but is the same
713 * whatever the query is - should be cached
714 */
715 protected boolean getBackgroundData(Element page_response, String collection, UserContext userContext)
716 {
717 Document doc = page_response.getOwnerDocument();
718
719 // create a message to process - contains requests for the collection
720 // description, the format element, the enrich services on offer
721 // these could all be cached
722 Element info_message = doc.createElement(GSXML.MESSAGE_ELEM);
723 String path = GSPath.appendLink(collection, "DocumentContentRetrieve");
724 // the format request - ignore for now, where does this request go to??
725 Element format_request = GSXML.createBasicRequest(doc, GSXML.REQUEST_TYPE_FORMAT, path, userContext);
726 info_message.appendChild(format_request);
727
728 // the enrich_services request - only do this if provide_annotations is true
729
730 if (provide_annotations)
731 {
732 Element enrich_services_request = GSXML.createBasicRequest(doc, GSXML.REQUEST_TYPE_DESCRIBE, "", userContext);
733 enrich_services_request.setAttribute(GSXML.INFO_ATT, "serviceList");
734 info_message.appendChild(enrich_services_request);
735 }
736
737 Element info_response = (Element) this.mr.process(info_message);
738
739 // the collection is the first response
740 NodeList responses = info_response.getElementsByTagName(GSXML.RESPONSE_ELEM);
741 Element format_resp = (Element) responses.item(0);
742
743 Element format_elem = (Element) GSXML.getChildByTagName(format_resp, GSXML.FORMAT_ELEM);
744 if (format_elem != null)
745 {
746 Element global_format_elem = (Element) GSXML.getChildByTagName(format_resp, GSXML.GLOBAL_FORMAT_ELEM);
747 if (global_format_elem != null)
748 {
749 GSXSLT.mergeFormatElements(format_elem, global_format_elem, false);
750 }
751
752 // set the format type
753 format_elem.setAttribute(GSXML.TYPE_ATT, "display");
754 page_response.appendChild(doc.importNode(format_elem, true));
755 }
756
757 if (provide_annotations)
758 {
759 Element services_resp = (Element) responses.item(1);
760
761 // a new message for the mr
762 Element enrich_message = doc.createElement(GSXML.MESSAGE_ELEM);
763 NodeList e_services = services_resp.getElementsByTagName(GSXML.SERVICE_ELEM);
764 boolean service_found = false;
765 for (int j = 0; j < e_services.getLength(); j++)
766 {
767 if (((Element) e_services.item(j)).getAttribute(GSXML.TYPE_ATT).equals("enrich"))
768 {
769 Element s = GSXML.createBasicRequest(doc, GSXML.REQUEST_TYPE_DESCRIBE, ((Element) e_services.item(j)).getAttribute(GSXML.NAME_ATT), userContext);
770 enrich_message.appendChild(s);
771 service_found = true;
772 }
773 }
774 if (service_found)
775 {
776 Element enrich_response = (Element) this.mr.process(enrich_message);
777
778 NodeList e_responses = enrich_response.getElementsByTagName(GSXML.RESPONSE_ELEM);
779 Element service_list = doc.createElement(GSXML.SERVICE_ELEM + GSXML.LIST_MODIFIER);
780 for (int i = 0; i < e_responses.getLength(); i++)
781 {
782 Element e_resp = (Element) e_responses.item(i);
783 Element e_service = (Element) doc.importNode(GSXML.getChildByTagName(e_resp, GSXML.SERVICE_ELEM), true);
784 e_service.setAttribute(GSXML.NAME_ATT, e_resp.getAttribute(GSXML.FROM_ATT));
785 service_list.appendChild(e_service);
786 }
787 page_response.appendChild(service_list);
788 }
789 } // if provide_annotations
790 return true;
791
792 }
793
794 protected String getDocumentType(Element basic_doc_list, String collection, UserContext userContext, Element page_response)
795 {
796 Document doc = basic_doc_list.getOwnerDocument();
797
798 Element ds_message = doc.createElement(GSXML.MESSAGE_ELEM);
799 String to = GSPath.appendLink(collection, "DocumentStructureRetrieve");// Hard-wired?
800 Element ds_request = GSXML.createBasicRequest(doc, GSXML.REQUEST_TYPE_PROCESS, to, userContext);
801 ds_message.appendChild(ds_request);
802
803 // Create a parameter list to specify the required structure information
804 Element ds_param_list = doc.createElement(GSXML.PARAM_ELEM + GSXML.LIST_MODIFIER);
805 Element ds_param = doc.createElement(GSXML.PARAM_ELEM);
806 ds_param_list.appendChild(ds_param);
807 ds_param.setAttribute(GSXML.NAME_ATT, "info");
808 ds_param.setAttribute(GSXML.VALUE_ATT, "documentType");
809
810 ds_request.appendChild(ds_param_list);
811
812 // add the node list we created earlier
813 ds_request.appendChild(basic_doc_list);
814
815 // Process the document structure retrieve message
816 Element ds_response_message = (Element) this.mr.process(ds_message);
817 if (processErrorElements(ds_response_message, page_response))
818 {
819 return null;
820 }
821
822 String[] links = { GSXML.RESPONSE_ELEM, GSXML.DOC_NODE_ELEM + GSXML.LIST_MODIFIER, GSXML.DOC_NODE_ELEM, "nodeStructureInfo" };
823 String path = GSPath.createPath(links);
824 Element info_elem = (Element) GSXML.getNodeByPath(ds_response_message, path);
825 if (info_elem == null) {
826 return null;
827 }
828 Element doctype_elem = GSXML.getNamedElement(info_elem, "info", "name", "documentType");
829 if (doctype_elem != null)
830 {
831 String doc_type = doctype_elem.getAttribute("value");
832 return doc_type;
833 }
834 return null;
835 }
836
837 /**
838 * this involves a bit of a hack to get the equivalent query terms - has to
839 * requery the query service - uses the last selected service name. (if it
840 * ends in query). should this action do the query or should it send a
841 * message to the query action? but that will involve lots of extra stuff.
842 * also doesn't handle phrases properly - just highlights all the terms
843 * found in the text.
844 */
845 protected Element highlightQueryTerms(String terms, String performed_query, Document doc, Element dc_response_doc_content) {
846 logger.error("in highlight, terms = "+terms);
847 if (terms == null || performed_query == null) {
848 return dc_response_doc_content;
849 }
850 HashMap<String, HashSet<String>> term_to_variants_map = new HashMap<String, HashSet<String>>();
851 HashSet<String> query_term_variants = new HashSet<String>();
852
853 // terms in the form snail:snail,SNAILS,Snail;farm:farm,farming,Farming
854 String[] term_list = terms.split(";");
855 for (int i=0; i<term_list.length; i++) {
856 String term_x = term_list[i];
857 int colon_index = term_x.indexOf(';');
858 String main_term;
859 String term_variants;
860 if (colon_index == -1) {
861 main_term = term_x;
862 term_variants = main_term;
863 } else {
864 main_term = term_x.substring(0, colon_index);
865 term_variants = term_x.substring(colon_index+1);
866 }
867 query_term_variants.add(main_term);
868 term_to_variants_map.put(main_term, new HashSet<String>(Arrays.asList(term_variants.split(","))));
869 }
870
871 String content = GSXML.getNodeText(dc_response_doc_content);
872
873 ArrayList<ArrayList<HashSet<String>>> phrase_query_term_variants_hierarchy = new ArrayList<ArrayList<HashSet<String>>>();
874
875//Element query_element = GSXML.getNamedElement(metadata_list, GSXML.METADATA_ELEM, GSXML.NAME_ATT, "query");
876//String performed_query = //GSXML.getNodeText(query_element) + " ";
877
878 ArrayList<HashSet<String>> phrase_query_p_term_variants_list = new ArrayList<HashSet<String>>();
879 int term_start = 0;
880 boolean in_term = false;
881 boolean in_phrase = false;
882 for (int i = 0; i < performed_query.length(); i++)
883 {
884 char character = performed_query.charAt(i);
885 boolean is_character_letter_or_digit = Character.isLetterOrDigit(character);
886
887 // Has a query term just started?
888 if (in_term == false && is_character_letter_or_digit == true)
889 {
890 in_term = true;
891 term_start = i;
892 }
893
894 // Or has a term just finished?
895 else if (in_term == true && is_character_letter_or_digit == false)
896 {
897 in_term = false;
898 String term = performed_query.substring(term_start, i);
899 HashSet<String> phrase_query_p_term_x_variants = term_to_variants_map.get(term);
900 // Element term_element = GSXML.getNamedElement(query_term_list_element, GSXML.TERM_ELEM, GSXML.NAME_ATT, term);
901 // if (term_element != null)
902 // {
903
904 // HashSet<String> phrase_query_p_term_x_variants = new HashSet<String>();
905
906 // NodeList term_equivalent_terms_nodelist = term_element.getElementsByTagName("equivTermList");
907 // if (term_equivalent_terms_nodelist == null || term_equivalent_terms_nodelist.getLength() == 0)
908 // {
909 // String termValueU = null;
910 // String termValueL = null;
911
912 // if (term.length() > 1)
913 // {
914 // termValueU = term.substring(0, 1).toUpperCase() + term.substring(1);
915 // termValueL = term.substring(0, 1).toLowerCase() + term.substring(1);
916 // }
917 // else
918 // {
919 // termValueU = term.substring(0, 1).toUpperCase();
920 // termValueL = term.substring(0, 1).toLowerCase();
921 // }
922
923 // phrase_query_p_term_x_variants.add(termValueU);
924 // phrase_query_p_term_x_variants.add(termValueL);
925 // }
926 // else
927 // {
928 // for (int j = 0; j < term_equivalent_terms_nodelist.getLength(); j++)
929 // {
930 // Element term_equivalent_terms_element = (Element) term_equivalent_terms_nodelist.item(j);
931 // String[] term_equivalent_terms = GSXML.getAttributeValuesFromList(term_equivalent_terms_element, GSXML.NAME_ATT);
932 // for (int k = 0; k < term_equivalent_terms.length; k++)
933 // {
934 // phrase_query_p_term_x_variants.add(term_equivalent_terms[k]);
935 // }
936 // }
937 // }
938 if (phrase_query_p_term_x_variants != null) {
939 phrase_query_p_term_variants_list.add(phrase_query_p_term_x_variants);
940
941 if (in_phrase == false)
942 {
943 phrase_query_term_variants_hierarchy.add(phrase_query_p_term_variants_list);
944 phrase_query_p_term_variants_list = new ArrayList<HashSet<String>>();
945 }
946 }
947 //}
948 }
949 // Watch for phrases (surrounded by quotes)
950 if (character == '\"')
951 {
952 // Has a phrase just started?
953 if (in_phrase == false)
954 {
955 in_phrase = true;
956 }
957 // Or has a phrase just finished?
958 else if (in_phrase == true)
959 {
960 in_phrase = false;
961 phrase_query_term_variants_hierarchy.add(phrase_query_p_term_variants_list);
962 }
963
964 phrase_query_p_term_variants_list = new ArrayList<HashSet<String>>();
965 }
966 }
967
968 return highlightQueryTermsInternal(doc, content, query_term_variants, phrase_query_term_variants_hierarchy);
969 }
970 protected Element highlightQueryTermsOld(Element request, Element dc_response_doc_content)
971 {
972 Document doc = request.getOwnerDocument();
973
974 // do the query again to get term info
975 Element cgi_param_list = (Element) GSXML.getChildByTagName(request, GSXML.PARAM_ELEM + GSXML.LIST_MODIFIER);
976 HashMap<String, Serializable> params = GSXML.extractParams(cgi_param_list, false);
977
978 HashMap previous_params = (HashMap) params.get("p");
979 if (previous_params == null)
980 {
981 return dc_response_doc_content;
982 }
983 String service_name = (String) previous_params.get(GSParams.SERVICE);
984 if (service_name == null || !service_name.endsWith("Query"))
985 { // hack for now - we only do highlighting if we were in a query last - ie not if we were in a browse thingy
986 logger.debug("invalid service, not doing highlighting");
987 return dc_response_doc_content;
988 }
989 String collection = (String) params.get(GSParams.COLLECTION);
990 UserContext userContext = new UserContext(request);
991 String to = GSPath.appendLink(collection, service_name);
992
993 Element mr_query_message = doc.createElement(GSXML.MESSAGE_ELEM);
994 Element mr_query_request = GSXML.createBasicRequest(doc, GSXML.REQUEST_TYPE_PROCESS, to, userContext);
995 mr_query_message.appendChild(mr_query_request);
996
997 // paramList
998 HashMap service_params = (HashMap) params.get("s1");
999
1000 Element query_param_list = doc.createElement(GSXML.PARAM_ELEM + GSXML.LIST_MODIFIER);
1001 GSXML.addParametersToList(query_param_list, service_params);
1002 GSXML.addParameterToList(query_param_list, "hldocOID", (String) params.get(GSParams.DOCUMENT));
1003 mr_query_request.appendChild(query_param_list);
1004
1005 // do the query
1006 Element mr_query_response = (Element) this.mr.process(mr_query_message);
1007
1008 String pathNode = GSPath.appendLink(GSXML.RESPONSE_ELEM, GSXML.NODE_CONTENT_ELEM);
1009 Element highlighted_Node = (Element) GSXML.getNodeByPath(mr_query_response, pathNode);
1010 if (highlighted_Node != null)
1011 {
1012 // Build a request to process highlighted text
1013
1014 Element hl_message = doc.createElement(GSXML.MESSAGE_ELEM);
1015 to = GSPath.appendLink(collection, "DocumentContentRetrieve");
1016 Element dc_request = GSXML.createBasicRequest(doc, GSXML.REQUEST_TYPE_PROCESS, to, userContext);
1017 hl_message.appendChild(dc_request);
1018
1019 // Create a parameter list to specify the request parameters - empty for now
1020 Element dc_param_list = doc.createElement(GSXML.PARAM_ELEM + GSXML.LIST_MODIFIER);
1021 dc_request.appendChild(dc_param_list);
1022
1023 // get the content
1024 Element doc_list = doc.createElement(GSXML.DOC_NODE_ELEM + GSXML.LIST_MODIFIER);
1025 dc_request.appendChild(doc_list);
1026 Element current_doc = doc.createElement(GSXML.DOC_NODE_ELEM);
1027 doc_list.appendChild(current_doc);
1028 current_doc.setAttribute(GSXML.NODE_ID_ATT, (String) params.get(GSParams.DOCUMENT));
1029 //Append highlighted content to request for processing
1030 dc_request.appendChild(doc.importNode(highlighted_Node, true));
1031
1032 Element hl_response_message = (Element) this.mr.process(hl_message);
1033 //Get results
1034 NodeList contentList = hl_response_message.getElementsByTagName(GSXML.NODE_CONTENT_ELEM);
1035 Element content = (Element) contentList.item(0);
1036 return content;
1037 }
1038
1039 String path = GSPath.appendLink(GSXML.RESPONSE_ELEM, GSXML.TERM_ELEM + GSXML.LIST_MODIFIER);
1040 Element query_term_list_element = (Element) GSXML.getNodeByPath(mr_query_response, path);
1041 if (query_term_list_element == null)
1042 {
1043 // no term info
1044 logger.error("No query term information.\n");
1045 return dc_response_doc_content;
1046 }
1047
1048 String content = GSXML.getNodeText(dc_response_doc_content);
1049
1050 String metadata_path = GSPath.appendLink(GSXML.RESPONSE_ELEM, GSXML.METADATA_ELEM + GSXML.LIST_MODIFIER);
1051 Element metadata_list = (Element) GSXML.getNodeByPath(mr_query_response, metadata_path);
1052
1053 HashSet<String> query_term_variants = new HashSet<String>();
1054 NodeList equivalent_terms_nodelist = query_term_list_element.getElementsByTagName("equivTermList");
1055 if (equivalent_terms_nodelist == null || equivalent_terms_nodelist.getLength() == 0)
1056 {
1057 NodeList terms_nodelist = query_term_list_element.getElementsByTagName("term");
1058 if (terms_nodelist != null && terms_nodelist.getLength() > 0)
1059 {
1060 for (int i = 0; i < terms_nodelist.getLength(); i++)
1061 {
1062 String termValue = ((Element) terms_nodelist.item(i)).getAttribute("name");
1063 String termValueU = null;
1064 String termValueL = null;
1065
1066 if (termValue.length() > 1)
1067 {
1068 termValueU = termValue.substring(0, 1).toUpperCase() + termValue.substring(1);
1069 termValueL = termValue.substring(0, 1).toLowerCase() + termValue.substring(1);
1070 }
1071 else
1072 {
1073 termValueU = termValue.substring(0, 1).toUpperCase();
1074 termValueL = termValue.substring(0, 1).toLowerCase();
1075 }
1076
1077 query_term_variants.add(termValueU);
1078 query_term_variants.add(termValueL);
1079 }
1080 }
1081 }
1082 else
1083 {
1084 for (int i = 0; i < equivalent_terms_nodelist.getLength(); i++)
1085 {
1086 Element equivalent_terms_element = (Element) equivalent_terms_nodelist.item(i);
1087 String[] equivalent_terms = GSXML.getAttributeValuesFromList(equivalent_terms_element, GSXML.NAME_ATT);
1088 for (int j = 0; j < equivalent_terms.length; j++)
1089 {
1090 query_term_variants.add(equivalent_terms[j]);
1091 }
1092 }
1093 }
1094
1095 ArrayList<ArrayList<HashSet<String>>> phrase_query_term_variants_hierarchy = new ArrayList<ArrayList<HashSet<String>>>();
1096
1097 Element query_element = GSXML.getNamedElement(metadata_list, GSXML.METADATA_ELEM, GSXML.NAME_ATT, "query");
1098 String performed_query = GSXML.getNodeText(query_element) + " ";
1099
1100 ArrayList<HashSet<String>> phrase_query_p_term_variants_list = new ArrayList<HashSet<String>>();
1101 int term_start = 0;
1102 boolean in_term = false;
1103 boolean in_phrase = false;
1104 for (int i = 0; i < performed_query.length(); i++)
1105 {
1106 char character = performed_query.charAt(i);
1107 boolean is_character_letter_or_digit = Character.isLetterOrDigit(character);
1108
1109 // Has a query term just started?
1110 if (in_term == false && is_character_letter_or_digit == true)
1111 {
1112 in_term = true;
1113 term_start = i;
1114 }
1115
1116 // Or has a term just finished?
1117 else if (in_term == true && is_character_letter_or_digit == false)
1118 {
1119 in_term = false;
1120 String term = performed_query.substring(term_start, i);
1121
1122 Element term_element = GSXML.getNamedElement(query_term_list_element, GSXML.TERM_ELEM, GSXML.NAME_ATT, term);
1123 if (term_element != null)
1124 {
1125
1126 HashSet<String> phrase_query_p_term_x_variants = new HashSet<String>();
1127
1128 NodeList term_equivalent_terms_nodelist = term_element.getElementsByTagName("equivTermList");
1129 if (term_equivalent_terms_nodelist == null || term_equivalent_terms_nodelist.getLength() == 0)
1130 {
1131 String termValueU = null;
1132 String termValueL = null;
1133
1134 if (term.length() > 1)
1135 {
1136 termValueU = term.substring(0, 1).toUpperCase() + term.substring(1);
1137 termValueL = term.substring(0, 1).toLowerCase() + term.substring(1);
1138 }
1139 else
1140 {
1141 termValueU = term.substring(0, 1).toUpperCase();
1142 termValueL = term.substring(0, 1).toLowerCase();
1143 }
1144
1145 phrase_query_p_term_x_variants.add(termValueU);
1146 phrase_query_p_term_x_variants.add(termValueL);
1147 }
1148 else
1149 {
1150 for (int j = 0; j < term_equivalent_terms_nodelist.getLength(); j++)
1151 {
1152 Element term_equivalent_terms_element = (Element) term_equivalent_terms_nodelist.item(j);
1153 String[] term_equivalent_terms = GSXML.getAttributeValuesFromList(term_equivalent_terms_element, GSXML.NAME_ATT);
1154 for (int k = 0; k < term_equivalent_terms.length; k++)
1155 {
1156 phrase_query_p_term_x_variants.add(term_equivalent_terms[k]);
1157 }
1158 }
1159 }
1160 phrase_query_p_term_variants_list.add(phrase_query_p_term_x_variants);
1161
1162 if (in_phrase == false)
1163 {
1164 phrase_query_term_variants_hierarchy.add(phrase_query_p_term_variants_list);
1165 phrase_query_p_term_variants_list = new ArrayList<HashSet<String>>();
1166 }
1167 }
1168 }
1169 // Watch for phrases (surrounded by quotes)
1170 if (character == '\"')
1171 {
1172 // Has a phrase just started?
1173 if (in_phrase == false)
1174 {
1175 in_phrase = true;
1176 }
1177 // Or has a phrase just finished?
1178 else if (in_phrase == true)
1179 {
1180 in_phrase = false;
1181 phrase_query_term_variants_hierarchy.add(phrase_query_p_term_variants_list);
1182 }
1183
1184 phrase_query_p_term_variants_list = new ArrayList<HashSet<String>>();
1185 }
1186 }
1187
1188 return highlightQueryTermsInternal(doc, content, query_term_variants, phrase_query_term_variants_hierarchy);
1189 }
1190
1191 /**
1192 * Highlights query terms in a piece of text.
1193 */
1194 private Element highlightQueryTermsInternal(Document doc, String content, HashSet<String> query_term_variants, ArrayList<ArrayList<HashSet<String>>> phrase_query_term_variants_hierarchy)
1195 {
1196
1197 logger.error("size = "+ query_term_variants.size());
1198 // Convert the content string to an array of characters for speed
1199 char[] content_characters = new char[content.length()];
1200 content.getChars(0, content.length(), content_characters, 0);
1201
1202 // Now skim through the content, identifying word matches
1203 ArrayList<WordMatch> word_matches = new ArrayList<WordMatch>();
1204 int word_start = 0;
1205 boolean in_word = false;
1206 boolean preceding_word_matched = false;
1207 boolean inTag = false;
1208 for (int i = 0; i < content_characters.length; i++)
1209 {
1210 //We don't want to find words inside HTML tags
1211 if (content_characters[i] == '<')
1212 {
1213 inTag = true;
1214 continue;
1215 }
1216 else if (inTag && content_characters[i] == '>')
1217 {
1218 inTag = false;
1219 }
1220 else if (inTag)
1221 {
1222 continue;
1223 }
1224
1225 boolean is_character_letter_or_digit = Character.isLetterOrDigit(content_characters[i]);
1226
1227 // Has a word just started?
1228 if (in_word == false && is_character_letter_or_digit == true)
1229 {
1230 in_word = true;
1231 word_start = i;
1232 }
1233
1234 // Or has a word just finished?
1235 else if (in_word == true && is_character_letter_or_digit == false)
1236 {
1237 in_word = false;
1238
1239 // Check if the word matches any of the query term equivalents
1240 String word = new String(content_characters, word_start, (i - word_start));
1241 if (query_term_variants.contains(word))
1242 {
1243 // We have found a matching word, so remember its location
1244 word_matches.add(new WordMatch(word, word_start, i, preceding_word_matched));
1245 preceding_word_matched = true;
1246 }
1247 else
1248 {
1249 preceding_word_matched = false;
1250 }
1251 }
1252 }
1253
1254 // Don't forget the last word...
1255 if (in_word == true)
1256 {
1257 // Check if the word matches any of the query term equivalents
1258 String word = new String(content_characters, word_start, (content_characters.length - word_start));
1259 if (query_term_variants.contains(word))
1260 {
1261 // We have found a matching word, so remember its location
1262 word_matches.add(new WordMatch(word, word_start, content_characters.length, preceding_word_matched));
1263 }
1264 }
1265
1266 ArrayList<Integer> highlight_start_positions = new ArrayList<Integer>();
1267 ArrayList<Integer> highlight_end_positions = new ArrayList<Integer>();
1268
1269 // Deal with phrases now
1270 ArrayList<PartialPhraseMatch> partial_phrase_matches = new ArrayList<PartialPhraseMatch>();
1271 for (int i = 0; i < word_matches.size(); i++)
1272 {
1273 WordMatch word_match = word_matches.get(i);
1274
1275 // See if any partial phrase matches are extended by this word
1276 if (word_match.preceding_word_matched)
1277 {
1278 for (int j = partial_phrase_matches.size() - 1; j >= 0; j--)
1279 {
1280 PartialPhraseMatch partial_phrase_match = partial_phrase_matches.remove(j);
1281 ArrayList phrase_query_p_term_variants_list = phrase_query_term_variants_hierarchy.get(partial_phrase_match.query_phrase_number);
1282 HashSet phrase_query_p_term_x_variants = (HashSet) phrase_query_p_term_variants_list.get(partial_phrase_match.num_words_matched);
1283 if (phrase_query_p_term_x_variants.contains(word_match.word))
1284 {
1285 partial_phrase_match.num_words_matched++;
1286
1287 // Has a complete phrase match occurred?
1288 if (partial_phrase_match.num_words_matched == phrase_query_p_term_variants_list.size())
1289 {
1290 // Check for overlaps by looking at the previous highlight range
1291 if (!highlight_end_positions.isEmpty())
1292 {
1293 int last_highlight_index = highlight_end_positions.size() - 1;
1294 int last_highlight_end = highlight_end_positions.get(last_highlight_index).intValue();
1295 if (last_highlight_end > partial_phrase_match.start_position)
1296 {
1297 // There is an overlap, so remove the previous phrase match
1298 int last_highlight_start = highlight_start_positions.remove(last_highlight_index).intValue();
1299 highlight_end_positions.remove(last_highlight_index);
1300 partial_phrase_match.start_position = last_highlight_start;
1301 }
1302 }
1303
1304 highlight_start_positions.add(new Integer(partial_phrase_match.start_position));
1305 highlight_end_positions.add(new Integer(word_match.end_position));
1306 }
1307 // No, but add the partial match back into the list for next time
1308 else
1309 {
1310 partial_phrase_matches.add(partial_phrase_match);
1311 }
1312 }
1313 }
1314 }
1315 else
1316 {
1317 partial_phrase_matches.clear();
1318 }
1319
1320 // See if this word is at the start of any of the phrases
1321 for (int p = 0; p < phrase_query_term_variants_hierarchy.size(); p++)
1322 {
1323 ArrayList phrase_query_p_term_variants_list = phrase_query_term_variants_hierarchy.get(p);
1324 HashSet phrase_query_p_term_1_variants = (HashSet) phrase_query_p_term_variants_list.get(0);
1325 if (phrase_query_p_term_1_variants.contains(word_match.word))
1326 {
1327 // If this phrase is just one word long, we have a complete match
1328 if (phrase_query_p_term_variants_list.size() == 1)
1329 {
1330 highlight_start_positions.add(new Integer(word_match.start_position));
1331 highlight_end_positions.add(new Integer(word_match.end_position));
1332 }
1333 // Otherwise we have the start of a potential phrase match
1334 else
1335 {
1336 partial_phrase_matches.add(new PartialPhraseMatch(word_match.start_position, p));
1337 }
1338 }
1339 }
1340 }
1341
1342 // Now add the annotation tags into the document at the correct points
1343 Element content_element = doc.createElement(GSXML.NODE_CONTENT_ELEM);
1344
1345 int last_wrote = 0;
1346 for (int i = 0; i < highlight_start_positions.size(); i++)
1347 {
1348 int highlight_start = highlight_start_positions.get(i).intValue();
1349 int highlight_end = highlight_end_positions.get(i).intValue();
1350
1351 // Print anything before the highlight range
1352 if (last_wrote < highlight_start)
1353 {
1354 String preceding_text = new String(content_characters, last_wrote, (highlight_start - last_wrote));
1355 content_element.appendChild(doc.createTextNode(preceding_text));
1356 }
1357
1358 // Print the highlight text, annotated
1359 if (highlight_end > last_wrote)
1360 {
1361 String highlight_text = new String(content_characters, highlight_start, (highlight_end - highlight_start));
1362 Element annotation_element = GSXML.createTextElement(doc, "annotation", highlight_text);
1363 annotation_element.setAttribute("type", "query_term");
1364 content_element.appendChild(annotation_element);
1365 last_wrote = highlight_end;
1366 }
1367 }
1368
1369 // Finish off any unwritten text
1370 if (last_wrote < content_characters.length)
1371 {
1372 String remaining_text = new String(content_characters, last_wrote, (content_characters.length - last_wrote));
1373 content_element.appendChild(doc.createTextNode(remaining_text));
1374 }
1375 return content_element;
1376 }
1377
1378 static private class WordMatch
1379 {
1380 public String word;
1381 public int start_position;
1382 public int end_position;
1383 public boolean preceding_word_matched;
1384
1385 public WordMatch(String word, int start_position, int end_position, boolean preceding_word_matched)
1386 {
1387 this.word = word;
1388 this.start_position = start_position;
1389 this.end_position = end_position;
1390 this.preceding_word_matched = preceding_word_matched;
1391 }
1392 }
1393
1394 static private class PartialPhraseMatch
1395 {
1396 public int start_position;
1397 public int query_phrase_number;
1398 public int num_words_matched;
1399
1400 public PartialPhraseMatch(int start_position, int query_phrase_number)
1401 {
1402 this.start_position = start_position;
1403 this.query_phrase_number = query_phrase_number;
1404 this.num_words_matched = 1;
1405 }
1406 }
1407}
Note: See TracBrowser for help on using the repository browser.