source: main/trunk/greenstone3/src/java/org/greenstone/gsdl3/action/DocumentAction.java@ 32069

Last change on this file since 32069 was 32069, checked in by kjdon, 6 years ago

forgot to add the import GlobalProperties line

  • Property svn:keywords set to Author Date Id Revision
File size: 49.7 KB
Line 
1/*
2 * DocumentAction.java
3 * Copyright (C) 2002 New Zealand Digital Library, http://www.nzdl.org
4 *
5 * This program is free software; you can redistribute it and/or modify
6 * it under the terms of the GNU General Public License as published by
7 * the Free Software Foundation; either version 2 of the License, or
8 * (at your option) any later version.
9 *
10 * This program is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 * GNU General Public License for more details.
14 *
15 * You should have received a copy of the GNU General Public License
16 * along with this program; if not, write to the Free Software
17 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
18 */
19package org.greenstone.gsdl3.action;
20
21// Greenstone classes
22import org.greenstone.gsdl3.core.ModuleInterface;
23import org.greenstone.gsdl3.util.*;
24import org.greenstone.util.GlobalProperties;
25
26// XML classes
27import org.w3c.dom.Document;
28import org.w3c.dom.Element;
29import org.w3c.dom.Node;
30import org.w3c.dom.Text;
31import org.w3c.dom.NodeList;
32
33// General Java classes
34import java.util.ArrayList;
35import java.util.HashMap;
36import java.util.HashSet;
37import java.io.File;
38import java.io.Serializable;
39
40import org.apache.log4j.*;
41
42/** Action class for retrieving Documents via the message router */
43public class DocumentAction extends Action
44{
45
46 static Logger logger = Logger.getLogger(org.greenstone.gsdl3.action.DocumentAction.class.getName());
47
48 // this is used to specify that the sibling nodes of a selected one should be obtained
49 public static final String SIBLING_ARG = "sib";
50 public static final String GOTO_PAGE_ARG = "gp";
51 public static final String ENRICH_DOC_ARG = "end";
52 public static final String EXPAND_DOCUMENT_ARG = "ed";
53 public static final String EXPAND_CONTENTS_ARG = "ec";
54 public static final String REALISTIC_BOOK_ARG = "book";
55 public static final String NO_TEXT_ARG = "noText";
56 public static final String DOC_EDIT_ARG = "docEdit";
57
58 /**
59 * if this is set to true, when a document is displayed, any annotation type
60 * services (enrich) will be offered to the user as well
61 */
62 protected boolean provide_annotations = false;
63
64 protected boolean highlight_query_terms = false;
65
66 public boolean configure()
67 {
68 super.configure();
69 String highlight = (String) config_params.get("highlightQueryTerms");
70 if (highlight != null && highlight.equals("true"))
71 {
72 highlight_query_terms = true;
73 }
74 String annotate = (String) config_params.get("displayAnnotationService");
75 if (annotate != null && annotate.equals("true"))
76 {
77 provide_annotations = true;
78 }
79 return true;
80 }
81
82 public Node process(Node message_node)
83 {
84 // for now, no subaction eventually we may want to have subactions such as text assoc or something ?
85
86 Element message = GSXML.nodeToElement(message_node);
87 Document doc = XMLConverter.newDOM(); //message.getOwnerDocument();
88
89 // the response
90 Element result = doc.createElement(GSXML.MESSAGE_ELEM);
91 Element page_response = doc.createElement(GSXML.RESPONSE_ELEM);
92 result.appendChild(page_response);
93
94 // get the request - assume only one
95 Element request = (Element) GSXML.getChildByTagName(message, GSXML.REQUEST_ELEM);
96 Element cgi_paramList = (Element) GSXML.getChildByTagName(request, GSXML.PARAM_ELEM + GSXML.LIST_MODIFIER);
97 HashMap<String, Serializable> params = GSXML.extractParams(cgi_paramList, false);
98
99 // just in case there are some that need to get passed to the services
100 HashMap service_params = (HashMap) params.get("s0");
101
102 String collection = (String) params.get(GSParams.COLLECTION);
103 String document_id = (String) params.get(GSParams.DOCUMENT);
104 if (document_id != null && document_id.equals(""))
105 {
106 document_id = null;
107 }
108 String href = (String) params.get(GSParams.HREF);//for an external link : get the href URL if it is existing in the params list
109 if (href != null && href.equals(""))
110 {
111 href = null;
112 }
113 String rl = (String) params.get(GSParams.RELATIVE_LINK);//for an external link : get the rl value if it is existing in the params list
114 if (document_id == null && href == null)
115 {
116 logger.error("no document specified!");
117 return result;
118 }
119 if (rl != null && rl.equals("0"))
120 {
121 // this is a true external link, we should have been directed to a different page or action
122 logger.error("rl value was 0, shouldn't get here");
123 return result;
124 }
125
126 UserContext userContext = new UserContext(request);
127
128 //append site metadata
129 addSiteMetadata(page_response, userContext);
130 addInterfaceOptions(page_response);
131
132 // get the additional data needed for the page
133 getBackgroundData(page_response, collection, userContext);
134 Element format_elem = (Element) GSXML.getChildByTagName(page_response, GSXML.FORMAT_ELEM);
135
136 if (format_elem != null) {
137 // lets look for param defaults set in config file
138 NodeList param_defaults = format_elem.getElementsByTagName("paramDefault");
139 for (int i=0; i<param_defaults.getLength(); i++) {
140 Element p = (Element)param_defaults.item(i);
141 String name = p.getAttribute(GSXML.NAME_ATT);
142 if (params.get(name) ==null) {
143 // wasn't set from interface
144 String value = p.getAttribute(GSXML.VALUE_ATT);
145 params.put(name, value );
146 // also add into request param xml so that xslt knows it too
147 GSXML.addParameterToList(cgi_paramList, name, value);
148 }
149 }
150 }
151
152
153 boolean editing_document = false;
154 String doc_edit = (String) params.get(DOC_EDIT_ARG);
155 if (doc_edit != null && doc_edit.equals("1")) {
156 editing_document = true;
157 }
158
159 // are we editing mode? just get the archive document, convert to our internal doc format, and return it
160 if (editing_document) {
161
162 // call get archive doc
163 Element dx_message = doc.createElement(GSXML.MESSAGE_ELEM);
164 String to = "DocXMLGetSection";
165 Element dx_request = GSXML.createBasicRequest(doc, GSXML.REQUEST_TYPE_PROCESS, to, userContext);
166 dx_message.appendChild(dx_request);
167 Element dx_section = doc.createElement(GSXML.DOCXML_SECTION_ELEM);
168 dx_section.setAttribute(GSXML.NODE_ID_ATT, document_id);
169 dx_section.setAttribute(GSXML.COLLECTION_ATT, collection);
170 dx_request.appendChild(dx_section);
171
172 Element dx_response_message = (Element) this.mr.process(dx_message);
173 if (processErrorElements(dx_response_message, page_response))
174 {
175 return result;
176 }
177
178 // get the section out
179 String path = GSPath.appendLink(GSXML.RESPONSE_ELEM, GSXML.DOCXML_SECTION_ELEM);
180 Element section = (Element) GSXML.getNodeByPath(dx_response_message, path);
181 if (section == null) {
182 logger.error("no archive doc returned for "+document_id);
183 return result;
184 }
185 // convert the archive format into the internal format that the page response requires
186
187 Element doc_elem = doc.createElement(GSXML.DOCUMENT_ELEM);
188 page_response.appendChild(doc_elem);
189 section.setAttribute(GSXML.NODE_ID_ATT, document_id);
190
191 Element transformed_section = transformArchiveToDocument(section);
192 doc_elem.appendChild(doc.importNode(transformed_section, true));
193 logger.error("dx result = "+XMLConverter.getPrettyString(result));
194 return result;
195 }
196
197 String document_type = (String) params.get(GSParams.DOCUMENT_TYPE);
198 if (document_type != null && document_type.equals(""))
199 {
200 //document_type = "hierarchy";
201 document_type = null; // we'll get it later if not already specified
202 }
203 //whether to retrieve siblings or not
204 boolean get_siblings = false;
205 String sibs = (String) params.get(SIBLING_ARG);
206 if (sibs != null && sibs.equals("1"))
207 {
208 get_siblings = true;
209 }
210
211 String doc_id_modifier = "";
212 String sibling_num = (String) params.get(GOTO_PAGE_ARG);
213 if (sibling_num != null && !sibling_num.equals(""))
214 {
215 // we have to modify the doc name
216 doc_id_modifier = "." + sibling_num + ".ss";
217 }
218
219 boolean expand_document = false;
220 String ed_arg = (String) params.get(EXPAND_DOCUMENT_ARG);
221 if (ed_arg != null && ed_arg.equals("1"))
222 {
223 expand_document = true;
224 }
225
226 boolean expand_contents = false;
227 if (expand_document)
228 { // we always expand the contents with the text
229 expand_contents = true;
230 }
231 else
232 {
233 String ec_arg = (String) params.get(EXPAND_CONTENTS_ARG);
234 if (ec_arg != null && ec_arg.equals("1"))
235 {
236 expand_contents = true;
237 }
238 }
239
240 // do we want text content? Not if no_text=1.
241 // expand_document overrides this. - should it??
242 boolean get_text = true;
243 String nt_arg = (String) params.get(NO_TEXT_ARG);
244
245 if (!expand_document && nt_arg!=null && nt_arg.equals("1")) {
246 logger.error("SETTING GET TEXT TO FALSE");
247 get_text = false;
248 } else {
249 logger.error("GET TEXT REMAINS TRUE");
250 }
251
252 // the_document is where all the doc info - structure and metadata etc
253 // is added into, to be returned in the page
254 Element the_document = doc.createElement(GSXML.DOCUMENT_ELEM);
255 page_response.appendChild(the_document);
256
257 // create a basic doc list containing the current node
258 Element basic_doc_list = doc.createElement(GSXML.DOC_NODE_ELEM + GSXML.LIST_MODIFIER);
259 Element current_doc = doc.createElement(GSXML.DOC_NODE_ELEM);
260 basic_doc_list.appendChild(current_doc);
261 if (document_id != null)
262 {
263 current_doc.setAttribute(GSXML.NODE_ID_ATT, document_id + doc_id_modifier);
264 }
265 else
266 {
267 current_doc.setAttribute(GSXML.HREF_ID_ATT, href);
268 // do we need this??
269 current_doc.setAttribute(GSXML.ID_MOD_ATT, doc_id_modifier);
270 }
271
272 if (document_type == null)
273 {
274 document_type = getDocumentType(basic_doc_list, collection, userContext, page_response);
275 }
276 if (document_type == null)
277 {
278 logger.debug("doctype is null, setting to simple");
279 document_type = GSXML.DOC_TYPE_SIMPLE;
280 }
281
282 the_document.setAttribute(GSXML.DOC_TYPE_ATT, document_type);
283
284
285 // Create a parameter list to specify the required structure information
286 Element ds_param_list = doc.createElement(GSXML.PARAM_ELEM + GSXML.LIST_MODIFIER);
287
288 if (service_params != null)
289 {
290 GSXML.addParametersToList(ds_param_list, service_params);
291 }
292
293 Element ds_param = null;
294 boolean get_structure = false;
295 boolean get_structure_info = false;
296 if (document_type.equals(GSXML.DOC_TYPE_PAGED))
297 {
298 get_structure_info = true;
299
300 if (expand_contents)
301 {
302 ds_param = doc.createElement(GSXML.PARAM_ELEM);
303 ds_param_list.appendChild(ds_param);
304 ds_param.setAttribute(GSXML.NAME_ATT, "structure");
305 ds_param.setAttribute(GSXML.VALUE_ATT, "entire");
306 }
307
308 // get the info needed for paged naviagtion
309 ds_param = doc.createElement(GSXML.PARAM_ELEM);
310 ds_param_list.appendChild(ds_param);
311 ds_param.setAttribute(GSXML.NAME_ATT, "info");
312 ds_param.setAttribute(GSXML.VALUE_ATT, "numSiblings");
313 ds_param = doc.createElement(GSXML.PARAM_ELEM);
314 ds_param_list.appendChild(ds_param);
315 ds_param.setAttribute(GSXML.NAME_ATT, "info");
316 ds_param.setAttribute(GSXML.VALUE_ATT, "numChildren");
317 ds_param = doc.createElement(GSXML.PARAM_ELEM);
318 ds_param_list.appendChild(ds_param);
319 ds_param.setAttribute(GSXML.NAME_ATT, "info");
320 ds_param.setAttribute(GSXML.VALUE_ATT, "siblingPosition");
321
322 if (get_siblings)
323 {
324 ds_param = doc.createElement(GSXML.PARAM_ELEM);
325 ds_param_list.appendChild(ds_param);
326 ds_param.setAttribute(GSXML.NAME_ATT, "structure");
327 ds_param.setAttribute(GSXML.VALUE_ATT, "siblings");
328 }
329
330 }
331 else if (document_type.equals(GSXML.DOC_TYPE_HIERARCHY) || document_type.equals(GSXML.DOC_TYPE_PAGED_HIERARCHY))
332 {
333 get_structure = true;
334 if (expand_contents)
335 {
336 ds_param = doc.createElement(GSXML.PARAM_ELEM);
337 ds_param_list.appendChild(ds_param);
338 ds_param.setAttribute(GSXML.NAME_ATT, "structure");
339 ds_param.setAttribute(GSXML.VALUE_ATT, "entire");
340 }
341 else
342 {
343 // get the info needed for table of contents
344 ds_param = doc.createElement(GSXML.PARAM_ELEM);
345 ds_param_list.appendChild(ds_param);
346 ds_param.setAttribute(GSXML.NAME_ATT, "structure");
347 ds_param.setAttribute(GSXML.VALUE_ATT, "ancestors");
348 ds_param = doc.createElement(GSXML.PARAM_ELEM);
349 ds_param_list.appendChild(ds_param);
350 ds_param.setAttribute(GSXML.NAME_ATT, "structure");
351 ds_param.setAttribute(GSXML.VALUE_ATT, "children");
352 if (get_siblings)
353 {
354 ds_param = doc.createElement(GSXML.PARAM_ELEM);
355 ds_param_list.appendChild(ds_param);
356 ds_param.setAttribute(GSXML.NAME_ATT, "structure");
357 ds_param.setAttribute(GSXML.VALUE_ATT, "siblings");
358 }
359 }
360 }
361 else
362 {
363 // we dont need any structure
364 }
365
366 boolean has_dummy = false;
367 if (get_structure || get_structure_info)
368 {
369
370 // Build a request to obtain the document structure
371 Element ds_message = doc.createElement(GSXML.MESSAGE_ELEM);
372 String to = GSPath.appendLink(collection, "DocumentStructureRetrieve");// Hard-wired?
373 Element ds_request = GSXML.createBasicRequest(doc, GSXML.REQUEST_TYPE_PROCESS, to, userContext);
374 ds_message.appendChild(ds_request);
375 ds_request.appendChild(ds_param_list);
376
377 // add the node list we created earlier
378 ds_request.appendChild(basic_doc_list);
379
380 // Process the document structure retrieve message
381 Element ds_response_message = (Element) this.mr.process(ds_message);
382 if (processErrorElements(ds_response_message, page_response))
383 {
384 return result;
385 }
386
387 // get the info and print out
388 String path = GSPath.appendLink(GSXML.RESPONSE_ELEM, GSXML.DOC_NODE_ELEM + GSXML.LIST_MODIFIER);
389 path = GSPath.appendLink(path, GSXML.DOC_NODE_ELEM);
390 path = GSPath.appendLink(path, "nodeStructureInfo");
391 Element ds_response_struct_info = (Element) GSXML.getNodeByPath(ds_response_message, path);
392 // get the doc_node bit
393 if (ds_response_struct_info != null)
394 {
395 the_document.appendChild(doc.importNode(ds_response_struct_info, true));
396 }
397 path = GSPath.appendLink(GSXML.RESPONSE_ELEM, GSXML.DOC_NODE_ELEM + GSXML.LIST_MODIFIER);
398 path = GSPath.appendLink(path, GSXML.DOC_NODE_ELEM);
399 path = GSPath.appendLink(path, GSXML.NODE_STRUCTURE_ELEM);
400 Element ds_response_structure = (Element) GSXML.getNodeByPath(ds_response_message, path);
401
402 if (ds_response_structure != null)
403 {
404 // add the contents of the structure bit into the_document
405 NodeList structs = ds_response_structure.getChildNodes();
406 for (int i = 0; i < structs.getLength(); i++)
407 {
408 the_document.appendChild(doc.importNode(structs.item(i), true));
409 }
410 }
411 else
412 {
413 // no structure nodes, so put in a dummy doc node
414 Element doc_node = doc.createElement(GSXML.DOC_NODE_ELEM);
415 if (document_id != null)
416 {
417 doc_node.setAttribute(GSXML.NODE_ID_ATT, document_id);
418 }
419 else
420 {
421 doc_node.setAttribute(GSXML.HREF_ID_ATT, href);
422
423 }
424 the_document.appendChild(doc_node);
425 has_dummy = true;
426 }
427 }
428 else
429 { // a simple type - we dont have a dummy node for simple
430 // should think about this more
431 // no structure request, so just put in a dummy doc node
432 Element doc_node = doc.createElement(GSXML.DOC_NODE_ELEM);
433 if (document_id != null)
434 {
435 doc_node.setAttribute(GSXML.NODE_ID_ATT, document_id);
436 }
437 else
438 {
439 doc_node.setAttribute(GSXML.HREF_ID_ATT, href);
440 }
441 the_document.appendChild(doc_node);
442 has_dummy = true;
443 }
444
445 // Build a request to obtain some document metadata
446 Element dm_message = doc.createElement(GSXML.MESSAGE_ELEM);
447 String to = GSPath.appendLink(collection, "DocumentMetadataRetrieve"); // Hard-wired?
448 Element dm_request = GSXML.createBasicRequest(doc, GSXML.REQUEST_TYPE_PROCESS, to, userContext);
449 dm_message.appendChild(dm_request);
450 // Create a parameter list to specify the required metadata information
451
452 HashSet<String> meta_names = new HashSet<String>();
453 meta_names.add("Title"); // the default
454 if (format_elem != null)
455 {
456 getRequiredMetadataNames(format_elem, meta_names);
457 }
458
459 Element extraMetaListElem = (Element) GSXML.getChildByTagName(request, GSXML.EXTRA_METADATA + GSXML.LIST_MODIFIER);
460 if (extraMetaListElem != null)
461 {
462 NodeList extraMetaList = extraMetaListElem.getElementsByTagName(GSXML.EXTRA_METADATA);
463 for (int i = 0; i < extraMetaList.getLength(); i++)
464 {
465 meta_names.add(((Element) extraMetaList.item(i)).getAttribute(GSXML.NAME_ATT));
466 }
467 }
468
469 Element dm_param_list = createMetadataParamList(doc,meta_names);
470 if (service_params != null)
471 {
472 GSXML.addParametersToList(dm_param_list, service_params);
473 }
474
475 dm_request.appendChild(dm_param_list);
476
477 // create the doc node list for the metadata request
478 Element dm_doc_list = doc.createElement(GSXML.DOC_NODE_ELEM + GSXML.LIST_MODIFIER);
479 dm_request.appendChild(dm_doc_list);
480
481 // Add each node from the structure response into the metadata request
482 NodeList doc_nodes = the_document.getElementsByTagName(GSXML.DOC_NODE_ELEM);
483 for (int i = 0; i < doc_nodes.getLength(); i++)
484 {
485 Element doc_node = (Element) doc_nodes.item(i);
486 String doc_node_id = doc_node.getAttribute(GSXML.NODE_ID_ATT);
487
488 // Add the documentNode to the list
489 Element dm_doc_node = doc.createElement(GSXML.DOC_NODE_ELEM);
490 dm_doc_list.appendChild(dm_doc_node);
491 dm_doc_node.setAttribute(GSXML.NODE_ID_ATT, doc_node_id);
492 dm_doc_node.setAttribute(GSXML.NODE_TYPE_ATT, doc_node.getAttribute(GSXML.NODE_TYPE_ATT));
493 if (document_id == null){
494 dm_doc_node.setAttribute(GSXML.HREF_ID_ATT, href );
495 }
496
497 }
498
499 // we also want a metadata request to the top level document to get
500 // assocfilepath - this could be cached too
501 Element doc_meta_request = GSXML.createBasicRequest(doc, GSXML.REQUEST_TYPE_PROCESS, to, userContext);
502 dm_message.appendChild(doc_meta_request);
503 Element doc_meta_param_list = doc.createElement(GSXML.PARAM_ELEM + GSXML.LIST_MODIFIER);
504 if (service_params != null)
505 {
506 GSXML.addParametersToList(doc_meta_param_list, service_params);
507 }
508
509 doc_meta_request.appendChild(doc_meta_param_list);
510 Element doc_param = doc.createElement(GSXML.PARAM_ELEM);
511 doc_meta_param_list.appendChild(doc_param);
512 doc_param.setAttribute(GSXML.NAME_ATT, "metadata");
513 doc_param.setAttribute(GSXML.VALUE_ATT, "assocfilepath");
514
515 // create the doc node list for the metadata request
516 Element doc_list = doc.createElement(GSXML.DOC_NODE_ELEM + GSXML.LIST_MODIFIER);
517 doc_meta_request.appendChild(doc_list);
518
519 Element doc_node = doc.createElement(GSXML.DOC_NODE_ELEM);
520 // the node we want is the root document node
521 if (document_id != null)
522 {
523 doc_node.setAttribute(GSXML.NODE_ID_ATT, document_id + ".rt");
524 }
525 /*else
526 {
527 doc_node.setAttribute(GSXML.HREF_ID_ATT, href);// + ".rt");
528 // can we assume that href is always a top level doc??
529 //doc_node.setAttribute(GSXML.ID_MOD_ATT, ".rt");
530 //doc_node.setAttribute("externalURL", has_rl);
531 }*/
532 doc_list.appendChild(doc_node);
533
534 Element dm_response_message = (Element) this.mr.process(dm_message);
535 if (processErrorElements(dm_response_message, page_response))
536 {
537 return result;
538 }
539
540 String path = GSPath.appendLink(GSXML.RESPONSE_ELEM, GSXML.DOC_NODE_ELEM + GSXML.LIST_MODIFIER);
541 Element dm_response_doc_list = (Element) GSXML.getNodeByPath(dm_response_message, path);
542
543 // Merge the metadata with the structure information
544 NodeList dm_response_docs = dm_response_doc_list.getChildNodes();
545 for (int i = 0; i < doc_nodes.getLength(); i++)
546 {
547 GSXML.mergeMetadataLists(doc_nodes.item(i), dm_response_docs.item(i));
548 }
549 // get the top level doc metadata out
550 Element doc_meta_response = (Element) dm_response_message.getElementsByTagName(GSXML.RESPONSE_ELEM).item(1);
551 Element top_doc_node = (Element) GSXML.getNodeByPath(doc_meta_response, "documentNodeList/documentNode");
552 GSXML.mergeMetadataLists(the_document, top_doc_node);
553
554 // do we want doc text content? If not, we are done.
555 if (!get_text) {
556 // don't get text
557 return result;
558 }
559
560 // Build a request to obtain some document content
561 Element dc_message = doc.createElement(GSXML.MESSAGE_ELEM);
562 to = GSPath.appendLink(collection, "DocumentContentRetrieve"); // Hard-wired?
563 Element dc_request = GSXML.createBasicRequest(doc, GSXML.REQUEST_TYPE_PROCESS, to, userContext);
564 dc_message.appendChild(dc_request);
565
566 // Create a parameter list to specify the request parameters - empty for now
567 Element dc_param_list = doc.createElement(GSXML.PARAM_ELEM + GSXML.LIST_MODIFIER);
568 if (service_params != null)
569 {
570 GSXML.addParametersToList(dc_param_list, service_params);
571 }
572
573 dc_request.appendChild(dc_param_list);
574
575 // get the content
576 // the doc list for the content request is the same as the one for the structure request unless we want the whole document, in which case its the same as for the metadata request.
577 if (expand_document)
578 {
579 dc_request.appendChild(dm_doc_list);
580 }
581 else
582 {
583 dc_request.appendChild(basic_doc_list);
584 }
585 Element dc_response_message = (Element) this.mr.process(dc_message);
586 if (processErrorElements(dc_response_message, page_response))
587 {
588 return result;
589 }
590 Element dc_response_doc_list = (Element) GSXML.getNodeByPath(dc_response_message, path);
591
592 if (expand_document)
593 {
594 // Merge the content with the structure information
595 NodeList dc_response_docs = dc_response_doc_list.getChildNodes();
596 for (int i = 0; i < doc_nodes.getLength(); i++)
597 {
598 Node content = GSXML.getChildByTagName((Element) dc_response_docs.item(i), GSXML.NODE_CONTENT_ELEM);
599 if (content != null)
600 {
601 if (highlight_query_terms)
602 {
603 String node_id = ((Element)doc_nodes.item(i)).getAttribute(GSXML.NODE_ID_ATT);
604 content = highlightQueryTerms(request, node_id, (Element) content);
605 }
606
607 doc_nodes.item(i).appendChild(doc.importNode(content, true));
608 }
609 //GSXML.mergeMetadataLists(doc_nodes.item(i), dm_response_docs.item(i));
610 }
611 if (has_dummy && document_type.equals(GSXML.DOC_TYPE_SIMPLE)) {
612 Element dummy_node = (Element) doc_nodes.item(0);
613 the_document.removeChild(dummy_node);
614 the_document.setAttribute(GSXML.NODE_ID_ATT, dummy_node.getAttribute(GSXML.NODE_ID_ATT));
615 NodeList dummy_children = dummy_node.getChildNodes();
616 for (int i = dummy_children.getLength() - 1; i >= 0; i--)
617 {
618 // special case as we don't want more than one metadata list
619 if (dummy_children.item(i).getNodeName().equals(GSXML.METADATA_ELEM + GSXML.LIST_MODIFIER))
620 {
621 GSXML.mergeMetadataFromList(the_document, dummy_children.item(i));
622 }
623 else
624 {
625 the_document.appendChild(dummy_children.item(i));
626 }
627 }
628 }
629 }
630 else
631 {
632 //path = GSPath.appendLink(path, GSXML.DOC_NODE_ELEM);
633 Element dc_response_doc = (Element) GSXML.getChildByTagName(dc_response_doc_list, GSXML.DOC_NODE_ELEM);
634 Element dc_response_doc_content = (Element) GSXML.getChildByTagName(dc_response_doc, GSXML.NODE_CONTENT_ELEM);
635 //Element dc_response_doc_external = (Element) GSXML.getChildByTagName(dc_response_doc, "external");
636
637 if (dc_response_doc_content == null)
638 {
639 // no content to add
640 if (dc_response_doc.getAttribute("external").equals("true"))
641 {
642
643 //if (dc_response_doc_external != null)
644 //{
645 String href_id = dc_response_doc.getAttribute(GSXML.HREF_ID_ATT);
646
647 the_document.setAttribute("selectedNode", href_id);
648 the_document.setAttribute("external", href_id);
649 }
650 return result;
651 }
652 if (highlight_query_terms)
653 {
654 dc_response_doc.removeChild(dc_response_doc_content);
655
656 dc_response_doc_content = highlightQueryTerms(request, null, dc_response_doc_content);
657 dc_response_doc.appendChild(dc_response_doc.getOwnerDocument().importNode(dc_response_doc_content, true));
658 }
659
660 if (provide_annotations)
661 {
662 String service_selected = (String) params.get(ENRICH_DOC_ARG);
663 if (service_selected != null && service_selected.equals("1"))
664 {
665 // now we can modifiy the response doc if needed
666 String enrich_service = (String) params.get(GSParams.SERVICE);
667 // send a message to the service
668 Element enrich_message = doc.createElement(GSXML.MESSAGE_ELEM);
669 Element enrich_request = GSXML.createBasicRequest(doc, GSXML.REQUEST_TYPE_PROCESS, enrich_service, userContext);
670 enrich_message.appendChild(enrich_request);
671 // check for parameters
672 HashMap e_service_params = (HashMap) params.get("s1");
673 if (e_service_params != null)
674 {
675 Element enrich_pl = doc.createElement(GSXML.PARAM_ELEM + GSXML.LIST_MODIFIER);
676 GSXML.addParametersToList(enrich_pl, e_service_params);
677 enrich_request.appendChild(enrich_pl);
678 }
679 Element e_doc_list = doc.createElement(GSXML.DOC_NODE_ELEM + GSXML.LIST_MODIFIER);
680 enrich_request.appendChild(e_doc_list);
681 e_doc_list.appendChild(doc.importNode(dc_response_doc, true));
682
683 Node enrich_response = this.mr.process(enrich_message);
684
685 String[] links = { GSXML.RESPONSE_ELEM, GSXML.DOC_NODE_ELEM + GSXML.LIST_MODIFIER, GSXML.DOC_NODE_ELEM, GSXML.NODE_CONTENT_ELEM };
686 path = GSPath.createPath(links);
687 dc_response_doc_content = (Element) GSXML.getNodeByPath(enrich_response, path);
688
689 }
690 } // if provide_annotations
691
692 // use the returned id rather than the sent one cos there may have
693 // been modifiers such as .pr that are removed.
694 String modified_doc_id = dc_response_doc.getAttribute(GSXML.NODE_ID_ATT);
695 the_document.setAttribute("selectedNode", modified_doc_id);
696 if (has_dummy)
697 {
698 // change the id if necessary and add the content
699 Element dummy_node = (Element) doc_nodes.item(0);
700
701 dummy_node.setAttribute(GSXML.NODE_ID_ATT, modified_doc_id);
702 dummy_node.appendChild(doc.importNode(dc_response_doc_content, true));
703 // hack for simple type
704 if (document_type.equals(GSXML.DOC_TYPE_SIMPLE))
705 {
706 // we dont want the internal docNode, just want the content and metadata in the document
707 // rethink this!!
708 the_document.removeChild(dummy_node);
709
710 NodeList dummy_children = dummy_node.getChildNodes();
711 //for (int i=0; i<dummy_children.getLength(); i++) {
712 for (int i = dummy_children.getLength() - 1; i >= 0; i--)
713 {
714 // special case as we don't want more than one metadata list
715 if (dummy_children.item(i).getNodeName().equals(GSXML.METADATA_ELEM + GSXML.LIST_MODIFIER))
716 {
717 GSXML.mergeMetadataFromList(the_document, dummy_children.item(i));
718 }
719 else
720 {
721 the_document.appendChild(dummy_children.item(i));
722 }
723 }
724 }
725
726 the_document.setAttribute(GSXML.NODE_ID_ATT, modified_doc_id);
727 }
728 else
729 {
730 // Merge the document content with the metadata and structure information
731 for (int i = 0; i < doc_nodes.getLength(); i++)
732 {
733 Node dn = doc_nodes.item(i);
734 String dn_id = ((Element) dn).getAttribute(GSXML.NODE_ID_ATT);
735 if (dn_id.equals(modified_doc_id))
736 {
737 dn.appendChild(doc.importNode(dc_response_doc_content, true));
738 break;
739 }
740 }
741 }
742 }
743 //logger.debug("(DocumentAction) Page:\n" + GSXML.xmlNodeToString(result));
744 return result;
745 }
746
747 /**
748 * tell the param class what its arguments are if an action has its own
749 * arguments, this should add them to the params object - particularly
750 * important for args that should not be saved
751 */
752 public boolean addActionParameters(GSParams params)
753 {
754 params.addParameter(GOTO_PAGE_ARG, false);
755 params.addParameter(ENRICH_DOC_ARG, false);
756 params.addParameter(EXPAND_DOCUMENT_ARG, false);
757 params.addParameter(EXPAND_CONTENTS_ARG, false);
758 params.addParameter(REALISTIC_BOOK_ARG, false);
759
760 return true;
761 }
762
763 /**
764 * this method gets the collection description, the format info, the list of
765 * enrich services, etc - stuff that is needed for the page, but is the same
766 * whatever the query is - should be cached
767 */
768 protected boolean getBackgroundData(Element page_response, String collection, UserContext userContext)
769 {
770 Document doc = page_response.getOwnerDocument();
771
772 // create a message to process - contains requests for the collection
773 // description, the format element, the enrich services on offer
774 // these could all be cached
775 Element info_message = doc.createElement(GSXML.MESSAGE_ELEM);
776 String path = GSPath.appendLink(collection, "DocumentContentRetrieve");
777 // the format request - ignore for now, where does this request go to??
778 Element format_request = GSXML.createBasicRequest(doc, GSXML.REQUEST_TYPE_FORMAT, path, userContext);
779 info_message.appendChild(format_request);
780
781 // the enrich_services request - only do this if provide_annotations is true
782
783 if (provide_annotations)
784 {
785 Element enrich_services_request = GSXML.createBasicRequest(doc, GSXML.REQUEST_TYPE_DESCRIBE, "", userContext);
786 enrich_services_request.setAttribute(GSXML.INFO_ATT, "serviceList");
787 info_message.appendChild(enrich_services_request);
788 }
789
790 Element info_response = (Element) this.mr.process(info_message);
791
792 // the collection is the first response
793 NodeList responses = info_response.getElementsByTagName(GSXML.RESPONSE_ELEM);
794 Element format_resp = (Element) responses.item(0);
795
796 Element format_elem = (Element) GSXML.getChildByTagName(format_resp, GSXML.FORMAT_ELEM);
797 if (format_elem != null)
798 {
799 Element global_format_elem = (Element) GSXML.getChildByTagName(format_resp, GSXML.GLOBAL_FORMAT_ELEM);
800 if (global_format_elem != null)
801 {
802 GSXSLT.mergeFormatElements(format_elem, global_format_elem, false);
803 }
804
805 // set the format type
806 format_elem.setAttribute(GSXML.TYPE_ATT, "display");
807 page_response.appendChild(doc.importNode(format_elem, true));
808 }
809
810 if (provide_annotations)
811 {
812 Element services_resp = (Element) responses.item(1);
813
814 // a new message for the mr
815 Element enrich_message = doc.createElement(GSXML.MESSAGE_ELEM);
816 NodeList e_services = services_resp.getElementsByTagName(GSXML.SERVICE_ELEM);
817 boolean service_found = false;
818 for (int j = 0; j < e_services.getLength(); j++)
819 {
820 if (((Element) e_services.item(j)).getAttribute(GSXML.TYPE_ATT).equals("enrich"))
821 {
822 Element s = GSXML.createBasicRequest(doc, GSXML.REQUEST_TYPE_DESCRIBE, ((Element) e_services.item(j)).getAttribute(GSXML.NAME_ATT), userContext);
823 enrich_message.appendChild(s);
824 service_found = true;
825 }
826 }
827 if (service_found)
828 {
829 Element enrich_response = (Element) this.mr.process(enrich_message);
830
831 NodeList e_responses = enrich_response.getElementsByTagName(GSXML.RESPONSE_ELEM);
832 Element service_list = doc.createElement(GSXML.SERVICE_ELEM + GSXML.LIST_MODIFIER);
833 for (int i = 0; i < e_responses.getLength(); i++)
834 {
835 Element e_resp = (Element) e_responses.item(i);
836 Element e_service = (Element) doc.importNode(GSXML.getChildByTagName(e_resp, GSXML.SERVICE_ELEM), true);
837 e_service.setAttribute(GSXML.NAME_ATT, e_resp.getAttribute(GSXML.FROM_ATT));
838 service_list.appendChild(e_service);
839 }
840 page_response.appendChild(service_list);
841 }
842 } // if provide_annotations
843 return true;
844
845 }
846
847 protected String getDocumentType(Element basic_doc_list, String collection, UserContext userContext, Element page_response)
848 {
849 Document doc = basic_doc_list.getOwnerDocument();
850
851 Element ds_message = doc.createElement(GSXML.MESSAGE_ELEM);
852 String to = GSPath.appendLink(collection, "DocumentStructureRetrieve");// Hard-wired?
853 Element ds_request = GSXML.createBasicRequest(doc, GSXML.REQUEST_TYPE_PROCESS, to, userContext);
854 ds_message.appendChild(ds_request);
855
856 // Create a parameter list to specify the required structure information
857 Element ds_param_list = doc.createElement(GSXML.PARAM_ELEM + GSXML.LIST_MODIFIER);
858 Element ds_param = doc.createElement(GSXML.PARAM_ELEM);
859 ds_param_list.appendChild(ds_param);
860 ds_param.setAttribute(GSXML.NAME_ATT, "info");
861 ds_param.setAttribute(GSXML.VALUE_ATT, "documentType");
862
863 ds_request.appendChild(ds_param_list);
864
865 // add the node list we created earlier
866 ds_request.appendChild(basic_doc_list);
867
868 // Process the document structure retrieve message
869 Element ds_response_message = (Element) this.mr.process(ds_message);
870 if (processErrorElements(ds_response_message, page_response))
871 {
872 return null;
873 }
874
875 String[] links = { GSXML.RESPONSE_ELEM, GSXML.DOC_NODE_ELEM + GSXML.LIST_MODIFIER, GSXML.DOC_NODE_ELEM, "nodeStructureInfo" };
876 String path = GSPath.createPath(links);
877 Element info_elem = (Element) GSXML.getNodeByPath(ds_response_message, path);
878 if (info_elem == null) {
879 return null;
880 }
881 Element doctype_elem = GSXML.getNamedElement(info_elem, "info", "name", "documentType");
882 if (doctype_elem != null)
883 {
884 String doc_type = doctype_elem.getAttribute("value");
885 return doc_type;
886 }
887 return null;
888 }
889
890 /** run the XSLT transform which converts from doc.xml format to our internal document format */
891 protected Element transformArchiveToDocument(Element section) {
892
893 String stylesheet_file = GSFile.stylesheetFile(GlobalProperties.getGSDL3Home(), (String) this.config_params.get(GSConstants.SITE_NAME), "", (String) this.config_params.get(GSConstants.INTERFACE_NAME), null, "archive2document.xsl");
894 Document stylesheet_doc = XMLConverter.getDOM(new File(stylesheet_file));
895 if (stylesheet_doc == null) {
896 logger.error("Couldn't load in stylesheet "+stylesheet_file);
897 return section;
898 }
899
900 Document section_doc = XMLConverter.newDOM();
901 section_doc.appendChild(section_doc.importNode(section, true));
902 Node result = this.transformer.transform(stylesheet_doc, section_doc);
903 logger.error("transform result = "+XMLConverter.getPrettyString(result));
904
905 Element new_element;
906 if (result.getNodeType() == Node.DOCUMENT_NODE)
907 {
908 new_element = ((Document) result).getDocumentElement();
909 }
910 else
911 {
912 new_element = (Element) result;
913 }
914
915
916 return new_element;
917
918 }
919
920
921 /**
922 * this involves a bit of a hack to get the equivalent query terms - has to
923 * requery the query service - uses the last selected service name. (if it
924 * ends in query). should this action do the query or should it send a
925 * message to the query action? but that will involve lots of extra stuff.
926 * also doesn't handle phrases properly - just highlights all the terms
927 * found in the text.
928 */
929 protected Element highlightQueryTerms(Element request, String current_node_id, Element dc_response_doc_content)
930 {
931 Document doc = request.getOwnerDocument();
932
933 // do the query again to get term info
934 Element cgi_param_list = (Element) GSXML.getChildByTagName(request, GSXML.PARAM_ELEM + GSXML.LIST_MODIFIER);
935 HashMap<String, Serializable> params = GSXML.extractParams(cgi_param_list, false);
936
937 HashMap previous_params = (HashMap) params.get("p");
938 if (previous_params == null)
939 {
940 return dc_response_doc_content;
941 }
942 String service_name = (String) previous_params.get(GSParams.SERVICE);
943 if (service_name == null || !service_name.endsWith("Query"))
944 { // hack for now - we only do highlighting if we were in a query last - ie not if we were in a browse thingy
945 logger.debug("invalid service, not doing highlighting");
946 return dc_response_doc_content;
947 }
948 String collection = (String) params.get(GSParams.COLLECTION);
949 UserContext userContext = new UserContext(request);
950 String to = GSPath.appendLink(collection, service_name);
951
952 Element mr_query_message = doc.createElement(GSXML.MESSAGE_ELEM);
953 Element mr_query_request = GSXML.createBasicRequest(doc, GSXML.REQUEST_TYPE_PROCESS, to, userContext);
954 mr_query_message.appendChild(mr_query_request);
955
956 // paramList
957 HashMap service_params = (HashMap) params.get("s1");
958
959 Element query_param_list = doc.createElement(GSXML.PARAM_ELEM + GSXML.LIST_MODIFIER);
960 GSXML.addParametersToList(query_param_list, service_params);
961 if (current_node_id != null) {
962 GSXML.addParameterToList(query_param_list, "hldocOID", current_node_id);
963 } else {
964 GSXML.addParameterToList(query_param_list, "hldocOID", (String) params.get(GSParams.DOCUMENT));
965 }
966 mr_query_request.appendChild(query_param_list);
967 // do the query
968 Element mr_query_response = (Element) this.mr.process(mr_query_message);
969 String pathNode = GSPath.appendLink(GSXML.RESPONSE_ELEM, GSXML.NODE_CONTENT_ELEM);
970 Element highlighted_Node = (Element) GSXML.getNodeByPath(mr_query_response, pathNode);
971 // For SOLR, the above query may come back with a nodeContent element, which is the hldocOID section content, with search terms marked up. We send it back to the documnetContentRetrieve service so that resolveTextMacros can be applied, and it can be properly encased in documentNode etc elements
972 if (highlighted_Node != null)
973 {
974 // Build a request to process highlighted text
975
976 Element hl_message = doc.createElement(GSXML.MESSAGE_ELEM);
977 to = GSPath.appendLink(collection, "DocumentContentRetrieve");
978 Element dc_request = GSXML.createBasicRequest(doc, GSXML.REQUEST_TYPE_PROCESS, to, userContext);
979 hl_message.appendChild(dc_request);
980
981 // Create a parameter list to specify the request parameters - empty for now
982 Element dc_param_list = doc.createElement(GSXML.PARAM_ELEM + GSXML.LIST_MODIFIER);
983 dc_request.appendChild(dc_param_list);
984
985 // get the content
986 Element doc_list = doc.createElement(GSXML.DOC_NODE_ELEM + GSXML.LIST_MODIFIER);
987 dc_request.appendChild(doc_list);
988 Element current_doc = doc.createElement(GSXML.DOC_NODE_ELEM);
989 doc_list.appendChild(current_doc);
990 current_doc.setAttribute(GSXML.NODE_ID_ATT, (String) params.get(GSParams.DOCUMENT));
991 //Append highlighted content to request for processing
992 dc_request.appendChild(doc.importNode(highlighted_Node, true));
993 Element hl_response_message = (Element) this.mr.process(hl_message);
994
995 //Get results
996 NodeList contentList = hl_response_message.getElementsByTagName(GSXML.NODE_CONTENT_ELEM);
997 Element content = (Element) contentList.item(0);
998 return content;
999 }
1000 String path = GSPath.appendLink(GSXML.RESPONSE_ELEM, GSXML.TERM_ELEM + GSXML.LIST_MODIFIER);
1001 Element query_term_list_element = (Element) GSXML.getNodeByPath(mr_query_response, path);
1002 if (query_term_list_element == null)
1003 {
1004 // no term info
1005 logger.error("No query term information.\n");
1006 return dc_response_doc_content;
1007 }
1008
1009 String content = GSXML.getNodeText(dc_response_doc_content);
1010
1011 String metadata_path = GSPath.appendLink(GSXML.RESPONSE_ELEM, GSXML.METADATA_ELEM + GSXML.LIST_MODIFIER);
1012 Element metadata_list = (Element) GSXML.getNodeByPath(mr_query_response, metadata_path);
1013
1014 HashSet<String> query_term_variants = new HashSet<String>();
1015 NodeList equivalent_terms_nodelist = query_term_list_element.getElementsByTagName("equivTermList");
1016 if (equivalent_terms_nodelist == null || equivalent_terms_nodelist.getLength() == 0)
1017 {
1018 NodeList terms_nodelist = query_term_list_element.getElementsByTagName("term");
1019 if (terms_nodelist != null && terms_nodelist.getLength() > 0)
1020 {
1021 for (int i = 0; i < terms_nodelist.getLength(); i++)
1022 {
1023 String termValue = ((Element) terms_nodelist.item(i)).getAttribute("name");
1024 String termValueU = null;
1025 String termValueL = null;
1026
1027 if (termValue.length() > 1)
1028 {
1029 termValueU = termValue.substring(0, 1).toUpperCase() + termValue.substring(1);
1030 termValueL = termValue.substring(0, 1).toLowerCase() + termValue.substring(1);
1031 }
1032 else
1033 {
1034 termValueU = termValue.substring(0, 1).toUpperCase();
1035 termValueL = termValue.substring(0, 1).toLowerCase();
1036 }
1037
1038 query_term_variants.add(termValueU);
1039 query_term_variants.add(termValueL);
1040 }
1041 }
1042 }
1043 else
1044 {
1045 for (int i = 0; i < equivalent_terms_nodelist.getLength(); i++)
1046 {
1047 Element equivalent_terms_element = (Element) equivalent_terms_nodelist.item(i);
1048 String[] equivalent_terms = GSXML.getAttributeValuesFromList(equivalent_terms_element, GSXML.NAME_ATT);
1049 for (int j = 0; j < equivalent_terms.length; j++)
1050 {
1051 query_term_variants.add(equivalent_terms[j]);
1052 }
1053 }
1054 }
1055
1056 ArrayList<ArrayList<HashSet<String>>> phrase_query_term_variants_hierarchy = new ArrayList<ArrayList<HashSet<String>>>();
1057
1058 Element query_element = GSXML.getNamedElement(metadata_list, GSXML.METADATA_ELEM, GSXML.NAME_ATT, "query");
1059 String performed_query = GSXML.getNodeText(query_element) + " ";
1060
1061 ArrayList<HashSet<String>> phrase_query_p_term_variants_list = new ArrayList<HashSet<String>>();
1062 int term_start = 0;
1063 boolean in_term = false;
1064 boolean in_phrase = false;
1065 for (int i = 0; i < performed_query.length(); i++)
1066 {
1067 char character = performed_query.charAt(i);
1068 boolean is_character_letter_or_digit = Character.isLetterOrDigit(character);
1069
1070 // Has a query term just started?
1071 if (in_term == false && is_character_letter_or_digit == true)
1072 {
1073 in_term = true;
1074 term_start = i;
1075 }
1076
1077 // Or has a term just finished?
1078 else if (in_term == true && is_character_letter_or_digit == false)
1079 {
1080 in_term = false;
1081 String term = performed_query.substring(term_start, i);
1082
1083 Element term_element = GSXML.getNamedElement(query_term_list_element, GSXML.TERM_ELEM, GSXML.NAME_ATT, term);
1084 if (term_element != null)
1085 {
1086
1087 HashSet<String> phrase_query_p_term_x_variants = new HashSet<String>();
1088
1089 NodeList term_equivalent_terms_nodelist = term_element.getElementsByTagName("equivTermList");
1090 if (term_equivalent_terms_nodelist == null || term_equivalent_terms_nodelist.getLength() == 0)
1091 {
1092 String termValueU = null;
1093 String termValueL = null;
1094
1095 if (term.length() > 1)
1096 {
1097 termValueU = term.substring(0, 1).toUpperCase() + term.substring(1);
1098 termValueL = term.substring(0, 1).toLowerCase() + term.substring(1);
1099 }
1100 else
1101 {
1102 termValueU = term.substring(0, 1).toUpperCase();
1103 termValueL = term.substring(0, 1).toLowerCase();
1104 }
1105
1106 phrase_query_p_term_x_variants.add(termValueU);
1107 phrase_query_p_term_x_variants.add(termValueL);
1108 }
1109 else
1110 {
1111 for (int j = 0; j < term_equivalent_terms_nodelist.getLength(); j++)
1112 {
1113 Element term_equivalent_terms_element = (Element) term_equivalent_terms_nodelist.item(j);
1114 String[] term_equivalent_terms = GSXML.getAttributeValuesFromList(term_equivalent_terms_element, GSXML.NAME_ATT);
1115 for (int k = 0; k < term_equivalent_terms.length; k++)
1116 {
1117 phrase_query_p_term_x_variants.add(term_equivalent_terms[k]);
1118 }
1119 }
1120 }
1121 phrase_query_p_term_variants_list.add(phrase_query_p_term_x_variants);
1122
1123 if (in_phrase == false)
1124 {
1125 phrase_query_term_variants_hierarchy.add(phrase_query_p_term_variants_list);
1126 phrase_query_p_term_variants_list = new ArrayList<HashSet<String>>();
1127 }
1128 }
1129 }
1130 // Watch for phrases (surrounded by quotes)
1131 if (character == '\"')
1132 {
1133 // Has a phrase just started?
1134 if (in_phrase == false)
1135 {
1136 in_phrase = true;
1137 }
1138 // Or has a phrase just finished?
1139 else if (in_phrase == true)
1140 {
1141 in_phrase = false;
1142 phrase_query_term_variants_hierarchy.add(phrase_query_p_term_variants_list);
1143 }
1144
1145 phrase_query_p_term_variants_list = new ArrayList<HashSet<String>>();
1146 }
1147 }
1148
1149 return highlightQueryTermsInternal(doc, content, query_term_variants, phrase_query_term_variants_hierarchy);
1150 }
1151
1152 /**
1153 * Highlights query terms in a piece of text.
1154 */
1155 private Element highlightQueryTermsInternal(Document doc, String content, HashSet<String> query_term_variants, ArrayList<ArrayList<HashSet<String>>> phrase_query_term_variants_hierarchy)
1156 {
1157 // Convert the content string to an array of characters for speed
1158 char[] content_characters = new char[content.length()];
1159 content.getChars(0, content.length(), content_characters, 0);
1160
1161 // Now skim through the content, identifying word matches
1162 ArrayList<WordMatch> word_matches = new ArrayList<WordMatch>();
1163 int word_start = 0;
1164 boolean in_word = false;
1165 boolean preceding_word_matched = false;
1166 boolean inTag = false;
1167 for (int i = 0; i < content_characters.length; i++)
1168 {
1169 //We don't want to find words inside HTML tags
1170 if (content_characters[i] == '<')
1171 {
1172 inTag = true;
1173 continue;
1174 }
1175 else if (inTag && content_characters[i] == '>')
1176 {
1177 inTag = false;
1178 }
1179 else if (inTag)
1180 {
1181 continue;
1182 }
1183
1184 boolean is_character_letter_or_digit = Character.isLetterOrDigit(content_characters[i]);
1185
1186 // Has a word just started?
1187 if (in_word == false && is_character_letter_or_digit == true)
1188 {
1189 in_word = true;
1190 word_start = i;
1191 }
1192
1193 // Or has a word just finished?
1194 else if (in_word == true && is_character_letter_or_digit == false)
1195 {
1196 in_word = false;
1197
1198 // Check if the word matches any of the query term equivalents
1199 String word = new String(content_characters, word_start, (i - word_start));
1200 if (query_term_variants.contains(word))
1201 {
1202 // We have found a matching word, so remember its location
1203 word_matches.add(new WordMatch(word, word_start, i, preceding_word_matched));
1204 preceding_word_matched = true;
1205 }
1206 else
1207 {
1208 preceding_word_matched = false;
1209 }
1210 }
1211 }
1212
1213 // Don't forget the last word...
1214 if (in_word == true)
1215 {
1216 // Check if the word matches any of the query term equivalents
1217 String word = new String(content_characters, word_start, (content_characters.length - word_start));
1218 if (query_term_variants.contains(word))
1219 {
1220 // We have found a matching word, so remember its location
1221 word_matches.add(new WordMatch(word, word_start, content_characters.length, preceding_word_matched));
1222 }
1223 }
1224
1225 ArrayList<Integer> highlight_start_positions = new ArrayList<Integer>();
1226 ArrayList<Integer> highlight_end_positions = new ArrayList<Integer>();
1227
1228 // Deal with phrases now
1229 ArrayList<PartialPhraseMatch> partial_phrase_matches = new ArrayList<PartialPhraseMatch>();
1230 for (int i = 0; i < word_matches.size(); i++)
1231 {
1232 WordMatch word_match = word_matches.get(i);
1233
1234 // See if any partial phrase matches are extended by this word
1235 if (word_match.preceding_word_matched)
1236 {
1237 for (int j = partial_phrase_matches.size() - 1; j >= 0; j--)
1238 {
1239 PartialPhraseMatch partial_phrase_match = partial_phrase_matches.remove(j);
1240 ArrayList phrase_query_p_term_variants_list = phrase_query_term_variants_hierarchy.get(partial_phrase_match.query_phrase_number);
1241 HashSet phrase_query_p_term_x_variants = (HashSet) phrase_query_p_term_variants_list.get(partial_phrase_match.num_words_matched);
1242 if (phrase_query_p_term_x_variants.contains(word_match.word))
1243 {
1244 partial_phrase_match.num_words_matched++;
1245
1246 // Has a complete phrase match occurred?
1247 if (partial_phrase_match.num_words_matched == phrase_query_p_term_variants_list.size())
1248 {
1249 // Check for overlaps by looking at the previous highlight range
1250 if (!highlight_end_positions.isEmpty())
1251 {
1252 int last_highlight_index = highlight_end_positions.size() - 1;
1253 int last_highlight_end = highlight_end_positions.get(last_highlight_index).intValue();
1254 if (last_highlight_end > partial_phrase_match.start_position)
1255 {
1256 // There is an overlap, so remove the previous phrase match
1257 int last_highlight_start = highlight_start_positions.remove(last_highlight_index).intValue();
1258 highlight_end_positions.remove(last_highlight_index);
1259 partial_phrase_match.start_position = last_highlight_start;
1260 }
1261 }
1262
1263 highlight_start_positions.add(new Integer(partial_phrase_match.start_position));
1264 highlight_end_positions.add(new Integer(word_match.end_position));
1265 }
1266 // No, but add the partial match back into the list for next time
1267 else
1268 {
1269 partial_phrase_matches.add(partial_phrase_match);
1270 }
1271 }
1272 }
1273 }
1274 else
1275 {
1276 partial_phrase_matches.clear();
1277 }
1278
1279 // See if this word is at the start of any of the phrases
1280 for (int p = 0; p < phrase_query_term_variants_hierarchy.size(); p++)
1281 {
1282 ArrayList phrase_query_p_term_variants_list = phrase_query_term_variants_hierarchy.get(p);
1283 if (phrase_query_p_term_variants_list.size()>0) {
1284 HashSet phrase_query_p_term_1_variants = (HashSet) phrase_query_p_term_variants_list.get(0);
1285 if (phrase_query_p_term_1_variants.contains(word_match.word))
1286 {
1287 // If this phrase is just one word long, we have a complete match
1288 if (phrase_query_p_term_variants_list.size() == 1)
1289 {
1290 highlight_start_positions.add(new Integer(word_match.start_position));
1291 highlight_end_positions.add(new Integer(word_match.end_position));
1292 }
1293 // Otherwise we have the start of a potential phrase match
1294 else
1295 {
1296 partial_phrase_matches.add(new PartialPhraseMatch(word_match.start_position, p));
1297 }
1298 }
1299 }
1300 }
1301 }
1302
1303 // Now add the annotation tags into the document at the correct points
1304 Element content_element = doc.createElement(GSXML.NODE_CONTENT_ELEM);
1305
1306 int last_wrote = 0;
1307 for (int i = 0; i < highlight_start_positions.size(); i++)
1308 {
1309 int highlight_start = highlight_start_positions.get(i).intValue();
1310 int highlight_end = highlight_end_positions.get(i).intValue();
1311
1312 // Print anything before the highlight range
1313 if (last_wrote < highlight_start)
1314 {
1315 String preceding_text = new String(content_characters, last_wrote, (highlight_start - last_wrote));
1316 content_element.appendChild(doc.createTextNode(preceding_text));
1317 }
1318
1319 // Print the highlight text, annotated
1320 if (highlight_end > last_wrote)
1321 {
1322 String highlight_text = new String(content_characters, highlight_start, (highlight_end - highlight_start));
1323 Element annotation_element = GSXML.createTextElement(doc, "annotation", highlight_text);
1324 annotation_element.setAttribute("type", "query_term");
1325 content_element.appendChild(annotation_element);
1326 last_wrote = highlight_end;
1327 }
1328 }
1329
1330 // Finish off any unwritten text
1331 if (last_wrote < content_characters.length)
1332 {
1333 String remaining_text = new String(content_characters, last_wrote, (content_characters.length - last_wrote));
1334 content_element.appendChild(doc.createTextNode(remaining_text));
1335 }
1336 return content_element;
1337 }
1338
1339 static private class WordMatch
1340 {
1341 public String word;
1342 public int start_position;
1343 public int end_position;
1344 public boolean preceding_word_matched;
1345
1346 public WordMatch(String word, int start_position, int end_position, boolean preceding_word_matched)
1347 {
1348 this.word = word;
1349 this.start_position = start_position;
1350 this.end_position = end_position;
1351 this.preceding_word_matched = preceding_word_matched;
1352 }
1353 }
1354
1355 static private class PartialPhraseMatch
1356 {
1357 public int start_position;
1358 public int query_phrase_number;
1359 public int num_words_matched;
1360
1361 public PartialPhraseMatch(int start_position, int query_phrase_number)
1362 {
1363 this.start_position = start_position;
1364 this.query_phrase_number = query_phrase_number;
1365 this.num_words_matched = 1;
1366 }
1367 }
1368}
Note: See TracBrowser for help on using the repository browser.