source: main/trunk/greenstone3/src/java/org/greenstone/gsdl3/action/DocumentAction.java@ 32068

Last change on this file since 32068 was 32068, checked in by kjdon, 6 years ago

if docEdit=1, then we just retrieve the entire document from archives, then convert to internal format using xslt. Also, don;t get document text if noText=1

  • Property svn:keywords set to Author Date Id Revision
File size: 49.7 KB
Line 
1/*
2 * DocumentAction.java
3 * Copyright (C) 2002 New Zealand Digital Library, http://www.nzdl.org
4 *
5 * This program is free software; you can redistribute it and/or modify
6 * it under the terms of the GNU General Public License as published by
7 * the Free Software Foundation; either version 2 of the License, or
8 * (at your option) any later version.
9 *
10 * This program is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 * GNU General Public License for more details.
14 *
15 * You should have received a copy of the GNU General Public License
16 * along with this program; if not, write to the Free Software
17 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
18 */
19package org.greenstone.gsdl3.action;
20
21// Greenstone classes
22import org.greenstone.gsdl3.core.ModuleInterface;
23import org.greenstone.gsdl3.util.*;
24
25// XML classes
26import org.w3c.dom.Document;
27import org.w3c.dom.Element;
28import org.w3c.dom.Node;
29import org.w3c.dom.Text;
30import org.w3c.dom.NodeList;
31
32// General Java classes
33import java.util.ArrayList;
34import java.util.HashMap;
35import java.util.HashSet;
36import java.io.File;
37import java.io.Serializable;
38
39import org.apache.log4j.*;
40
41/** Action class for retrieving Documents via the message router */
42public class DocumentAction extends Action
43{
44
45 static Logger logger = Logger.getLogger(org.greenstone.gsdl3.action.DocumentAction.class.getName());
46
47 // this is used to specify that the sibling nodes of a selected one should be obtained
48 public static final String SIBLING_ARG = "sib";
49 public static final String GOTO_PAGE_ARG = "gp";
50 public static final String ENRICH_DOC_ARG = "end";
51 public static final String EXPAND_DOCUMENT_ARG = "ed";
52 public static final String EXPAND_CONTENTS_ARG = "ec";
53 public static final String REALISTIC_BOOK_ARG = "book";
54 public static final String NO_TEXT_ARG = "noText";
55 public static final String DOC_EDIT_ARG = "docEdit";
56
57 /**
58 * if this is set to true, when a document is displayed, any annotation type
59 * services (enrich) will be offered to the user as well
60 */
61 protected boolean provide_annotations = false;
62
63 protected boolean highlight_query_terms = false;
64
65 public boolean configure()
66 {
67 super.configure();
68 String highlight = (String) config_params.get("highlightQueryTerms");
69 if (highlight != null && highlight.equals("true"))
70 {
71 highlight_query_terms = true;
72 }
73 String annotate = (String) config_params.get("displayAnnotationService");
74 if (annotate != null && annotate.equals("true"))
75 {
76 provide_annotations = true;
77 }
78 return true;
79 }
80
81 public Node process(Node message_node)
82 {
83 // for now, no subaction eventually we may want to have subactions such as text assoc or something ?
84
85 Element message = GSXML.nodeToElement(message_node);
86 Document doc = XMLConverter.newDOM(); //message.getOwnerDocument();
87
88 // the response
89 Element result = doc.createElement(GSXML.MESSAGE_ELEM);
90 Element page_response = doc.createElement(GSXML.RESPONSE_ELEM);
91 result.appendChild(page_response);
92
93 // get the request - assume only one
94 Element request = (Element) GSXML.getChildByTagName(message, GSXML.REQUEST_ELEM);
95 Element cgi_paramList = (Element) GSXML.getChildByTagName(request, GSXML.PARAM_ELEM + GSXML.LIST_MODIFIER);
96 HashMap<String, Serializable> params = GSXML.extractParams(cgi_paramList, false);
97
98 // just in case there are some that need to get passed to the services
99 HashMap service_params = (HashMap) params.get("s0");
100
101 String collection = (String) params.get(GSParams.COLLECTION);
102 String document_id = (String) params.get(GSParams.DOCUMENT);
103 if (document_id != null && document_id.equals(""))
104 {
105 document_id = null;
106 }
107 String href = (String) params.get(GSParams.HREF);//for an external link : get the href URL if it is existing in the params list
108 if (href != null && href.equals(""))
109 {
110 href = null;
111 }
112 String rl = (String) params.get(GSParams.RELATIVE_LINK);//for an external link : get the rl value if it is existing in the params list
113 if (document_id == null && href == null)
114 {
115 logger.error("no document specified!");
116 return result;
117 }
118 if (rl != null && rl.equals("0"))
119 {
120 // this is a true external link, we should have been directed to a different page or action
121 logger.error("rl value was 0, shouldn't get here");
122 return result;
123 }
124
125 UserContext userContext = new UserContext(request);
126
127 //append site metadata
128 addSiteMetadata(page_response, userContext);
129 addInterfaceOptions(page_response);
130
131 // get the additional data needed for the page
132 getBackgroundData(page_response, collection, userContext);
133 Element format_elem = (Element) GSXML.getChildByTagName(page_response, GSXML.FORMAT_ELEM);
134
135 if (format_elem != null) {
136 // lets look for param defaults set in config file
137 NodeList param_defaults = format_elem.getElementsByTagName("paramDefault");
138 for (int i=0; i<param_defaults.getLength(); i++) {
139 Element p = (Element)param_defaults.item(i);
140 String name = p.getAttribute(GSXML.NAME_ATT);
141 if (params.get(name) ==null) {
142 // wasn't set from interface
143 String value = p.getAttribute(GSXML.VALUE_ATT);
144 params.put(name, value );
145 // also add into request param xml so that xslt knows it too
146 GSXML.addParameterToList(cgi_paramList, name, value);
147 }
148 }
149 }
150
151
152 boolean editing_document = false;
153 String doc_edit = (String) params.get(DOC_EDIT_ARG);
154 if (doc_edit != null && doc_edit.equals("1")) {
155 editing_document = true;
156 }
157
158 // are we editing mode? just get the archive document, convert to our internal doc format, and return it
159 if (editing_document) {
160
161 // call get archive doc
162 Element dx_message = doc.createElement(GSXML.MESSAGE_ELEM);
163 String to = "DocXMLGetSection";
164 Element dx_request = GSXML.createBasicRequest(doc, GSXML.REQUEST_TYPE_PROCESS, to, userContext);
165 dx_message.appendChild(dx_request);
166 Element dx_section = doc.createElement(GSXML.DOCXML_SECTION_ELEM);
167 dx_section.setAttribute(GSXML.NODE_ID_ATT, document_id);
168 dx_section.setAttribute(GSXML.COLLECTION_ATT, collection);
169 dx_request.appendChild(dx_section);
170
171 Element dx_response_message = (Element) this.mr.process(dx_message);
172 if (processErrorElements(dx_response_message, page_response))
173 {
174 return result;
175 }
176
177 // get the section out
178 String path = GSPath.appendLink(GSXML.RESPONSE_ELEM, GSXML.DOCXML_SECTION_ELEM);
179 Element section = (Element) GSXML.getNodeByPath(dx_response_message, path);
180 if (section == null) {
181 logger.error("no archive doc returned for "+document_id);
182 return result;
183 }
184 // convert the archive format into the internal format that the page response requires
185
186 Element doc_elem = doc.createElement(GSXML.DOCUMENT_ELEM);
187 page_response.appendChild(doc_elem);
188 section.setAttribute(GSXML.NODE_ID_ATT, document_id);
189
190 Element transformed_section = transformArchiveToDocument(section);
191 doc_elem.appendChild(doc.importNode(transformed_section, true));
192 logger.error("dx result = "+XMLConverter.getPrettyString(result));
193 return result;
194 }
195
196 String document_type = (String) params.get(GSParams.DOCUMENT_TYPE);
197 if (document_type != null && document_type.equals(""))
198 {
199 //document_type = "hierarchy";
200 document_type = null; // we'll get it later if not already specified
201 }
202 //whether to retrieve siblings or not
203 boolean get_siblings = false;
204 String sibs = (String) params.get(SIBLING_ARG);
205 if (sibs != null && sibs.equals("1"))
206 {
207 get_siblings = true;
208 }
209
210 String doc_id_modifier = "";
211 String sibling_num = (String) params.get(GOTO_PAGE_ARG);
212 if (sibling_num != null && !sibling_num.equals(""))
213 {
214 // we have to modify the doc name
215 doc_id_modifier = "." + sibling_num + ".ss";
216 }
217
218 boolean expand_document = false;
219 String ed_arg = (String) params.get(EXPAND_DOCUMENT_ARG);
220 if (ed_arg != null && ed_arg.equals("1"))
221 {
222 expand_document = true;
223 }
224
225 boolean expand_contents = false;
226 if (expand_document)
227 { // we always expand the contents with the text
228 expand_contents = true;
229 }
230 else
231 {
232 String ec_arg = (String) params.get(EXPAND_CONTENTS_ARG);
233 if (ec_arg != null && ec_arg.equals("1"))
234 {
235 expand_contents = true;
236 }
237 }
238
239 // do we want text content? Not if no_text=1.
240 // expand_document overrides this. - should it??
241 boolean get_text = true;
242 String nt_arg = (String) params.get(NO_TEXT_ARG);
243
244 if (!expand_document && nt_arg!=null && nt_arg.equals("1")) {
245 logger.error("SETTING GET TEXT TO FALSE");
246 get_text = false;
247 } else {
248 logger.error("GET TEXT REMAINS TRUE");
249 }
250
251 // the_document is where all the doc info - structure and metadata etc
252 // is added into, to be returned in the page
253 Element the_document = doc.createElement(GSXML.DOCUMENT_ELEM);
254 page_response.appendChild(the_document);
255
256 // create a basic doc list containing the current node
257 Element basic_doc_list = doc.createElement(GSXML.DOC_NODE_ELEM + GSXML.LIST_MODIFIER);
258 Element current_doc = doc.createElement(GSXML.DOC_NODE_ELEM);
259 basic_doc_list.appendChild(current_doc);
260 if (document_id != null)
261 {
262 current_doc.setAttribute(GSXML.NODE_ID_ATT, document_id + doc_id_modifier);
263 }
264 else
265 {
266 current_doc.setAttribute(GSXML.HREF_ID_ATT, href);
267 // do we need this??
268 current_doc.setAttribute(GSXML.ID_MOD_ATT, doc_id_modifier);
269 }
270
271 if (document_type == null)
272 {
273 document_type = getDocumentType(basic_doc_list, collection, userContext, page_response);
274 }
275 if (document_type == null)
276 {
277 logger.debug("doctype is null, setting to simple");
278 document_type = GSXML.DOC_TYPE_SIMPLE;
279 }
280
281 the_document.setAttribute(GSXML.DOC_TYPE_ATT, document_type);
282
283
284 // Create a parameter list to specify the required structure information
285 Element ds_param_list = doc.createElement(GSXML.PARAM_ELEM + GSXML.LIST_MODIFIER);
286
287 if (service_params != null)
288 {
289 GSXML.addParametersToList(ds_param_list, service_params);
290 }
291
292 Element ds_param = null;
293 boolean get_structure = false;
294 boolean get_structure_info = false;
295 if (document_type.equals(GSXML.DOC_TYPE_PAGED))
296 {
297 get_structure_info = true;
298
299 if (expand_contents)
300 {
301 ds_param = doc.createElement(GSXML.PARAM_ELEM);
302 ds_param_list.appendChild(ds_param);
303 ds_param.setAttribute(GSXML.NAME_ATT, "structure");
304 ds_param.setAttribute(GSXML.VALUE_ATT, "entire");
305 }
306
307 // get the info needed for paged naviagtion
308 ds_param = doc.createElement(GSXML.PARAM_ELEM);
309 ds_param_list.appendChild(ds_param);
310 ds_param.setAttribute(GSXML.NAME_ATT, "info");
311 ds_param.setAttribute(GSXML.VALUE_ATT, "numSiblings");
312 ds_param = doc.createElement(GSXML.PARAM_ELEM);
313 ds_param_list.appendChild(ds_param);
314 ds_param.setAttribute(GSXML.NAME_ATT, "info");
315 ds_param.setAttribute(GSXML.VALUE_ATT, "numChildren");
316 ds_param = doc.createElement(GSXML.PARAM_ELEM);
317 ds_param_list.appendChild(ds_param);
318 ds_param.setAttribute(GSXML.NAME_ATT, "info");
319 ds_param.setAttribute(GSXML.VALUE_ATT, "siblingPosition");
320
321 if (get_siblings)
322 {
323 ds_param = doc.createElement(GSXML.PARAM_ELEM);
324 ds_param_list.appendChild(ds_param);
325 ds_param.setAttribute(GSXML.NAME_ATT, "structure");
326 ds_param.setAttribute(GSXML.VALUE_ATT, "siblings");
327 }
328
329 }
330 else if (document_type.equals(GSXML.DOC_TYPE_HIERARCHY) || document_type.equals(GSXML.DOC_TYPE_PAGED_HIERARCHY))
331 {
332 get_structure = true;
333 if (expand_contents)
334 {
335 ds_param = doc.createElement(GSXML.PARAM_ELEM);
336 ds_param_list.appendChild(ds_param);
337 ds_param.setAttribute(GSXML.NAME_ATT, "structure");
338 ds_param.setAttribute(GSXML.VALUE_ATT, "entire");
339 }
340 else
341 {
342 // get the info needed for table of contents
343 ds_param = doc.createElement(GSXML.PARAM_ELEM);
344 ds_param_list.appendChild(ds_param);
345 ds_param.setAttribute(GSXML.NAME_ATT, "structure");
346 ds_param.setAttribute(GSXML.VALUE_ATT, "ancestors");
347 ds_param = doc.createElement(GSXML.PARAM_ELEM);
348 ds_param_list.appendChild(ds_param);
349 ds_param.setAttribute(GSXML.NAME_ATT, "structure");
350 ds_param.setAttribute(GSXML.VALUE_ATT, "children");
351 if (get_siblings)
352 {
353 ds_param = doc.createElement(GSXML.PARAM_ELEM);
354 ds_param_list.appendChild(ds_param);
355 ds_param.setAttribute(GSXML.NAME_ATT, "structure");
356 ds_param.setAttribute(GSXML.VALUE_ATT, "siblings");
357 }
358 }
359 }
360 else
361 {
362 // we dont need any structure
363 }
364
365 boolean has_dummy = false;
366 if (get_structure || get_structure_info)
367 {
368
369 // Build a request to obtain the document structure
370 Element ds_message = doc.createElement(GSXML.MESSAGE_ELEM);
371 String to = GSPath.appendLink(collection, "DocumentStructureRetrieve");// Hard-wired?
372 Element ds_request = GSXML.createBasicRequest(doc, GSXML.REQUEST_TYPE_PROCESS, to, userContext);
373 ds_message.appendChild(ds_request);
374 ds_request.appendChild(ds_param_list);
375
376 // add the node list we created earlier
377 ds_request.appendChild(basic_doc_list);
378
379 // Process the document structure retrieve message
380 Element ds_response_message = (Element) this.mr.process(ds_message);
381 if (processErrorElements(ds_response_message, page_response))
382 {
383 return result;
384 }
385
386 // get the info and print out
387 String path = GSPath.appendLink(GSXML.RESPONSE_ELEM, GSXML.DOC_NODE_ELEM + GSXML.LIST_MODIFIER);
388 path = GSPath.appendLink(path, GSXML.DOC_NODE_ELEM);
389 path = GSPath.appendLink(path, "nodeStructureInfo");
390 Element ds_response_struct_info = (Element) GSXML.getNodeByPath(ds_response_message, path);
391 // get the doc_node bit
392 if (ds_response_struct_info != null)
393 {
394 the_document.appendChild(doc.importNode(ds_response_struct_info, true));
395 }
396 path = GSPath.appendLink(GSXML.RESPONSE_ELEM, GSXML.DOC_NODE_ELEM + GSXML.LIST_MODIFIER);
397 path = GSPath.appendLink(path, GSXML.DOC_NODE_ELEM);
398 path = GSPath.appendLink(path, GSXML.NODE_STRUCTURE_ELEM);
399 Element ds_response_structure = (Element) GSXML.getNodeByPath(ds_response_message, path);
400
401 if (ds_response_structure != null)
402 {
403 // add the contents of the structure bit into the_document
404 NodeList structs = ds_response_structure.getChildNodes();
405 for (int i = 0; i < structs.getLength(); i++)
406 {
407 the_document.appendChild(doc.importNode(structs.item(i), true));
408 }
409 }
410 else
411 {
412 // no structure nodes, so put in a dummy doc node
413 Element doc_node = doc.createElement(GSXML.DOC_NODE_ELEM);
414 if (document_id != null)
415 {
416 doc_node.setAttribute(GSXML.NODE_ID_ATT, document_id);
417 }
418 else
419 {
420 doc_node.setAttribute(GSXML.HREF_ID_ATT, href);
421
422 }
423 the_document.appendChild(doc_node);
424 has_dummy = true;
425 }
426 }
427 else
428 { // a simple type - we dont have a dummy node for simple
429 // should think about this more
430 // no structure request, so just put in a dummy doc node
431 Element doc_node = doc.createElement(GSXML.DOC_NODE_ELEM);
432 if (document_id != null)
433 {
434 doc_node.setAttribute(GSXML.NODE_ID_ATT, document_id);
435 }
436 else
437 {
438 doc_node.setAttribute(GSXML.HREF_ID_ATT, href);
439 }
440 the_document.appendChild(doc_node);
441 has_dummy = true;
442 }
443
444 // Build a request to obtain some document metadata
445 Element dm_message = doc.createElement(GSXML.MESSAGE_ELEM);
446 String to = GSPath.appendLink(collection, "DocumentMetadataRetrieve"); // Hard-wired?
447 Element dm_request = GSXML.createBasicRequest(doc, GSXML.REQUEST_TYPE_PROCESS, to, userContext);
448 dm_message.appendChild(dm_request);
449 // Create a parameter list to specify the required metadata information
450
451 HashSet<String> meta_names = new HashSet<String>();
452 meta_names.add("Title"); // the default
453 if (format_elem != null)
454 {
455 getRequiredMetadataNames(format_elem, meta_names);
456 }
457
458 Element extraMetaListElem = (Element) GSXML.getChildByTagName(request, GSXML.EXTRA_METADATA + GSXML.LIST_MODIFIER);
459 if (extraMetaListElem != null)
460 {
461 NodeList extraMetaList = extraMetaListElem.getElementsByTagName(GSXML.EXTRA_METADATA);
462 for (int i = 0; i < extraMetaList.getLength(); i++)
463 {
464 meta_names.add(((Element) extraMetaList.item(i)).getAttribute(GSXML.NAME_ATT));
465 }
466 }
467
468 Element dm_param_list = createMetadataParamList(doc,meta_names);
469 if (service_params != null)
470 {
471 GSXML.addParametersToList(dm_param_list, service_params);
472 }
473
474 dm_request.appendChild(dm_param_list);
475
476 // create the doc node list for the metadata request
477 Element dm_doc_list = doc.createElement(GSXML.DOC_NODE_ELEM + GSXML.LIST_MODIFIER);
478 dm_request.appendChild(dm_doc_list);
479
480 // Add each node from the structure response into the metadata request
481 NodeList doc_nodes = the_document.getElementsByTagName(GSXML.DOC_NODE_ELEM);
482 for (int i = 0; i < doc_nodes.getLength(); i++)
483 {
484 Element doc_node = (Element) doc_nodes.item(i);
485 String doc_node_id = doc_node.getAttribute(GSXML.NODE_ID_ATT);
486
487 // Add the documentNode to the list
488 Element dm_doc_node = doc.createElement(GSXML.DOC_NODE_ELEM);
489 dm_doc_list.appendChild(dm_doc_node);
490 dm_doc_node.setAttribute(GSXML.NODE_ID_ATT, doc_node_id);
491 dm_doc_node.setAttribute(GSXML.NODE_TYPE_ATT, doc_node.getAttribute(GSXML.NODE_TYPE_ATT));
492 if (document_id == null){
493 dm_doc_node.setAttribute(GSXML.HREF_ID_ATT, href );
494 }
495
496 }
497
498 // we also want a metadata request to the top level document to get
499 // assocfilepath - this could be cached too
500 Element doc_meta_request = GSXML.createBasicRequest(doc, GSXML.REQUEST_TYPE_PROCESS, to, userContext);
501 dm_message.appendChild(doc_meta_request);
502 Element doc_meta_param_list = doc.createElement(GSXML.PARAM_ELEM + GSXML.LIST_MODIFIER);
503 if (service_params != null)
504 {
505 GSXML.addParametersToList(doc_meta_param_list, service_params);
506 }
507
508 doc_meta_request.appendChild(doc_meta_param_list);
509 Element doc_param = doc.createElement(GSXML.PARAM_ELEM);
510 doc_meta_param_list.appendChild(doc_param);
511 doc_param.setAttribute(GSXML.NAME_ATT, "metadata");
512 doc_param.setAttribute(GSXML.VALUE_ATT, "assocfilepath");
513
514 // create the doc node list for the metadata request
515 Element doc_list = doc.createElement(GSXML.DOC_NODE_ELEM + GSXML.LIST_MODIFIER);
516 doc_meta_request.appendChild(doc_list);
517
518 Element doc_node = doc.createElement(GSXML.DOC_NODE_ELEM);
519 // the node we want is the root document node
520 if (document_id != null)
521 {
522 doc_node.setAttribute(GSXML.NODE_ID_ATT, document_id + ".rt");
523 }
524 /*else
525 {
526 doc_node.setAttribute(GSXML.HREF_ID_ATT, href);// + ".rt");
527 // can we assume that href is always a top level doc??
528 //doc_node.setAttribute(GSXML.ID_MOD_ATT, ".rt");
529 //doc_node.setAttribute("externalURL", has_rl);
530 }*/
531 doc_list.appendChild(doc_node);
532
533 Element dm_response_message = (Element) this.mr.process(dm_message);
534 if (processErrorElements(dm_response_message, page_response))
535 {
536 return result;
537 }
538
539 String path = GSPath.appendLink(GSXML.RESPONSE_ELEM, GSXML.DOC_NODE_ELEM + GSXML.LIST_MODIFIER);
540 Element dm_response_doc_list = (Element) GSXML.getNodeByPath(dm_response_message, path);
541
542 // Merge the metadata with the structure information
543 NodeList dm_response_docs = dm_response_doc_list.getChildNodes();
544 for (int i = 0; i < doc_nodes.getLength(); i++)
545 {
546 GSXML.mergeMetadataLists(doc_nodes.item(i), dm_response_docs.item(i));
547 }
548 // get the top level doc metadata out
549 Element doc_meta_response = (Element) dm_response_message.getElementsByTagName(GSXML.RESPONSE_ELEM).item(1);
550 Element top_doc_node = (Element) GSXML.getNodeByPath(doc_meta_response, "documentNodeList/documentNode");
551 GSXML.mergeMetadataLists(the_document, top_doc_node);
552
553 // do we want doc text content? If not, we are done.
554 if (!get_text) {
555 // don't get text
556 return result;
557 }
558
559 // Build a request to obtain some document content
560 Element dc_message = doc.createElement(GSXML.MESSAGE_ELEM);
561 to = GSPath.appendLink(collection, "DocumentContentRetrieve"); // Hard-wired?
562 Element dc_request = GSXML.createBasicRequest(doc, GSXML.REQUEST_TYPE_PROCESS, to, userContext);
563 dc_message.appendChild(dc_request);
564
565 // Create a parameter list to specify the request parameters - empty for now
566 Element dc_param_list = doc.createElement(GSXML.PARAM_ELEM + GSXML.LIST_MODIFIER);
567 if (service_params != null)
568 {
569 GSXML.addParametersToList(dc_param_list, service_params);
570 }
571
572 dc_request.appendChild(dc_param_list);
573
574 // get the content
575 // the doc list for the content request is the same as the one for the structure request unless we want the whole document, in which case its the same as for the metadata request.
576 if (expand_document)
577 {
578 dc_request.appendChild(dm_doc_list);
579 }
580 else
581 {
582 dc_request.appendChild(basic_doc_list);
583 }
584 Element dc_response_message = (Element) this.mr.process(dc_message);
585 if (processErrorElements(dc_response_message, page_response))
586 {
587 return result;
588 }
589 Element dc_response_doc_list = (Element) GSXML.getNodeByPath(dc_response_message, path);
590
591 if (expand_document)
592 {
593 // Merge the content with the structure information
594 NodeList dc_response_docs = dc_response_doc_list.getChildNodes();
595 for (int i = 0; i < doc_nodes.getLength(); i++)
596 {
597 Node content = GSXML.getChildByTagName((Element) dc_response_docs.item(i), GSXML.NODE_CONTENT_ELEM);
598 if (content != null)
599 {
600 if (highlight_query_terms)
601 {
602 String node_id = ((Element)doc_nodes.item(i)).getAttribute(GSXML.NODE_ID_ATT);
603 content = highlightQueryTerms(request, node_id, (Element) content);
604 }
605
606 doc_nodes.item(i).appendChild(doc.importNode(content, true));
607 }
608 //GSXML.mergeMetadataLists(doc_nodes.item(i), dm_response_docs.item(i));
609 }
610 if (has_dummy && document_type.equals(GSXML.DOC_TYPE_SIMPLE)) {
611 Element dummy_node = (Element) doc_nodes.item(0);
612 the_document.removeChild(dummy_node);
613 the_document.setAttribute(GSXML.NODE_ID_ATT, dummy_node.getAttribute(GSXML.NODE_ID_ATT));
614 NodeList dummy_children = dummy_node.getChildNodes();
615 for (int i = dummy_children.getLength() - 1; i >= 0; i--)
616 {
617 // special case as we don't want more than one metadata list
618 if (dummy_children.item(i).getNodeName().equals(GSXML.METADATA_ELEM + GSXML.LIST_MODIFIER))
619 {
620 GSXML.mergeMetadataFromList(the_document, dummy_children.item(i));
621 }
622 else
623 {
624 the_document.appendChild(dummy_children.item(i));
625 }
626 }
627 }
628 }
629 else
630 {
631 //path = GSPath.appendLink(path, GSXML.DOC_NODE_ELEM);
632 Element dc_response_doc = (Element) GSXML.getChildByTagName(dc_response_doc_list, GSXML.DOC_NODE_ELEM);
633 Element dc_response_doc_content = (Element) GSXML.getChildByTagName(dc_response_doc, GSXML.NODE_CONTENT_ELEM);
634 //Element dc_response_doc_external = (Element) GSXML.getChildByTagName(dc_response_doc, "external");
635
636 if (dc_response_doc_content == null)
637 {
638 // no content to add
639 if (dc_response_doc.getAttribute("external").equals("true"))
640 {
641
642 //if (dc_response_doc_external != null)
643 //{
644 String href_id = dc_response_doc.getAttribute(GSXML.HREF_ID_ATT);
645
646 the_document.setAttribute("selectedNode", href_id);
647 the_document.setAttribute("external", href_id);
648 }
649 return result;
650 }
651 if (highlight_query_terms)
652 {
653 dc_response_doc.removeChild(dc_response_doc_content);
654
655 dc_response_doc_content = highlightQueryTerms(request, null, dc_response_doc_content);
656 dc_response_doc.appendChild(dc_response_doc.getOwnerDocument().importNode(dc_response_doc_content, true));
657 }
658
659 if (provide_annotations)
660 {
661 String service_selected = (String) params.get(ENRICH_DOC_ARG);
662 if (service_selected != null && service_selected.equals("1"))
663 {
664 // now we can modifiy the response doc if needed
665 String enrich_service = (String) params.get(GSParams.SERVICE);
666 // send a message to the service
667 Element enrich_message = doc.createElement(GSXML.MESSAGE_ELEM);
668 Element enrich_request = GSXML.createBasicRequest(doc, GSXML.REQUEST_TYPE_PROCESS, enrich_service, userContext);
669 enrich_message.appendChild(enrich_request);
670 // check for parameters
671 HashMap e_service_params = (HashMap) params.get("s1");
672 if (e_service_params != null)
673 {
674 Element enrich_pl = doc.createElement(GSXML.PARAM_ELEM + GSXML.LIST_MODIFIER);
675 GSXML.addParametersToList(enrich_pl, e_service_params);
676 enrich_request.appendChild(enrich_pl);
677 }
678 Element e_doc_list = doc.createElement(GSXML.DOC_NODE_ELEM + GSXML.LIST_MODIFIER);
679 enrich_request.appendChild(e_doc_list);
680 e_doc_list.appendChild(doc.importNode(dc_response_doc, true));
681
682 Node enrich_response = this.mr.process(enrich_message);
683
684 String[] links = { GSXML.RESPONSE_ELEM, GSXML.DOC_NODE_ELEM + GSXML.LIST_MODIFIER, GSXML.DOC_NODE_ELEM, GSXML.NODE_CONTENT_ELEM };
685 path = GSPath.createPath(links);
686 dc_response_doc_content = (Element) GSXML.getNodeByPath(enrich_response, path);
687
688 }
689 } // if provide_annotations
690
691 // use the returned id rather than the sent one cos there may have
692 // been modifiers such as .pr that are removed.
693 String modified_doc_id = dc_response_doc.getAttribute(GSXML.NODE_ID_ATT);
694 the_document.setAttribute("selectedNode", modified_doc_id);
695 if (has_dummy)
696 {
697 // change the id if necessary and add the content
698 Element dummy_node = (Element) doc_nodes.item(0);
699
700 dummy_node.setAttribute(GSXML.NODE_ID_ATT, modified_doc_id);
701 dummy_node.appendChild(doc.importNode(dc_response_doc_content, true));
702 // hack for simple type
703 if (document_type.equals(GSXML.DOC_TYPE_SIMPLE))
704 {
705 // we dont want the internal docNode, just want the content and metadata in the document
706 // rethink this!!
707 the_document.removeChild(dummy_node);
708
709 NodeList dummy_children = dummy_node.getChildNodes();
710 //for (int i=0; i<dummy_children.getLength(); i++) {
711 for (int i = dummy_children.getLength() - 1; i >= 0; i--)
712 {
713 // special case as we don't want more than one metadata list
714 if (dummy_children.item(i).getNodeName().equals(GSXML.METADATA_ELEM + GSXML.LIST_MODIFIER))
715 {
716 GSXML.mergeMetadataFromList(the_document, dummy_children.item(i));
717 }
718 else
719 {
720 the_document.appendChild(dummy_children.item(i));
721 }
722 }
723 }
724
725 the_document.setAttribute(GSXML.NODE_ID_ATT, modified_doc_id);
726 }
727 else
728 {
729 // Merge the document content with the metadata and structure information
730 for (int i = 0; i < doc_nodes.getLength(); i++)
731 {
732 Node dn = doc_nodes.item(i);
733 String dn_id = ((Element) dn).getAttribute(GSXML.NODE_ID_ATT);
734 if (dn_id.equals(modified_doc_id))
735 {
736 dn.appendChild(doc.importNode(dc_response_doc_content, true));
737 break;
738 }
739 }
740 }
741 }
742 //logger.debug("(DocumentAction) Page:\n" + GSXML.xmlNodeToString(result));
743 return result;
744 }
745
746 /**
747 * tell the param class what its arguments are if an action has its own
748 * arguments, this should add them to the params object - particularly
749 * important for args that should not be saved
750 */
751 public boolean addActionParameters(GSParams params)
752 {
753 params.addParameter(GOTO_PAGE_ARG, false);
754 params.addParameter(ENRICH_DOC_ARG, false);
755 params.addParameter(EXPAND_DOCUMENT_ARG, false);
756 params.addParameter(EXPAND_CONTENTS_ARG, false);
757 params.addParameter(REALISTIC_BOOK_ARG, false);
758
759 return true;
760 }
761
762 /**
763 * this method gets the collection description, the format info, the list of
764 * enrich services, etc - stuff that is needed for the page, but is the same
765 * whatever the query is - should be cached
766 */
767 protected boolean getBackgroundData(Element page_response, String collection, UserContext userContext)
768 {
769 Document doc = page_response.getOwnerDocument();
770
771 // create a message to process - contains requests for the collection
772 // description, the format element, the enrich services on offer
773 // these could all be cached
774 Element info_message = doc.createElement(GSXML.MESSAGE_ELEM);
775 String path = GSPath.appendLink(collection, "DocumentContentRetrieve");
776 // the format request - ignore for now, where does this request go to??
777 Element format_request = GSXML.createBasicRequest(doc, GSXML.REQUEST_TYPE_FORMAT, path, userContext);
778 info_message.appendChild(format_request);
779
780 // the enrich_services request - only do this if provide_annotations is true
781
782 if (provide_annotations)
783 {
784 Element enrich_services_request = GSXML.createBasicRequest(doc, GSXML.REQUEST_TYPE_DESCRIBE, "", userContext);
785 enrich_services_request.setAttribute(GSXML.INFO_ATT, "serviceList");
786 info_message.appendChild(enrich_services_request);
787 }
788
789 Element info_response = (Element) this.mr.process(info_message);
790
791 // the collection is the first response
792 NodeList responses = info_response.getElementsByTagName(GSXML.RESPONSE_ELEM);
793 Element format_resp = (Element) responses.item(0);
794
795 Element format_elem = (Element) GSXML.getChildByTagName(format_resp, GSXML.FORMAT_ELEM);
796 if (format_elem != null)
797 {
798 Element global_format_elem = (Element) GSXML.getChildByTagName(format_resp, GSXML.GLOBAL_FORMAT_ELEM);
799 if (global_format_elem != null)
800 {
801 GSXSLT.mergeFormatElements(format_elem, global_format_elem, false);
802 }
803
804 // set the format type
805 format_elem.setAttribute(GSXML.TYPE_ATT, "display");
806 page_response.appendChild(doc.importNode(format_elem, true));
807 }
808
809 if (provide_annotations)
810 {
811 Element services_resp = (Element) responses.item(1);
812
813 // a new message for the mr
814 Element enrich_message = doc.createElement(GSXML.MESSAGE_ELEM);
815 NodeList e_services = services_resp.getElementsByTagName(GSXML.SERVICE_ELEM);
816 boolean service_found = false;
817 for (int j = 0; j < e_services.getLength(); j++)
818 {
819 if (((Element) e_services.item(j)).getAttribute(GSXML.TYPE_ATT).equals("enrich"))
820 {
821 Element s = GSXML.createBasicRequest(doc, GSXML.REQUEST_TYPE_DESCRIBE, ((Element) e_services.item(j)).getAttribute(GSXML.NAME_ATT), userContext);
822 enrich_message.appendChild(s);
823 service_found = true;
824 }
825 }
826 if (service_found)
827 {
828 Element enrich_response = (Element) this.mr.process(enrich_message);
829
830 NodeList e_responses = enrich_response.getElementsByTagName(GSXML.RESPONSE_ELEM);
831 Element service_list = doc.createElement(GSXML.SERVICE_ELEM + GSXML.LIST_MODIFIER);
832 for (int i = 0; i < e_responses.getLength(); i++)
833 {
834 Element e_resp = (Element) e_responses.item(i);
835 Element e_service = (Element) doc.importNode(GSXML.getChildByTagName(e_resp, GSXML.SERVICE_ELEM), true);
836 e_service.setAttribute(GSXML.NAME_ATT, e_resp.getAttribute(GSXML.FROM_ATT));
837 service_list.appendChild(e_service);
838 }
839 page_response.appendChild(service_list);
840 }
841 } // if provide_annotations
842 return true;
843
844 }
845
846 protected String getDocumentType(Element basic_doc_list, String collection, UserContext userContext, Element page_response)
847 {
848 Document doc = basic_doc_list.getOwnerDocument();
849
850 Element ds_message = doc.createElement(GSXML.MESSAGE_ELEM);
851 String to = GSPath.appendLink(collection, "DocumentStructureRetrieve");// Hard-wired?
852 Element ds_request = GSXML.createBasicRequest(doc, GSXML.REQUEST_TYPE_PROCESS, to, userContext);
853 ds_message.appendChild(ds_request);
854
855 // Create a parameter list to specify the required structure information
856 Element ds_param_list = doc.createElement(GSXML.PARAM_ELEM + GSXML.LIST_MODIFIER);
857 Element ds_param = doc.createElement(GSXML.PARAM_ELEM);
858 ds_param_list.appendChild(ds_param);
859 ds_param.setAttribute(GSXML.NAME_ATT, "info");
860 ds_param.setAttribute(GSXML.VALUE_ATT, "documentType");
861
862 ds_request.appendChild(ds_param_list);
863
864 // add the node list we created earlier
865 ds_request.appendChild(basic_doc_list);
866
867 // Process the document structure retrieve message
868 Element ds_response_message = (Element) this.mr.process(ds_message);
869 if (processErrorElements(ds_response_message, page_response))
870 {
871 return null;
872 }
873
874 String[] links = { GSXML.RESPONSE_ELEM, GSXML.DOC_NODE_ELEM + GSXML.LIST_MODIFIER, GSXML.DOC_NODE_ELEM, "nodeStructureInfo" };
875 String path = GSPath.createPath(links);
876 Element info_elem = (Element) GSXML.getNodeByPath(ds_response_message, path);
877 if (info_elem == null) {
878 return null;
879 }
880 Element doctype_elem = GSXML.getNamedElement(info_elem, "info", "name", "documentType");
881 if (doctype_elem != null)
882 {
883 String doc_type = doctype_elem.getAttribute("value");
884 return doc_type;
885 }
886 return null;
887 }
888
889 /** run the XSLT transform which converts from doc.xml format to our internal document format */
890 protected Element transformArchiveToDocument(Element section) {
891
892 String stylesheet_file = GSFile.stylesheetFile(GlobalProperties.getGSDL3Home(), (String) this.config_params.get(GSConstants.SITE_NAME), "", (String) this.config_params.get(GSConstants.INTERFACE_NAME), null, "archive2document.xsl");
893 Document stylesheet_doc = XMLConverter.getDOM(new File(stylesheet_file));
894 if (stylesheet_doc == null) {
895 logger.error("Couldn't load in stylesheet "+stylesheet_file);
896 return section;
897 }
898
899 Document section_doc = XMLConverter.newDOM();
900 section_doc.appendChild(section_doc.importNode(section, true));
901 Node result = this.transformer.transform(stylesheet_doc, section_doc);
902 logger.error("transform result = "+XMLConverter.getPrettyString(result));
903
904 Element new_element;
905 if (result.getNodeType() == Node.DOCUMENT_NODE)
906 {
907 new_element = ((Document) result).getDocumentElement();
908 }
909 else
910 {
911 new_element = (Element) result;
912 }
913
914
915 return new_element;
916
917 }
918
919
920 /**
921 * this involves a bit of a hack to get the equivalent query terms - has to
922 * requery the query service - uses the last selected service name. (if it
923 * ends in query). should this action do the query or should it send a
924 * message to the query action? but that will involve lots of extra stuff.
925 * also doesn't handle phrases properly - just highlights all the terms
926 * found in the text.
927 */
928 protected Element highlightQueryTerms(Element request, String current_node_id, Element dc_response_doc_content)
929 {
930 Document doc = request.getOwnerDocument();
931
932 // do the query again to get term info
933 Element cgi_param_list = (Element) GSXML.getChildByTagName(request, GSXML.PARAM_ELEM + GSXML.LIST_MODIFIER);
934 HashMap<String, Serializable> params = GSXML.extractParams(cgi_param_list, false);
935
936 HashMap previous_params = (HashMap) params.get("p");
937 if (previous_params == null)
938 {
939 return dc_response_doc_content;
940 }
941 String service_name = (String) previous_params.get(GSParams.SERVICE);
942 if (service_name == null || !service_name.endsWith("Query"))
943 { // hack for now - we only do highlighting if we were in a query last - ie not if we were in a browse thingy
944 logger.debug("invalid service, not doing highlighting");
945 return dc_response_doc_content;
946 }
947 String collection = (String) params.get(GSParams.COLLECTION);
948 UserContext userContext = new UserContext(request);
949 String to = GSPath.appendLink(collection, service_name);
950
951 Element mr_query_message = doc.createElement(GSXML.MESSAGE_ELEM);
952 Element mr_query_request = GSXML.createBasicRequest(doc, GSXML.REQUEST_TYPE_PROCESS, to, userContext);
953 mr_query_message.appendChild(mr_query_request);
954
955 // paramList
956 HashMap service_params = (HashMap) params.get("s1");
957
958 Element query_param_list = doc.createElement(GSXML.PARAM_ELEM + GSXML.LIST_MODIFIER);
959 GSXML.addParametersToList(query_param_list, service_params);
960 if (current_node_id != null) {
961 GSXML.addParameterToList(query_param_list, "hldocOID", current_node_id);
962 } else {
963 GSXML.addParameterToList(query_param_list, "hldocOID", (String) params.get(GSParams.DOCUMENT));
964 }
965 mr_query_request.appendChild(query_param_list);
966 // do the query
967 Element mr_query_response = (Element) this.mr.process(mr_query_message);
968 String pathNode = GSPath.appendLink(GSXML.RESPONSE_ELEM, GSXML.NODE_CONTENT_ELEM);
969 Element highlighted_Node = (Element) GSXML.getNodeByPath(mr_query_response, pathNode);
970 // For SOLR, the above query may come back with a nodeContent element, which is the hldocOID section content, with search terms marked up. We send it back to the documnetContentRetrieve service so that resolveTextMacros can be applied, and it can be properly encased in documentNode etc elements
971 if (highlighted_Node != null)
972 {
973 // Build a request to process highlighted text
974
975 Element hl_message = doc.createElement(GSXML.MESSAGE_ELEM);
976 to = GSPath.appendLink(collection, "DocumentContentRetrieve");
977 Element dc_request = GSXML.createBasicRequest(doc, GSXML.REQUEST_TYPE_PROCESS, to, userContext);
978 hl_message.appendChild(dc_request);
979
980 // Create a parameter list to specify the request parameters - empty for now
981 Element dc_param_list = doc.createElement(GSXML.PARAM_ELEM + GSXML.LIST_MODIFIER);
982 dc_request.appendChild(dc_param_list);
983
984 // get the content
985 Element doc_list = doc.createElement(GSXML.DOC_NODE_ELEM + GSXML.LIST_MODIFIER);
986 dc_request.appendChild(doc_list);
987 Element current_doc = doc.createElement(GSXML.DOC_NODE_ELEM);
988 doc_list.appendChild(current_doc);
989 current_doc.setAttribute(GSXML.NODE_ID_ATT, (String) params.get(GSParams.DOCUMENT));
990 //Append highlighted content to request for processing
991 dc_request.appendChild(doc.importNode(highlighted_Node, true));
992 Element hl_response_message = (Element) this.mr.process(hl_message);
993
994 //Get results
995 NodeList contentList = hl_response_message.getElementsByTagName(GSXML.NODE_CONTENT_ELEM);
996 Element content = (Element) contentList.item(0);
997 return content;
998 }
999 String path = GSPath.appendLink(GSXML.RESPONSE_ELEM, GSXML.TERM_ELEM + GSXML.LIST_MODIFIER);
1000 Element query_term_list_element = (Element) GSXML.getNodeByPath(mr_query_response, path);
1001 if (query_term_list_element == null)
1002 {
1003 // no term info
1004 logger.error("No query term information.\n");
1005 return dc_response_doc_content;
1006 }
1007
1008 String content = GSXML.getNodeText(dc_response_doc_content);
1009
1010 String metadata_path = GSPath.appendLink(GSXML.RESPONSE_ELEM, GSXML.METADATA_ELEM + GSXML.LIST_MODIFIER);
1011 Element metadata_list = (Element) GSXML.getNodeByPath(mr_query_response, metadata_path);
1012
1013 HashSet<String> query_term_variants = new HashSet<String>();
1014 NodeList equivalent_terms_nodelist = query_term_list_element.getElementsByTagName("equivTermList");
1015 if (equivalent_terms_nodelist == null || equivalent_terms_nodelist.getLength() == 0)
1016 {
1017 NodeList terms_nodelist = query_term_list_element.getElementsByTagName("term");
1018 if (terms_nodelist != null && terms_nodelist.getLength() > 0)
1019 {
1020 for (int i = 0; i < terms_nodelist.getLength(); i++)
1021 {
1022 String termValue = ((Element) terms_nodelist.item(i)).getAttribute("name");
1023 String termValueU = null;
1024 String termValueL = null;
1025
1026 if (termValue.length() > 1)
1027 {
1028 termValueU = termValue.substring(0, 1).toUpperCase() + termValue.substring(1);
1029 termValueL = termValue.substring(0, 1).toLowerCase() + termValue.substring(1);
1030 }
1031 else
1032 {
1033 termValueU = termValue.substring(0, 1).toUpperCase();
1034 termValueL = termValue.substring(0, 1).toLowerCase();
1035 }
1036
1037 query_term_variants.add(termValueU);
1038 query_term_variants.add(termValueL);
1039 }
1040 }
1041 }
1042 else
1043 {
1044 for (int i = 0; i < equivalent_terms_nodelist.getLength(); i++)
1045 {
1046 Element equivalent_terms_element = (Element) equivalent_terms_nodelist.item(i);
1047 String[] equivalent_terms = GSXML.getAttributeValuesFromList(equivalent_terms_element, GSXML.NAME_ATT);
1048 for (int j = 0; j < equivalent_terms.length; j++)
1049 {
1050 query_term_variants.add(equivalent_terms[j]);
1051 }
1052 }
1053 }
1054
1055 ArrayList<ArrayList<HashSet<String>>> phrase_query_term_variants_hierarchy = new ArrayList<ArrayList<HashSet<String>>>();
1056
1057 Element query_element = GSXML.getNamedElement(metadata_list, GSXML.METADATA_ELEM, GSXML.NAME_ATT, "query");
1058 String performed_query = GSXML.getNodeText(query_element) + " ";
1059
1060 ArrayList<HashSet<String>> phrase_query_p_term_variants_list = new ArrayList<HashSet<String>>();
1061 int term_start = 0;
1062 boolean in_term = false;
1063 boolean in_phrase = false;
1064 for (int i = 0; i < performed_query.length(); i++)
1065 {
1066 char character = performed_query.charAt(i);
1067 boolean is_character_letter_or_digit = Character.isLetterOrDigit(character);
1068
1069 // Has a query term just started?
1070 if (in_term == false && is_character_letter_or_digit == true)
1071 {
1072 in_term = true;
1073 term_start = i;
1074 }
1075
1076 // Or has a term just finished?
1077 else if (in_term == true && is_character_letter_or_digit == false)
1078 {
1079 in_term = false;
1080 String term = performed_query.substring(term_start, i);
1081
1082 Element term_element = GSXML.getNamedElement(query_term_list_element, GSXML.TERM_ELEM, GSXML.NAME_ATT, term);
1083 if (term_element != null)
1084 {
1085
1086 HashSet<String> phrase_query_p_term_x_variants = new HashSet<String>();
1087
1088 NodeList term_equivalent_terms_nodelist = term_element.getElementsByTagName("equivTermList");
1089 if (term_equivalent_terms_nodelist == null || term_equivalent_terms_nodelist.getLength() == 0)
1090 {
1091 String termValueU = null;
1092 String termValueL = null;
1093
1094 if (term.length() > 1)
1095 {
1096 termValueU = term.substring(0, 1).toUpperCase() + term.substring(1);
1097 termValueL = term.substring(0, 1).toLowerCase() + term.substring(1);
1098 }
1099 else
1100 {
1101 termValueU = term.substring(0, 1).toUpperCase();
1102 termValueL = term.substring(0, 1).toLowerCase();
1103 }
1104
1105 phrase_query_p_term_x_variants.add(termValueU);
1106 phrase_query_p_term_x_variants.add(termValueL);
1107 }
1108 else
1109 {
1110 for (int j = 0; j < term_equivalent_terms_nodelist.getLength(); j++)
1111 {
1112 Element term_equivalent_terms_element = (Element) term_equivalent_terms_nodelist.item(j);
1113 String[] term_equivalent_terms = GSXML.getAttributeValuesFromList(term_equivalent_terms_element, GSXML.NAME_ATT);
1114 for (int k = 0; k < term_equivalent_terms.length; k++)
1115 {
1116 phrase_query_p_term_x_variants.add(term_equivalent_terms[k]);
1117 }
1118 }
1119 }
1120 phrase_query_p_term_variants_list.add(phrase_query_p_term_x_variants);
1121
1122 if (in_phrase == false)
1123 {
1124 phrase_query_term_variants_hierarchy.add(phrase_query_p_term_variants_list);
1125 phrase_query_p_term_variants_list = new ArrayList<HashSet<String>>();
1126 }
1127 }
1128 }
1129 // Watch for phrases (surrounded by quotes)
1130 if (character == '\"')
1131 {
1132 // Has a phrase just started?
1133 if (in_phrase == false)
1134 {
1135 in_phrase = true;
1136 }
1137 // Or has a phrase just finished?
1138 else if (in_phrase == true)
1139 {
1140 in_phrase = false;
1141 phrase_query_term_variants_hierarchy.add(phrase_query_p_term_variants_list);
1142 }
1143
1144 phrase_query_p_term_variants_list = new ArrayList<HashSet<String>>();
1145 }
1146 }
1147
1148 return highlightQueryTermsInternal(doc, content, query_term_variants, phrase_query_term_variants_hierarchy);
1149 }
1150
1151 /**
1152 * Highlights query terms in a piece of text.
1153 */
1154 private Element highlightQueryTermsInternal(Document doc, String content, HashSet<String> query_term_variants, ArrayList<ArrayList<HashSet<String>>> phrase_query_term_variants_hierarchy)
1155 {
1156 // Convert the content string to an array of characters for speed
1157 char[] content_characters = new char[content.length()];
1158 content.getChars(0, content.length(), content_characters, 0);
1159
1160 // Now skim through the content, identifying word matches
1161 ArrayList<WordMatch> word_matches = new ArrayList<WordMatch>();
1162 int word_start = 0;
1163 boolean in_word = false;
1164 boolean preceding_word_matched = false;
1165 boolean inTag = false;
1166 for (int i = 0; i < content_characters.length; i++)
1167 {
1168 //We don't want to find words inside HTML tags
1169 if (content_characters[i] == '<')
1170 {
1171 inTag = true;
1172 continue;
1173 }
1174 else if (inTag && content_characters[i] == '>')
1175 {
1176 inTag = false;
1177 }
1178 else if (inTag)
1179 {
1180 continue;
1181 }
1182
1183 boolean is_character_letter_or_digit = Character.isLetterOrDigit(content_characters[i]);
1184
1185 // Has a word just started?
1186 if (in_word == false && is_character_letter_or_digit == true)
1187 {
1188 in_word = true;
1189 word_start = i;
1190 }
1191
1192 // Or has a word just finished?
1193 else if (in_word == true && is_character_letter_or_digit == false)
1194 {
1195 in_word = false;
1196
1197 // Check if the word matches any of the query term equivalents
1198 String word = new String(content_characters, word_start, (i - word_start));
1199 if (query_term_variants.contains(word))
1200 {
1201 // We have found a matching word, so remember its location
1202 word_matches.add(new WordMatch(word, word_start, i, preceding_word_matched));
1203 preceding_word_matched = true;
1204 }
1205 else
1206 {
1207 preceding_word_matched = false;
1208 }
1209 }
1210 }
1211
1212 // Don't forget the last word...
1213 if (in_word == true)
1214 {
1215 // Check if the word matches any of the query term equivalents
1216 String word = new String(content_characters, word_start, (content_characters.length - word_start));
1217 if (query_term_variants.contains(word))
1218 {
1219 // We have found a matching word, so remember its location
1220 word_matches.add(new WordMatch(word, word_start, content_characters.length, preceding_word_matched));
1221 }
1222 }
1223
1224 ArrayList<Integer> highlight_start_positions = new ArrayList<Integer>();
1225 ArrayList<Integer> highlight_end_positions = new ArrayList<Integer>();
1226
1227 // Deal with phrases now
1228 ArrayList<PartialPhraseMatch> partial_phrase_matches = new ArrayList<PartialPhraseMatch>();
1229 for (int i = 0; i < word_matches.size(); i++)
1230 {
1231 WordMatch word_match = word_matches.get(i);
1232
1233 // See if any partial phrase matches are extended by this word
1234 if (word_match.preceding_word_matched)
1235 {
1236 for (int j = partial_phrase_matches.size() - 1; j >= 0; j--)
1237 {
1238 PartialPhraseMatch partial_phrase_match = partial_phrase_matches.remove(j);
1239 ArrayList phrase_query_p_term_variants_list = phrase_query_term_variants_hierarchy.get(partial_phrase_match.query_phrase_number);
1240 HashSet phrase_query_p_term_x_variants = (HashSet) phrase_query_p_term_variants_list.get(partial_phrase_match.num_words_matched);
1241 if (phrase_query_p_term_x_variants.contains(word_match.word))
1242 {
1243 partial_phrase_match.num_words_matched++;
1244
1245 // Has a complete phrase match occurred?
1246 if (partial_phrase_match.num_words_matched == phrase_query_p_term_variants_list.size())
1247 {
1248 // Check for overlaps by looking at the previous highlight range
1249 if (!highlight_end_positions.isEmpty())
1250 {
1251 int last_highlight_index = highlight_end_positions.size() - 1;
1252 int last_highlight_end = highlight_end_positions.get(last_highlight_index).intValue();
1253 if (last_highlight_end > partial_phrase_match.start_position)
1254 {
1255 // There is an overlap, so remove the previous phrase match
1256 int last_highlight_start = highlight_start_positions.remove(last_highlight_index).intValue();
1257 highlight_end_positions.remove(last_highlight_index);
1258 partial_phrase_match.start_position = last_highlight_start;
1259 }
1260 }
1261
1262 highlight_start_positions.add(new Integer(partial_phrase_match.start_position));
1263 highlight_end_positions.add(new Integer(word_match.end_position));
1264 }
1265 // No, but add the partial match back into the list for next time
1266 else
1267 {
1268 partial_phrase_matches.add(partial_phrase_match);
1269 }
1270 }
1271 }
1272 }
1273 else
1274 {
1275 partial_phrase_matches.clear();
1276 }
1277
1278 // See if this word is at the start of any of the phrases
1279 for (int p = 0; p < phrase_query_term_variants_hierarchy.size(); p++)
1280 {
1281 ArrayList phrase_query_p_term_variants_list = phrase_query_term_variants_hierarchy.get(p);
1282 if (phrase_query_p_term_variants_list.size()>0) {
1283 HashSet phrase_query_p_term_1_variants = (HashSet) phrase_query_p_term_variants_list.get(0);
1284 if (phrase_query_p_term_1_variants.contains(word_match.word))
1285 {
1286 // If this phrase is just one word long, we have a complete match
1287 if (phrase_query_p_term_variants_list.size() == 1)
1288 {
1289 highlight_start_positions.add(new Integer(word_match.start_position));
1290 highlight_end_positions.add(new Integer(word_match.end_position));
1291 }
1292 // Otherwise we have the start of a potential phrase match
1293 else
1294 {
1295 partial_phrase_matches.add(new PartialPhraseMatch(word_match.start_position, p));
1296 }
1297 }
1298 }
1299 }
1300 }
1301
1302 // Now add the annotation tags into the document at the correct points
1303 Element content_element = doc.createElement(GSXML.NODE_CONTENT_ELEM);
1304
1305 int last_wrote = 0;
1306 for (int i = 0; i < highlight_start_positions.size(); i++)
1307 {
1308 int highlight_start = highlight_start_positions.get(i).intValue();
1309 int highlight_end = highlight_end_positions.get(i).intValue();
1310
1311 // Print anything before the highlight range
1312 if (last_wrote < highlight_start)
1313 {
1314 String preceding_text = new String(content_characters, last_wrote, (highlight_start - last_wrote));
1315 content_element.appendChild(doc.createTextNode(preceding_text));
1316 }
1317
1318 // Print the highlight text, annotated
1319 if (highlight_end > last_wrote)
1320 {
1321 String highlight_text = new String(content_characters, highlight_start, (highlight_end - highlight_start));
1322 Element annotation_element = GSXML.createTextElement(doc, "annotation", highlight_text);
1323 annotation_element.setAttribute("type", "query_term");
1324 content_element.appendChild(annotation_element);
1325 last_wrote = highlight_end;
1326 }
1327 }
1328
1329 // Finish off any unwritten text
1330 if (last_wrote < content_characters.length)
1331 {
1332 String remaining_text = new String(content_characters, last_wrote, (content_characters.length - last_wrote));
1333 content_element.appendChild(doc.createTextNode(remaining_text));
1334 }
1335 return content_element;
1336 }
1337
1338 static private class WordMatch
1339 {
1340 public String word;
1341 public int start_position;
1342 public int end_position;
1343 public boolean preceding_word_matched;
1344
1345 public WordMatch(String word, int start_position, int end_position, boolean preceding_word_matched)
1346 {
1347 this.word = word;
1348 this.start_position = start_position;
1349 this.end_position = end_position;
1350 this.preceding_word_matched = preceding_word_matched;
1351 }
1352 }
1353
1354 static private class PartialPhraseMatch
1355 {
1356 public int start_position;
1357 public int query_phrase_number;
1358 public int num_words_matched;
1359
1360 public PartialPhraseMatch(int start_position, int query_phrase_number)
1361 {
1362 this.start_position = start_position;
1363 this.query_phrase_number = query_phrase_number;
1364 this.num_words_matched = 1;
1365 }
1366 }
1367}
Note: See TracBrowser for help on using the repository browser.