source: main/trunk/greenstone3/src/java/org/greenstone/gsdl3/action/DocumentAction.java@ 37177

Last change on this file since 37177 was 37177, checked in by davidb, 15 months ago

Introduction of new optional parameter docVersion. If null (or equal to the empty string), then code works as before. Designed to work with the file-level document-version history mechanism, if non-empty, then this value is used to change where doc.xml on the file system is read from

  • Property svn:keywords set to Author Date Id Revision
File size: 64.0 KB
Line 
1/*
2 * DocumentAction.java
3 * Copyright (C) 2002 New Zealand Digital Library, http://www.nzdl.org
4 *
5 * This program is free software; you can redistribute it and/or modify
6 * it under the terms of the GNU General Public License as published by
7 * the Free Software Foundation; either version 2 of the License, or
8 * (at your option) any later version.
9 *
10 * This program is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 * GNU General Public License for more details.
14 *
15 * You should have received a copy of the GNU General Public License
16 * along with this program; if not, write to the Free Software
17 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
18 */
19package org.greenstone.gsdl3.action;
20
21// Greenstone classes
22import org.greenstone.gsdl3.core.ModuleInterface;
23import org.greenstone.gsdl3.service.AbstractDocumentRetrieve;
24import org.greenstone.gsdl3.service.DocXMLUtil;
25import org.greenstone.gsdl3.util.*;
26import org.greenstone.util.GlobalProperties;
27
28// XML classes
29import org.w3c.dom.Document;
30import org.w3c.dom.Element;
31import org.w3c.dom.Node;
32import org.w3c.dom.Text;
33import org.w3c.dom.NodeList;
34
35// General Java classes
36import java.util.ArrayList;
37import java.util.HashMap;
38import java.util.HashSet;
39import java.util.Iterator;
40import java.io.File;
41import java.io.Serializable;
42
43import org.apache.log4j.*;
44
45/** Action class for retrieving Documents via the message router */
46public class DocumentAction extends Action
47{
48
49 static Logger logger = Logger.getLogger(org.greenstone.gsdl3.action.DocumentAction.class.getName());
50
51 // this is used to specify that the sibling nodes of a selected one should be obtained
52 public static final String SIBLING_ARG = "sib";
53 public static final String GOTO_PAGE_ARG = "gp";
54 public static final String ENRICH_DOC_ARG = "end";
55 public static final String EXPAND_DOCUMENT_ARG = "ed";
56 public static final String EXPAND_CONTENTS_ARG = "ec";
57 public static final String REALISTIC_BOOK_ARG = "book";
58 public static final String NO_TEXT_ARG = "noText";
59 public static final String DOC_EDIT_ARG = "docEdit";
60 public static final String DOC_VERSION_ARG = "dv";
61
62 /**
63 * if this is set to true, when a document is displayed, any annotation type
64 * services (enrich) will be offered to the user as well
65 */
66 protected boolean provide_annotations = false;
67
68 protected boolean highlight_query_terms = false;
69
70 public boolean configure()
71 {
72 super.configure();
73 String highlight = (String) config_params.get("highlightQueryTerms");
74 if (highlight != null && highlight.equals("true"))
75 {
76 highlight_query_terms = true;
77 }
78 String annotate = (String) config_params.get("displayAnnotationService");
79 if (annotate != null && annotate.equals("true"))
80 {
81 provide_annotations = true;
82 }
83 return true;
84 }
85
86 public Node process(Node message_node)
87 {
88 // for now, no subaction eventually we may want to have subactions such as text assoc or something ?
89
90 Element message = GSXML.nodeToElement(message_node);
91 Document doc = XMLConverter.newDOM();
92
93 // the response
94 Element result = doc.createElement(GSXML.MESSAGE_ELEM);
95 Element page_response = doc.createElement(GSXML.RESPONSE_ELEM);
96 result.appendChild(page_response);
97
98 // get the request - assume only one
99 Element request = (Element) GSXML.getChildByTagName(message, GSXML.REQUEST_ELEM);
100 Element cgi_paramList = (Element) GSXML.getChildByTagName(request, GSXML.PARAM_ELEM + GSXML.LIST_MODIFIER);
101 HashMap<String, Serializable> params = GSXML.extractParams(cgi_paramList, false);
102
103 // just in case there are some that need to get passed to the services
104 // why do we use s0 here and s1 in other places???
105 HashMap service_params = (HashMap) params.get("s0");
106
107 String collection = (String) params.get(GSParams.COLLECTION);
108 String document_id = (String) params.get(GSParams.DOCUMENT);
109 if (document_id != null && document_id.equals(""))
110 {
111 document_id = null;
112 }
113 String href = (String) params.get(GSParams.HREF);//for an external link : get the href URL if it is existing in the params list
114 if (href != null && href.equals(""))
115 {
116 href = null;
117 }
118 String rl = (String) params.get(GSParams.RELATIVE_LINK);//for an external link : get the rl value if it is existing in the params list
119 if (document_id == null && href == null)
120 {
121 logger.error("no document specified!");
122 return result;
123 }
124 if (rl != null && rl.equals("0"))
125 {
126 // this is a true external link, we should have been directed to a different page or action
127 logger.error("rl value was 0, shouldn't get here");
128 return result;
129 }
130
131 String doc_id_modifier = "";
132 String sibling_num = (String) params.get(GOTO_PAGE_ARG);
133 if (sibling_num != null && !sibling_num.equals(""))
134 {
135 // we have to modify the doc name
136 doc_id_modifier = "." + sibling_num + ".ss";
137 }
138
139
140 UserContext userContext = new UserContext(request);
141
142 //append site metadata
143 addSiteMetadata(page_response, userContext);
144 addInterfaceOptions(page_response);
145
146 // get the additional data needed for the page
147 getBackgroundData(page_response, collection, userContext);
148
149 // create a basic doc list containing the current node
150 // we will use this to query whether the id is valid, and to get document type
151 Element basic_doc_list = doc.createElement(GSXML.DOC_NODE_ELEM + GSXML.LIST_MODIFIER);
152 Element current_doc = doc.createElement(GSXML.DOC_NODE_ELEM);
153 basic_doc_list.appendChild(current_doc);
154 if (document_id != null)
155 {
156 current_doc.setAttribute(GSXML.NODE_ID_ATT, document_id + doc_id_modifier);
157 }
158 else
159 {
160 current_doc.setAttribute(GSXML.HREF_ID_ATT, href);
161 // do we need this??
162 current_doc.setAttribute(GSXML.ID_MOD_ATT, doc_id_modifier);
163 }
164
165 // lets do a quick check here for valid doc id.
166 if (document_id != null) {
167 boolean is_valid = checkValidOID(basic_doc_list, collection, userContext, page_response );
168 if (!is_valid) {
169 GSXML.addError(page_response, "Invalid doc id ("+document_id+")", GSXML.ERROR_TYPE_INVALID_ID);
170 return result;
171 }
172 }
173 Element format_elem = (Element) GSXML.getChildByTagName(page_response, GSXML.FORMAT_ELEM);
174
175 if (format_elem != null) {
176 // lets look for param defaults set in config file
177 NodeList param_defaults = format_elem.getElementsByTagName(GSXML.PARAM_DEFAULT_ELEM);
178 for (int i=0; i<param_defaults.getLength(); i++) {
179 Element p = (Element)param_defaults.item(i);
180 String name = p.getAttribute(GSXML.NAME_ATT);
181 if (params.get(name) ==null) {
182 // wasn't set from interface
183 String value = p.getAttribute(GSXML.VALUE_ATT);
184 params.put(name, value );
185 // also add into request param xml so that xslt knows it too
186 GSXML.addParameterToList(cgi_paramList, name, value);
187 }
188 }
189 }
190
191 String document_type = (String) params.get(GSParams.DOCUMENT_TYPE);
192 if (document_type != null && document_type.equals(""))
193 {
194 //document_type = "hierarchy";
195 document_type = null; // we'll get it later if not already specified
196 }
197 // what if it is null here?? Anu to check...
198
199
200 boolean editing_document = false;
201 String doc_edit = (String) params.get(DOC_EDIT_ARG);
202 if (doc_edit != null && doc_edit.equals("1")) {
203 editing_document = true;
204 }
205
206 // are we editing mode? just get the archive document, convert to our internal doc format, and return it
207 if (editing_document) {
208 String opt_document_version = (String) params.get(DOC_VERSION_ARG);
209 return getFormattedArchiveDoc(doc, collection, document_id, opt_document_version, document_type, result, page_response, userContext);
210 }
211
212 //whether to retrieve siblings or not
213 boolean get_siblings = false;
214 String sibs = (String) params.get(SIBLING_ARG);
215 if (sibs != null && sibs.equals("1"))
216 {
217 get_siblings = true;
218 }
219
220 boolean expand_document = false;
221 String ed_arg = (String) params.get(EXPAND_DOCUMENT_ARG);
222 if (ed_arg != null && ed_arg.equals("1"))
223 {
224 expand_document = true;
225 }
226
227 boolean expand_contents = false;
228 if (expand_document)
229 { // we always expand the contents with the text
230 expand_contents = true;
231 }
232 else
233 {
234 String ec_arg = (String) params.get(EXPAND_CONTENTS_ARG);
235 if (ec_arg != null && ec_arg.equals("1"))
236 {
237 expand_contents = true;
238 }
239 }
240
241 // do we want text content? Not if no_text=1.
242 // expand_document overrides this. - should it??
243 boolean get_text = true;
244 String nt_arg = (String) params.get(NO_TEXT_ARG);
245
246 if (!expand_document && nt_arg!=null && nt_arg.equals("1")) {
247 logger.debug("SETTING GET TEXT TO FALSE");
248 get_text = false;
249 } else {
250 logger.debug("GET TEXT REMAINS TRUE");
251 }
252
253 // the_document is where all the doc info - structure and metadata etc
254 // is added into, to be returned in the page
255 Element the_document = doc.createElement(GSXML.DOCUMENT_ELEM);
256 page_response.appendChild(the_document);
257
258// used to create basic_doc_list here
259 if (document_type == null)
260 {
261 document_type = getDocumentType(basic_doc_list, collection, userContext, page_response);
262 }
263 if (document_type == null)
264 {
265 logger.debug("##### doctype is null, setting to simple");
266 document_type = GSXML.DOC_TYPE_SIMPLE;
267 }
268
269 the_document.setAttribute(GSXML.DOC_TYPE_ATT, document_type);
270
271 // start getting doc structure
272
273 // Create a parameter list to specify the required structure information
274 Element ds_param_list = doc.createElement(GSXML.PARAM_ELEM + GSXML.LIST_MODIFIER);
275
276 if (service_params != null)
277 {
278 GSXML.addParametersToList(ds_param_list, service_params);
279 }
280
281 Element ds_param = null;
282 boolean get_structure = false;
283 boolean get_structure_info = false;
284 if (document_type.equals(GSXML.DOC_TYPE_PAGED))
285 {
286 get_structure_info = true;
287
288 if (expand_contents)
289 {
290 ds_param = doc.createElement(GSXML.PARAM_ELEM);
291 ds_param_list.appendChild(ds_param);
292 ds_param.setAttribute(GSXML.NAME_ATT, "structure");
293 ds_param.setAttribute(GSXML.VALUE_ATT, "entire");
294 }
295
296 // get the info needed for paged naviagtion
297 ds_param = doc.createElement(GSXML.PARAM_ELEM);
298 ds_param_list.appendChild(ds_param);
299 ds_param.setAttribute(GSXML.NAME_ATT, "info");
300 ds_param.setAttribute(GSXML.VALUE_ATT, "numSiblings");
301 ds_param = doc.createElement(GSXML.PARAM_ELEM);
302 ds_param_list.appendChild(ds_param);
303 ds_param.setAttribute(GSXML.NAME_ATT, "info");
304 ds_param.setAttribute(GSXML.VALUE_ATT, "numChildren");
305 ds_param = doc.createElement(GSXML.PARAM_ELEM);
306 ds_param_list.appendChild(ds_param);
307 ds_param.setAttribute(GSXML.NAME_ATT, "info");
308 ds_param.setAttribute(GSXML.VALUE_ATT, "siblingPosition");
309
310 if (get_siblings)
311 {
312 ds_param = doc.createElement(GSXML.PARAM_ELEM);
313 ds_param_list.appendChild(ds_param);
314 ds_param.setAttribute(GSXML.NAME_ATT, "structure");
315 ds_param.setAttribute(GSXML.VALUE_ATT, "siblings");
316 }
317
318 }
319 else if (document_type.equals(GSXML.DOC_TYPE_HIERARCHY) || document_type.equals(GSXML.DOC_TYPE_PAGED_HIERARCHY))
320 {
321 get_structure = true;
322 if (expand_contents)
323 {
324 ds_param = doc.createElement(GSXML.PARAM_ELEM);
325 ds_param_list.appendChild(ds_param);
326 ds_param.setAttribute(GSXML.NAME_ATT, "structure");
327 ds_param.setAttribute(GSXML.VALUE_ATT, "entire");
328 }
329 else
330 {
331 // get the info needed for table of contents
332 ds_param = doc.createElement(GSXML.PARAM_ELEM);
333 ds_param_list.appendChild(ds_param);
334 ds_param.setAttribute(GSXML.NAME_ATT, "structure");
335 ds_param.setAttribute(GSXML.VALUE_ATT, "ancestors");
336 ds_param = doc.createElement(GSXML.PARAM_ELEM);
337 ds_param_list.appendChild(ds_param);
338 ds_param.setAttribute(GSXML.NAME_ATT, "structure");
339 ds_param.setAttribute(GSXML.VALUE_ATT, "children");
340 if (get_siblings)
341 {
342 ds_param = doc.createElement(GSXML.PARAM_ELEM);
343 ds_param_list.appendChild(ds_param);
344 ds_param.setAttribute(GSXML.NAME_ATT, "structure");
345 ds_param.setAttribute(GSXML.VALUE_ATT, "siblings");
346 }
347 }
348 }
349 else
350 {
351 // we dont need any structure
352 }
353
354 boolean has_dummy = false;
355 if (get_structure || get_structure_info)
356 {
357
358 // Build a request to obtain the document structure
359 Element ds_message = doc.createElement(GSXML.MESSAGE_ELEM);
360 String to = GSPath.appendLink(collection, AbstractDocumentRetrieve.DOCUMENT_STRUCTURE_RETRIEVE_SERVICE);
361 Element ds_request = GSXML.createBasicRequest(doc, GSXML.REQUEST_TYPE_PROCESS, to, userContext);
362 ds_message.appendChild(ds_request);
363 ds_request.appendChild(ds_param_list);
364
365 // add the node list we created earlier
366 ds_request.appendChild(basic_doc_list);
367
368 // Process the document structure retrieve message
369 Element ds_response_message = (Element) this.mr.process(ds_message);
370 if (processErrorElements(ds_response_message, page_response))
371 {
372 return result;
373 }
374
375 // get the info and print out
376 String path = GSPath.appendLink(GSXML.RESPONSE_ELEM, GSXML.DOC_NODE_ELEM + GSXML.LIST_MODIFIER);
377 path = GSPath.appendLink(path, GSXML.DOC_NODE_ELEM);
378 path = GSPath.appendLink(path, "nodeStructureInfo");
379 Element ds_response_struct_info = (Element) GSXML.getNodeByPath(ds_response_message, path);
380 // get the doc_node bit
381 if (ds_response_struct_info != null)
382 {
383 the_document.appendChild(doc.importNode(ds_response_struct_info, true));
384 }
385 path = GSPath.appendLink(GSXML.RESPONSE_ELEM, GSXML.DOC_NODE_ELEM + GSXML.LIST_MODIFIER);
386 path = GSPath.appendLink(path, GSXML.DOC_NODE_ELEM);
387 path = GSPath.appendLink(path, GSXML.NODE_STRUCTURE_ELEM);
388 Element ds_response_structure = (Element) GSXML.getNodeByPath(ds_response_message, path);
389
390 if (ds_response_structure != null)
391 {
392 // add the contents of the structure bit into the_document
393 NodeList structs = ds_response_structure.getChildNodes();
394 for (int i = 0; i < structs.getLength(); i++)
395 {
396 the_document.appendChild(doc.importNode(structs.item(i), true));
397 }
398 }
399 else
400 {
401 // no structure nodes, so put in a dummy doc node
402 Element doc_node = doc.createElement(GSXML.DOC_NODE_ELEM);
403 if (document_id != null)
404 {
405 doc_node.setAttribute(GSXML.NODE_ID_ATT, document_id);
406 }
407 else
408 {
409 doc_node.setAttribute(GSXML.HREF_ID_ATT, href);
410
411 }
412 the_document.appendChild(doc_node);
413 has_dummy = true;
414 }
415 }
416 else
417 { // a simple type - we dont have a dummy node for simple
418 // should think about this more
419 // no structure request, so just put in a dummy doc node
420 Element doc_node = doc.createElement(GSXML.DOC_NODE_ELEM);
421 if (document_id != null)
422 {
423 doc_node.setAttribute(GSXML.NODE_ID_ATT, document_id);
424 }
425 else
426 {
427 doc_node.setAttribute(GSXML.HREF_ID_ATT, href);
428 }
429 the_document.appendChild(doc_node);
430 has_dummy = true;
431 }
432
433 // end getting doc structure
434
435 // start getting doc metadata
436
437 // Build a request to obtain some document metadata
438 Element dm_message = doc.createElement(GSXML.MESSAGE_ELEM);
439 String to = GSPath.appendLink(collection, AbstractDocumentRetrieve.DOCUMENT_METADATA_RETRIEVE_SERVICE);
440 Element dm_request = GSXML.createBasicRequest(doc, GSXML.REQUEST_TYPE_PROCESS, to, userContext);
441 dm_message.appendChild(dm_request);
442 // Create a parameter list to specify the required metadata information
443
444 HashSet<String> meta_names = new HashSet<String>();
445 meta_names.add("Title"); // the default
446 if (format_elem != null)
447 {
448 getRequiredMetadataNames(format_elem, meta_names);
449 }
450
451 Element extraMetaListElem = (Element) GSXML.getChildByTagName(request, GSXML.EXTRA_METADATA + GSXML.LIST_MODIFIER);
452 if (extraMetaListElem != null)
453 {
454 NodeList extraMetaList = extraMetaListElem.getElementsByTagName(GSXML.EXTRA_METADATA);
455 for (int i = 0; i < extraMetaList.getLength(); i++)
456 {
457 meta_names.add(((Element) extraMetaList.item(i)).getAttribute(GSXML.NAME_ATT));
458 }
459 }
460
461 Element dm_param_list = createMetadataParamList(doc,meta_names);
462 if (service_params != null)
463 {
464 GSXML.addParametersToList(dm_param_list, service_params);
465 }
466
467 dm_request.appendChild(dm_param_list);
468
469 // create the doc node list for the metadata request
470 Element dm_doc_list = doc.createElement(GSXML.DOC_NODE_ELEM + GSXML.LIST_MODIFIER);
471 dm_request.appendChild(dm_doc_list);
472
473 // Add each node from the structure response into the metadata request
474 NodeList doc_nodes = the_document.getElementsByTagName(GSXML.DOC_NODE_ELEM);
475 for (int i = 0; i < doc_nodes.getLength(); i++)
476 {
477 Element doc_node = (Element) doc_nodes.item(i);
478 String doc_node_id = doc_node.getAttribute(GSXML.NODE_ID_ATT);
479
480 // Add the documentNode to the list
481 Element dm_doc_node = doc.createElement(GSXML.DOC_NODE_ELEM);
482 if (needSectionContent(params)) {
483 if (doc_node_id.equals(document_id)) {
484 dm_doc_list.appendChild(dm_doc_node);
485 }
486 } else {
487 dm_doc_list.appendChild(dm_doc_node);
488 }
489 //dm_doc_list.appendChild(dm_doc_node);
490 dm_doc_node.setAttribute(GSXML.NODE_ID_ATT, doc_node_id);
491 dm_doc_node.setAttribute(GSXML.NODE_TYPE_ATT, doc_node.getAttribute(GSXML.NODE_TYPE_ATT));
492 if (document_id == null){
493 dm_doc_node.setAttribute(GSXML.HREF_ID_ATT, href );
494 }
495
496 }
497 // we also want a metadata request to the top level document to get
498 // assocfilepath - this could be cached too
499 Element doc_meta_request = GSXML.createBasicRequest(doc, GSXML.REQUEST_TYPE_PROCESS, to, userContext);
500 dm_message.appendChild(doc_meta_request);
501 Element doc_meta_param_list = doc.createElement(GSXML.PARAM_ELEM + GSXML.LIST_MODIFIER);
502 if (service_params != null)
503 {
504 GSXML.addParametersToList(doc_meta_param_list, service_params);
505 }
506
507 doc_meta_request.appendChild(doc_meta_param_list);
508 Element doc_param = doc.createElement(GSXML.PARAM_ELEM);
509 doc_meta_param_list.appendChild(doc_param);
510 doc_param.setAttribute(GSXML.NAME_ATT, "metadata");
511 doc_param.setAttribute(GSXML.VALUE_ATT, "assocfilepath");
512
513 // create the doc node list for the metadata request
514 Element doc_list = doc.createElement(GSXML.DOC_NODE_ELEM + GSXML.LIST_MODIFIER);
515 doc_meta_request.appendChild(doc_list);
516
517 Element doc_node = doc.createElement(GSXML.DOC_NODE_ELEM);
518 // the node we want is the root document node
519 if (document_id != null)
520 {
521 doc_node.setAttribute(GSXML.NODE_ID_ATT, document_id + ".rt");
522 }
523 /*else
524 {
525 doc_node.setAttribute(GSXML.HREF_ID_ATT, href);// + ".rt");
526 // can we assume that href is always a top level doc??
527 //doc_node.setAttribute(GSXML.ID_MOD_ATT, ".rt");
528 //doc_node.setAttribute("externalURL", has_rl);
529 }*/
530 doc_list.appendChild(doc_node);
531
532 Element dm_response_message = (Element) this.mr.process(dm_message);
533 if (processErrorElements(dm_response_message, page_response))
534 {
535 return result;
536 }
537
538 String path = GSPath.appendLink(GSXML.RESPONSE_ELEM, GSXML.DOC_NODE_ELEM + GSXML.LIST_MODIFIER);
539 Element dm_response_doc_list = (Element) GSXML.getNodeByPath(dm_response_message, path);
540
541 // Merge the metadata with the structure information
542 NodeList dm_response_docs = dm_response_doc_list.getChildNodes();
543 for (int i = 0; i < doc_nodes.getLength(); i++)
544 {
545 Node dcNode;
546 String node_idd = ((Element)doc_nodes.item(i)).getAttribute(GSXML.NODE_ID_ATT);
547 if (node_idd.isEmpty()) {
548 String href_id_att = ((Element)doc_nodes.item(i)).getAttribute(GSXML.HREF_ID_ATT);
549 dcNode = GSXML.getNamedElement(dm_response_doc_list, "documentNode", GSXML.HREF_ID_ATT, href_id_att);
550 } else {
551 dcNode = GSXML.getNamedElement(dm_response_doc_list, "documentNode", GSXML.NODE_ID_ATT, node_idd);
552 }
553 GSXML.mergeMetadataLists(doc_nodes.item(i), dcNode);
554 }
555 // get the top level doc metadata out
556 Element doc_meta_response = (Element) dm_response_message.getElementsByTagName(GSXML.RESPONSE_ELEM).item(1);
557 Element top_doc_node = (Element) GSXML.getNodeByPath(doc_meta_response, "documentNodeList/documentNode");
558 GSXML.mergeMetadataLists(the_document, top_doc_node);
559
560 // if we are highlighting query terms, then we also get them highlighted in the metadata
561
562 HashSet<String> query_term_variants = null;
563 ArrayList<ArrayList<HashSet<String>>> phrase_query_term_variants_hierarchy = null;
564 boolean do_highlight_query_terms = highlight_query_terms;
565 int query_terms_status = 0;
566 if (highlight_query_terms) {
567 // lets get the query term equivalents
568 query_term_variants = new HashSet<String>();
569 phrase_query_term_variants_hierarchy = new ArrayList<ArrayList<HashSet<String>>>();
570 if ((query_terms_status = getQueryTermVariants(request, query_term_variants, phrase_query_term_variants_hierarchy)) ==0) {
571 do_highlight_query_terms = false; // we couldn't get the terms
572 }
573 }
574
575 // lets try marking up the metadata with search terms
576 // if the search service doesn't send back <equivTermlist> then we haven't got the term variants. We lower case everything and do case insensitive matching
577 boolean highlight_case_insensitive = false;
578 if (query_terms_status == NO_EQUIV_QUERY_TERMS) {
579 highlight_case_insensitive = true;
580 }
581 if (do_highlight_query_terms) {
582 highlightQueryTermsDOM(doc, the_document, "metadata", query_term_variants, phrase_query_term_variants_hierarchy, highlight_case_insensitive);
583 }
584
585 // do we want doc text content? If not, we are done.
586 if (!get_text) {
587 // don't get text
588 return result;
589 }
590
591 // Build a request to obtain some document content
592 Element dc_message = doc.createElement(GSXML.MESSAGE_ELEM);
593 to = GSPath.appendLink(collection, AbstractDocumentRetrieve.DOCUMENT_CONTENT_RETRIEVE_SERVICE);
594 Element dc_request = GSXML.createBasicRequest(doc, GSXML.REQUEST_TYPE_PROCESS, to, userContext);
595 dc_message.appendChild(dc_request);
596
597 // Create a parameter list to specify the request parameters - empty for now
598 Element dc_param_list = doc.createElement(GSXML.PARAM_ELEM + GSXML.LIST_MODIFIER);
599 if (service_params != null)
600 {
601 GSXML.addParametersToList(dc_param_list, service_params);
602 }
603
604 dc_request.appendChild(dc_param_list);
605
606 // get the content
607 // the doc list for the content request is the same as the one for the structure request unless we want the whole document, in which case its the same as for the metadata request.
608 if (expand_document)
609 {
610 dc_request.appendChild(dm_doc_list);
611 }
612 else
613 {
614 dc_request.appendChild(basic_doc_list);
615 }
616 Element dc_response_message = (Element) this.mr.process(dc_message);
617
618 if (processErrorElements(dc_response_message, page_response))
619 {
620 return result;
621
622 }
623 Element dc_response_doc_list = (Element) GSXML.getNodeByPath(dc_response_message, path);
624
625 boolean get_marked_up_doc_from_query = false;
626 if (do_highlight_query_terms && query_terms_status == NO_EQUIV_QUERY_TERMS) {
627 get_marked_up_doc_from_query = true; // we try to. solr we can, lucene we can't
628 }
629
630 if (expand_document)
631 {
632 // Merge the content with the structure information
633 NodeList dc_response_docs = dc_response_doc_list.getChildNodes();
634 for (int i = 0; i < doc_nodes.getLength(); i++)
635 {
636 String node_id = ((Element)doc_nodes.item(i)).getAttribute(GSXML.NODE_ID_ATT);
637 Node docNode = GSXML.getNamedElement(dc_response_doc_list, "documentNode", GSXML.NODE_ID_ATT, node_id);
638 Node content = GSXML.getChildByTagName(docNode, GSXML.NODE_CONTENT_ELEM);
639 if (content != null)
640 {
641 if (do_highlight_query_terms) {
642 if (get_marked_up_doc_from_query) {
643
644 Element new_content = retrieveHighlightedContent(request, node_id);
645
646 if (new_content == null) {
647 // we didn't get any text back from the request. assume we won't be able to get it next time either (eg lucene)
648 get_marked_up_doc_from_query = false;
649 content = highlightQueryTermsElementText(doc, (Element)content, query_term_variants, phrase_query_term_variants_hierarchy, highlight_case_insensitive);
650 } else {
651 content= new_content;
652 }
653 } else {
654 content = highlightQueryTermsElementText(doc, (Element)content, query_term_variants, phrase_query_term_variants_hierarchy, highlight_case_insensitive);
655 }
656 }
657 doc_nodes.item(i).appendChild(doc.importNode(content, true));
658 }
659
660 }
661 if (has_dummy && document_type.equals(GSXML.DOC_TYPE_SIMPLE)) {
662 Element dummy_node = (Element) doc_nodes.item(0);
663 the_document.removeChild(dummy_node);
664 the_document.setAttribute(GSXML.NODE_ID_ATT, dummy_node.getAttribute(GSXML.NODE_ID_ATT));
665 NodeList dummy_children = dummy_node.getChildNodes();
666 for (int i = dummy_children.getLength() - 1; i >= 0; i--)
667 {
668 // special case as we don't want more than one metadata list
669 if (dummy_children.item(i).getNodeName().equals(GSXML.METADATA_ELEM + GSXML.LIST_MODIFIER))
670 {
671 GSXML.mergeMetadataFromList(the_document, dummy_children.item(i));
672 }
673 else
674 {
675 the_document.appendChild(dummy_children.item(i));
676 }
677 }
678 }
679 }
680 else
681 {
682 Element dc_response_doc = (Element) GSXML.getChildByTagName(dc_response_doc_list, GSXML.DOC_NODE_ELEM);
683 Element dc_response_doc_content = (Element) GSXML.getChildByTagName(dc_response_doc, GSXML.NODE_CONTENT_ELEM);
684
685 if (dc_response_doc_content == null)
686 {
687 // no content to add
688 if (dc_response_doc.getAttribute("external").equals("true"))
689 {
690 String href_id = dc_response_doc.getAttribute(GSXML.HREF_ID_ATT);
691
692 the_document.setAttribute("selectedNode", href_id);
693 the_document.setAttribute("external", href_id);
694 }
695 return result;
696 }
697 if (do_highlight_query_terms)
698 {
699 dc_response_doc.removeChild(dc_response_doc_content);
700 if (get_marked_up_doc_from_query) {
701 Element new_content = retrieveHighlightedContent(request, null);
702 if (new_content == null) {
703 get_marked_up_doc_from_query = false;
704 dc_response_doc_content = highlightQueryTermsElementText(doc, (Element)dc_response_doc_content, query_term_variants, phrase_query_term_variants_hierarchy, highlight_case_insensitive);
705 } else {
706
707 dc_response_doc_content = new_content;
708 }
709 } else {
710 dc_response_doc_content = highlightQueryTermsElementText(doc, (Element)dc_response_doc_content, query_term_variants, phrase_query_term_variants_hierarchy, highlight_case_insensitive);
711 }
712 dc_response_doc.appendChild(dc_response_doc.getOwnerDocument().importNode(dc_response_doc_content, true));
713 }
714
715 if (provide_annotations)
716 {
717 String service_selected = (String) params.get(ENRICH_DOC_ARG);
718 if (service_selected != null && service_selected.equals("1"))
719 {
720 // now we can modifiy the response doc if needed
721 String enrich_service = (String) params.get(GSParams.SERVICE);
722 // send a message to the service
723 Element enrich_message = doc.createElement(GSXML.MESSAGE_ELEM);
724 Element enrich_request = GSXML.createBasicRequest(doc, GSXML.REQUEST_TYPE_PROCESS, enrich_service, userContext);
725 enrich_message.appendChild(enrich_request);
726 // check for parameters
727 HashMap e_service_params = (HashMap) params.get("s1");
728 if (e_service_params != null)
729 {
730 Element enrich_pl = doc.createElement(GSXML.PARAM_ELEM + GSXML.LIST_MODIFIER);
731 GSXML.addParametersToList(enrich_pl, e_service_params);
732 enrich_request.appendChild(enrich_pl);
733 }
734 Element e_doc_list = doc.createElement(GSXML.DOC_NODE_ELEM + GSXML.LIST_MODIFIER);
735 enrich_request.appendChild(e_doc_list);
736 e_doc_list.appendChild(doc.importNode(dc_response_doc, true));
737
738 Node enrich_response = this.mr.process(enrich_message);
739
740 String[] links = { GSXML.RESPONSE_ELEM, GSXML.DOC_NODE_ELEM + GSXML.LIST_MODIFIER, GSXML.DOC_NODE_ELEM, GSXML.NODE_CONTENT_ELEM };
741 path = GSPath.createPath(links);
742 dc_response_doc_content = (Element) GSXML.getNodeByPath(enrich_response, path);
743
744 }
745 } // if provide_annotations
746
747 // use the returned id rather than the sent one cos there may have
748 // been modifiers such as .pr that are removed.
749 String modified_doc_id = dc_response_doc.getAttribute(GSXML.NODE_ID_ATT);
750 the_document.setAttribute("selectedNode", modified_doc_id);
751 if (has_dummy)
752 {
753 // change the id if necessary and add the content
754 Element dummy_node = (Element) doc_nodes.item(0);
755
756 dummy_node.setAttribute(GSXML.NODE_ID_ATT, modified_doc_id);
757 dummy_node.appendChild(doc.importNode(dc_response_doc_content, true));
758 // hack for simple type
759 if (document_type.equals(GSXML.DOC_TYPE_SIMPLE))
760 {
761 // we dont want the internal docNode, just want the content and metadata in the document
762 // rethink this!!
763 the_document.removeChild(dummy_node);
764
765 NodeList dummy_children = dummy_node.getChildNodes();
766 for (int i = dummy_children.getLength() - 1; i >= 0; i--)
767 {
768 // special case as we don't want more than one metadata list
769 if (dummy_children.item(i).getNodeName().equals(GSXML.METADATA_ELEM + GSXML.LIST_MODIFIER))
770 {
771 GSXML.mergeMetadataFromList(the_document, dummy_children.item(i));
772 }
773 else
774 {
775 the_document.appendChild(dummy_children.item(i));
776 }
777 }
778 }
779
780 the_document.setAttribute(GSXML.NODE_ID_ATT, modified_doc_id);
781 }
782 else
783 {
784 // Merge the document content with the metadata and structure information
785 for (int i = 0; i < doc_nodes.getLength(); i++)
786 {
787 Node dn = doc_nodes.item(i);
788 String dn_id = ((Element) dn).getAttribute(GSXML.NODE_ID_ATT);
789 if (dn_id.equals(modified_doc_id))
790 {
791 dn.appendChild(doc.importNode(dc_response_doc_content, true));
792 break;
793 }
794 }
795 }
796 }
797 //logger.debug("(DocumentAction) Page:\n" + GSXML.xmlNodeToString(result));
798 return result;
799 }
800
801 protected boolean checkValidOID(Element basic_doc_list, String collection, UserContext userContext, Element page_response) {
802 Document doc = basic_doc_list.getOwnerDocument();
803
804 Element v_message = doc.createElement(GSXML.MESSAGE_ELEM);
805 String to = GSPath.appendLink(collection, AbstractDocumentRetrieve.VALIDATE_DOCUMENT_ID_SERVICE);
806 Element v_request = GSXML.createBasicRequest(doc, GSXML.REQUEST_TYPE_PROCESS, to, userContext);
807 v_message.appendChild(v_request);
808
809 // add the node list
810 v_request.appendChild(basic_doc_list);
811 Element v_response_message = (Element) this.mr.process(v_message);
812 if (processErrorElements(v_response_message, page_response))
813 {
814 return false;
815 }
816 String[] links = { GSXML.RESPONSE_ELEM, GSXML.DOC_NODE_ELEM + GSXML.LIST_MODIFIER, GSXML.DOC_NODE_ELEM };
817 String path = GSPath.createPath(links);
818 Element info_elem = (Element) GSXML.getNodeByPath(v_response_message, path);
819 if (info_elem == null) {
820 return false;
821 }
822 if (info_elem.getAttribute("valid").equals("true")) {
823 return true;
824 }
825 return false;
826
827 }
828
829 protected Element getFormattedArchiveDoc(Document doc, String collection, String document_id, String opt_document_version, String document_type,
830 Element result, Element page_response, UserContext userContext ) {
831 // call get archive doc
832 Element dx_message = doc.createElement(GSXML.MESSAGE_ELEM);
833 String to = DocXMLUtil.DOC_XML_GET_SECTION_SERVICE;
834 Element dx_request = GSXML.createBasicRequest(doc, GSXML.REQUEST_TYPE_PROCESS, to, userContext);
835 dx_message.appendChild(dx_request);
836 Element dx_section = doc.createElement(GSXML.DOCXML_SECTION_ELEM);
837 dx_section.setAttribute(GSXML.NODE_ID_ATT, document_id);
838 dx_section.setAttribute(GSXML.COLLECTION_ATT, collection);
839 dx_section.setAttribute(GSXML.DOC_VERSION_ATT, opt_document_version);
840 dx_request.appendChild(dx_section);
841
842 Element dx_response_message = (Element) this.mr.process(dx_message);
843 if (processErrorElements(dx_response_message, page_response))
844 {
845 return result;
846 }
847
848 // get the section out
849 String path = GSPath.appendLink(GSXML.RESPONSE_ELEM, GSXML.DOCXML_SECTION_ELEM);
850 Element section = (Element) GSXML.getNodeByPath(dx_response_message, path);
851 if (section == null) {
852 logger.error("no archive doc returned for "+document_id);
853 return result;
854 }
855 // convert the archive format into the internal format that the page response requires
856
857 // work out doctype
858 // NOTE: this will be coming from collection database in index
859 // the archive file doesn't store this. So we have to assume
860 // that the doc type will not be changing with any
861 // modifications happening to archives.
862
863 // if doc type is null, then we need to work it out.
864 // create a basic doc list containing the current node
865
866 if (document_type == null) {
867 Element basic_doc_list = doc.createElement(GSXML.DOC_NODE_ELEM + GSXML.LIST_MODIFIER);
868 Element current_doc = doc.createElement(GSXML.DOC_NODE_ELEM);
869 basic_doc_list.appendChild(current_doc);
870 current_doc.setAttribute(GSXML.NODE_ID_ATT, document_id);
871 basic_doc_list.appendChild(current_doc);
872 document_type = getDocumentType(basic_doc_list, collection, userContext, page_response);
873 }
874
875 if (document_type == null) {
876 logger.debug("@@@ doctype is null, setting to simple");
877 document_type = GSXML.DOC_TYPE_SIMPLE;
878 }
879
880 Element doc_elem = doc.createElement(GSXML.DOCUMENT_ELEM);
881 doc_elem.setAttribute(GSXML.DOC_TYPE_ATT, document_type);
882 page_response.appendChild(doc_elem);
883
884 Element transformed_section = transformArchiveToDocument(section);
885 if (document_type == GSXML.DOC_TYPE_SIMPLE) {
886 // simple doc, only returning a single document node, which is the top level section.
887 doc_elem.setAttribute(GSXML.NODE_ID_ATT, document_id);
888 GSXML.mergeElements(doc_elem, transformed_section);
889 return result;
890 }
891
892 // multi sectioned document.
893 transformed_section.setAttribute(GSXML.NODE_ID_ATT, document_id);
894 // In docEdit mode, we obtain the text from archives, from doc.xml
895 // Now the transformation has replaced <Section> with <documentNode>
896 // Need to add nodeID, nodeType and docType attributes to each docNode
897 // as doc.xml doesn't store that.
898 insertDocNodeAttributes(transformed_section, document_type, null);
899 doc_elem.appendChild(doc.importNode(transformed_section, true));
900 logger.debug("dx result = "+XMLConverter.getPrettyString(result));
901
902 return result;
903 }
904
905
906 private boolean needSectionContent(HashMap<String, Serializable> params) {
907 String document_id = (String) params.get(GSParams.DOCUMENT);
908 String ilt = (String) params.get(GSParams.INLINE_TEMPLATE);
909 String iltPrefix = "<xsl:template match=\"/\"><text><xsl:for-each select=\"/page/pageResponse/document//documentNode[@nodeID =";
910 if (ilt != null && ilt.startsWith(iltPrefix) && document_id != null) {
911 return true;
912 }
913
914 return false;
915 }
916 /**
917 * this method gets the collection description, the format info, the list of
918 * enrich services, etc - stuff that is needed for the page, but is the same
919 * whatever the query is - should be cached
920 */
921 protected boolean getBackgroundData(Element page_response, String collection, UserContext userContext)
922 {
923 Document doc = page_response.getOwnerDocument();
924
925 // create a message to process - contains requests for the collection
926 // description, the format element, the enrich services on offer
927 // these could all be cached
928 Element info_message = doc.createElement(GSXML.MESSAGE_ELEM);
929 String path = GSPath.appendLink(collection, AbstractDocumentRetrieve.DOCUMENT_CONTENT_RETRIEVE_SERVICE);
930 // the format request - ignore for now, where does this request go to??
931 Element format_request = GSXML.createBasicRequest(doc, GSXML.REQUEST_TYPE_FORMAT, path, userContext);
932 info_message.appendChild(format_request);
933
934 // the enrich_services request - only do this if provide_annotations is true
935
936 if (provide_annotations)
937 {
938 Element enrich_services_request = GSXML.createBasicRequest(doc, GSXML.REQUEST_TYPE_DESCRIBE, "", userContext);
939 enrich_services_request.setAttribute(GSXML.INFO_ATT, "serviceList");
940 info_message.appendChild(enrich_services_request);
941 }
942
943 Element info_response = (Element) this.mr.process(info_message);
944
945 // the collection is the first response
946 NodeList responses = info_response.getElementsByTagName(GSXML.RESPONSE_ELEM);
947 Element format_resp = (Element) responses.item(0);
948
949 Element format_elem = (Element) GSXML.getChildByTagName(format_resp, GSXML.FORMAT_ELEM);
950 if (format_elem != null)
951 {
952 Element global_format_elem = (Element) GSXML.getChildByTagName(format_resp, GSXML.GLOBAL_FORMAT_ELEM);
953 if (global_format_elem != null)
954 {
955 GSXSLT.mergeFormatElements(format_elem, global_format_elem, false);
956 }
957
958 // set the format type
959 format_elem.setAttribute(GSXML.TYPE_ATT, "display");
960 page_response.appendChild(doc.importNode(format_elem, true));
961 }
962
963 if (provide_annotations)
964 {
965 Element services_resp = (Element) responses.item(1);
966
967 // a new message for the mr
968 Element enrich_message = doc.createElement(GSXML.MESSAGE_ELEM);
969 NodeList e_services = services_resp.getElementsByTagName(GSXML.SERVICE_ELEM);
970 boolean service_found = false;
971 for (int j = 0; j < e_services.getLength(); j++)
972 {
973 if (((Element) e_services.item(j)).getAttribute(GSXML.TYPE_ATT).equals("enrich"))
974 {
975 Element s = GSXML.createBasicRequest(doc, GSXML.REQUEST_TYPE_DESCRIBE, ((Element) e_services.item(j)).getAttribute(GSXML.NAME_ATT), userContext);
976 enrich_message.appendChild(s);
977 service_found = true;
978 }
979 }
980 if (service_found)
981 {
982 Element enrich_response = (Element) this.mr.process(enrich_message);
983
984 NodeList e_responses = enrich_response.getElementsByTagName(GSXML.RESPONSE_ELEM);
985 Element service_list = doc.createElement(GSXML.SERVICE_ELEM + GSXML.LIST_MODIFIER);
986 for (int i = 0; i < e_responses.getLength(); i++)
987 {
988 Element e_resp = (Element) e_responses.item(i);
989 Element e_service = (Element) doc.importNode(GSXML.getChildByTagName(e_resp, GSXML.SERVICE_ELEM), true);
990 e_service.setAttribute(GSXML.NAME_ATT, e_resp.getAttribute(GSXML.FROM_ATT));
991 service_list.appendChild(e_service);
992 }
993 page_response.appendChild(service_list);
994 }
995 } // if provide_annotations
996 return true;
997
998 }
999
1000 protected String getDocumentType(Element basic_doc_list, String collection, UserContext userContext, Element page_response)
1001 {
1002 Document doc = basic_doc_list.getOwnerDocument();
1003
1004 Element ds_message = doc.createElement(GSXML.MESSAGE_ELEM);
1005 String to = GSPath.appendLink(collection, AbstractDocumentRetrieve.DOCUMENT_STRUCTURE_RETRIEVE_SERVICE);
1006 Element ds_request = GSXML.createBasicRequest(doc, GSXML.REQUEST_TYPE_PROCESS, to, userContext);
1007 ds_message.appendChild(ds_request);
1008
1009 // Create a parameter list to specify the required structure information
1010 Element ds_param_list = doc.createElement(GSXML.PARAM_ELEM + GSXML.LIST_MODIFIER);
1011 Element ds_param = doc.createElement(GSXML.PARAM_ELEM);
1012 ds_param_list.appendChild(ds_param);
1013 ds_param.setAttribute(GSXML.NAME_ATT, "info");
1014 ds_param.setAttribute(GSXML.VALUE_ATT, "documentType");
1015
1016 ds_request.appendChild(ds_param_list);
1017
1018 // add the node list we created earlier
1019 ds_request.appendChild(basic_doc_list);
1020
1021 // Process the document structure retrieve message
1022 Element ds_response_message = (Element) this.mr.process(ds_message);
1023 if (processErrorElements(ds_response_message, page_response))
1024 {
1025 return null;
1026 }
1027
1028 String[] links = { GSXML.RESPONSE_ELEM, GSXML.DOC_NODE_ELEM + GSXML.LIST_MODIFIER, GSXML.DOC_NODE_ELEM, "nodeStructureInfo" };
1029 String path = GSPath.createPath(links);
1030 Element info_elem = (Element) GSXML.getNodeByPath(ds_response_message, path);
1031 if (info_elem == null) {
1032 return null;
1033 }
1034 Element doctype_elem = GSXML.getNamedElement(info_elem, "info", "name", "documentType");
1035 if (doctype_elem != null)
1036 {
1037 String doc_type = doctype_elem.getAttribute("value");
1038 return doc_type;
1039 }
1040 return null;
1041 }
1042
1043 // Recursive method to set the docType, nodeType and nodeID attributes of each docNode
1044 // The docType remains constant as in parameter document_type
1045 // The nodeID for the first (root) docNode is already set. For all children, the rootNode id
1046 // is updated to be <parent-id>.<num-child>, where the first parent-id is rootNode id.
1047 // The nodeType is root if rootNode, internal if there are children and leaf if no children
1048 protected void insertDocNodeAttributes(Element docNode, String document_type, String id) {
1049
1050 boolean isRoot = false;
1051 if(id == null) { // rootNode, get the root nodeID to work with recursively
1052 id = docNode.getAttribute(GSXML.NODE_ID_ATT);
1053 isRoot = true;
1054 } else { // for all but the root node, need to still set the nodeID
1055 docNode.setAttribute(GSXML.NODE_ID_ATT, id);
1056 }
1057
1058 docNode.setAttribute(GSXML.DOC_TYPE_ATT, document_type);
1059
1060 NodeList docNodes = GSXML.getChildrenByTagName(docNode, GSXML.DOC_NODE_ELEM);
1061 if(docNodes.getLength() > 0) {
1062 docNode.setAttribute(GSXML.NODE_TYPE_ATT, GSXML.NODE_TYPE_INTERNAL);
1063 for(int i = 0; i < docNodes.getLength(); i++) {
1064 Element childDocNode = (Element)docNodes.item(i);
1065
1066 // work out the child docNode's nodeID based on current id
1067 String nodeID = id + "." + (i+1);
1068 insertDocNodeAttributes(childDocNode, document_type, nodeID); //recursion step
1069 }
1070 } else {
1071 docNode.setAttribute(GSXML.NODE_TYPE_ATT, GSXML.NODE_TYPE_LEAF);
1072 }
1073
1074 // rootNode's nodeType is a special case: it's "root", not "leaf" or "internal"
1075 if(isRoot) docNode.setAttribute(GSXML.NODE_TYPE_ATT, GSXML.NODE_TYPE_ROOT);
1076
1077 }
1078
1079 /** run the XSLT transform which converts from doc.xml format to our internal document format */
1080 protected Element transformArchiveToDocument(Element section) {
1081
1082 String stylesheet_filename = GSFile.stylesheetFile(GlobalProperties.getGSDL3Home(), (String) this.config_params.get(GSConstants.SITE_NAME), "", (String) this.config_params.get(GSConstants.INTERFACE_NAME), (ArrayList<String>) this.config_params.get(GSConstants.BASE_INTERFACES), "archive2document.xsl");
1083 if (stylesheet_filename == null) {
1084 logger.error("Couldn't find stylesheet archive2document.xsl");
1085 return section;
1086 }
1087
1088 Document stylesheet_doc = XMLConverter.getDOM(new File(stylesheet_filename));
1089 if (stylesheet_doc == null) {
1090 logger.error("Couldn't load in stylesheet "+stylesheet_filename);
1091 return section;
1092 }
1093
1094 Document section_doc = XMLConverter.newDOM();
1095 section_doc.appendChild(section_doc.importNode(section, true));
1096 Node result = this.transformer.transform(stylesheet_doc, section_doc);
1097 logger.debug("transform result = "+XMLConverter.getPrettyString(result));
1098
1099 Element new_element;
1100 if (result.getNodeType() == Node.DOCUMENT_NODE) {
1101 new_element = ((Document) result).getDocumentElement();
1102 } else {
1103 new_element = (Element) result;
1104 }
1105
1106
1107 return new_element;
1108
1109 }
1110
1111 protected final int NO_QUERY_TERMS = 0;
1112 protected final int NO_EQUIV_QUERY_TERMS = 1;
1113 protected final int EQUIV_QUERY_TERMS = 2;
1114 /**
1115 * this involves a bit of a hack to get the equivalent query terms - has to
1116 * requery the query service - uses the last selected service name. (if it
1117 * ends in query).
1118 */
1119 protected int getQueryTermVariants(Element request, HashSet<String> query_term_variants, ArrayList<ArrayList<HashSet<String>>> phrase_query_term_variants_hierarchy)
1120 {
1121 Document doc = XMLConverter.newDOM();
1122
1123 // do the query again to get term info
1124 Element cgi_param_list = (Element) GSXML.getChildByTagName(request, GSXML.PARAM_ELEM + GSXML.LIST_MODIFIER);
1125 HashMap<String, Serializable> params = GSXML.extractParams(cgi_param_list, false);
1126
1127 HashMap previous_params = (HashMap) params.get("p");
1128 if (previous_params == null)
1129 {
1130 return NO_QUERY_TERMS;
1131 }
1132 String service_name = (String) previous_params.get(GSParams.SERVICE);
1133 if (service_name == null || !service_name.endsWith("Query"))
1134 { // hack for now - we only do highlighting if we were in a query last - ie not if we were in a browse thingy
1135 logger.debug("invalid service "+service_name+", not doing highlighting");
1136 return NO_QUERY_TERMS;
1137 }
1138
1139 String collection = (String) params.get(GSParams.COLLECTION);
1140 UserContext userContext = new UserContext(request);
1141 String to = GSPath.appendLink(collection, service_name);
1142
1143 Element mr_query_message = doc.createElement(GSXML.MESSAGE_ELEM);
1144 Element mr_query_request = GSXML.createBasicRequest(doc, GSXML.REQUEST_TYPE_PROCESS, to, userContext);
1145 mr_query_message.appendChild(mr_query_request);
1146
1147 // paramList
1148 HashMap service_params = (HashMap) params.get("s1");
1149
1150 Element query_param_list = doc.createElement(GSXML.PARAM_ELEM + GSXML.LIST_MODIFIER);
1151 GSXML.addParametersToList(query_param_list, service_params);
1152 mr_query_request.appendChild(query_param_list);
1153
1154 // do the query
1155 Element mr_query_response = (Element) this.mr.process(mr_query_message);
1156
1157 // find the term lists
1158 String path = GSPath.appendLink(GSXML.RESPONSE_ELEM, GSXML.TERM_ELEM + GSXML.LIST_MODIFIER);
1159 Element query_term_list_element = (Element) GSXML.getNodeByPath(mr_query_response, path);
1160 if (query_term_list_element == null)
1161 {
1162 // no term info
1163 return NO_QUERY_TERMS;
1164 }
1165
1166 int result_code = NO_EQUIV_QUERY_TERMS;
1167 NodeList equivalent_terms_nodelist = query_term_list_element.getElementsByTagName("equivTermList");
1168 if (equivalent_terms_nodelist == null || equivalent_terms_nodelist.getLength() == 0)
1169 {
1170 // if we have no equivalent terms, just add the current terms lower cased and we do case insensitive matching later on
1171 NodeList terms_nodelist = query_term_list_element.getElementsByTagName("term");
1172 if (terms_nodelist != null && terms_nodelist.getLength() > 0)
1173 {
1174 for (int i = 0; i < terms_nodelist.getLength(); i++)
1175 {
1176 String termValue = ((Element) terms_nodelist.item(i)).getAttribute("name");
1177 query_term_variants.add(termValue.toLowerCase());
1178 }
1179 }
1180 }
1181 else
1182 {
1183 result_code = EQUIV_QUERY_TERMS;
1184 for (int i = 0; i < equivalent_terms_nodelist.getLength(); i++)
1185 {
1186 Element equivalent_terms_element = (Element) equivalent_terms_nodelist.item(i);
1187 String[] equivalent_terms = GSXML.getAttributeValuesFromList(equivalent_terms_element, GSXML.NAME_ATT);
1188 for (int j = 0; j < equivalent_terms.length; j++)
1189 {
1190 query_term_variants.add(equivalent_terms[j]);
1191 }
1192 }
1193 }
1194
1195 String metadata_path = GSPath.appendLink(GSXML.RESPONSE_ELEM, GSXML.METADATA_ELEM + GSXML.LIST_MODIFIER);
1196 Element metadata_list = (Element) GSXML.getNodeByPath(mr_query_response, metadata_path);
1197
1198 Element query_element = GSXML.getNamedElement(metadata_list, GSXML.METADATA_ELEM, GSXML.NAME_ATT, "query");
1199 String performed_query = GSXML.getNodeText(query_element) + " ";
1200 logger.debug("performed query="+performed_query);
1201
1202 boolean has_phrases = false; // if there are no phrases, we don't bother making the phrase variants structure
1203 if (performed_query.contains("\"")) {
1204 has_phrases = true;
1205 }
1206
1207 ArrayList<HashSet<String>> phrase_query_p_term_variants_list = new ArrayList<HashSet<String>>();
1208 int term_start = 0;
1209 boolean in_term = false;
1210 boolean in_phrase = false;
1211 for (int i = 0; i < performed_query.length(); i++) {
1212
1213 char character = performed_query.charAt(i);
1214 boolean is_character_letter_or_digit = Character.isLetterOrDigit(character);
1215
1216 // Has a query term just started?
1217 if (in_term == false && is_character_letter_or_digit == true)
1218 {
1219 in_term = true;
1220 term_start = i;
1221 }
1222
1223 // Or has a term just finished?
1224 else if (in_term == true && is_character_letter_or_digit == false)
1225 {
1226 in_term = false;
1227 String term = performed_query.substring(term_start, i);
1228 if (has_phrases) {
1229 // do the phrase bit
1230 HashSet<String> phrase_query_p_term_x_variants = new HashSet<String>();
1231 if (result_code == EQUIV_QUERY_TERMS) {
1232 Element term_element = GSXML.getNamedElement(query_term_list_element, GSXML.TERM_ELEM, GSXML.NAME_ATT, term);
1233 if (term_element != null) {
1234 // might be null for eg TX in [snails]:TX
1235
1236 NodeList term_equivalent_terms_nodelist = term_element.getElementsByTagName("equivTermList");
1237 if (term_equivalent_terms_nodelist != null || term_equivalent_terms_nodelist.getLength() != 0) {
1238 for (int j = 0; j < term_equivalent_terms_nodelist.getLength(); j++)
1239 {
1240 Element term_equivalent_terms_element = (Element) term_equivalent_terms_nodelist.item(j);
1241 String[] term_equivalent_terms = GSXML.getAttributeValuesFromList(term_equivalent_terms_element, GSXML.NAME_ATT);
1242 for (int k = 0; k < term_equivalent_terms.length; k++)
1243 {
1244 phrase_query_p_term_x_variants.add(term_equivalent_terms[k]);
1245 }
1246 }
1247 }
1248 }
1249 } else { // result_code != EQUIV_QUERY_TERMS
1250 // we don;t have equivalent term list, so just add the lower cased version in, and we do case-insensitive matching later on
1251 if (query_term_variants.contains(term.toLowerCase()) || containsSubString(query_term_variants, term)) {
1252 // this handles the case where the user has searched for snails, but term list returns 'snail'
1253 phrase_query_p_term_x_variants.add(term.toLowerCase());
1254 }
1255 }
1256 if (phrase_query_p_term_x_variants.size()>0) {
1257 // we have found a valid term
1258 phrase_query_p_term_variants_list.add(phrase_query_p_term_x_variants);
1259
1260 if (in_phrase == false)
1261 {
1262 phrase_query_term_variants_hierarchy.add(phrase_query_p_term_variants_list);
1263 phrase_query_p_term_variants_list = new ArrayList<HashSet<String>>();
1264 }
1265 }
1266 } // end if has_phrases
1267 else {
1268 // no phrases so we don't have to do the phrasey stuff. but
1269 // we need to check the term against the query term list - if its not in there, check whether its the root of a term.
1270 // we want to handle the case where user has queried "snails", the term list returned only has snail, and therefore snails doesn't get highlighted.
1271 // but dont want to include eg TX
1272 if (result_code == NO_EQUIV_QUERY_TERMS) {
1273 if (containsSubString(query_term_variants, term)) {
1274 query_term_variants.add(term.toLowerCase());
1275 }
1276 }
1277
1278 }
1279 } // end of in_term...
1280 // Watch for phrases (surrounded by quotes)
1281 if (character == '\"') {
1282
1283 // Has a phrase just started?
1284 if (in_phrase == false)
1285 {
1286 in_phrase = true;
1287 }
1288 // Or has a phrase just finished?
1289 else if (in_phrase == true)
1290 {
1291 in_phrase = false;
1292 phrase_query_term_variants_hierarchy.add(phrase_query_p_term_variants_list);
1293 }
1294
1295 phrase_query_p_term_variants_list = new ArrayList<HashSet<String>>();
1296 } // if char == "
1297 } // for each char in performed query
1298
1299 return result_code;
1300 }
1301
1302 protected boolean containsSubString(HashSet<String> query_term_variants, String term) {
1303 // hack to filter out TX, TI field names
1304 String lc_term = term.toLowerCase();
1305 if (query_term_variants.contains(term)) {
1306 return false; // or true??
1307 }
1308 if (term.matches("[A-Z][A-Z][A-Z]?")) {
1309 return false;
1310 }
1311 Iterator i = query_term_variants.iterator();
1312 while (i.hasNext()) {
1313 String t = (String)i.next();
1314 if (term.startsWith(t)) {
1315 return true;
1316 }
1317 }
1318 return false;
1319 }
1320
1321
1322 /** retrieve the marked up highlighted section - only works for solr collection */
1323 protected Element retrieveHighlightedContent(Element request, String node_id) {
1324
1325 Document doc = XMLConverter.newDOM();
1326
1327 // do the query again to get term info
1328 Element cgi_param_list = (Element) GSXML.getChildByTagName(request, GSXML.PARAM_ELEM + GSXML.LIST_MODIFIER);
1329 HashMap<String, Serializable> params = GSXML.extractParams(cgi_param_list, false);
1330
1331 HashMap previous_params = (HashMap) params.get("p");
1332 if (previous_params == null)
1333 {
1334 return null;
1335 }
1336 String service_name = (String) previous_params.get(GSParams.SERVICE);
1337 if (service_name == null || !service_name.endsWith("Query"))
1338 { // hack for now - we only do highlighting if we were in a query last - ie not if we were in a browse thingy
1339 logger.debug("HL invalid service, not doing highlighting");
1340 return null;
1341 }
1342
1343 String collection = (String) params.get(GSParams.COLLECTION);
1344 UserContext userContext = new UserContext(request);
1345 String to = GSPath.appendLink(collection, service_name);
1346
1347 Element mr_query_message = doc.createElement(GSXML.MESSAGE_ELEM);
1348 Element mr_query_request = GSXML.createBasicRequest(doc, GSXML.REQUEST_TYPE_PROCESS, to, userContext);
1349 mr_query_message.appendChild(mr_query_request);
1350
1351 // paramList
1352 HashMap service_params = (HashMap) params.get("s1");
1353
1354 // hack in case the user searched on eg titles, but we want highlighting in the text
1355 service_params.put("index", "TX");
1356 Element query_param_list = doc.createElement(GSXML.PARAM_ELEM + GSXML.LIST_MODIFIER);
1357 GSXML.addParametersToList(query_param_list, service_params);
1358
1359 if (node_id != null) {
1360 GSXML.addParameterToList(query_param_list, "hldocOID", node_id);
1361 } else {
1362 GSXML.addParameterToList(query_param_list, "hldocOID", (String) params.get(GSParams.DOCUMENT));
1363 }
1364 mr_query_request.appendChild(query_param_list);
1365 // do the query
1366
1367 Element mr_query_response = (Element) this.mr.process(mr_query_message);
1368 String pathNode = GSPath.appendLink(GSXML.RESPONSE_ELEM, GSXML.NODE_CONTENT_ELEM);
1369 Element highlighted_node = (Element) GSXML.getNodeByPath(mr_query_response, pathNode);
1370
1371 if (highlighted_node == null) {
1372 return null;
1373 }
1374 // For SOLR, the highlighted node will be a nodeContent element, which is the hldocOID section content, with search terms marked up.
1375 //We send it back to the documnetContentRetrieve service so that resolveTextMacros can be applied, and it can be properly encased in documentNode etc elements
1376
1377 // Build a request to process highlighted text
1378
1379 Element hl_message = doc.createElement(GSXML.MESSAGE_ELEM);
1380 to = GSPath.appendLink(collection, AbstractDocumentRetrieve.DOCUMENT_CONTENT_RETRIEVE_SERVICE);
1381 Element dc_request = GSXML.createBasicRequest(doc, GSXML.REQUEST_TYPE_PROCESS, to, userContext);
1382 hl_message.appendChild(dc_request);
1383
1384 // Create a parameter list to specify the request parameters - empty for now
1385 Element dc_param_list = doc.createElement(GSXML.PARAM_ELEM + GSXML.LIST_MODIFIER);
1386 dc_request.appendChild(dc_param_list);
1387
1388 // get the content
1389 Element doc_list = doc.createElement(GSXML.DOC_NODE_ELEM + GSXML.LIST_MODIFIER);
1390 dc_request.appendChild(doc_list);
1391 Element current_doc = doc.createElement(GSXML.DOC_NODE_ELEM);
1392 doc_list.appendChild(current_doc);
1393 current_doc.setAttribute(GSXML.NODE_ID_ATT, (String) params.get(GSParams.DOCUMENT));
1394 //Append highlighted content to request for processing
1395 dc_request.appendChild(doc.importNode(highlighted_node, true));
1396 Element hl_response_message = (Element) this.mr.process(hl_message);
1397 //Get results
1398 NodeList contentList = hl_response_message.getElementsByTagName(GSXML.NODE_CONTENT_ELEM);
1399 Element content = (Element) contentList.item(0);
1400 return content;
1401
1402
1403 }
1404 /**
1405 * Highlights query terms in specified elements (whose name is in element_names) text inside top_level_elem
1406 */
1407 protected boolean highlightQueryTermsDOM(Document doc, Element top_level_elem, String element_name, HashSet<String> query_term_variants, ArrayList<ArrayList<HashSet<String>>> phrase_query_term_variants_hierarchy, boolean case_insensitive) {
1408
1409 NodeList named_elems = top_level_elem.getElementsByTagName(element_name);
1410 for (int j=named_elems.getLength()-1; j>=0; j--) {
1411 Element this_elem = (Element)named_elems.item(j);
1412 Element replacement_elem = highlightQueryTermsElementText(doc, this_elem, query_term_variants, phrase_query_term_variants_hierarchy, case_insensitive);
1413 this_elem.getParentNode().replaceChild(replacement_elem, this_elem);
1414 }
1415 return true;
1416 }
1417 /**
1418 * Highlights query terms in the text content of an element.
1419 */
1420 private Element highlightQueryTermsElementText(Document doc, Element original_element, HashSet<String> query_term_variants, ArrayList<ArrayList<HashSet<String>>> phrase_query_term_variants_hierarchy, boolean case_insensitive)
1421 {
1422 String content = GSXML.getNodeText(original_element);
1423 // Convert the content string to an array of characters for speed
1424 char[] content_characters = new char[content.length()];
1425 content.getChars(0, content.length(), content_characters, 0);
1426
1427 // Now skim through the content, identifying word matches
1428 ArrayList<WordMatch> word_matches = new ArrayList<WordMatch>();
1429 int word_start = 0;
1430 boolean in_word = false;
1431 boolean preceding_word_matched = false;
1432 boolean inTag = false;
1433 for (int i = 0; i < content_characters.length; i++)
1434 {
1435 //We don't want to find words inside HTML tags
1436 if (content_characters[i] == '<')
1437 {
1438 // are we currently in a word?
1439 if (in_word) {
1440 in_word = false;
1441 String word = new String(content_characters, word_start, (i - word_start));
1442 if (case_insensitive) {
1443 word = word.toLowerCase();
1444 }
1445 if (query_term_variants.contains(word)) {
1446 // We have found a matching word, so remember its location
1447 word_matches.add(new WordMatch(word, word_start, i, preceding_word_matched));
1448 // should preceding word matched be set to true/false here??
1449 preceding_word_matched = true;
1450 } else {
1451 preceding_word_matched = false;
1452 }
1453 }
1454 inTag = true;
1455 continue;
1456 }
1457 else if (inTag && content_characters[i] == '>')
1458 {
1459 inTag = false;
1460 continue;
1461 }
1462 else if (inTag)
1463 {
1464 continue;
1465 }
1466
1467 boolean is_character_letter_or_digit = Character.isLetterOrDigit(content_characters[i]);
1468
1469 // Has a word just started?
1470 if (in_word == false && is_character_letter_or_digit == true)
1471 {
1472 in_word = true;
1473 word_start = i;
1474 }
1475
1476 // Or has a word just finished?
1477 else if (in_word == true && is_character_letter_or_digit == false)
1478 {
1479 in_word = false;
1480
1481 // Check if the word matches any of the query term equivalents
1482 String word = new String(content_characters, word_start, (i - word_start));
1483 if (case_insensitive) {
1484 word = word.toLowerCase();
1485 }
1486 if (query_term_variants.contains(word))
1487 {
1488 // We have found a matching word, so remember its location
1489 word_matches.add(new WordMatch(word, word_start, i, preceding_word_matched));
1490 preceding_word_matched = true;
1491 }
1492 else
1493 {
1494 preceding_word_matched = false;
1495 }
1496 }
1497 }
1498
1499 // Don't forget the last word...
1500 if (in_word == true)
1501 {
1502 // Check if the word matches any of the query term equivalents
1503 String word = new String(content_characters, word_start, (content_characters.length - word_start));
1504 if (case_insensitive) {
1505 word = word.toLowerCase();
1506 }
1507 if (query_term_variants.contains(word))
1508 {
1509 // We have found a matching word, so remember its location
1510 word_matches.add(new WordMatch(word, word_start, content_characters.length, preceding_word_matched));
1511 }
1512 }
1513
1514 if (word_matches.size() == 0) {
1515 // just return a copy of the original element
1516 return (Element)doc.importNode(original_element, true);
1517
1518 }
1519
1520 ArrayList<Integer> highlight_start_positions = new ArrayList<Integer>();
1521 ArrayList<Integer> highlight_end_positions = new ArrayList<Integer>();
1522
1523 if (phrase_query_term_variants_hierarchy.size() ==0) {
1524 for (int i = 0; i < word_matches.size(); i++) {
1525 highlight_start_positions.add(Integer.valueOf(word_matches.get(i).start_position));
1526 highlight_end_positions.add(Integer.valueOf(word_matches.get(i).end_position));
1527 }
1528 }
1529 else {
1530 // Deal with phrases now
1531 ArrayList<PartialPhraseMatch> partial_phrase_matches = new ArrayList<PartialPhraseMatch>();
1532 for (int i = 0; i < word_matches.size(); i++)
1533 {
1534 WordMatch word_match = word_matches.get(i);
1535
1536 // See if any partial phrase matches are extended by this word
1537 if (word_match.preceding_word_matched)
1538 {
1539 for (int j = partial_phrase_matches.size() - 1; j >= 0; j--)
1540 {
1541 PartialPhraseMatch partial_phrase_match = partial_phrase_matches.remove(j);
1542 ArrayList phrase_query_p_term_variants_list = phrase_query_term_variants_hierarchy.get(partial_phrase_match.query_phrase_number);
1543 HashSet phrase_query_p_term_x_variants = (HashSet) phrase_query_p_term_variants_list.get(partial_phrase_match.num_words_matched);
1544 if (phrase_query_p_term_x_variants.contains(word_match.word))
1545 {
1546 partial_phrase_match.num_words_matched++;
1547
1548 // Has a complete phrase match occurred?
1549 if (partial_phrase_match.num_words_matched == phrase_query_p_term_variants_list.size())
1550 {
1551 // Check for overlaps by looking at the previous highlight range
1552 if (!highlight_end_positions.isEmpty())
1553 {
1554 int last_highlight_index = highlight_end_positions.size() - 1;
1555 int last_highlight_end = highlight_end_positions.get(last_highlight_index).intValue();
1556 if (last_highlight_end > partial_phrase_match.start_position)
1557 {
1558 // There is an overlap, so remove the previous phrase match
1559 int last_highlight_start = highlight_start_positions.remove(last_highlight_index).intValue();
1560 highlight_end_positions.remove(last_highlight_index);
1561 partial_phrase_match.start_position = last_highlight_start;
1562 }
1563 }
1564
1565 highlight_start_positions.add(Integer.valueOf(partial_phrase_match.start_position));
1566 highlight_end_positions.add(Integer.valueOf(word_match.end_position));
1567 }
1568 // No, but add the partial match back into the list for next time
1569 else
1570 {
1571 partial_phrase_matches.add(partial_phrase_match);
1572 }
1573 }
1574 }
1575 }
1576 else
1577 {
1578 partial_phrase_matches.clear();
1579 }
1580
1581 // See if this word is at the start of any of the phrases
1582 for (int p = 0; p < phrase_query_term_variants_hierarchy.size(); p++)
1583 {
1584 ArrayList phrase_query_p_term_variants_list = phrase_query_term_variants_hierarchy.get(p);
1585 if (phrase_query_p_term_variants_list.size()>0) {
1586 HashSet phrase_query_p_term_1_variants = (HashSet) phrase_query_p_term_variants_list.get(0);
1587 if (phrase_query_p_term_1_variants.contains(word_match.word))
1588 {
1589 // If this phrase is just one word long, we have a complete match
1590 if (phrase_query_p_term_variants_list.size() == 1)
1591 {
1592 highlight_start_positions.add(Integer.valueOf(word_match.start_position));
1593 highlight_end_positions.add(Integer.valueOf(word_match.end_position));
1594 }
1595 // Otherwise we have the start of a potential phrase match
1596 else
1597 {
1598 partial_phrase_matches.add(new PartialPhraseMatch(word_match.start_position, p));
1599 }
1600 }
1601 }
1602 }
1603 }
1604 }
1605
1606 // Now add the annotation tags into the document at the correct points
1607 Element content_element = (Element)doc.importNode(original_element, false); // just copy the element plus any attributes, but not any children.
1608 int last_wrote = 0;
1609 for (int i = 0; i < highlight_start_positions.size(); i++)
1610 {
1611 int highlight_start = highlight_start_positions.get(i).intValue();
1612 int highlight_end = highlight_end_positions.get(i).intValue();
1613
1614 // Print anything before the highlight range
1615 if (last_wrote < highlight_start)
1616 {
1617 String preceding_text = new String(content_characters, last_wrote, (highlight_start - last_wrote));
1618 content_element.appendChild(doc.createTextNode(preceding_text));
1619 }
1620
1621 // Print the highlight text, annotated
1622 if (highlight_end > last_wrote)
1623 {
1624 String highlight_text = new String(content_characters, highlight_start, (highlight_end - highlight_start));
1625 Element annotation_element = GSXML.createTextElement(doc, "annotation", highlight_text);
1626 annotation_element.setAttribute("type", "query_term");
1627 content_element.appendChild(annotation_element);
1628 last_wrote = highlight_end;
1629 }
1630 }
1631
1632 // Finish off any unwritten text
1633 if (last_wrote < content_characters.length)
1634 {
1635 String remaining_text = new String(content_characters, last_wrote, (content_characters.length - last_wrote));
1636 content_element.appendChild(doc.createTextNode(remaining_text));
1637 }
1638 return content_element;
1639 }
1640
1641
1642 static private class WordMatch
1643 {
1644 public String word;
1645 public int start_position;
1646 public int end_position;
1647 public boolean preceding_word_matched;
1648
1649 public WordMatch(String word, int start_position, int end_position, boolean preceding_word_matched)
1650 {
1651 this.word = word;
1652 this.start_position = start_position;
1653 this.end_position = end_position;
1654 this.preceding_word_matched = preceding_word_matched;
1655 }
1656 }
1657
1658 static private class PartialPhraseMatch
1659 {
1660 public int start_position;
1661 public int query_phrase_number;
1662 public int num_words_matched;
1663
1664 public PartialPhraseMatch(int start_position, int query_phrase_number)
1665 {
1666 this.start_position = start_position;
1667 this.query_phrase_number = query_phrase_number;
1668 this.num_words_matched = 1;
1669 }
1670 }
1671}
Note: See TracBrowser for help on using the repository browser.