source: main/trunk/greenstone3/src/java/org/greenstone/gsdl3/action/DocumentAction.java@ 36978

Last change on this file since 36978 was 36978, checked in by kjdon, 17 months ago

now this does a check to see if the doc id is valid before proceding to get teh structure, metadata etc. if the id is invalid, the page will contain an error element and not a document element. Also replaced hard coded service names with their variable names

  • Property svn:keywords set to Author Date Id Revision
File size: 63.8 KB
Line 
1/*
2 * DocumentAction.java
3 * Copyright (C) 2002 New Zealand Digital Library, http://www.nzdl.org
4 *
5 * This program is free software; you can redistribute it and/or modify
6 * it under the terms of the GNU General Public License as published by
7 * the Free Software Foundation; either version 2 of the License, or
8 * (at your option) any later version.
9 *
10 * This program is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 * GNU General Public License for more details.
14 *
15 * You should have received a copy of the GNU General Public License
16 * along with this program; if not, write to the Free Software
17 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
18 */
19package org.greenstone.gsdl3.action;
20
21// Greenstone classes
22import org.greenstone.gsdl3.core.ModuleInterface;
23import org.greenstone.gsdl3.service.AbstractDocumentRetrieve;
24import org.greenstone.gsdl3.service.DocXMLUtil;
25import org.greenstone.gsdl3.util.*;
26import org.greenstone.util.GlobalProperties;
27
28// XML classes
29import org.w3c.dom.Document;
30import org.w3c.dom.Element;
31import org.w3c.dom.Node;
32import org.w3c.dom.Text;
33import org.w3c.dom.NodeList;
34
35// General Java classes
36import java.util.ArrayList;
37import java.util.HashMap;
38import java.util.HashSet;
39import java.util.Iterator;
40import java.io.File;
41import java.io.Serializable;
42
43import org.apache.log4j.*;
44
45/** Action class for retrieving Documents via the message router */
46public class DocumentAction extends Action
47{
48
49 static Logger logger = Logger.getLogger(org.greenstone.gsdl3.action.DocumentAction.class.getName());
50
51 // this is used to specify that the sibling nodes of a selected one should be obtained
52 public static final String SIBLING_ARG = "sib";
53 public static final String GOTO_PAGE_ARG = "gp";
54 public static final String ENRICH_DOC_ARG = "end";
55 public static final String EXPAND_DOCUMENT_ARG = "ed";
56 public static final String EXPAND_CONTENTS_ARG = "ec";
57 public static final String REALISTIC_BOOK_ARG = "book";
58 public static final String NO_TEXT_ARG = "noText";
59 public static final String DOC_EDIT_ARG = "docEdit";
60
61 /**
62 * if this is set to true, when a document is displayed, any annotation type
63 * services (enrich) will be offered to the user as well
64 */
65 protected boolean provide_annotations = false;
66
67 protected boolean highlight_query_terms = false;
68
69 public boolean configure()
70 {
71 super.configure();
72 String highlight = (String) config_params.get("highlightQueryTerms");
73 if (highlight != null && highlight.equals("true"))
74 {
75 highlight_query_terms = true;
76 }
77 String annotate = (String) config_params.get("displayAnnotationService");
78 if (annotate != null && annotate.equals("true"))
79 {
80 provide_annotations = true;
81 }
82 return true;
83 }
84
85 public Node process(Node message_node)
86 {
87 // for now, no subaction eventually we may want to have subactions such as text assoc or something ?
88
89 Element message = GSXML.nodeToElement(message_node);
90 Document doc = XMLConverter.newDOM();
91
92 // the response
93 Element result = doc.createElement(GSXML.MESSAGE_ELEM);
94 Element page_response = doc.createElement(GSXML.RESPONSE_ELEM);
95 result.appendChild(page_response);
96
97 // get the request - assume only one
98 Element request = (Element) GSXML.getChildByTagName(message, GSXML.REQUEST_ELEM);
99 Element cgi_paramList = (Element) GSXML.getChildByTagName(request, GSXML.PARAM_ELEM + GSXML.LIST_MODIFIER);
100 HashMap<String, Serializable> params = GSXML.extractParams(cgi_paramList, false);
101
102 // just in case there are some that need to get passed to the services
103 // why do we use s0 here and s1 in other places???
104 HashMap service_params = (HashMap) params.get("s0");
105
106 String collection = (String) params.get(GSParams.COLLECTION);
107 String document_id = (String) params.get(GSParams.DOCUMENT);
108 if (document_id != null && document_id.equals(""))
109 {
110 document_id = null;
111 }
112 String href = (String) params.get(GSParams.HREF);//for an external link : get the href URL if it is existing in the params list
113 if (href != null && href.equals(""))
114 {
115 href = null;
116 }
117 String rl = (String) params.get(GSParams.RELATIVE_LINK);//for an external link : get the rl value if it is existing in the params list
118 if (document_id == null && href == null)
119 {
120 logger.error("no document specified!");
121 return result;
122 }
123 if (rl != null && rl.equals("0"))
124 {
125 // this is a true external link, we should have been directed to a different page or action
126 logger.error("rl value was 0, shouldn't get here");
127 return result;
128 }
129
130 String doc_id_modifier = "";
131 String sibling_num = (String) params.get(GOTO_PAGE_ARG);
132 if (sibling_num != null && !sibling_num.equals(""))
133 {
134 // we have to modify the doc name
135 doc_id_modifier = "." + sibling_num + ".ss";
136 }
137
138
139 UserContext userContext = new UserContext(request);
140
141 //append site metadata
142 addSiteMetadata(page_response, userContext);
143 addInterfaceOptions(page_response);
144
145 // get the additional data needed for the page
146 getBackgroundData(page_response, collection, userContext);
147
148 // create a basic doc list containing the current node
149 // we will use this to query whether the id is valid, and to get document type
150 Element basic_doc_list = doc.createElement(GSXML.DOC_NODE_ELEM + GSXML.LIST_MODIFIER);
151 Element current_doc = doc.createElement(GSXML.DOC_NODE_ELEM);
152 basic_doc_list.appendChild(current_doc);
153 if (document_id != null)
154 {
155 current_doc.setAttribute(GSXML.NODE_ID_ATT, document_id + doc_id_modifier);
156 }
157 else
158 {
159 current_doc.setAttribute(GSXML.HREF_ID_ATT, href);
160 // do we need this??
161 current_doc.setAttribute(GSXML.ID_MOD_ATT, doc_id_modifier);
162 }
163
164 // lets do a quick check here for valid doc id.
165 if (document_id != null) {
166 boolean is_valid = checkValidOID(basic_doc_list, collection, userContext, page_response );
167 if (!is_valid) {
168 GSXML.addError(page_response, "Invalid doc id ("+document_id+")", GSXML.ERROR_TYPE_INVALID_ID);
169 return result;
170 }
171 }
172 Element format_elem = (Element) GSXML.getChildByTagName(page_response, GSXML.FORMAT_ELEM);
173
174 if (format_elem != null) {
175 // lets look for param defaults set in config file
176 NodeList param_defaults = format_elem.getElementsByTagName(GSXML.PARAM_DEFAULT_ELEM);
177 for (int i=0; i<param_defaults.getLength(); i++) {
178 Element p = (Element)param_defaults.item(i);
179 String name = p.getAttribute(GSXML.NAME_ATT);
180 if (params.get(name) ==null) {
181 // wasn't set from interface
182 String value = p.getAttribute(GSXML.VALUE_ATT);
183 params.put(name, value );
184 // also add into request param xml so that xslt knows it too
185 GSXML.addParameterToList(cgi_paramList, name, value);
186 }
187 }
188 }
189
190 String document_type = (String) params.get(GSParams.DOCUMENT_TYPE);
191 if (document_type != null && document_type.equals(""))
192 {
193 //document_type = "hierarchy";
194 document_type = null; // we'll get it later if not already specified
195 }
196 // what if it is null here?? Anu to check...
197
198
199 boolean editing_document = false;
200 String doc_edit = (String) params.get(DOC_EDIT_ARG);
201 if (doc_edit != null && doc_edit.equals("1")) {
202 editing_document = true;
203 }
204
205 // are we editing mode? just get the archive document, convert to our internal doc format, and return it
206 if (editing_document) {
207 return getFormattedArchiveDoc(doc, collection, document_id, document_type, result, page_response, userContext);
208 }
209
210 //whether to retrieve siblings or not
211 boolean get_siblings = false;
212 String sibs = (String) params.get(SIBLING_ARG);
213 if (sibs != null && sibs.equals("1"))
214 {
215 get_siblings = true;
216 }
217
218 boolean expand_document = false;
219 String ed_arg = (String) params.get(EXPAND_DOCUMENT_ARG);
220 if (ed_arg != null && ed_arg.equals("1"))
221 {
222 expand_document = true;
223 }
224
225 boolean expand_contents = false;
226 if (expand_document)
227 { // we always expand the contents with the text
228 expand_contents = true;
229 }
230 else
231 {
232 String ec_arg = (String) params.get(EXPAND_CONTENTS_ARG);
233 if (ec_arg != null && ec_arg.equals("1"))
234 {
235 expand_contents = true;
236 }
237 }
238
239 // do we want text content? Not if no_text=1.
240 // expand_document overrides this. - should it??
241 boolean get_text = true;
242 String nt_arg = (String) params.get(NO_TEXT_ARG);
243
244 if (!expand_document && nt_arg!=null && nt_arg.equals("1")) {
245 logger.debug("SETTING GET TEXT TO FALSE");
246 get_text = false;
247 } else {
248 logger.debug("GET TEXT REMAINS TRUE");
249 }
250
251 // the_document is where all the doc info - structure and metadata etc
252 // is added into, to be returned in the page
253 Element the_document = doc.createElement(GSXML.DOCUMENT_ELEM);
254 page_response.appendChild(the_document);
255
256// used to create basic_doc_list here
257 if (document_type == null)
258 {
259 document_type = getDocumentType(basic_doc_list, collection, userContext, page_response);
260 }
261 if (document_type == null)
262 {
263 logger.debug("##### doctype is null, setting to simple");
264 document_type = GSXML.DOC_TYPE_SIMPLE;
265 }
266
267 the_document.setAttribute(GSXML.DOC_TYPE_ATT, document_type);
268
269 // start getting doc structure
270
271 // Create a parameter list to specify the required structure information
272 Element ds_param_list = doc.createElement(GSXML.PARAM_ELEM + GSXML.LIST_MODIFIER);
273
274 if (service_params != null)
275 {
276 GSXML.addParametersToList(ds_param_list, service_params);
277 }
278
279 Element ds_param = null;
280 boolean get_structure = false;
281 boolean get_structure_info = false;
282 if (document_type.equals(GSXML.DOC_TYPE_PAGED))
283 {
284 get_structure_info = true;
285
286 if (expand_contents)
287 {
288 ds_param = doc.createElement(GSXML.PARAM_ELEM);
289 ds_param_list.appendChild(ds_param);
290 ds_param.setAttribute(GSXML.NAME_ATT, "structure");
291 ds_param.setAttribute(GSXML.VALUE_ATT, "entire");
292 }
293
294 // get the info needed for paged naviagtion
295 ds_param = doc.createElement(GSXML.PARAM_ELEM);
296 ds_param_list.appendChild(ds_param);
297 ds_param.setAttribute(GSXML.NAME_ATT, "info");
298 ds_param.setAttribute(GSXML.VALUE_ATT, "numSiblings");
299 ds_param = doc.createElement(GSXML.PARAM_ELEM);
300 ds_param_list.appendChild(ds_param);
301 ds_param.setAttribute(GSXML.NAME_ATT, "info");
302 ds_param.setAttribute(GSXML.VALUE_ATT, "numChildren");
303 ds_param = doc.createElement(GSXML.PARAM_ELEM);
304 ds_param_list.appendChild(ds_param);
305 ds_param.setAttribute(GSXML.NAME_ATT, "info");
306 ds_param.setAttribute(GSXML.VALUE_ATT, "siblingPosition");
307
308 if (get_siblings)
309 {
310 ds_param = doc.createElement(GSXML.PARAM_ELEM);
311 ds_param_list.appendChild(ds_param);
312 ds_param.setAttribute(GSXML.NAME_ATT, "structure");
313 ds_param.setAttribute(GSXML.VALUE_ATT, "siblings");
314 }
315
316 }
317 else if (document_type.equals(GSXML.DOC_TYPE_HIERARCHY) || document_type.equals(GSXML.DOC_TYPE_PAGED_HIERARCHY))
318 {
319 get_structure = true;
320 if (expand_contents)
321 {
322 ds_param = doc.createElement(GSXML.PARAM_ELEM);
323 ds_param_list.appendChild(ds_param);
324 ds_param.setAttribute(GSXML.NAME_ATT, "structure");
325 ds_param.setAttribute(GSXML.VALUE_ATT, "entire");
326 }
327 else
328 {
329 // get the info needed for table of contents
330 ds_param = doc.createElement(GSXML.PARAM_ELEM);
331 ds_param_list.appendChild(ds_param);
332 ds_param.setAttribute(GSXML.NAME_ATT, "structure");
333 ds_param.setAttribute(GSXML.VALUE_ATT, "ancestors");
334 ds_param = doc.createElement(GSXML.PARAM_ELEM);
335 ds_param_list.appendChild(ds_param);
336 ds_param.setAttribute(GSXML.NAME_ATT, "structure");
337 ds_param.setAttribute(GSXML.VALUE_ATT, "children");
338 if (get_siblings)
339 {
340 ds_param = doc.createElement(GSXML.PARAM_ELEM);
341 ds_param_list.appendChild(ds_param);
342 ds_param.setAttribute(GSXML.NAME_ATT, "structure");
343 ds_param.setAttribute(GSXML.VALUE_ATT, "siblings");
344 }
345 }
346 }
347 else
348 {
349 // we dont need any structure
350 }
351
352 boolean has_dummy = false;
353 if (get_structure || get_structure_info)
354 {
355
356 // Build a request to obtain the document structure
357 Element ds_message = doc.createElement(GSXML.MESSAGE_ELEM);
358 String to = GSPath.appendLink(collection, AbstractDocumentRetrieve.DOCUMENT_STRUCTURE_RETRIEVE_SERVICE);
359 Element ds_request = GSXML.createBasicRequest(doc, GSXML.REQUEST_TYPE_PROCESS, to, userContext);
360 ds_message.appendChild(ds_request);
361 ds_request.appendChild(ds_param_list);
362
363 // add the node list we created earlier
364 ds_request.appendChild(basic_doc_list);
365
366 // Process the document structure retrieve message
367 Element ds_response_message = (Element) this.mr.process(ds_message);
368 if (processErrorElements(ds_response_message, page_response))
369 {
370 return result;
371 }
372
373 // get the info and print out
374 String path = GSPath.appendLink(GSXML.RESPONSE_ELEM, GSXML.DOC_NODE_ELEM + GSXML.LIST_MODIFIER);
375 path = GSPath.appendLink(path, GSXML.DOC_NODE_ELEM);
376 path = GSPath.appendLink(path, "nodeStructureInfo");
377 Element ds_response_struct_info = (Element) GSXML.getNodeByPath(ds_response_message, path);
378 // get the doc_node bit
379 if (ds_response_struct_info != null)
380 {
381 the_document.appendChild(doc.importNode(ds_response_struct_info, true));
382 }
383 path = GSPath.appendLink(GSXML.RESPONSE_ELEM, GSXML.DOC_NODE_ELEM + GSXML.LIST_MODIFIER);
384 path = GSPath.appendLink(path, GSXML.DOC_NODE_ELEM);
385 path = GSPath.appendLink(path, GSXML.NODE_STRUCTURE_ELEM);
386 Element ds_response_structure = (Element) GSXML.getNodeByPath(ds_response_message, path);
387
388 if (ds_response_structure != null)
389 {
390 // add the contents of the structure bit into the_document
391 NodeList structs = ds_response_structure.getChildNodes();
392 for (int i = 0; i < structs.getLength(); i++)
393 {
394 the_document.appendChild(doc.importNode(structs.item(i), true));
395 }
396 }
397 else
398 {
399 // no structure nodes, so put in a dummy doc node
400 Element doc_node = doc.createElement(GSXML.DOC_NODE_ELEM);
401 if (document_id != null)
402 {
403 doc_node.setAttribute(GSXML.NODE_ID_ATT, document_id);
404 }
405 else
406 {
407 doc_node.setAttribute(GSXML.HREF_ID_ATT, href);
408
409 }
410 the_document.appendChild(doc_node);
411 has_dummy = true;
412 }
413 }
414 else
415 { // a simple type - we dont have a dummy node for simple
416 // should think about this more
417 // no structure request, so just put in a dummy doc node
418 Element doc_node = doc.createElement(GSXML.DOC_NODE_ELEM);
419 if (document_id != null)
420 {
421 doc_node.setAttribute(GSXML.NODE_ID_ATT, document_id);
422 }
423 else
424 {
425 doc_node.setAttribute(GSXML.HREF_ID_ATT, href);
426 }
427 the_document.appendChild(doc_node);
428 has_dummy = true;
429 }
430
431 // end getting doc structure
432
433 // start getting doc metadata
434
435 // Build a request to obtain some document metadata
436 Element dm_message = doc.createElement(GSXML.MESSAGE_ELEM);
437 String to = GSPath.appendLink(collection, AbstractDocumentRetrieve.DOCUMENT_METADATA_RETRIEVE_SERVICE);
438 Element dm_request = GSXML.createBasicRequest(doc, GSXML.REQUEST_TYPE_PROCESS, to, userContext);
439 dm_message.appendChild(dm_request);
440 // Create a parameter list to specify the required metadata information
441
442 HashSet<String> meta_names = new HashSet<String>();
443 meta_names.add("Title"); // the default
444 if (format_elem != null)
445 {
446 getRequiredMetadataNames(format_elem, meta_names);
447 }
448
449 Element extraMetaListElem = (Element) GSXML.getChildByTagName(request, GSXML.EXTRA_METADATA + GSXML.LIST_MODIFIER);
450 if (extraMetaListElem != null)
451 {
452 NodeList extraMetaList = extraMetaListElem.getElementsByTagName(GSXML.EXTRA_METADATA);
453 for (int i = 0; i < extraMetaList.getLength(); i++)
454 {
455 meta_names.add(((Element) extraMetaList.item(i)).getAttribute(GSXML.NAME_ATT));
456 }
457 }
458
459 Element dm_param_list = createMetadataParamList(doc,meta_names);
460 if (service_params != null)
461 {
462 GSXML.addParametersToList(dm_param_list, service_params);
463 }
464
465 dm_request.appendChild(dm_param_list);
466
467 // create the doc node list for the metadata request
468 Element dm_doc_list = doc.createElement(GSXML.DOC_NODE_ELEM + GSXML.LIST_MODIFIER);
469 dm_request.appendChild(dm_doc_list);
470
471 // Add each node from the structure response into the metadata request
472 NodeList doc_nodes = the_document.getElementsByTagName(GSXML.DOC_NODE_ELEM);
473 for (int i = 0; i < doc_nodes.getLength(); i++)
474 {
475 Element doc_node = (Element) doc_nodes.item(i);
476 String doc_node_id = doc_node.getAttribute(GSXML.NODE_ID_ATT);
477
478 // Add the documentNode to the list
479 Element dm_doc_node = doc.createElement(GSXML.DOC_NODE_ELEM);
480 if (needSectionContent(params)) {
481 if (doc_node_id.equals(document_id)) {
482 dm_doc_list.appendChild(dm_doc_node);
483 }
484 } else {
485 dm_doc_list.appendChild(dm_doc_node);
486 }
487 //dm_doc_list.appendChild(dm_doc_node);
488 dm_doc_node.setAttribute(GSXML.NODE_ID_ATT, doc_node_id);
489 dm_doc_node.setAttribute(GSXML.NODE_TYPE_ATT, doc_node.getAttribute(GSXML.NODE_TYPE_ATT));
490 if (document_id == null){
491 dm_doc_node.setAttribute(GSXML.HREF_ID_ATT, href );
492 }
493
494 }
495 // we also want a metadata request to the top level document to get
496 // assocfilepath - this could be cached too
497 Element doc_meta_request = GSXML.createBasicRequest(doc, GSXML.REQUEST_TYPE_PROCESS, to, userContext);
498 dm_message.appendChild(doc_meta_request);
499 Element doc_meta_param_list = doc.createElement(GSXML.PARAM_ELEM + GSXML.LIST_MODIFIER);
500 if (service_params != null)
501 {
502 GSXML.addParametersToList(doc_meta_param_list, service_params);
503 }
504
505 doc_meta_request.appendChild(doc_meta_param_list);
506 Element doc_param = doc.createElement(GSXML.PARAM_ELEM);
507 doc_meta_param_list.appendChild(doc_param);
508 doc_param.setAttribute(GSXML.NAME_ATT, "metadata");
509 doc_param.setAttribute(GSXML.VALUE_ATT, "assocfilepath");
510
511 // create the doc node list for the metadata request
512 Element doc_list = doc.createElement(GSXML.DOC_NODE_ELEM + GSXML.LIST_MODIFIER);
513 doc_meta_request.appendChild(doc_list);
514
515 Element doc_node = doc.createElement(GSXML.DOC_NODE_ELEM);
516 // the node we want is the root document node
517 if (document_id != null)
518 {
519 doc_node.setAttribute(GSXML.NODE_ID_ATT, document_id + ".rt");
520 }
521 /*else
522 {
523 doc_node.setAttribute(GSXML.HREF_ID_ATT, href);// + ".rt");
524 // can we assume that href is always a top level doc??
525 //doc_node.setAttribute(GSXML.ID_MOD_ATT, ".rt");
526 //doc_node.setAttribute("externalURL", has_rl);
527 }*/
528 doc_list.appendChild(doc_node);
529
530 Element dm_response_message = (Element) this.mr.process(dm_message);
531 if (processErrorElements(dm_response_message, page_response))
532 {
533 return result;
534 }
535
536 String path = GSPath.appendLink(GSXML.RESPONSE_ELEM, GSXML.DOC_NODE_ELEM + GSXML.LIST_MODIFIER);
537 Element dm_response_doc_list = (Element) GSXML.getNodeByPath(dm_response_message, path);
538
539 // Merge the metadata with the structure information
540 NodeList dm_response_docs = dm_response_doc_list.getChildNodes();
541 for (int i = 0; i < doc_nodes.getLength(); i++)
542 {
543 Node dcNode;
544 String node_idd = ((Element)doc_nodes.item(i)).getAttribute(GSXML.NODE_ID_ATT);
545 if (node_idd.isEmpty()) {
546 String href_id_att = ((Element)doc_nodes.item(i)).getAttribute(GSXML.HREF_ID_ATT);
547 dcNode = GSXML.getNamedElement(dm_response_doc_list, "documentNode", GSXML.HREF_ID_ATT, href_id_att);
548 } else {
549 dcNode = GSXML.getNamedElement(dm_response_doc_list, "documentNode", GSXML.NODE_ID_ATT, node_idd);
550 }
551 GSXML.mergeMetadataLists(doc_nodes.item(i), dcNode);
552 }
553 // get the top level doc metadata out
554 Element doc_meta_response = (Element) dm_response_message.getElementsByTagName(GSXML.RESPONSE_ELEM).item(1);
555 Element top_doc_node = (Element) GSXML.getNodeByPath(doc_meta_response, "documentNodeList/documentNode");
556 GSXML.mergeMetadataLists(the_document, top_doc_node);
557
558 // if we are highlighting query terms, then we also get them highlighted in the metadata
559
560 HashSet<String> query_term_variants = null;
561 ArrayList<ArrayList<HashSet<String>>> phrase_query_term_variants_hierarchy = null;
562 boolean do_highlight_query_terms = highlight_query_terms;
563 int query_terms_status = 0;
564 if (highlight_query_terms) {
565 // lets get the query term equivalents
566 query_term_variants = new HashSet<String>();
567 phrase_query_term_variants_hierarchy = new ArrayList<ArrayList<HashSet<String>>>();
568 if ((query_terms_status = getQueryTermVariants(request, query_term_variants, phrase_query_term_variants_hierarchy)) ==0) {
569 do_highlight_query_terms = false; // we couldn't get the terms
570 }
571 }
572
573 // lets try marking up the metadata with search terms
574 // if the search service doesn't send back <equivTermlist> then we haven't got the term variants. We lower case everything and do case insensitive matching
575 boolean highlight_case_insensitive = false;
576 if (query_terms_status == NO_EQUIV_QUERY_TERMS) {
577 highlight_case_insensitive = true;
578 }
579 if (do_highlight_query_terms) {
580 highlightQueryTermsDOM(doc, the_document, "metadata", query_term_variants, phrase_query_term_variants_hierarchy, highlight_case_insensitive);
581 }
582
583 // do we want doc text content? If not, we are done.
584 if (!get_text) {
585 // don't get text
586 return result;
587 }
588
589 // Build a request to obtain some document content
590 Element dc_message = doc.createElement(GSXML.MESSAGE_ELEM);
591 to = GSPath.appendLink(collection, AbstractDocumentRetrieve.DOCUMENT_CONTENT_RETRIEVE_SERVICE);
592 Element dc_request = GSXML.createBasicRequest(doc, GSXML.REQUEST_TYPE_PROCESS, to, userContext);
593 dc_message.appendChild(dc_request);
594
595 // Create a parameter list to specify the request parameters - empty for now
596 Element dc_param_list = doc.createElement(GSXML.PARAM_ELEM + GSXML.LIST_MODIFIER);
597 if (service_params != null)
598 {
599 GSXML.addParametersToList(dc_param_list, service_params);
600 }
601
602 dc_request.appendChild(dc_param_list);
603
604 // get the content
605 // the doc list for the content request is the same as the one for the structure request unless we want the whole document, in which case its the same as for the metadata request.
606 if (expand_document)
607 {
608 dc_request.appendChild(dm_doc_list);
609 }
610 else
611 {
612 dc_request.appendChild(basic_doc_list);
613 }
614 Element dc_response_message = (Element) this.mr.process(dc_message);
615
616 if (processErrorElements(dc_response_message, page_response))
617 {
618 return result;
619
620 }
621 Element dc_response_doc_list = (Element) GSXML.getNodeByPath(dc_response_message, path);
622
623 boolean get_marked_up_doc_from_query = false;
624 if (do_highlight_query_terms && query_terms_status == NO_EQUIV_QUERY_TERMS) {
625 get_marked_up_doc_from_query = true; // we try to. solr we can, lucene we can't
626 }
627
628 if (expand_document)
629 {
630 // Merge the content with the structure information
631 NodeList dc_response_docs = dc_response_doc_list.getChildNodes();
632 for (int i = 0; i < doc_nodes.getLength(); i++)
633 {
634 String node_id = ((Element)doc_nodes.item(i)).getAttribute(GSXML.NODE_ID_ATT);
635 Node docNode = GSXML.getNamedElement(dc_response_doc_list, "documentNode", GSXML.NODE_ID_ATT, node_id);
636 Node content = GSXML.getChildByTagName(docNode, GSXML.NODE_CONTENT_ELEM);
637 if (content != null)
638 {
639 if (do_highlight_query_terms) {
640 if (get_marked_up_doc_from_query) {
641
642 Element new_content = retrieveHighlightedContent(request, node_id);
643
644 if (new_content == null) {
645 // we didn't get any text back from the request. assume we won't be able to get it next time either (eg lucene)
646 get_marked_up_doc_from_query = false;
647 content = highlightQueryTermsElementText(doc, (Element)content, query_term_variants, phrase_query_term_variants_hierarchy, highlight_case_insensitive);
648 } else {
649 content= new_content;
650 }
651 } else {
652 content = highlightQueryTermsElementText(doc, (Element)content, query_term_variants, phrase_query_term_variants_hierarchy, highlight_case_insensitive);
653 }
654 }
655 doc_nodes.item(i).appendChild(doc.importNode(content, true));
656 }
657
658 }
659 if (has_dummy && document_type.equals(GSXML.DOC_TYPE_SIMPLE)) {
660 Element dummy_node = (Element) doc_nodes.item(0);
661 the_document.removeChild(dummy_node);
662 the_document.setAttribute(GSXML.NODE_ID_ATT, dummy_node.getAttribute(GSXML.NODE_ID_ATT));
663 NodeList dummy_children = dummy_node.getChildNodes();
664 for (int i = dummy_children.getLength() - 1; i >= 0; i--)
665 {
666 // special case as we don't want more than one metadata list
667 if (dummy_children.item(i).getNodeName().equals(GSXML.METADATA_ELEM + GSXML.LIST_MODIFIER))
668 {
669 GSXML.mergeMetadataFromList(the_document, dummy_children.item(i));
670 }
671 else
672 {
673 the_document.appendChild(dummy_children.item(i));
674 }
675 }
676 }
677 }
678 else
679 {
680 Element dc_response_doc = (Element) GSXML.getChildByTagName(dc_response_doc_list, GSXML.DOC_NODE_ELEM);
681 Element dc_response_doc_content = (Element) GSXML.getChildByTagName(dc_response_doc, GSXML.NODE_CONTENT_ELEM);
682
683 if (dc_response_doc_content == null)
684 {
685 // no content to add
686 if (dc_response_doc.getAttribute("external").equals("true"))
687 {
688 String href_id = dc_response_doc.getAttribute(GSXML.HREF_ID_ATT);
689
690 the_document.setAttribute("selectedNode", href_id);
691 the_document.setAttribute("external", href_id);
692 }
693 return result;
694 }
695 if (do_highlight_query_terms)
696 {
697 dc_response_doc.removeChild(dc_response_doc_content);
698 if (get_marked_up_doc_from_query) {
699 Element new_content = retrieveHighlightedContent(request, null);
700 if (new_content == null) {
701 get_marked_up_doc_from_query = false;
702 dc_response_doc_content = highlightQueryTermsElementText(doc, (Element)dc_response_doc_content, query_term_variants, phrase_query_term_variants_hierarchy, highlight_case_insensitive);
703 } else {
704
705 dc_response_doc_content = new_content;
706 }
707 } else {
708 dc_response_doc_content = highlightQueryTermsElementText(doc, (Element)dc_response_doc_content, query_term_variants, phrase_query_term_variants_hierarchy, highlight_case_insensitive);
709 }
710 dc_response_doc.appendChild(dc_response_doc.getOwnerDocument().importNode(dc_response_doc_content, true));
711 }
712
713 if (provide_annotations)
714 {
715 String service_selected = (String) params.get(ENRICH_DOC_ARG);
716 if (service_selected != null && service_selected.equals("1"))
717 {
718 // now we can modifiy the response doc if needed
719 String enrich_service = (String) params.get(GSParams.SERVICE);
720 // send a message to the service
721 Element enrich_message = doc.createElement(GSXML.MESSAGE_ELEM);
722 Element enrich_request = GSXML.createBasicRequest(doc, GSXML.REQUEST_TYPE_PROCESS, enrich_service, userContext);
723 enrich_message.appendChild(enrich_request);
724 // check for parameters
725 HashMap e_service_params = (HashMap) params.get("s1");
726 if (e_service_params != null)
727 {
728 Element enrich_pl = doc.createElement(GSXML.PARAM_ELEM + GSXML.LIST_MODIFIER);
729 GSXML.addParametersToList(enrich_pl, e_service_params);
730 enrich_request.appendChild(enrich_pl);
731 }
732 Element e_doc_list = doc.createElement(GSXML.DOC_NODE_ELEM + GSXML.LIST_MODIFIER);
733 enrich_request.appendChild(e_doc_list);
734 e_doc_list.appendChild(doc.importNode(dc_response_doc, true));
735
736 Node enrich_response = this.mr.process(enrich_message);
737
738 String[] links = { GSXML.RESPONSE_ELEM, GSXML.DOC_NODE_ELEM + GSXML.LIST_MODIFIER, GSXML.DOC_NODE_ELEM, GSXML.NODE_CONTENT_ELEM };
739 path = GSPath.createPath(links);
740 dc_response_doc_content = (Element) GSXML.getNodeByPath(enrich_response, path);
741
742 }
743 } // if provide_annotations
744
745 // use the returned id rather than the sent one cos there may have
746 // been modifiers such as .pr that are removed.
747 String modified_doc_id = dc_response_doc.getAttribute(GSXML.NODE_ID_ATT);
748 the_document.setAttribute("selectedNode", modified_doc_id);
749 if (has_dummy)
750 {
751 // change the id if necessary and add the content
752 Element dummy_node = (Element) doc_nodes.item(0);
753
754 dummy_node.setAttribute(GSXML.NODE_ID_ATT, modified_doc_id);
755 dummy_node.appendChild(doc.importNode(dc_response_doc_content, true));
756 // hack for simple type
757 if (document_type.equals(GSXML.DOC_TYPE_SIMPLE))
758 {
759 // we dont want the internal docNode, just want the content and metadata in the document
760 // rethink this!!
761 the_document.removeChild(dummy_node);
762
763 NodeList dummy_children = dummy_node.getChildNodes();
764 for (int i = dummy_children.getLength() - 1; i >= 0; i--)
765 {
766 // special case as we don't want more than one metadata list
767 if (dummy_children.item(i).getNodeName().equals(GSXML.METADATA_ELEM + GSXML.LIST_MODIFIER))
768 {
769 GSXML.mergeMetadataFromList(the_document, dummy_children.item(i));
770 }
771 else
772 {
773 the_document.appendChild(dummy_children.item(i));
774 }
775 }
776 }
777
778 the_document.setAttribute(GSXML.NODE_ID_ATT, modified_doc_id);
779 }
780 else
781 {
782 // Merge the document content with the metadata and structure information
783 for (int i = 0; i < doc_nodes.getLength(); i++)
784 {
785 Node dn = doc_nodes.item(i);
786 String dn_id = ((Element) dn).getAttribute(GSXML.NODE_ID_ATT);
787 if (dn_id.equals(modified_doc_id))
788 {
789 dn.appendChild(doc.importNode(dc_response_doc_content, true));
790 break;
791 }
792 }
793 }
794 }
795 //logger.debug("(DocumentAction) Page:\n" + GSXML.xmlNodeToString(result));
796 return result;
797 }
798
799 protected boolean checkValidOID(Element basic_doc_list, String collection, UserContext userContext, Element page_response) {
800 Document doc = basic_doc_list.getOwnerDocument();
801
802 Element v_message = doc.createElement(GSXML.MESSAGE_ELEM);
803 String to = GSPath.appendLink(collection, AbstractDocumentRetrieve.VALIDATE_DOCUMENT_ID_SERVICE);
804 Element v_request = GSXML.createBasicRequest(doc, GSXML.REQUEST_TYPE_PROCESS, to, userContext);
805 v_message.appendChild(v_request);
806
807 // add the node list
808 v_request.appendChild(basic_doc_list);
809 Element v_response_message = (Element) this.mr.process(v_message);
810 if (processErrorElements(v_response_message, page_response))
811 {
812 return false;
813 }
814 String[] links = { GSXML.RESPONSE_ELEM, GSXML.DOC_NODE_ELEM + GSXML.LIST_MODIFIER, GSXML.DOC_NODE_ELEM };
815 String path = GSPath.createPath(links);
816 Element info_elem = (Element) GSXML.getNodeByPath(v_response_message, path);
817 if (info_elem == null) {
818 return false;
819 }
820 if (info_elem.getAttribute("valid").equals("true")) {
821 return true;
822 }
823 return false;
824
825 }
826
827 protected Element getFormattedArchiveDoc(Document doc, String collection, String document_id, String document_type, Element result, Element page_response, UserContext userContext ) {
828 // call get archive doc
829 Element dx_message = doc.createElement(GSXML.MESSAGE_ELEM);
830 String to = DocXMLUtil.DOC_XML_GET_SECTION_SERVICE;
831 Element dx_request = GSXML.createBasicRequest(doc, GSXML.REQUEST_TYPE_PROCESS, to, userContext);
832 dx_message.appendChild(dx_request);
833 Element dx_section = doc.createElement(GSXML.DOCXML_SECTION_ELEM);
834 dx_section.setAttribute(GSXML.NODE_ID_ATT, document_id);
835 dx_section.setAttribute(GSXML.COLLECTION_ATT, collection);
836 dx_request.appendChild(dx_section);
837
838 Element dx_response_message = (Element) this.mr.process(dx_message);
839 if (processErrorElements(dx_response_message, page_response))
840 {
841 return result;
842 }
843
844 // get the section out
845 String path = GSPath.appendLink(GSXML.RESPONSE_ELEM, GSXML.DOCXML_SECTION_ELEM);
846 Element section = (Element) GSXML.getNodeByPath(dx_response_message, path);
847 if (section == null) {
848 logger.error("no archive doc returned for "+document_id);
849 return result;
850 }
851 // convert the archive format into the internal format that the page response requires
852
853 // work out doctype
854 // NOTE: this will be coming from collection database in index
855 // the archive file doesn't store this. So we have to assume
856 // that the doc type will not be changing with any
857 // modifications happening to archives.
858
859 // if doc type is null, then we need to work it out.
860 // create a basic doc list containing the current node
861
862 if (document_type == null) {
863 Element basic_doc_list = doc.createElement(GSXML.DOC_NODE_ELEM + GSXML.LIST_MODIFIER);
864 Element current_doc = doc.createElement(GSXML.DOC_NODE_ELEM);
865 basic_doc_list.appendChild(current_doc);
866 current_doc.setAttribute(GSXML.NODE_ID_ATT, document_id);
867 basic_doc_list.appendChild(current_doc);
868 document_type = getDocumentType(basic_doc_list, collection, userContext, page_response);
869 }
870
871 if (document_type == null) {
872 logger.debug("@@@ doctype is null, setting to simple");
873 document_type = GSXML.DOC_TYPE_SIMPLE;
874 }
875
876 Element doc_elem = doc.createElement(GSXML.DOCUMENT_ELEM);
877 doc_elem.setAttribute(GSXML.DOC_TYPE_ATT, document_type);
878 page_response.appendChild(doc_elem);
879
880 Element transformed_section = transformArchiveToDocument(section);
881 if (document_type == GSXML.DOC_TYPE_SIMPLE) {
882 // simple doc, only returning a single document node, which is the top level section.
883 doc_elem.setAttribute(GSXML.NODE_ID_ATT, document_id);
884 GSXML.mergeElements(doc_elem, transformed_section);
885 return result;
886 }
887
888 // multi sectioned document.
889 transformed_section.setAttribute(GSXML.NODE_ID_ATT, document_id);
890 // In docEdit mode, we obtain the text from archives, from doc.xml
891 // Now the transformation has replaced <Section> with <documentNode>
892 // Need to add nodeID, nodeType and docType attributes to each docNode
893 // as doc.xml doesn't store that.
894 insertDocNodeAttributes(transformed_section, document_type, null);
895 doc_elem.appendChild(doc.importNode(transformed_section, true));
896 logger.debug("dx result = "+XMLConverter.getPrettyString(result));
897
898 return result;
899 }
900
901
902 private boolean needSectionContent(HashMap<String, Serializable> params) {
903 String document_id = (String) params.get(GSParams.DOCUMENT);
904 String ilt = (String) params.get(GSParams.INLINE_TEMPLATE);
905 String iltPrefix = "<xsl:template match=\"/\"><text><xsl:for-each select=\"/page/pageResponse/document//documentNode[@nodeID =";
906 if (ilt != null && ilt.startsWith(iltPrefix) && document_id != null) {
907 return true;
908 }
909
910 return false;
911 }
912 /**
913 * this method gets the collection description, the format info, the list of
914 * enrich services, etc - stuff that is needed for the page, but is the same
915 * whatever the query is - should be cached
916 */
917 protected boolean getBackgroundData(Element page_response, String collection, UserContext userContext)
918 {
919 Document doc = page_response.getOwnerDocument();
920
921 // create a message to process - contains requests for the collection
922 // description, the format element, the enrich services on offer
923 // these could all be cached
924 Element info_message = doc.createElement(GSXML.MESSAGE_ELEM);
925 String path = GSPath.appendLink(collection, AbstractDocumentRetrieve.DOCUMENT_CONTENT_RETRIEVE_SERVICE);
926 // the format request - ignore for now, where does this request go to??
927 Element format_request = GSXML.createBasicRequest(doc, GSXML.REQUEST_TYPE_FORMAT, path, userContext);
928 info_message.appendChild(format_request);
929
930 // the enrich_services request - only do this if provide_annotations is true
931
932 if (provide_annotations)
933 {
934 Element enrich_services_request = GSXML.createBasicRequest(doc, GSXML.REQUEST_TYPE_DESCRIBE, "", userContext);
935 enrich_services_request.setAttribute(GSXML.INFO_ATT, "serviceList");
936 info_message.appendChild(enrich_services_request);
937 }
938
939 Element info_response = (Element) this.mr.process(info_message);
940
941 // the collection is the first response
942 NodeList responses = info_response.getElementsByTagName(GSXML.RESPONSE_ELEM);
943 Element format_resp = (Element) responses.item(0);
944
945 Element format_elem = (Element) GSXML.getChildByTagName(format_resp, GSXML.FORMAT_ELEM);
946 if (format_elem != null)
947 {
948 Element global_format_elem = (Element) GSXML.getChildByTagName(format_resp, GSXML.GLOBAL_FORMAT_ELEM);
949 if (global_format_elem != null)
950 {
951 GSXSLT.mergeFormatElements(format_elem, global_format_elem, false);
952 }
953
954 // set the format type
955 format_elem.setAttribute(GSXML.TYPE_ATT, "display");
956 page_response.appendChild(doc.importNode(format_elem, true));
957 }
958
959 if (provide_annotations)
960 {
961 Element services_resp = (Element) responses.item(1);
962
963 // a new message for the mr
964 Element enrich_message = doc.createElement(GSXML.MESSAGE_ELEM);
965 NodeList e_services = services_resp.getElementsByTagName(GSXML.SERVICE_ELEM);
966 boolean service_found = false;
967 for (int j = 0; j < e_services.getLength(); j++)
968 {
969 if (((Element) e_services.item(j)).getAttribute(GSXML.TYPE_ATT).equals("enrich"))
970 {
971 Element s = GSXML.createBasicRequest(doc, GSXML.REQUEST_TYPE_DESCRIBE, ((Element) e_services.item(j)).getAttribute(GSXML.NAME_ATT), userContext);
972 enrich_message.appendChild(s);
973 service_found = true;
974 }
975 }
976 if (service_found)
977 {
978 Element enrich_response = (Element) this.mr.process(enrich_message);
979
980 NodeList e_responses = enrich_response.getElementsByTagName(GSXML.RESPONSE_ELEM);
981 Element service_list = doc.createElement(GSXML.SERVICE_ELEM + GSXML.LIST_MODIFIER);
982 for (int i = 0; i < e_responses.getLength(); i++)
983 {
984 Element e_resp = (Element) e_responses.item(i);
985 Element e_service = (Element) doc.importNode(GSXML.getChildByTagName(e_resp, GSXML.SERVICE_ELEM), true);
986 e_service.setAttribute(GSXML.NAME_ATT, e_resp.getAttribute(GSXML.FROM_ATT));
987 service_list.appendChild(e_service);
988 }
989 page_response.appendChild(service_list);
990 }
991 } // if provide_annotations
992 return true;
993
994 }
995
996 protected String getDocumentType(Element basic_doc_list, String collection, UserContext userContext, Element page_response)
997 {
998 Document doc = basic_doc_list.getOwnerDocument();
999
1000 Element ds_message = doc.createElement(GSXML.MESSAGE_ELEM);
1001 String to = GSPath.appendLink(collection, AbstractDocumentRetrieve.DOCUMENT_STRUCTURE_RETRIEVE_SERVICE);
1002 Element ds_request = GSXML.createBasicRequest(doc, GSXML.REQUEST_TYPE_PROCESS, to, userContext);
1003 ds_message.appendChild(ds_request);
1004
1005 // Create a parameter list to specify the required structure information
1006 Element ds_param_list = doc.createElement(GSXML.PARAM_ELEM + GSXML.LIST_MODIFIER);
1007 Element ds_param = doc.createElement(GSXML.PARAM_ELEM);
1008 ds_param_list.appendChild(ds_param);
1009 ds_param.setAttribute(GSXML.NAME_ATT, "info");
1010 ds_param.setAttribute(GSXML.VALUE_ATT, "documentType");
1011
1012 ds_request.appendChild(ds_param_list);
1013
1014 // add the node list we created earlier
1015 ds_request.appendChild(basic_doc_list);
1016
1017 // Process the document structure retrieve message
1018 Element ds_response_message = (Element) this.mr.process(ds_message);
1019 if (processErrorElements(ds_response_message, page_response))
1020 {
1021 return null;
1022 }
1023
1024 String[] links = { GSXML.RESPONSE_ELEM, GSXML.DOC_NODE_ELEM + GSXML.LIST_MODIFIER, GSXML.DOC_NODE_ELEM, "nodeStructureInfo" };
1025 String path = GSPath.createPath(links);
1026 Element info_elem = (Element) GSXML.getNodeByPath(ds_response_message, path);
1027 if (info_elem == null) {
1028 return null;
1029 }
1030 Element doctype_elem = GSXML.getNamedElement(info_elem, "info", "name", "documentType");
1031 if (doctype_elem != null)
1032 {
1033 String doc_type = doctype_elem.getAttribute("value");
1034 return doc_type;
1035 }
1036 return null;
1037 }
1038
1039 // Recursive method to set the docType, nodeType and nodeID attributes of each docNode
1040 // The docType remains constant as in parameter document_type
1041 // The nodeID for the first (root) docNode is already set. For all children, the rootNode id
1042 // is updated to be <parent-id>.<num-child>, where the first parent-id is rootNode id.
1043 // The nodeType is root if rootNode, internal if there are children and leaf if no children
1044 protected void insertDocNodeAttributes(Element docNode, String document_type, String id) {
1045
1046 boolean isRoot = false;
1047 if(id == null) { // rootNode, get the root nodeID to work with recursively
1048 id = docNode.getAttribute(GSXML.NODE_ID_ATT);
1049 isRoot = true;
1050 } else { // for all but the root node, need to still set the nodeID
1051 docNode.setAttribute(GSXML.NODE_ID_ATT, id);
1052 }
1053
1054 docNode.setAttribute(GSXML.DOC_TYPE_ATT, document_type);
1055
1056 NodeList docNodes = GSXML.getChildrenByTagName(docNode, GSXML.DOC_NODE_ELEM);
1057 if(docNodes.getLength() > 0) {
1058 docNode.setAttribute(GSXML.NODE_TYPE_ATT, GSXML.NODE_TYPE_INTERNAL);
1059 for(int i = 0; i < docNodes.getLength(); i++) {
1060 Element childDocNode = (Element)docNodes.item(i);
1061
1062 // work out the child docNode's nodeID based on current id
1063 String nodeID = id + "." + (i+1);
1064 insertDocNodeAttributes(childDocNode, document_type, nodeID); //recursion step
1065 }
1066 } else {
1067 docNode.setAttribute(GSXML.NODE_TYPE_ATT, GSXML.NODE_TYPE_LEAF);
1068 }
1069
1070 // rootNode's nodeType is a special case: it's "root", not "leaf" or "internal"
1071 if(isRoot) docNode.setAttribute(GSXML.NODE_TYPE_ATT, GSXML.NODE_TYPE_ROOT);
1072
1073 }
1074
1075 /** run the XSLT transform which converts from doc.xml format to our internal document format */
1076 protected Element transformArchiveToDocument(Element section) {
1077
1078 String stylesheet_filename = GSFile.stylesheetFile(GlobalProperties.getGSDL3Home(), (String) this.config_params.get(GSConstants.SITE_NAME), "", (String) this.config_params.get(GSConstants.INTERFACE_NAME), (ArrayList<String>) this.config_params.get(GSConstants.BASE_INTERFACES), "archive2document.xsl");
1079 if (stylesheet_filename == null) {
1080 logger.error("Couldn't find stylesheet archive2document.xsl");
1081 return section;
1082 }
1083
1084 Document stylesheet_doc = XMLConverter.getDOM(new File(stylesheet_filename));
1085 if (stylesheet_doc == null) {
1086 logger.error("Couldn't load in stylesheet "+stylesheet_filename);
1087 return section;
1088 }
1089
1090 Document section_doc = XMLConverter.newDOM();
1091 section_doc.appendChild(section_doc.importNode(section, true));
1092 Node result = this.transformer.transform(stylesheet_doc, section_doc);
1093 logger.debug("transform result = "+XMLConverter.getPrettyString(result));
1094
1095 Element new_element;
1096 if (result.getNodeType() == Node.DOCUMENT_NODE) {
1097 new_element = ((Document) result).getDocumentElement();
1098 } else {
1099 new_element = (Element) result;
1100 }
1101
1102
1103 return new_element;
1104
1105 }
1106
1107 protected final int NO_QUERY_TERMS = 0;
1108 protected final int NO_EQUIV_QUERY_TERMS = 1;
1109 protected final int EQUIV_QUERY_TERMS = 2;
1110 /**
1111 * this involves a bit of a hack to get the equivalent query terms - has to
1112 * requery the query service - uses the last selected service name. (if it
1113 * ends in query).
1114 */
1115 protected int getQueryTermVariants(Element request, HashSet<String> query_term_variants, ArrayList<ArrayList<HashSet<String>>> phrase_query_term_variants_hierarchy)
1116 {
1117 Document doc = XMLConverter.newDOM();
1118
1119 // do the query again to get term info
1120 Element cgi_param_list = (Element) GSXML.getChildByTagName(request, GSXML.PARAM_ELEM + GSXML.LIST_MODIFIER);
1121 HashMap<String, Serializable> params = GSXML.extractParams(cgi_param_list, false);
1122
1123 HashMap previous_params = (HashMap) params.get("p");
1124 if (previous_params == null)
1125 {
1126 return NO_QUERY_TERMS;
1127 }
1128 String service_name = (String) previous_params.get(GSParams.SERVICE);
1129 if (service_name == null || !service_name.endsWith("Query"))
1130 { // hack for now - we only do highlighting if we were in a query last - ie not if we were in a browse thingy
1131 logger.debug("invalid service "+service_name+", not doing highlighting");
1132 return NO_QUERY_TERMS;
1133 }
1134
1135 String collection = (String) params.get(GSParams.COLLECTION);
1136 UserContext userContext = new UserContext(request);
1137 String to = GSPath.appendLink(collection, service_name);
1138
1139 Element mr_query_message = doc.createElement(GSXML.MESSAGE_ELEM);
1140 Element mr_query_request = GSXML.createBasicRequest(doc, GSXML.REQUEST_TYPE_PROCESS, to, userContext);
1141 mr_query_message.appendChild(mr_query_request);
1142
1143 // paramList
1144 HashMap service_params = (HashMap) params.get("s1");
1145
1146 Element query_param_list = doc.createElement(GSXML.PARAM_ELEM + GSXML.LIST_MODIFIER);
1147 GSXML.addParametersToList(query_param_list, service_params);
1148 mr_query_request.appendChild(query_param_list);
1149
1150 // do the query
1151 Element mr_query_response = (Element) this.mr.process(mr_query_message);
1152
1153 // find the term lists
1154 String path = GSPath.appendLink(GSXML.RESPONSE_ELEM, GSXML.TERM_ELEM + GSXML.LIST_MODIFIER);
1155 Element query_term_list_element = (Element) GSXML.getNodeByPath(mr_query_response, path);
1156 if (query_term_list_element == null)
1157 {
1158 // no term info
1159 return NO_QUERY_TERMS;
1160 }
1161
1162 int result_code = NO_EQUIV_QUERY_TERMS;
1163 NodeList equivalent_terms_nodelist = query_term_list_element.getElementsByTagName("equivTermList");
1164 if (equivalent_terms_nodelist == null || equivalent_terms_nodelist.getLength() == 0)
1165 {
1166 // if we have no equivalent terms, just add the current terms lower cased and we do case insensitive matching later on
1167 NodeList terms_nodelist = query_term_list_element.getElementsByTagName("term");
1168 if (terms_nodelist != null && terms_nodelist.getLength() > 0)
1169 {
1170 for (int i = 0; i < terms_nodelist.getLength(); i++)
1171 {
1172 String termValue = ((Element) terms_nodelist.item(i)).getAttribute("name");
1173 query_term_variants.add(termValue.toLowerCase());
1174 }
1175 }
1176 }
1177 else
1178 {
1179 result_code = EQUIV_QUERY_TERMS;
1180 for (int i = 0; i < equivalent_terms_nodelist.getLength(); i++)
1181 {
1182 Element equivalent_terms_element = (Element) equivalent_terms_nodelist.item(i);
1183 String[] equivalent_terms = GSXML.getAttributeValuesFromList(equivalent_terms_element, GSXML.NAME_ATT);
1184 for (int j = 0; j < equivalent_terms.length; j++)
1185 {
1186 query_term_variants.add(equivalent_terms[j]);
1187 }
1188 }
1189 }
1190
1191 String metadata_path = GSPath.appendLink(GSXML.RESPONSE_ELEM, GSXML.METADATA_ELEM + GSXML.LIST_MODIFIER);
1192 Element metadata_list = (Element) GSXML.getNodeByPath(mr_query_response, metadata_path);
1193
1194 Element query_element = GSXML.getNamedElement(metadata_list, GSXML.METADATA_ELEM, GSXML.NAME_ATT, "query");
1195 String performed_query = GSXML.getNodeText(query_element) + " ";
1196 logger.debug("performed query="+performed_query);
1197
1198 boolean has_phrases = false; // if there are no phrases, we don't bother making the phrase variants structure
1199 if (performed_query.contains("\"")) {
1200 has_phrases = true;
1201 }
1202
1203 ArrayList<HashSet<String>> phrase_query_p_term_variants_list = new ArrayList<HashSet<String>>();
1204 int term_start = 0;
1205 boolean in_term = false;
1206 boolean in_phrase = false;
1207 for (int i = 0; i < performed_query.length(); i++) {
1208
1209 char character = performed_query.charAt(i);
1210 boolean is_character_letter_or_digit = Character.isLetterOrDigit(character);
1211
1212 // Has a query term just started?
1213 if (in_term == false && is_character_letter_or_digit == true)
1214 {
1215 in_term = true;
1216 term_start = i;
1217 }
1218
1219 // Or has a term just finished?
1220 else if (in_term == true && is_character_letter_or_digit == false)
1221 {
1222 in_term = false;
1223 String term = performed_query.substring(term_start, i);
1224 if (has_phrases) {
1225 // do the phrase bit
1226 HashSet<String> phrase_query_p_term_x_variants = new HashSet<String>();
1227 if (result_code == EQUIV_QUERY_TERMS) {
1228 Element term_element = GSXML.getNamedElement(query_term_list_element, GSXML.TERM_ELEM, GSXML.NAME_ATT, term);
1229 if (term_element != null) {
1230 // might be null for eg TX in [snails]:TX
1231
1232 NodeList term_equivalent_terms_nodelist = term_element.getElementsByTagName("equivTermList");
1233 if (term_equivalent_terms_nodelist != null || term_equivalent_terms_nodelist.getLength() != 0) {
1234 for (int j = 0; j < term_equivalent_terms_nodelist.getLength(); j++)
1235 {
1236 Element term_equivalent_terms_element = (Element) term_equivalent_terms_nodelist.item(j);
1237 String[] term_equivalent_terms = GSXML.getAttributeValuesFromList(term_equivalent_terms_element, GSXML.NAME_ATT);
1238 for (int k = 0; k < term_equivalent_terms.length; k++)
1239 {
1240 phrase_query_p_term_x_variants.add(term_equivalent_terms[k]);
1241 }
1242 }
1243 }
1244 }
1245 } else { // result_code != EQUIV_QUERY_TERMS
1246 // we don;t have equivalent term list, so just add the lower cased version in, and we do case-insensitive matching later on
1247 if (query_term_variants.contains(term.toLowerCase()) || containsSubString(query_term_variants, term)) {
1248 // this handles the case where the user has searched for snails, but term list returns 'snail'
1249 phrase_query_p_term_x_variants.add(term.toLowerCase());
1250 }
1251 }
1252 if (phrase_query_p_term_x_variants.size()>0) {
1253 // we have found a valid term
1254 phrase_query_p_term_variants_list.add(phrase_query_p_term_x_variants);
1255
1256 if (in_phrase == false)
1257 {
1258 phrase_query_term_variants_hierarchy.add(phrase_query_p_term_variants_list);
1259 phrase_query_p_term_variants_list = new ArrayList<HashSet<String>>();
1260 }
1261 }
1262 } // end if has_phrases
1263 else {
1264 // no phrases so we don't have to do the phrasey stuff. but
1265 // we need to check the term against the query term list - if its not in there, check whether its the root of a term.
1266 // we want to handle the case where user has queried "snails", the term list returned only has snail, and therefore snails doesn't get highlighted.
1267 // but dont want to include eg TX
1268 if (result_code == NO_EQUIV_QUERY_TERMS) {
1269 if (containsSubString(query_term_variants, term)) {
1270 query_term_variants.add(term.toLowerCase());
1271 }
1272 }
1273
1274 }
1275 } // end of in_term...
1276 // Watch for phrases (surrounded by quotes)
1277 if (character == '\"') {
1278
1279 // Has a phrase just started?
1280 if (in_phrase == false)
1281 {
1282 in_phrase = true;
1283 }
1284 // Or has a phrase just finished?
1285 else if (in_phrase == true)
1286 {
1287 in_phrase = false;
1288 phrase_query_term_variants_hierarchy.add(phrase_query_p_term_variants_list);
1289 }
1290
1291 phrase_query_p_term_variants_list = new ArrayList<HashSet<String>>();
1292 } // if char == "
1293 } // for each char in performed query
1294
1295 return result_code;
1296 }
1297
1298 protected boolean containsSubString(HashSet<String> query_term_variants, String term) {
1299 // hack to filter out TX, TI field names
1300 String lc_term = term.toLowerCase();
1301 if (query_term_variants.contains(term)) {
1302 return false; // or true??
1303 }
1304 if (term.matches("[A-Z][A-Z][A-Z]?")) {
1305 return false;
1306 }
1307 Iterator i = query_term_variants.iterator();
1308 while (i.hasNext()) {
1309 String t = (String)i.next();
1310 if (term.startsWith(t)) {
1311 return true;
1312 }
1313 }
1314 return false;
1315 }
1316
1317
1318 /** retrieve the marked up highlighted section - only works for solr collection */
1319 protected Element retrieveHighlightedContent(Element request, String node_id) {
1320
1321 Document doc = XMLConverter.newDOM();
1322
1323 // do the query again to get term info
1324 Element cgi_param_list = (Element) GSXML.getChildByTagName(request, GSXML.PARAM_ELEM + GSXML.LIST_MODIFIER);
1325 HashMap<String, Serializable> params = GSXML.extractParams(cgi_param_list, false);
1326
1327 HashMap previous_params = (HashMap) params.get("p");
1328 if (previous_params == null)
1329 {
1330 return null;
1331 }
1332 String service_name = (String) previous_params.get(GSParams.SERVICE);
1333 if (service_name == null || !service_name.endsWith("Query"))
1334 { // hack for now - we only do highlighting if we were in a query last - ie not if we were in a browse thingy
1335 logger.debug("HL invalid service, not doing highlighting");
1336 return null;
1337 }
1338
1339 String collection = (String) params.get(GSParams.COLLECTION);
1340 UserContext userContext = new UserContext(request);
1341 String to = GSPath.appendLink(collection, service_name);
1342
1343 Element mr_query_message = doc.createElement(GSXML.MESSAGE_ELEM);
1344 Element mr_query_request = GSXML.createBasicRequest(doc, GSXML.REQUEST_TYPE_PROCESS, to, userContext);
1345 mr_query_message.appendChild(mr_query_request);
1346
1347 // paramList
1348 HashMap service_params = (HashMap) params.get("s1");
1349
1350 // hack in case the user searched on eg titles, but we want highlighting in the text
1351 service_params.put("index", "TX");
1352 Element query_param_list = doc.createElement(GSXML.PARAM_ELEM + GSXML.LIST_MODIFIER);
1353 GSXML.addParametersToList(query_param_list, service_params);
1354
1355 if (node_id != null) {
1356 GSXML.addParameterToList(query_param_list, "hldocOID", node_id);
1357 } else {
1358 GSXML.addParameterToList(query_param_list, "hldocOID", (String) params.get(GSParams.DOCUMENT));
1359 }
1360 mr_query_request.appendChild(query_param_list);
1361 // do the query
1362
1363 Element mr_query_response = (Element) this.mr.process(mr_query_message);
1364 String pathNode = GSPath.appendLink(GSXML.RESPONSE_ELEM, GSXML.NODE_CONTENT_ELEM);
1365 Element highlighted_node = (Element) GSXML.getNodeByPath(mr_query_response, pathNode);
1366
1367 if (highlighted_node == null) {
1368 return null;
1369 }
1370 // For SOLR, the highlighted node will be a nodeContent element, which is the hldocOID section content, with search terms marked up.
1371 //We send it back to the documnetContentRetrieve service so that resolveTextMacros can be applied, and it can be properly encased in documentNode etc elements
1372
1373 // Build a request to process highlighted text
1374
1375 Element hl_message = doc.createElement(GSXML.MESSAGE_ELEM);
1376 to = GSPath.appendLink(collection, AbstractDocumentRetrieve.DOCUMENT_CONTENT_RETRIEVE_SERVICE);
1377 Element dc_request = GSXML.createBasicRequest(doc, GSXML.REQUEST_TYPE_PROCESS, to, userContext);
1378 hl_message.appendChild(dc_request);
1379
1380 // Create a parameter list to specify the request parameters - empty for now
1381 Element dc_param_list = doc.createElement(GSXML.PARAM_ELEM + GSXML.LIST_MODIFIER);
1382 dc_request.appendChild(dc_param_list);
1383
1384 // get the content
1385 Element doc_list = doc.createElement(GSXML.DOC_NODE_ELEM + GSXML.LIST_MODIFIER);
1386 dc_request.appendChild(doc_list);
1387 Element current_doc = doc.createElement(GSXML.DOC_NODE_ELEM);
1388 doc_list.appendChild(current_doc);
1389 current_doc.setAttribute(GSXML.NODE_ID_ATT, (String) params.get(GSParams.DOCUMENT));
1390 //Append highlighted content to request for processing
1391 dc_request.appendChild(doc.importNode(highlighted_node, true));
1392 Element hl_response_message = (Element) this.mr.process(hl_message);
1393 //Get results
1394 NodeList contentList = hl_response_message.getElementsByTagName(GSXML.NODE_CONTENT_ELEM);
1395 Element content = (Element) contentList.item(0);
1396 return content;
1397
1398
1399 }
1400 /**
1401 * Highlights query terms in specified elements (whose name is in element_names) text inside top_level_elem
1402 */
1403 protected boolean highlightQueryTermsDOM(Document doc, Element top_level_elem, String element_name, HashSet<String> query_term_variants, ArrayList<ArrayList<HashSet<String>>> phrase_query_term_variants_hierarchy, boolean case_insensitive) {
1404
1405 NodeList named_elems = top_level_elem.getElementsByTagName(element_name);
1406 for (int j=named_elems.getLength()-1; j>=0; j--) {
1407 Element this_elem = (Element)named_elems.item(j);
1408 Element replacement_elem = highlightQueryTermsElementText(doc, this_elem, query_term_variants, phrase_query_term_variants_hierarchy, case_insensitive);
1409 this_elem.getParentNode().replaceChild(replacement_elem, this_elem);
1410 }
1411 return true;
1412 }
1413 /**
1414 * Highlights query terms in the text content of an element.
1415 */
1416 private Element highlightQueryTermsElementText(Document doc, Element original_element, HashSet<String> query_term_variants, ArrayList<ArrayList<HashSet<String>>> phrase_query_term_variants_hierarchy, boolean case_insensitive)
1417 {
1418 String content = GSXML.getNodeText(original_element);
1419 // Convert the content string to an array of characters for speed
1420 char[] content_characters = new char[content.length()];
1421 content.getChars(0, content.length(), content_characters, 0);
1422
1423 // Now skim through the content, identifying word matches
1424 ArrayList<WordMatch> word_matches = new ArrayList<WordMatch>();
1425 int word_start = 0;
1426 boolean in_word = false;
1427 boolean preceding_word_matched = false;
1428 boolean inTag = false;
1429 for (int i = 0; i < content_characters.length; i++)
1430 {
1431 //We don't want to find words inside HTML tags
1432 if (content_characters[i] == '<')
1433 {
1434 // are we currently in a word?
1435 if (in_word) {
1436 in_word = false;
1437 String word = new String(content_characters, word_start, (i - word_start));
1438 if (case_insensitive) {
1439 word = word.toLowerCase();
1440 }
1441 if (query_term_variants.contains(word)) {
1442 // We have found a matching word, so remember its location
1443 word_matches.add(new WordMatch(word, word_start, i, preceding_word_matched));
1444 // should preceding word matched be set to true/false here??
1445 preceding_word_matched = true;
1446 } else {
1447 preceding_word_matched = false;
1448 }
1449 }
1450 inTag = true;
1451 continue;
1452 }
1453 else if (inTag && content_characters[i] == '>')
1454 {
1455 inTag = false;
1456 continue;
1457 }
1458 else if (inTag)
1459 {
1460 continue;
1461 }
1462
1463 boolean is_character_letter_or_digit = Character.isLetterOrDigit(content_characters[i]);
1464
1465 // Has a word just started?
1466 if (in_word == false && is_character_letter_or_digit == true)
1467 {
1468 in_word = true;
1469 word_start = i;
1470 }
1471
1472 // Or has a word just finished?
1473 else if (in_word == true && is_character_letter_or_digit == false)
1474 {
1475 in_word = false;
1476
1477 // Check if the word matches any of the query term equivalents
1478 String word = new String(content_characters, word_start, (i - word_start));
1479 if (case_insensitive) {
1480 word = word.toLowerCase();
1481 }
1482 if (query_term_variants.contains(word))
1483 {
1484 // We have found a matching word, so remember its location
1485 word_matches.add(new WordMatch(word, word_start, i, preceding_word_matched));
1486 preceding_word_matched = true;
1487 }
1488 else
1489 {
1490 preceding_word_matched = false;
1491 }
1492 }
1493 }
1494
1495 // Don't forget the last word...
1496 if (in_word == true)
1497 {
1498 // Check if the word matches any of the query term equivalents
1499 String word = new String(content_characters, word_start, (content_characters.length - word_start));
1500 if (case_insensitive) {
1501 word = word.toLowerCase();
1502 }
1503 if (query_term_variants.contains(word))
1504 {
1505 // We have found a matching word, so remember its location
1506 word_matches.add(new WordMatch(word, word_start, content_characters.length, preceding_word_matched));
1507 }
1508 }
1509
1510 if (word_matches.size() == 0) {
1511 // just return a copy of the original element
1512 return (Element)doc.importNode(original_element, true);
1513
1514 }
1515
1516 ArrayList<Integer> highlight_start_positions = new ArrayList<Integer>();
1517 ArrayList<Integer> highlight_end_positions = new ArrayList<Integer>();
1518
1519 if (phrase_query_term_variants_hierarchy.size() ==0) {
1520 for (int i = 0; i < word_matches.size(); i++) {
1521 highlight_start_positions.add(Integer.valueOf(word_matches.get(i).start_position));
1522 highlight_end_positions.add(Integer.valueOf(word_matches.get(i).end_position));
1523 }
1524 }
1525 else {
1526 // Deal with phrases now
1527 ArrayList<PartialPhraseMatch> partial_phrase_matches = new ArrayList<PartialPhraseMatch>();
1528 for (int i = 0; i < word_matches.size(); i++)
1529 {
1530 WordMatch word_match = word_matches.get(i);
1531
1532 // See if any partial phrase matches are extended by this word
1533 if (word_match.preceding_word_matched)
1534 {
1535 for (int j = partial_phrase_matches.size() - 1; j >= 0; j--)
1536 {
1537 PartialPhraseMatch partial_phrase_match = partial_phrase_matches.remove(j);
1538 ArrayList phrase_query_p_term_variants_list = phrase_query_term_variants_hierarchy.get(partial_phrase_match.query_phrase_number);
1539 HashSet phrase_query_p_term_x_variants = (HashSet) phrase_query_p_term_variants_list.get(partial_phrase_match.num_words_matched);
1540 if (phrase_query_p_term_x_variants.contains(word_match.word))
1541 {
1542 partial_phrase_match.num_words_matched++;
1543
1544 // Has a complete phrase match occurred?
1545 if (partial_phrase_match.num_words_matched == phrase_query_p_term_variants_list.size())
1546 {
1547 // Check for overlaps by looking at the previous highlight range
1548 if (!highlight_end_positions.isEmpty())
1549 {
1550 int last_highlight_index = highlight_end_positions.size() - 1;
1551 int last_highlight_end = highlight_end_positions.get(last_highlight_index).intValue();
1552 if (last_highlight_end > partial_phrase_match.start_position)
1553 {
1554 // There is an overlap, so remove the previous phrase match
1555 int last_highlight_start = highlight_start_positions.remove(last_highlight_index).intValue();
1556 highlight_end_positions.remove(last_highlight_index);
1557 partial_phrase_match.start_position = last_highlight_start;
1558 }
1559 }
1560
1561 highlight_start_positions.add(Integer.valueOf(partial_phrase_match.start_position));
1562 highlight_end_positions.add(Integer.valueOf(word_match.end_position));
1563 }
1564 // No, but add the partial match back into the list for next time
1565 else
1566 {
1567 partial_phrase_matches.add(partial_phrase_match);
1568 }
1569 }
1570 }
1571 }
1572 else
1573 {
1574 partial_phrase_matches.clear();
1575 }
1576
1577 // See if this word is at the start of any of the phrases
1578 for (int p = 0; p < phrase_query_term_variants_hierarchy.size(); p++)
1579 {
1580 ArrayList phrase_query_p_term_variants_list = phrase_query_term_variants_hierarchy.get(p);
1581 if (phrase_query_p_term_variants_list.size()>0) {
1582 HashSet phrase_query_p_term_1_variants = (HashSet) phrase_query_p_term_variants_list.get(0);
1583 if (phrase_query_p_term_1_variants.contains(word_match.word))
1584 {
1585 // If this phrase is just one word long, we have a complete match
1586 if (phrase_query_p_term_variants_list.size() == 1)
1587 {
1588 highlight_start_positions.add(Integer.valueOf(word_match.start_position));
1589 highlight_end_positions.add(Integer.valueOf(word_match.end_position));
1590 }
1591 // Otherwise we have the start of a potential phrase match
1592 else
1593 {
1594 partial_phrase_matches.add(new PartialPhraseMatch(word_match.start_position, p));
1595 }
1596 }
1597 }
1598 }
1599 }
1600 }
1601
1602 // Now add the annotation tags into the document at the correct points
1603 Element content_element = (Element)doc.importNode(original_element, false); // just copy the element plus any attributes, but not any children.
1604 int last_wrote = 0;
1605 for (int i = 0; i < highlight_start_positions.size(); i++)
1606 {
1607 int highlight_start = highlight_start_positions.get(i).intValue();
1608 int highlight_end = highlight_end_positions.get(i).intValue();
1609
1610 // Print anything before the highlight range
1611 if (last_wrote < highlight_start)
1612 {
1613 String preceding_text = new String(content_characters, last_wrote, (highlight_start - last_wrote));
1614 content_element.appendChild(doc.createTextNode(preceding_text));
1615 }
1616
1617 // Print the highlight text, annotated
1618 if (highlight_end > last_wrote)
1619 {
1620 String highlight_text = new String(content_characters, highlight_start, (highlight_end - highlight_start));
1621 Element annotation_element = GSXML.createTextElement(doc, "annotation", highlight_text);
1622 annotation_element.setAttribute("type", "query_term");
1623 content_element.appendChild(annotation_element);
1624 last_wrote = highlight_end;
1625 }
1626 }
1627
1628 // Finish off any unwritten text
1629 if (last_wrote < content_characters.length)
1630 {
1631 String remaining_text = new String(content_characters, last_wrote, (content_characters.length - last_wrote));
1632 content_element.appendChild(doc.createTextNode(remaining_text));
1633 }
1634 return content_element;
1635 }
1636
1637
1638 static private class WordMatch
1639 {
1640 public String word;
1641 public int start_position;
1642 public int end_position;
1643 public boolean preceding_word_matched;
1644
1645 public WordMatch(String word, int start_position, int end_position, boolean preceding_word_matched)
1646 {
1647 this.word = word;
1648 this.start_position = start_position;
1649 this.end_position = end_position;
1650 this.preceding_word_matched = preceding_word_matched;
1651 }
1652 }
1653
1654 static private class PartialPhraseMatch
1655 {
1656 public int start_position;
1657 public int query_phrase_number;
1658 public int num_words_matched;
1659
1660 public PartialPhraseMatch(int start_position, int query_phrase_number)
1661 {
1662 this.start_position = start_position;
1663 this.query_phrase_number = query_phrase_number;
1664 this.num_words_matched = 1;
1665 }
1666 }
1667}
Note: See TracBrowser for help on using the repository browser.