source: main/trunk/greenstone3/src/java/org/greenstone/gsdl3/action/DocumentAction.java@ 37515

Last change on this file since 37515 was 37515, checked in by kjdon, 14 months ago

usign the new GetRequiredMEtadataNames - has an extra arg, and we no longer need to do teh extraMEtadataList bit ourselves, as its now in getRequiredMetadataNames

  • Property svn:keywords set to Author Date Id Revision
File size: 63.6 KB
Line 
1/*
2 * DocumentAction.java
3 * Copyright (C) 2002 New Zealand Digital Library, http://www.nzdl.org
4 *
5 * This program is free software; you can redistribute it and/or modify
6 * it under the terms of the GNU General Public License as published by
7 * the Free Software Foundation; either version 2 of the License, or
8 * (at your option) any later version.
9 *
10 * This program is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 * GNU General Public License for more details.
14 *
15 * You should have received a copy of the GNU General Public License
16 * along with this program; if not, write to the Free Software
17 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
18 */
19package org.greenstone.gsdl3.action;
20
21// Greenstone classes
22import org.greenstone.gsdl3.core.ModuleInterface;
23import org.greenstone.gsdl3.service.AbstractDocumentRetrieve;
24import org.greenstone.gsdl3.service.DocXMLUtil;
25import org.greenstone.gsdl3.util.*;
26import org.greenstone.util.GlobalProperties;
27
28// XML classes
29import org.w3c.dom.Document;
30import org.w3c.dom.Element;
31import org.w3c.dom.Node;
32import org.w3c.dom.Text;
33import org.w3c.dom.NodeList;
34
35// General Java classes
36import java.util.ArrayList;
37import java.util.HashMap;
38import java.util.HashSet;
39import java.util.Iterator;
40import java.io.File;
41import java.io.Serializable;
42
43import org.apache.log4j.*;
44
45/** Action class for retrieving Documents via the message router */
46public class DocumentAction extends Action
47{
48
49 static Logger logger = Logger.getLogger(org.greenstone.gsdl3.action.DocumentAction.class.getName());
50
51 // this is used to specify that the sibling nodes of a selected one should be obtained
52 public static final String SIBLING_ARG = "sib";
53 public static final String GOTO_PAGE_ARG = "gp";
54 public static final String ENRICH_DOC_ARG = "end";
55 public static final String EXPAND_DOCUMENT_ARG = "ed";
56 public static final String EXPAND_CONTENTS_ARG = "ec";
57 public static final String REALISTIC_BOOK_ARG = "book";
58 public static final String NO_TEXT_ARG = "noText";
59 public static final String DOC_EDIT_ARG = "docEdit";
60 public static final String DOC_VERSION_ARG = "dv";
61
62 /**
63 * if this is set to true, when a document is displayed, any annotation type
64 * services (enrich) will be offered to the user as well
65 */
66 protected boolean provide_annotations = false;
67
68 protected boolean highlight_query_terms = false;
69
70 public boolean configure()
71 {
72 super.configure();
73 String highlight = (String) config_params.get("highlightQueryTerms");
74 if (highlight != null && highlight.equals("true"))
75 {
76 highlight_query_terms = true;
77 }
78 String annotate = (String) config_params.get("displayAnnotationService");
79 if (annotate != null && annotate.equals("true"))
80 {
81 provide_annotations = true;
82 }
83 return true;
84 }
85
86 public Node process(Node message_node)
87 {
88 // for now, no subaction eventually we may want to have subactions such as text assoc or something ?
89
90 Element message = GSXML.nodeToElement(message_node);
91 Document doc = XMLConverter.newDOM();
92
93 // the response
94 Element result = doc.createElement(GSXML.MESSAGE_ELEM);
95 Element page_response = doc.createElement(GSXML.RESPONSE_ELEM);
96 result.appendChild(page_response);
97
98 // get the request - assume only one
99 Element request = (Element) GSXML.getChildByTagName(message, GSXML.REQUEST_ELEM);
100 Element cgi_paramList = (Element) GSXML.getChildByTagName(request, GSXML.PARAM_ELEM + GSXML.LIST_MODIFIER);
101 HashMap<String, Serializable> params = GSXML.extractParams(cgi_paramList, false);
102
103 // just in case there are some that need to get passed to the services
104 // why do we use s0 here and s1 in other places???
105 HashMap service_params = (HashMap) params.get("s0");
106
107 String collection = (String) params.get(GSParams.COLLECTION);
108 String document_id = (String) params.get(GSParams.DOCUMENT);
109 if (document_id != null && document_id.equals(""))
110 {
111 document_id = null;
112 }
113 String href = (String) params.get(GSParams.HREF);//for an external link : get the href URL if it is existing in the params list
114 if (href != null && href.equals(""))
115 {
116 href = null;
117 }
118 String rl = (String) params.get(GSParams.RELATIVE_LINK);//for an external link : get the rl value if it is existing in the params list
119 if (document_id == null && href == null)
120 {
121 logger.error("no document specified!");
122 return result;
123 }
124 if (rl != null && rl.equals("0"))
125 {
126 // this is a true external link, we should have been directed to a different page or action
127 logger.error("rl value was 0, shouldn't get here");
128 return result;
129 }
130
131 String doc_id_modifier = "";
132 String sibling_num = (String) params.get(GOTO_PAGE_ARG);
133 if (sibling_num != null && !sibling_num.equals(""))
134 {
135 // we have to modify the doc name
136 doc_id_modifier = "." + sibling_num + ".ss";
137 }
138
139
140 UserContext userContext = new UserContext(request);
141
142 //append site metadata
143 addSiteMetadata(page_response, userContext);
144 addInterfaceOptions(page_response);
145
146 // get the additional data needed for the page
147 getBackgroundData(page_response, collection, userContext);
148
149 // create a basic doc list containing the current node
150 // we will use this to query whether the id is valid, and to get document type
151 Element basic_doc_list = doc.createElement(GSXML.DOC_NODE_ELEM + GSXML.LIST_MODIFIER);
152 Element current_doc = doc.createElement(GSXML.DOC_NODE_ELEM);
153 basic_doc_list.appendChild(current_doc);
154 if (document_id != null)
155 {
156 current_doc.setAttribute(GSXML.NODE_ID_ATT, document_id + doc_id_modifier);
157 }
158 else
159 {
160 current_doc.setAttribute(GSXML.HREF_ID_ATT, href);
161 // do we need this??
162 current_doc.setAttribute(GSXML.ID_MOD_ATT, doc_id_modifier);
163 }
164
165 // lets do a quick check here for valid doc id.
166 if (document_id != null) {
167 boolean is_valid = checkValidOID(basic_doc_list, collection, userContext, page_response );
168 if (!is_valid) {
169 GSXML.addError(page_response, "Invalid doc id ("+document_id+")", GSXML.ERROR_TYPE_INVALID_ID);
170 return result;
171 }
172 }
173 Element format_elem = (Element) GSXML.getChildByTagName(page_response, GSXML.FORMAT_ELEM);
174
175 if (format_elem != null) {
176 // lets look for param defaults set in config file
177 NodeList param_defaults = format_elem.getElementsByTagName(GSXML.PARAM_DEFAULT_ELEM);
178 for (int i=0; i<param_defaults.getLength(); i++) {
179 Element p = (Element)param_defaults.item(i);
180 String name = p.getAttribute(GSXML.NAME_ATT);
181 if (params.get(name) ==null) {
182 // wasn't set from interface
183 String value = p.getAttribute(GSXML.VALUE_ATT);
184 params.put(name, value );
185 // also add into request param xml so that xslt knows it too
186 GSXML.addParameterToList(cgi_paramList, name, value);
187 }
188 }
189 }
190
191 String document_type = (String) params.get(GSParams.DOCUMENT_TYPE);
192 if (document_type != null && document_type.equals(""))
193 {
194 //document_type = "hierarchy";
195 document_type = null; // we'll get it later if not already specified
196 }
197 // what if it is null here?? Anu to check...
198
199
200 boolean editing_document = false;
201 String doc_edit = (String) params.get(DOC_EDIT_ARG);
202 if (doc_edit != null && doc_edit.equals("1")) {
203 editing_document = true;
204 }
205
206 // are we editing mode? just get the archive document, convert to our internal doc format, and return it
207 if (editing_document) {
208 String opt_document_version = (String) params.get(DOC_VERSION_ARG);
209 return getFormattedArchiveDoc(doc, collection, document_id, opt_document_version, document_type, result, page_response, userContext);
210 }
211
212 //whether to retrieve siblings or not
213 boolean get_siblings = false;
214 String sibs = (String) params.get(SIBLING_ARG);
215 if (sibs != null && sibs.equals("1"))
216 {
217 get_siblings = true;
218 }
219
220 boolean expand_document = false;
221 String ed_arg = (String) params.get(EXPAND_DOCUMENT_ARG);
222 if (ed_arg != null && ed_arg.equals("1"))
223 {
224 expand_document = true;
225 }
226
227 boolean expand_contents = false;
228 if (expand_document)
229 { // we always expand the contents with the text
230 expand_contents = true;
231 }
232 else
233 {
234 String ec_arg = (String) params.get(EXPAND_CONTENTS_ARG);
235 if (ec_arg != null && ec_arg.equals("1"))
236 {
237 expand_contents = true;
238 }
239 }
240
241 // do we want text content? Not if no_text=1.
242 // expand_document overrides this. - should it??
243 boolean get_text = true;
244 String nt_arg = (String) params.get(NO_TEXT_ARG);
245
246 if (!expand_document && nt_arg!=null && nt_arg.equals("1")) {
247 logger.debug("SETTING GET TEXT TO FALSE");
248 get_text = false;
249 } else {
250 logger.debug("GET TEXT REMAINS TRUE");
251 }
252
253 // the_document is where all the doc info - structure and metadata etc
254 // is added into, to be returned in the page
255 Element the_document = doc.createElement(GSXML.DOCUMENT_ELEM);
256 page_response.appendChild(the_document);
257
258// used to create basic_doc_list here
259 if (document_type == null)
260 {
261 document_type = getDocumentType(basic_doc_list, collection, userContext, page_response);
262 }
263 if (document_type == null)
264 {
265 logger.debug("##### doctype is null, setting to simple");
266 document_type = GSXML.DOC_TYPE_SIMPLE;
267 }
268
269 the_document.setAttribute(GSXML.DOC_TYPE_ATT, document_type);
270
271 // start getting doc structure
272
273 // Create a parameter list to specify the required structure information
274 Element ds_param_list = doc.createElement(GSXML.PARAM_ELEM + GSXML.LIST_MODIFIER);
275
276 if (service_params != null)
277 {
278 GSXML.addParametersToList(ds_param_list, service_params);
279 }
280
281 Element ds_param = null;
282 boolean get_structure = false;
283 boolean get_structure_info = false;
284 if (document_type.equals(GSXML.DOC_TYPE_PAGED))
285 {
286 get_structure_info = true;
287
288 if (expand_contents)
289 {
290 ds_param = doc.createElement(GSXML.PARAM_ELEM);
291 ds_param_list.appendChild(ds_param);
292 ds_param.setAttribute(GSXML.NAME_ATT, "structure");
293 ds_param.setAttribute(GSXML.VALUE_ATT, "entire");
294 }
295
296 // get the info needed for paged naviagtion
297 ds_param = doc.createElement(GSXML.PARAM_ELEM);
298 ds_param_list.appendChild(ds_param);
299 ds_param.setAttribute(GSXML.NAME_ATT, "info");
300 ds_param.setAttribute(GSXML.VALUE_ATT, "numSiblings");
301 ds_param = doc.createElement(GSXML.PARAM_ELEM);
302 ds_param_list.appendChild(ds_param);
303 ds_param.setAttribute(GSXML.NAME_ATT, "info");
304 ds_param.setAttribute(GSXML.VALUE_ATT, "numChildren");
305 ds_param = doc.createElement(GSXML.PARAM_ELEM);
306 ds_param_list.appendChild(ds_param);
307 ds_param.setAttribute(GSXML.NAME_ATT, "info");
308 ds_param.setAttribute(GSXML.VALUE_ATT, "siblingPosition");
309
310 if (get_siblings)
311 {
312 ds_param = doc.createElement(GSXML.PARAM_ELEM);
313 ds_param_list.appendChild(ds_param);
314 ds_param.setAttribute(GSXML.NAME_ATT, "structure");
315 ds_param.setAttribute(GSXML.VALUE_ATT, "siblings");
316 }
317
318 }
319 else if (document_type.equals(GSXML.DOC_TYPE_HIERARCHY) || document_type.equals(GSXML.DOC_TYPE_PAGED_HIERARCHY))
320 {
321 get_structure = true;
322 if (expand_contents)
323 {
324 ds_param = doc.createElement(GSXML.PARAM_ELEM);
325 ds_param_list.appendChild(ds_param);
326 ds_param.setAttribute(GSXML.NAME_ATT, "structure");
327 ds_param.setAttribute(GSXML.VALUE_ATT, "entire");
328 }
329 else
330 {
331 // get the info needed for table of contents
332 ds_param = doc.createElement(GSXML.PARAM_ELEM);
333 ds_param_list.appendChild(ds_param);
334 ds_param.setAttribute(GSXML.NAME_ATT, "structure");
335 ds_param.setAttribute(GSXML.VALUE_ATT, "ancestors");
336 ds_param = doc.createElement(GSXML.PARAM_ELEM);
337 ds_param_list.appendChild(ds_param);
338 ds_param.setAttribute(GSXML.NAME_ATT, "structure");
339 ds_param.setAttribute(GSXML.VALUE_ATT, "children");
340 if (get_siblings)
341 {
342 ds_param = doc.createElement(GSXML.PARAM_ELEM);
343 ds_param_list.appendChild(ds_param);
344 ds_param.setAttribute(GSXML.NAME_ATT, "structure");
345 ds_param.setAttribute(GSXML.VALUE_ATT, "siblings");
346 }
347 }
348 }
349 else
350 {
351 // we dont need any structure
352 }
353
354 boolean has_dummy = false;
355 if (get_structure || get_structure_info)
356 {
357
358 // Build a request to obtain the document structure
359 Element ds_message = doc.createElement(GSXML.MESSAGE_ELEM);
360 String to = GSPath.appendLink(collection, AbstractDocumentRetrieve.DOCUMENT_STRUCTURE_RETRIEVE_SERVICE);
361 Element ds_request = GSXML.createBasicRequest(doc, GSXML.REQUEST_TYPE_PROCESS, to, userContext);
362 ds_message.appendChild(ds_request);
363 ds_request.appendChild(ds_param_list);
364
365 // add the node list we created earlier
366 ds_request.appendChild(basic_doc_list);
367
368 // Process the document structure retrieve message
369 Element ds_response_message = (Element) this.mr.process(ds_message);
370 if (processErrorElements(ds_response_message, page_response))
371 {
372 return result;
373 }
374
375 // get the info and print out
376 String path = GSPath.appendLink(GSXML.RESPONSE_ELEM, GSXML.DOC_NODE_ELEM + GSXML.LIST_MODIFIER);
377 path = GSPath.appendLink(path, GSXML.DOC_NODE_ELEM);
378 path = GSPath.appendLink(path, "nodeStructureInfo");
379 Element ds_response_struct_info = (Element) GSXML.getNodeByPath(ds_response_message, path);
380 // get the doc_node bit
381 if (ds_response_struct_info != null)
382 {
383 the_document.appendChild(doc.importNode(ds_response_struct_info, true));
384 }
385 path = GSPath.appendLink(GSXML.RESPONSE_ELEM, GSXML.DOC_NODE_ELEM + GSXML.LIST_MODIFIER);
386 path = GSPath.appendLink(path, GSXML.DOC_NODE_ELEM);
387 path = GSPath.appendLink(path, GSXML.NODE_STRUCTURE_ELEM);
388 Element ds_response_structure = (Element) GSXML.getNodeByPath(ds_response_message, path);
389
390 if (ds_response_structure != null)
391 {
392 // add the contents of the structure bit into the_document
393 NodeList structs = ds_response_structure.getChildNodes();
394 for (int i = 0; i < structs.getLength(); i++)
395 {
396 the_document.appendChild(doc.importNode(structs.item(i), true));
397 }
398 }
399 else
400 {
401 // no structure nodes, so put in a dummy doc node
402 Element doc_node = doc.createElement(GSXML.DOC_NODE_ELEM);
403 if (document_id != null)
404 {
405 doc_node.setAttribute(GSXML.NODE_ID_ATT, document_id);
406 }
407 else
408 {
409 doc_node.setAttribute(GSXML.HREF_ID_ATT, href);
410
411 }
412 the_document.appendChild(doc_node);
413 has_dummy = true;
414 }
415 }
416 else
417 { // a simple type - we dont have a dummy node for simple
418 // should think about this more
419 // no structure request, so just put in a dummy doc node
420 Element doc_node = doc.createElement(GSXML.DOC_NODE_ELEM);
421 if (document_id != null)
422 {
423 doc_node.setAttribute(GSXML.NODE_ID_ATT, document_id);
424 }
425 else
426 {
427 doc_node.setAttribute(GSXML.HREF_ID_ATT, href);
428 }
429 the_document.appendChild(doc_node);
430 has_dummy = true;
431 }
432
433 // end getting doc structure
434
435 // start getting doc metadata
436
437 // Build a request to obtain some document metadata
438 Element dm_message = doc.createElement(GSXML.MESSAGE_ELEM);
439 String to = GSPath.appendLink(collection, AbstractDocumentRetrieve.DOCUMENT_METADATA_RETRIEVE_SERVICE);
440 Element dm_request = GSXML.createBasicRequest(doc, GSXML.REQUEST_TYPE_PROCESS, to, userContext);
441 dm_message.appendChild(dm_request);
442 // Create a parameter list to specify the required metadata information
443
444 HashSet<String> meta_names = new HashSet<String>();
445 meta_names.add("Title"); // the default
446 getRequiredMetadataNames(meta_names, format_elem, request);
447
448 Element dm_param_list = createMetadataParamList(doc,meta_names);
449 if (service_params != null)
450 {
451 GSXML.addParametersToList(dm_param_list, service_params);
452 }
453
454 dm_request.appendChild(dm_param_list);
455
456 // create the doc node list for the metadata request
457 Element dm_doc_list = doc.createElement(GSXML.DOC_NODE_ELEM + GSXML.LIST_MODIFIER);
458 dm_request.appendChild(dm_doc_list);
459
460 // Add each node from the structure response into the metadata request
461 NodeList doc_nodes = the_document.getElementsByTagName(GSXML.DOC_NODE_ELEM);
462 for (int i = 0; i < doc_nodes.getLength(); i++)
463 {
464 Element doc_node = (Element) doc_nodes.item(i);
465 String doc_node_id = doc_node.getAttribute(GSXML.NODE_ID_ATT);
466
467 // Add the documentNode to the list
468 Element dm_doc_node = doc.createElement(GSXML.DOC_NODE_ELEM);
469 if (needSectionContent(params)) {
470 if (doc_node_id.equals(document_id)) {
471 dm_doc_list.appendChild(dm_doc_node);
472 }
473 } else {
474 dm_doc_list.appendChild(dm_doc_node);
475 }
476 //dm_doc_list.appendChild(dm_doc_node);
477 dm_doc_node.setAttribute(GSXML.NODE_ID_ATT, doc_node_id);
478 dm_doc_node.setAttribute(GSXML.NODE_TYPE_ATT, doc_node.getAttribute(GSXML.NODE_TYPE_ATT));
479 if (document_id == null){
480 dm_doc_node.setAttribute(GSXML.HREF_ID_ATT, href );
481 }
482
483 }
484 // we also want a metadata request to the top level document to get
485 // assocfilepath - this could be cached too
486 Element doc_meta_request = GSXML.createBasicRequest(doc, GSXML.REQUEST_TYPE_PROCESS, to, userContext);
487 dm_message.appendChild(doc_meta_request);
488 Element doc_meta_param_list = doc.createElement(GSXML.PARAM_ELEM + GSXML.LIST_MODIFIER);
489 if (service_params != null)
490 {
491 GSXML.addParametersToList(doc_meta_param_list, service_params);
492 }
493
494 doc_meta_request.appendChild(doc_meta_param_list);
495 Element doc_param = doc.createElement(GSXML.PARAM_ELEM);
496 doc_meta_param_list.appendChild(doc_param);
497 doc_param.setAttribute(GSXML.NAME_ATT, "metadata");
498 doc_param.setAttribute(GSXML.VALUE_ATT, "assocfilepath");
499
500 // create the doc node list for the metadata request
501 Element doc_list = doc.createElement(GSXML.DOC_NODE_ELEM + GSXML.LIST_MODIFIER);
502 doc_meta_request.appendChild(doc_list);
503
504 Element doc_node = doc.createElement(GSXML.DOC_NODE_ELEM);
505 // the node we want is the root document node
506 if (document_id != null)
507 {
508 doc_node.setAttribute(GSXML.NODE_ID_ATT, document_id + ".rt");
509 }
510 /*else
511 {
512 doc_node.setAttribute(GSXML.HREF_ID_ATT, href);// + ".rt");
513 // can we assume that href is always a top level doc??
514 //doc_node.setAttribute(GSXML.ID_MOD_ATT, ".rt");
515 //doc_node.setAttribute("externalURL", has_rl);
516 }*/
517 doc_list.appendChild(doc_node);
518
519 Element dm_response_message = (Element) this.mr.process(dm_message);
520 if (processErrorElements(dm_response_message, page_response))
521 {
522 return result;
523 }
524
525 String path = GSPath.appendLink(GSXML.RESPONSE_ELEM, GSXML.DOC_NODE_ELEM + GSXML.LIST_MODIFIER);
526 Element dm_response_doc_list = (Element) GSXML.getNodeByPath(dm_response_message, path);
527
528 // Merge the metadata with the structure information
529 NodeList dm_response_docs = dm_response_doc_list.getChildNodes();
530 for (int i = 0; i < doc_nodes.getLength(); i++)
531 {
532 Node dcNode;
533 String node_idd = ((Element)doc_nodes.item(i)).getAttribute(GSXML.NODE_ID_ATT);
534 if (node_idd.isEmpty()) {
535 String href_id_att = ((Element)doc_nodes.item(i)).getAttribute(GSXML.HREF_ID_ATT);
536 dcNode = GSXML.getNamedElement(dm_response_doc_list, "documentNode", GSXML.HREF_ID_ATT, href_id_att);
537 } else {
538 dcNode = GSXML.getNamedElement(dm_response_doc_list, "documentNode", GSXML.NODE_ID_ATT, node_idd);
539 }
540 GSXML.mergeMetadataLists(doc_nodes.item(i), dcNode);
541 }
542 // get the top level doc metadata out
543 Element doc_meta_response = (Element) dm_response_message.getElementsByTagName(GSXML.RESPONSE_ELEM).item(1);
544 Element top_doc_node = (Element) GSXML.getNodeByPath(doc_meta_response, "documentNodeList/documentNode");
545 GSXML.mergeMetadataLists(the_document, top_doc_node);
546
547 // if we are highlighting query terms, then we also get them highlighted in the metadata
548
549 HashSet<String> query_term_variants = null;
550 ArrayList<ArrayList<HashSet<String>>> phrase_query_term_variants_hierarchy = null;
551 boolean do_highlight_query_terms = highlight_query_terms;
552 int query_terms_status = 0;
553 if (highlight_query_terms) {
554 // lets get the query term equivalents
555 query_term_variants = new HashSet<String>();
556 phrase_query_term_variants_hierarchy = new ArrayList<ArrayList<HashSet<String>>>();
557 if ((query_terms_status = getQueryTermVariants(request, query_term_variants, phrase_query_term_variants_hierarchy)) ==0) {
558 do_highlight_query_terms = false; // we couldn't get the terms
559 }
560 }
561
562 // lets try marking up the metadata with search terms
563 // if the search service doesn't send back <equivTermlist> then we haven't got the term variants. We lower case everything and do case insensitive matching
564 boolean highlight_case_insensitive = false;
565 if (query_terms_status == NO_EQUIV_QUERY_TERMS) {
566 highlight_case_insensitive = true;
567 }
568 if (do_highlight_query_terms) {
569 highlightQueryTermsDOM(doc, the_document, "metadata", query_term_variants, phrase_query_term_variants_hierarchy, highlight_case_insensitive);
570 }
571
572 // do we want doc text content? If not, we are done.
573 if (!get_text) {
574 // don't get text
575 return result;
576 }
577
578 // Build a request to obtain some document content
579 Element dc_message = doc.createElement(GSXML.MESSAGE_ELEM);
580 to = GSPath.appendLink(collection, AbstractDocumentRetrieve.DOCUMENT_CONTENT_RETRIEVE_SERVICE);
581 Element dc_request = GSXML.createBasicRequest(doc, GSXML.REQUEST_TYPE_PROCESS, to, userContext);
582 dc_message.appendChild(dc_request);
583
584 // Create a parameter list to specify the request parameters - empty for now
585 Element dc_param_list = doc.createElement(GSXML.PARAM_ELEM + GSXML.LIST_MODIFIER);
586 if (service_params != null)
587 {
588 GSXML.addParametersToList(dc_param_list, service_params);
589 }
590
591 dc_request.appendChild(dc_param_list);
592
593 // get the content
594 // the doc list for the content request is the same as the one for the structure request unless we want the whole document, in which case its the same as for the metadata request.
595 if (expand_document)
596 {
597 dc_request.appendChild(dm_doc_list);
598 }
599 else
600 {
601 dc_request.appendChild(basic_doc_list);
602 }
603 Element dc_response_message = (Element) this.mr.process(dc_message);
604
605 if (processErrorElements(dc_response_message, page_response))
606 {
607 return result;
608
609 }
610 Element dc_response_doc_list = (Element) GSXML.getNodeByPath(dc_response_message, path);
611
612 boolean get_marked_up_doc_from_query = false;
613 if (do_highlight_query_terms && query_terms_status == NO_EQUIV_QUERY_TERMS) {
614 get_marked_up_doc_from_query = true; // we try to. solr we can, lucene we can't
615 }
616
617 if (expand_document)
618 {
619 // Merge the content with the structure information
620 NodeList dc_response_docs = dc_response_doc_list.getChildNodes();
621 for (int i = 0; i < doc_nodes.getLength(); i++)
622 {
623 String node_id = ((Element)doc_nodes.item(i)).getAttribute(GSXML.NODE_ID_ATT);
624 Node docNode = GSXML.getNamedElement(dc_response_doc_list, "documentNode", GSXML.NODE_ID_ATT, node_id);
625 Node content = GSXML.getChildByTagName(docNode, GSXML.NODE_CONTENT_ELEM);
626 if (content != null)
627 {
628 if (do_highlight_query_terms) {
629 if (get_marked_up_doc_from_query) {
630
631 Element new_content = retrieveHighlightedContent(request, node_id);
632
633 if (new_content == null) {
634 // we didn't get any text back from the request. assume we won't be able to get it next time either (eg lucene)
635 get_marked_up_doc_from_query = false;
636 content = highlightQueryTermsElementText(doc, (Element)content, query_term_variants, phrase_query_term_variants_hierarchy, highlight_case_insensitive);
637 } else {
638 content= new_content;
639 }
640 } else {
641 content = highlightQueryTermsElementText(doc, (Element)content, query_term_variants, phrase_query_term_variants_hierarchy, highlight_case_insensitive);
642 }
643 }
644 doc_nodes.item(i).appendChild(doc.importNode(content, true));
645 }
646
647 }
648 if (has_dummy && document_type.equals(GSXML.DOC_TYPE_SIMPLE)) {
649 Element dummy_node = (Element) doc_nodes.item(0);
650 the_document.removeChild(dummy_node);
651 the_document.setAttribute(GSXML.NODE_ID_ATT, dummy_node.getAttribute(GSXML.NODE_ID_ATT));
652 NodeList dummy_children = dummy_node.getChildNodes();
653 for (int i = dummy_children.getLength() - 1; i >= 0; i--)
654 {
655 // special case as we don't want more than one metadata list
656 if (dummy_children.item(i).getNodeName().equals(GSXML.METADATA_ELEM + GSXML.LIST_MODIFIER))
657 {
658 GSXML.mergeMetadataFromList(the_document, dummy_children.item(i));
659 }
660 else
661 {
662 the_document.appendChild(dummy_children.item(i));
663 }
664 }
665 }
666 }
667 else
668 {
669 Element dc_response_doc = (Element) GSXML.getChildByTagName(dc_response_doc_list, GSXML.DOC_NODE_ELEM);
670 Element dc_response_doc_content = (Element) GSXML.getChildByTagName(dc_response_doc, GSXML.NODE_CONTENT_ELEM);
671
672 if (dc_response_doc_content == null)
673 {
674 // no content to add
675 if (dc_response_doc.getAttribute("external").equals("true"))
676 {
677 String href_id = dc_response_doc.getAttribute(GSXML.HREF_ID_ATT);
678
679 the_document.setAttribute("selectedNode", href_id);
680 the_document.setAttribute("external", href_id);
681 }
682 return result;
683 }
684 if (do_highlight_query_terms)
685 {
686 dc_response_doc.removeChild(dc_response_doc_content);
687 if (get_marked_up_doc_from_query) {
688 Element new_content = retrieveHighlightedContent(request, null);
689 if (new_content == null) {
690 get_marked_up_doc_from_query = false;
691 dc_response_doc_content = highlightQueryTermsElementText(doc, (Element)dc_response_doc_content, query_term_variants, phrase_query_term_variants_hierarchy, highlight_case_insensitive);
692 } else {
693
694 dc_response_doc_content = new_content;
695 }
696 } else {
697 dc_response_doc_content = highlightQueryTermsElementText(doc, (Element)dc_response_doc_content, query_term_variants, phrase_query_term_variants_hierarchy, highlight_case_insensitive);
698 }
699 dc_response_doc.appendChild(dc_response_doc.getOwnerDocument().importNode(dc_response_doc_content, true));
700 }
701
702 if (provide_annotations)
703 {
704 String service_selected = (String) params.get(ENRICH_DOC_ARG);
705 if (service_selected != null && service_selected.equals("1"))
706 {
707 // now we can modifiy the response doc if needed
708 String enrich_service = (String) params.get(GSParams.SERVICE);
709 // send a message to the service
710 Element enrich_message = doc.createElement(GSXML.MESSAGE_ELEM);
711 Element enrich_request = GSXML.createBasicRequest(doc, GSXML.REQUEST_TYPE_PROCESS, enrich_service, userContext);
712 enrich_message.appendChild(enrich_request);
713 // check for parameters
714 HashMap e_service_params = (HashMap) params.get("s1");
715 if (e_service_params != null)
716 {
717 Element enrich_pl = doc.createElement(GSXML.PARAM_ELEM + GSXML.LIST_MODIFIER);
718 GSXML.addParametersToList(enrich_pl, e_service_params);
719 enrich_request.appendChild(enrich_pl);
720 }
721 Element e_doc_list = doc.createElement(GSXML.DOC_NODE_ELEM + GSXML.LIST_MODIFIER);
722 enrich_request.appendChild(e_doc_list);
723 e_doc_list.appendChild(doc.importNode(dc_response_doc, true));
724
725 Node enrich_response = this.mr.process(enrich_message);
726
727 String[] links = { GSXML.RESPONSE_ELEM, GSXML.DOC_NODE_ELEM + GSXML.LIST_MODIFIER, GSXML.DOC_NODE_ELEM, GSXML.NODE_CONTENT_ELEM };
728 path = GSPath.createPath(links);
729 dc_response_doc_content = (Element) GSXML.getNodeByPath(enrich_response, path);
730
731 }
732 } // if provide_annotations
733
734 // use the returned id rather than the sent one cos there may have
735 // been modifiers such as .pr that are removed.
736 String modified_doc_id = dc_response_doc.getAttribute(GSXML.NODE_ID_ATT);
737 the_document.setAttribute("selectedNode", modified_doc_id);
738 if (has_dummy)
739 {
740 // change the id if necessary and add the content
741 Element dummy_node = (Element) doc_nodes.item(0);
742
743 dummy_node.setAttribute(GSXML.NODE_ID_ATT, modified_doc_id);
744 dummy_node.appendChild(doc.importNode(dc_response_doc_content, true));
745 // hack for simple type
746 if (document_type.equals(GSXML.DOC_TYPE_SIMPLE))
747 {
748 // we dont want the internal docNode, just want the content and metadata in the document
749 // rethink this!!
750 the_document.removeChild(dummy_node);
751
752 NodeList dummy_children = dummy_node.getChildNodes();
753 for (int i = dummy_children.getLength() - 1; i >= 0; i--)
754 {
755 // special case as we don't want more than one metadata list
756 if (dummy_children.item(i).getNodeName().equals(GSXML.METADATA_ELEM + GSXML.LIST_MODIFIER))
757 {
758 GSXML.mergeMetadataFromList(the_document, dummy_children.item(i));
759 }
760 else
761 {
762 the_document.appendChild(dummy_children.item(i));
763 }
764 }
765 }
766
767 the_document.setAttribute(GSXML.NODE_ID_ATT, modified_doc_id);
768 }
769 else
770 {
771 // Merge the document content with the metadata and structure information
772 for (int i = 0; i < doc_nodes.getLength(); i++)
773 {
774 Node dn = doc_nodes.item(i);
775 String dn_id = ((Element) dn).getAttribute(GSXML.NODE_ID_ATT);
776 if (dn_id.equals(modified_doc_id))
777 {
778 dn.appendChild(doc.importNode(dc_response_doc_content, true));
779 break;
780 }
781 }
782 }
783 }
784 //logger.debug("(DocumentAction) Page:\n" + GSXML.xmlNodeToString(result));
785 return result;
786 }
787
788 protected boolean checkValidOID(Element basic_doc_list, String collection, UserContext userContext, Element page_response) {
789 Document doc = basic_doc_list.getOwnerDocument();
790
791 Element v_message = doc.createElement(GSXML.MESSAGE_ELEM);
792 String to = GSPath.appendLink(collection, AbstractDocumentRetrieve.VALIDATE_DOCUMENT_ID_SERVICE);
793 Element v_request = GSXML.createBasicRequest(doc, GSXML.REQUEST_TYPE_PROCESS, to, userContext);
794 v_message.appendChild(v_request);
795
796 // add the node list
797 v_request.appendChild(basic_doc_list);
798 Element v_response_message = (Element) this.mr.process(v_message);
799 if (processErrorElements(v_response_message, page_response))
800 {
801 return false;
802 }
803 String[] links = { GSXML.RESPONSE_ELEM, GSXML.DOC_NODE_ELEM + GSXML.LIST_MODIFIER, GSXML.DOC_NODE_ELEM };
804 String path = GSPath.createPath(links);
805 Element info_elem = (Element) GSXML.getNodeByPath(v_response_message, path);
806 if (info_elem == null) {
807 return false;
808 }
809 if (info_elem.getAttribute("valid").equals("true")) {
810 return true;
811 }
812 return false;
813
814 }
815
816 protected Element getFormattedArchiveDoc(Document doc, String collection, String document_id, String opt_document_version, String document_type,
817 Element result, Element page_response, UserContext userContext ) {
818 // call get archive doc
819 Element dx_message = doc.createElement(GSXML.MESSAGE_ELEM);
820 String to = DocXMLUtil.DOC_XML_GET_SECTION_SERVICE;
821 Element dx_request = GSXML.createBasicRequest(doc, GSXML.REQUEST_TYPE_PROCESS, to, userContext);
822 dx_message.appendChild(dx_request);
823 Element dx_section = doc.createElement(GSXML.DOCXML_SECTION_ELEM);
824 dx_section.setAttribute(GSXML.NODE_ID_ATT, document_id);
825 dx_section.setAttribute(GSXML.COLLECTION_ATT, collection);
826 dx_section.setAttribute(GSXML.DOC_VERSION_ATT, opt_document_version);
827 dx_request.appendChild(dx_section);
828
829 Element dx_response_message = (Element) this.mr.process(dx_message);
830 if (processErrorElements(dx_response_message, page_response))
831 {
832 return result;
833 }
834
835 // get the section out
836 String path = GSPath.appendLink(GSXML.RESPONSE_ELEM, GSXML.DOCXML_SECTION_ELEM);
837 Element section = (Element) GSXML.getNodeByPath(dx_response_message, path);
838 if (section == null) {
839 logger.error("no archive doc returned for "+document_id);
840 return result;
841 }
842 // convert the archive format into the internal format that the page response requires
843
844 // work out doctype
845 // NOTE: this will be coming from collection database in index
846 // the archive file doesn't store this. So we have to assume
847 // that the doc type will not be changing with any
848 // modifications happening to archives.
849
850 // if doc type is null, then we need to work it out.
851 // create a basic doc list containing the current node
852
853 if (document_type == null) {
854 Element basic_doc_list = doc.createElement(GSXML.DOC_NODE_ELEM + GSXML.LIST_MODIFIER);
855 Element current_doc = doc.createElement(GSXML.DOC_NODE_ELEM);
856 basic_doc_list.appendChild(current_doc);
857 current_doc.setAttribute(GSXML.NODE_ID_ATT, document_id);
858 basic_doc_list.appendChild(current_doc);
859 document_type = getDocumentType(basic_doc_list, collection, userContext, page_response);
860 }
861
862 if (document_type == null) {
863 logger.debug("@@@ doctype is null, setting to simple");
864 document_type = GSXML.DOC_TYPE_SIMPLE;
865 }
866
867 Element doc_elem = doc.createElement(GSXML.DOCUMENT_ELEM);
868 doc_elem.setAttribute(GSXML.DOC_TYPE_ATT, document_type);
869 page_response.appendChild(doc_elem);
870
871 Element transformed_section = transformArchiveToDocument(section);
872 if (document_type == GSXML.DOC_TYPE_SIMPLE) {
873 // simple doc, only returning a single document node, which is the top level section.
874 doc_elem.setAttribute(GSXML.NODE_ID_ATT, document_id);
875 GSXML.mergeElements(doc_elem, transformed_section);
876 return result;
877 }
878
879 // multi sectioned document.
880 transformed_section.setAttribute(GSXML.NODE_ID_ATT, document_id);
881 // In docEdit mode, we obtain the text from archives, from doc.xml
882 // Now the transformation has replaced <Section> with <documentNode>
883 // Need to add nodeID, nodeType and docType attributes to each docNode
884 // as doc.xml doesn't store that.
885 insertDocNodeAttributes(transformed_section, document_type, null);
886 doc_elem.appendChild(doc.importNode(transformed_section, true));
887 logger.debug("dx result = "+XMLConverter.getPrettyString(result));
888
889 return result;
890 }
891
892
893 private boolean needSectionContent(HashMap<String, Serializable> params) {
894 String document_id = (String) params.get(GSParams.DOCUMENT);
895 String ilt = (String) params.get(GSParams.INLINE_TEMPLATE);
896 String iltPrefix = "<xsl:template match=\"/\"><text><xsl:for-each select=\"/page/pageResponse/document//documentNode[@nodeID =";
897 if (ilt != null && ilt.startsWith(iltPrefix) && document_id != null) {
898 return true;
899 }
900
901 return false;
902 }
903 /**
904 * this method gets the collection description, the format info, the list of
905 * enrich services, etc - stuff that is needed for the page, but is the same
906 * whatever the query is - should be cached
907 */
908 protected boolean getBackgroundData(Element page_response, String collection, UserContext userContext)
909 {
910 Document doc = page_response.getOwnerDocument();
911
912 // create a message to process - contains requests for the collection
913 // description, the format element, the enrich services on offer
914 // these could all be cached
915 Element info_message = doc.createElement(GSXML.MESSAGE_ELEM);
916 String path = GSPath.appendLink(collection, AbstractDocumentRetrieve.DOCUMENT_CONTENT_RETRIEVE_SERVICE);
917 // the format request - ignore for now, where does this request go to??
918 Element format_request = GSXML.createBasicRequest(doc, GSXML.REQUEST_TYPE_FORMAT, path, userContext);
919 info_message.appendChild(format_request);
920
921 // the enrich_services request - only do this if provide_annotations is true
922
923 if (provide_annotations)
924 {
925 Element enrich_services_request = GSXML.createBasicRequest(doc, GSXML.REQUEST_TYPE_DESCRIBE, "", userContext);
926 enrich_services_request.setAttribute(GSXML.INFO_ATT, "serviceList");
927 info_message.appendChild(enrich_services_request);
928 }
929
930 Element info_response = (Element) this.mr.process(info_message);
931
932 // the collection is the first response
933 NodeList responses = info_response.getElementsByTagName(GSXML.RESPONSE_ELEM);
934 Element format_resp = (Element) responses.item(0);
935
936 Element format_elem = (Element) GSXML.getChildByTagName(format_resp, GSXML.FORMAT_ELEM);
937 if (format_elem != null)
938 {
939 Element global_format_elem = (Element) GSXML.getChildByTagName(format_resp, GSXML.GLOBAL_FORMAT_ELEM);
940 if (global_format_elem != null)
941 {
942 GSXSLT.mergeFormatElements(format_elem, global_format_elem, false);
943 }
944
945 // set the format type
946 format_elem.setAttribute(GSXML.TYPE_ATT, "display");
947 page_response.appendChild(doc.importNode(format_elem, true));
948 }
949
950 if (provide_annotations)
951 {
952 Element services_resp = (Element) responses.item(1);
953
954 // a new message for the mr
955 Element enrich_message = doc.createElement(GSXML.MESSAGE_ELEM);
956 NodeList e_services = services_resp.getElementsByTagName(GSXML.SERVICE_ELEM);
957 boolean service_found = false;
958 for (int j = 0; j < e_services.getLength(); j++)
959 {
960 if (((Element) e_services.item(j)).getAttribute(GSXML.TYPE_ATT).equals("enrich"))
961 {
962 Element s = GSXML.createBasicRequest(doc, GSXML.REQUEST_TYPE_DESCRIBE, ((Element) e_services.item(j)).getAttribute(GSXML.NAME_ATT), userContext);
963 enrich_message.appendChild(s);
964 service_found = true;
965 }
966 }
967 if (service_found)
968 {
969 Element enrich_response = (Element) this.mr.process(enrich_message);
970
971 NodeList e_responses = enrich_response.getElementsByTagName(GSXML.RESPONSE_ELEM);
972 Element service_list = doc.createElement(GSXML.SERVICE_ELEM + GSXML.LIST_MODIFIER);
973 for (int i = 0; i < e_responses.getLength(); i++)
974 {
975 Element e_resp = (Element) e_responses.item(i);
976 Element e_service = (Element) doc.importNode(GSXML.getChildByTagName(e_resp, GSXML.SERVICE_ELEM), true);
977 e_service.setAttribute(GSXML.NAME_ATT, e_resp.getAttribute(GSXML.FROM_ATT));
978 service_list.appendChild(e_service);
979 }
980 page_response.appendChild(service_list);
981 }
982 } // if provide_annotations
983 return true;
984
985 }
986
987 protected String getDocumentType(Element basic_doc_list, String collection, UserContext userContext, Element page_response)
988 {
989 Document doc = basic_doc_list.getOwnerDocument();
990
991 Element ds_message = doc.createElement(GSXML.MESSAGE_ELEM);
992 String to = GSPath.appendLink(collection, AbstractDocumentRetrieve.DOCUMENT_STRUCTURE_RETRIEVE_SERVICE);
993 Element ds_request = GSXML.createBasicRequest(doc, GSXML.REQUEST_TYPE_PROCESS, to, userContext);
994 ds_message.appendChild(ds_request);
995
996 // Create a parameter list to specify the required structure information
997 Element ds_param_list = doc.createElement(GSXML.PARAM_ELEM + GSXML.LIST_MODIFIER);
998 Element ds_param = doc.createElement(GSXML.PARAM_ELEM);
999 ds_param_list.appendChild(ds_param);
1000 ds_param.setAttribute(GSXML.NAME_ATT, "info");
1001 ds_param.setAttribute(GSXML.VALUE_ATT, "documentType");
1002
1003 ds_request.appendChild(ds_param_list);
1004
1005 // add the node list we created earlier
1006 ds_request.appendChild(basic_doc_list);
1007
1008 // Process the document structure retrieve message
1009 Element ds_response_message = (Element) this.mr.process(ds_message);
1010 if (processErrorElements(ds_response_message, page_response))
1011 {
1012 return null;
1013 }
1014
1015 String[] links = { GSXML.RESPONSE_ELEM, GSXML.DOC_NODE_ELEM + GSXML.LIST_MODIFIER, GSXML.DOC_NODE_ELEM, "nodeStructureInfo" };
1016 String path = GSPath.createPath(links);
1017 Element info_elem = (Element) GSXML.getNodeByPath(ds_response_message, path);
1018 if (info_elem == null) {
1019 return null;
1020 }
1021 Element doctype_elem = GSXML.getNamedElement(info_elem, "info", "name", "documentType");
1022 if (doctype_elem != null)
1023 {
1024 String doc_type = doctype_elem.getAttribute("value");
1025 return doc_type;
1026 }
1027 return null;
1028 }
1029
1030 // Recursive method to set the docType, nodeType and nodeID attributes of each docNode
1031 // The docType remains constant as in parameter document_type
1032 // The nodeID for the first (root) docNode is already set. For all children, the rootNode id
1033 // is updated to be <parent-id>.<num-child>, where the first parent-id is rootNode id.
1034 // The nodeType is root if rootNode, internal if there are children and leaf if no children
1035 protected void insertDocNodeAttributes(Element docNode, String document_type, String id) {
1036
1037 boolean isRoot = false;
1038 if(id == null) { // rootNode, get the root nodeID to work with recursively
1039 id = docNode.getAttribute(GSXML.NODE_ID_ATT);
1040 isRoot = true;
1041 } else { // for all but the root node, need to still set the nodeID
1042 docNode.setAttribute(GSXML.NODE_ID_ATT, id);
1043 }
1044
1045 docNode.setAttribute(GSXML.DOC_TYPE_ATT, document_type);
1046
1047 NodeList docNodes = GSXML.getChildrenByTagName(docNode, GSXML.DOC_NODE_ELEM);
1048 if(docNodes.getLength() > 0) {
1049 docNode.setAttribute(GSXML.NODE_TYPE_ATT, GSXML.NODE_TYPE_INTERNAL);
1050 for(int i = 0; i < docNodes.getLength(); i++) {
1051 Element childDocNode = (Element)docNodes.item(i);
1052
1053 // work out the child docNode's nodeID based on current id
1054 String nodeID = id + "." + (i+1);
1055 insertDocNodeAttributes(childDocNode, document_type, nodeID); //recursion step
1056 }
1057 } else {
1058 docNode.setAttribute(GSXML.NODE_TYPE_ATT, GSXML.NODE_TYPE_LEAF);
1059 }
1060
1061 // rootNode's nodeType is a special case: it's "root", not "leaf" or "internal"
1062 if(isRoot) docNode.setAttribute(GSXML.NODE_TYPE_ATT, GSXML.NODE_TYPE_ROOT);
1063
1064 }
1065
1066 /** run the XSLT transform which converts from doc.xml format to our internal document format */
1067 protected Element transformArchiveToDocument(Element section) {
1068
1069 String stylesheet_filename = GSFile.stylesheetFile(GlobalProperties.getGSDL3Home(), (String) this.config_params.get(GSConstants.SITE_NAME), "", (String) this.config_params.get(GSConstants.INTERFACE_NAME), (ArrayList<String>) this.config_params.get(GSConstants.BASE_INTERFACES), "archive2document.xsl");
1070 if (stylesheet_filename == null) {
1071 logger.error("Couldn't find stylesheet archive2document.xsl");
1072 return section;
1073 }
1074
1075 Document stylesheet_doc = XMLConverter.getDOM(new File(stylesheet_filename));
1076 if (stylesheet_doc == null) {
1077 logger.error("Couldn't load in stylesheet "+stylesheet_filename);
1078 return section;
1079 }
1080
1081 Document section_doc = XMLConverter.newDOM();
1082 section_doc.appendChild(section_doc.importNode(section, true));
1083 Node result = this.transformer.transform(stylesheet_doc, section_doc);
1084 logger.debug("transform result = "+XMLConverter.getPrettyString(result));
1085
1086 Element new_element;
1087 if (result.getNodeType() == Node.DOCUMENT_NODE) {
1088 new_element = ((Document) result).getDocumentElement();
1089 } else {
1090 new_element = (Element) result;
1091 }
1092
1093
1094 return new_element;
1095
1096 }
1097
1098 protected final int NO_QUERY_TERMS = 0;
1099 protected final int NO_EQUIV_QUERY_TERMS = 1;
1100 protected final int EQUIV_QUERY_TERMS = 2;
1101 /**
1102 * this involves a bit of a hack to get the equivalent query terms - has to
1103 * requery the query service - uses the last selected service name. (if it
1104 * ends in query).
1105 */
1106 protected int getQueryTermVariants(Element request, HashSet<String> query_term_variants, ArrayList<ArrayList<HashSet<String>>> phrase_query_term_variants_hierarchy)
1107 {
1108 Document doc = XMLConverter.newDOM();
1109
1110 // do the query again to get term info
1111 Element cgi_param_list = (Element) GSXML.getChildByTagName(request, GSXML.PARAM_ELEM + GSXML.LIST_MODIFIER);
1112 HashMap<String, Serializable> params = GSXML.extractParams(cgi_param_list, false);
1113
1114 HashMap previous_params = (HashMap) params.get("p");
1115 if (previous_params == null)
1116 {
1117 return NO_QUERY_TERMS;
1118 }
1119 String service_name = (String) previous_params.get(GSParams.SERVICE);
1120 if (service_name == null || !service_name.endsWith("Query"))
1121 { // hack for now - we only do highlighting if we were in a query last - ie not if we were in a browse thingy
1122 logger.debug("invalid service "+service_name+", not doing highlighting");
1123 return NO_QUERY_TERMS;
1124 }
1125
1126 String collection = (String) params.get(GSParams.COLLECTION);
1127 UserContext userContext = new UserContext(request);
1128 String to = GSPath.appendLink(collection, service_name);
1129
1130 Element mr_query_message = doc.createElement(GSXML.MESSAGE_ELEM);
1131 Element mr_query_request = GSXML.createBasicRequest(doc, GSXML.REQUEST_TYPE_PROCESS, to, userContext);
1132 mr_query_message.appendChild(mr_query_request);
1133
1134 // paramList
1135 HashMap service_params = (HashMap) params.get("s1");
1136
1137 Element query_param_list = doc.createElement(GSXML.PARAM_ELEM + GSXML.LIST_MODIFIER);
1138 GSXML.addParametersToList(query_param_list, service_params);
1139 mr_query_request.appendChild(query_param_list);
1140
1141 // do the query
1142 Element mr_query_response = (Element) this.mr.process(mr_query_message);
1143
1144 // find the term lists
1145 String path = GSPath.appendLink(GSXML.RESPONSE_ELEM, GSXML.TERM_ELEM + GSXML.LIST_MODIFIER);
1146 Element query_term_list_element = (Element) GSXML.getNodeByPath(mr_query_response, path);
1147 if (query_term_list_element == null)
1148 {
1149 // no term info
1150 return NO_QUERY_TERMS;
1151 }
1152
1153 int result_code = NO_EQUIV_QUERY_TERMS;
1154 NodeList equivalent_terms_nodelist = query_term_list_element.getElementsByTagName("equivTermList");
1155 if (equivalent_terms_nodelist == null || equivalent_terms_nodelist.getLength() == 0)
1156 {
1157 // if we have no equivalent terms, just add the current terms lower cased and we do case insensitive matching later on
1158 NodeList terms_nodelist = query_term_list_element.getElementsByTagName("term");
1159 if (terms_nodelist != null && terms_nodelist.getLength() > 0)
1160 {
1161 for (int i = 0; i < terms_nodelist.getLength(); i++)
1162 {
1163 String termValue = ((Element) terms_nodelist.item(i)).getAttribute("name");
1164 query_term_variants.add(termValue.toLowerCase());
1165 }
1166 }
1167 }
1168 else
1169 {
1170 result_code = EQUIV_QUERY_TERMS;
1171 for (int i = 0; i < equivalent_terms_nodelist.getLength(); i++)
1172 {
1173 Element equivalent_terms_element = (Element) equivalent_terms_nodelist.item(i);
1174 String[] equivalent_terms = GSXML.getAttributeValuesFromList(equivalent_terms_element, GSXML.NAME_ATT);
1175 for (int j = 0; j < equivalent_terms.length; j++)
1176 {
1177 query_term_variants.add(equivalent_terms[j]);
1178 }
1179 }
1180 }
1181
1182 String metadata_path = GSPath.appendLink(GSXML.RESPONSE_ELEM, GSXML.METADATA_ELEM + GSXML.LIST_MODIFIER);
1183 Element metadata_list = (Element) GSXML.getNodeByPath(mr_query_response, metadata_path);
1184
1185 Element query_element = GSXML.getNamedElement(metadata_list, GSXML.METADATA_ELEM, GSXML.NAME_ATT, "query");
1186 String performed_query = GSXML.getNodeText(query_element) + " ";
1187 logger.debug("performed query="+performed_query);
1188
1189 boolean has_phrases = false; // if there are no phrases, we don't bother making the phrase variants structure
1190 if (performed_query.contains("\"")) {
1191 has_phrases = true;
1192 }
1193
1194 ArrayList<HashSet<String>> phrase_query_p_term_variants_list = new ArrayList<HashSet<String>>();
1195 int term_start = 0;
1196 boolean in_term = false;
1197 boolean in_phrase = false;
1198 for (int i = 0; i < performed_query.length(); i++) {
1199
1200 char character = performed_query.charAt(i);
1201 boolean is_character_letter_or_digit = Character.isLetterOrDigit(character);
1202
1203 // Has a query term just started?
1204 if (in_term == false && is_character_letter_or_digit == true)
1205 {
1206 in_term = true;
1207 term_start = i;
1208 }
1209
1210 // Or has a term just finished?
1211 else if (in_term == true && is_character_letter_or_digit == false)
1212 {
1213 in_term = false;
1214 String term = performed_query.substring(term_start, i);
1215 if (has_phrases) {
1216 // do the phrase bit
1217 HashSet<String> phrase_query_p_term_x_variants = new HashSet<String>();
1218 if (result_code == EQUIV_QUERY_TERMS) {
1219 Element term_element = GSXML.getNamedElement(query_term_list_element, GSXML.TERM_ELEM, GSXML.NAME_ATT, term);
1220 if (term_element != null) {
1221 // might be null for eg TX in [snails]:TX
1222
1223 NodeList term_equivalent_terms_nodelist = term_element.getElementsByTagName("equivTermList");
1224 if (term_equivalent_terms_nodelist != null || term_equivalent_terms_nodelist.getLength() != 0) {
1225 for (int j = 0; j < term_equivalent_terms_nodelist.getLength(); j++)
1226 {
1227 Element term_equivalent_terms_element = (Element) term_equivalent_terms_nodelist.item(j);
1228 String[] term_equivalent_terms = GSXML.getAttributeValuesFromList(term_equivalent_terms_element, GSXML.NAME_ATT);
1229 for (int k = 0; k < term_equivalent_terms.length; k++)
1230 {
1231 phrase_query_p_term_x_variants.add(term_equivalent_terms[k]);
1232 }
1233 }
1234 }
1235 }
1236 } else { // result_code != EQUIV_QUERY_TERMS
1237 // we don;t have equivalent term list, so just add the lower cased version in, and we do case-insensitive matching later on
1238 if (query_term_variants.contains(term.toLowerCase()) || containsSubString(query_term_variants, term)) {
1239 // this handles the case where the user has searched for snails, but term list returns 'snail'
1240 phrase_query_p_term_x_variants.add(term.toLowerCase());
1241 }
1242 }
1243 if (phrase_query_p_term_x_variants.size()>0) {
1244 // we have found a valid term
1245 phrase_query_p_term_variants_list.add(phrase_query_p_term_x_variants);
1246
1247 if (in_phrase == false)
1248 {
1249 phrase_query_term_variants_hierarchy.add(phrase_query_p_term_variants_list);
1250 phrase_query_p_term_variants_list = new ArrayList<HashSet<String>>();
1251 }
1252 }
1253 } // end if has_phrases
1254 else {
1255 // no phrases so we don't have to do the phrasey stuff. but
1256 // we need to check the term against the query term list - if its not in there, check whether its the root of a term.
1257 // we want to handle the case where user has queried "snails", the term list returned only has snail, and therefore snails doesn't get highlighted.
1258 // but dont want to include eg TX
1259 if (result_code == NO_EQUIV_QUERY_TERMS) {
1260 if (containsSubString(query_term_variants, term)) {
1261 query_term_variants.add(term.toLowerCase());
1262 }
1263 }
1264
1265 }
1266 } // end of in_term...
1267 // Watch for phrases (surrounded by quotes)
1268 if (character == '\"') {
1269
1270 // Has a phrase just started?
1271 if (in_phrase == false)
1272 {
1273 in_phrase = true;
1274 }
1275 // Or has a phrase just finished?
1276 else if (in_phrase == true)
1277 {
1278 in_phrase = false;
1279 phrase_query_term_variants_hierarchy.add(phrase_query_p_term_variants_list);
1280 }
1281
1282 phrase_query_p_term_variants_list = new ArrayList<HashSet<String>>();
1283 } // if char == "
1284 } // for each char in performed query
1285
1286 return result_code;
1287 }
1288
1289 protected boolean containsSubString(HashSet<String> query_term_variants, String term) {
1290 // hack to filter out TX, TI field names
1291 String lc_term = term.toLowerCase();
1292 if (query_term_variants.contains(term)) {
1293 return false; // or true??
1294 }
1295 if (term.matches("[A-Z][A-Z][A-Z]?")) {
1296 return false;
1297 }
1298 Iterator i = query_term_variants.iterator();
1299 while (i.hasNext()) {
1300 String t = (String)i.next();
1301 if (term.startsWith(t)) {
1302 return true;
1303 }
1304 }
1305 return false;
1306 }
1307
1308
1309 /** retrieve the marked up highlighted section - only works for solr collection */
1310 protected Element retrieveHighlightedContent(Element request, String node_id) {
1311
1312 Document doc = XMLConverter.newDOM();
1313
1314 // do the query again to get term info
1315 Element cgi_param_list = (Element) GSXML.getChildByTagName(request, GSXML.PARAM_ELEM + GSXML.LIST_MODIFIER);
1316 HashMap<String, Serializable> params = GSXML.extractParams(cgi_param_list, false);
1317
1318 HashMap previous_params = (HashMap) params.get("p");
1319 if (previous_params == null)
1320 {
1321 return null;
1322 }
1323 String service_name = (String) previous_params.get(GSParams.SERVICE);
1324 if (service_name == null || !service_name.endsWith("Query"))
1325 { // hack for now - we only do highlighting if we were in a query last - ie not if we were in a browse thingy
1326 logger.debug("HL invalid service, not doing highlighting");
1327 return null;
1328 }
1329
1330 String collection = (String) params.get(GSParams.COLLECTION);
1331 UserContext userContext = new UserContext(request);
1332 String to = GSPath.appendLink(collection, service_name);
1333
1334 Element mr_query_message = doc.createElement(GSXML.MESSAGE_ELEM);
1335 Element mr_query_request = GSXML.createBasicRequest(doc, GSXML.REQUEST_TYPE_PROCESS, to, userContext);
1336 mr_query_message.appendChild(mr_query_request);
1337
1338 // paramList
1339 HashMap service_params = (HashMap) params.get("s1");
1340
1341 // hack in case the user searched on eg titles, but we want highlighting in the text
1342 service_params.put("index", "TX");
1343 Element query_param_list = doc.createElement(GSXML.PARAM_ELEM + GSXML.LIST_MODIFIER);
1344 GSXML.addParametersToList(query_param_list, service_params);
1345
1346 if (node_id != null) {
1347 GSXML.addParameterToList(query_param_list, "hldocOID", node_id);
1348 } else {
1349 GSXML.addParameterToList(query_param_list, "hldocOID", (String) params.get(GSParams.DOCUMENT));
1350 }
1351 mr_query_request.appendChild(query_param_list);
1352 // do the query
1353
1354 Element mr_query_response = (Element) this.mr.process(mr_query_message);
1355 String pathNode = GSPath.appendLink(GSXML.RESPONSE_ELEM, GSXML.NODE_CONTENT_ELEM);
1356 Element highlighted_node = (Element) GSXML.getNodeByPath(mr_query_response, pathNode);
1357
1358 if (highlighted_node == null) {
1359 return null;
1360 }
1361 // For SOLR, the highlighted node will be a nodeContent element, which is the hldocOID section content, with search terms marked up.
1362 //We send it back to the documnetContentRetrieve service so that resolveTextMacros can be applied, and it can be properly encased in documentNode etc elements
1363
1364 // Build a request to process highlighted text
1365
1366 Element hl_message = doc.createElement(GSXML.MESSAGE_ELEM);
1367 to = GSPath.appendLink(collection, AbstractDocumentRetrieve.DOCUMENT_CONTENT_RETRIEVE_SERVICE);
1368 Element dc_request = GSXML.createBasicRequest(doc, GSXML.REQUEST_TYPE_PROCESS, to, userContext);
1369 hl_message.appendChild(dc_request);
1370
1371 // Create a parameter list to specify the request parameters - empty for now
1372 Element dc_param_list = doc.createElement(GSXML.PARAM_ELEM + GSXML.LIST_MODIFIER);
1373 dc_request.appendChild(dc_param_list);
1374
1375 // get the content
1376 Element doc_list = doc.createElement(GSXML.DOC_NODE_ELEM + GSXML.LIST_MODIFIER);
1377 dc_request.appendChild(doc_list);
1378 Element current_doc = doc.createElement(GSXML.DOC_NODE_ELEM);
1379 doc_list.appendChild(current_doc);
1380 current_doc.setAttribute(GSXML.NODE_ID_ATT, (String) params.get(GSParams.DOCUMENT));
1381 //Append highlighted content to request for processing
1382 dc_request.appendChild(doc.importNode(highlighted_node, true));
1383 Element hl_response_message = (Element) this.mr.process(hl_message);
1384 //Get results
1385 NodeList contentList = hl_response_message.getElementsByTagName(GSXML.NODE_CONTENT_ELEM);
1386 Element content = (Element) contentList.item(0);
1387 return content;
1388
1389
1390 }
1391 /**
1392 * Highlights query terms in specified elements (whose name is in element_names) text inside top_level_elem
1393 */
1394 protected boolean highlightQueryTermsDOM(Document doc, Element top_level_elem, String element_name, HashSet<String> query_term_variants, ArrayList<ArrayList<HashSet<String>>> phrase_query_term_variants_hierarchy, boolean case_insensitive) {
1395
1396 NodeList named_elems = top_level_elem.getElementsByTagName(element_name);
1397 for (int j=named_elems.getLength()-1; j>=0; j--) {
1398 Element this_elem = (Element)named_elems.item(j);
1399 Element replacement_elem = highlightQueryTermsElementText(doc, this_elem, query_term_variants, phrase_query_term_variants_hierarchy, case_insensitive);
1400 this_elem.getParentNode().replaceChild(replacement_elem, this_elem);
1401 }
1402 return true;
1403 }
1404 /**
1405 * Highlights query terms in the text content of an element.
1406 */
1407 private Element highlightQueryTermsElementText(Document doc, Element original_element, HashSet<String> query_term_variants, ArrayList<ArrayList<HashSet<String>>> phrase_query_term_variants_hierarchy, boolean case_insensitive)
1408 {
1409 String content = GSXML.getNodeText(original_element);
1410 // Convert the content string to an array of characters for speed
1411 char[] content_characters = new char[content.length()];
1412 content.getChars(0, content.length(), content_characters, 0);
1413
1414 // Now skim through the content, identifying word matches
1415 ArrayList<WordMatch> word_matches = new ArrayList<WordMatch>();
1416 int word_start = 0;
1417 boolean in_word = false;
1418 boolean preceding_word_matched = false;
1419 boolean inTag = false;
1420 for (int i = 0; i < content_characters.length; i++)
1421 {
1422 //We don't want to find words inside HTML tags
1423 if (content_characters[i] == '<')
1424 {
1425 // are we currently in a word?
1426 if (in_word) {
1427 in_word = false;
1428 String word = new String(content_characters, word_start, (i - word_start));
1429 if (case_insensitive) {
1430 word = word.toLowerCase();
1431 }
1432 if (query_term_variants.contains(word)) {
1433 // We have found a matching word, so remember its location
1434 word_matches.add(new WordMatch(word, word_start, i, preceding_word_matched));
1435 // should preceding word matched be set to true/false here??
1436 preceding_word_matched = true;
1437 } else {
1438 preceding_word_matched = false;
1439 }
1440 }
1441 inTag = true;
1442 continue;
1443 }
1444 else if (inTag && content_characters[i] == '>')
1445 {
1446 inTag = false;
1447 continue;
1448 }
1449 else if (inTag)
1450 {
1451 continue;
1452 }
1453
1454 boolean is_character_letter_or_digit = Character.isLetterOrDigit(content_characters[i]);
1455
1456 // Has a word just started?
1457 if (in_word == false && is_character_letter_or_digit == true)
1458 {
1459 in_word = true;
1460 word_start = i;
1461 }
1462
1463 // Or has a word just finished?
1464 else if (in_word == true && is_character_letter_or_digit == false)
1465 {
1466 in_word = false;
1467
1468 // Check if the word matches any of the query term equivalents
1469 String word = new String(content_characters, word_start, (i - word_start));
1470 if (case_insensitive) {
1471 word = word.toLowerCase();
1472 }
1473 if (query_term_variants.contains(word))
1474 {
1475 // We have found a matching word, so remember its location
1476 word_matches.add(new WordMatch(word, word_start, i, preceding_word_matched));
1477 preceding_word_matched = true;
1478 }
1479 else
1480 {
1481 preceding_word_matched = false;
1482 }
1483 }
1484 }
1485
1486 // Don't forget the last word...
1487 if (in_word == true)
1488 {
1489 // Check if the word matches any of the query term equivalents
1490 String word = new String(content_characters, word_start, (content_characters.length - word_start));
1491 if (case_insensitive) {
1492 word = word.toLowerCase();
1493 }
1494 if (query_term_variants.contains(word))
1495 {
1496 // We have found a matching word, so remember its location
1497 word_matches.add(new WordMatch(word, word_start, content_characters.length, preceding_word_matched));
1498 }
1499 }
1500
1501 if (word_matches.size() == 0) {
1502 // just return a copy of the original element
1503 return (Element)doc.importNode(original_element, true);
1504
1505 }
1506
1507 ArrayList<Integer> highlight_start_positions = new ArrayList<Integer>();
1508 ArrayList<Integer> highlight_end_positions = new ArrayList<Integer>();
1509
1510 if (phrase_query_term_variants_hierarchy.size() ==0) {
1511 for (int i = 0; i < word_matches.size(); i++) {
1512 highlight_start_positions.add(Integer.valueOf(word_matches.get(i).start_position));
1513 highlight_end_positions.add(Integer.valueOf(word_matches.get(i).end_position));
1514 }
1515 }
1516 else {
1517 // Deal with phrases now
1518 ArrayList<PartialPhraseMatch> partial_phrase_matches = new ArrayList<PartialPhraseMatch>();
1519 for (int i = 0; i < word_matches.size(); i++)
1520 {
1521 WordMatch word_match = word_matches.get(i);
1522
1523 // See if any partial phrase matches are extended by this word
1524 if (word_match.preceding_word_matched)
1525 {
1526 for (int j = partial_phrase_matches.size() - 1; j >= 0; j--)
1527 {
1528 PartialPhraseMatch partial_phrase_match = partial_phrase_matches.remove(j);
1529 ArrayList phrase_query_p_term_variants_list = phrase_query_term_variants_hierarchy.get(partial_phrase_match.query_phrase_number);
1530 HashSet phrase_query_p_term_x_variants = (HashSet) phrase_query_p_term_variants_list.get(partial_phrase_match.num_words_matched);
1531 if (phrase_query_p_term_x_variants.contains(word_match.word))
1532 {
1533 partial_phrase_match.num_words_matched++;
1534
1535 // Has a complete phrase match occurred?
1536 if (partial_phrase_match.num_words_matched == phrase_query_p_term_variants_list.size())
1537 {
1538 // Check for overlaps by looking at the previous highlight range
1539 if (!highlight_end_positions.isEmpty())
1540 {
1541 int last_highlight_index = highlight_end_positions.size() - 1;
1542 int last_highlight_end = highlight_end_positions.get(last_highlight_index).intValue();
1543 if (last_highlight_end > partial_phrase_match.start_position)
1544 {
1545 // There is an overlap, so remove the previous phrase match
1546 int last_highlight_start = highlight_start_positions.remove(last_highlight_index).intValue();
1547 highlight_end_positions.remove(last_highlight_index);
1548 partial_phrase_match.start_position = last_highlight_start;
1549 }
1550 }
1551
1552 highlight_start_positions.add(Integer.valueOf(partial_phrase_match.start_position));
1553 highlight_end_positions.add(Integer.valueOf(word_match.end_position));
1554 }
1555 // No, but add the partial match back into the list for next time
1556 else
1557 {
1558 partial_phrase_matches.add(partial_phrase_match);
1559 }
1560 }
1561 }
1562 }
1563 else
1564 {
1565 partial_phrase_matches.clear();
1566 }
1567
1568 // See if this word is at the start of any of the phrases
1569 for (int p = 0; p < phrase_query_term_variants_hierarchy.size(); p++)
1570 {
1571 ArrayList phrase_query_p_term_variants_list = phrase_query_term_variants_hierarchy.get(p);
1572 if (phrase_query_p_term_variants_list.size()>0) {
1573 HashSet phrase_query_p_term_1_variants = (HashSet) phrase_query_p_term_variants_list.get(0);
1574 if (phrase_query_p_term_1_variants.contains(word_match.word))
1575 {
1576 // If this phrase is just one word long, we have a complete match
1577 if (phrase_query_p_term_variants_list.size() == 1)
1578 {
1579 highlight_start_positions.add(Integer.valueOf(word_match.start_position));
1580 highlight_end_positions.add(Integer.valueOf(word_match.end_position));
1581 }
1582 // Otherwise we have the start of a potential phrase match
1583 else
1584 {
1585 partial_phrase_matches.add(new PartialPhraseMatch(word_match.start_position, p));
1586 }
1587 }
1588 }
1589 }
1590 }
1591 }
1592
1593 // Now add the annotation tags into the document at the correct points
1594 Element content_element = (Element)doc.importNode(original_element, false); // just copy the element plus any attributes, but not any children.
1595 int last_wrote = 0;
1596 for (int i = 0; i < highlight_start_positions.size(); i++)
1597 {
1598 int highlight_start = highlight_start_positions.get(i).intValue();
1599 int highlight_end = highlight_end_positions.get(i).intValue();
1600
1601 // Print anything before the highlight range
1602 if (last_wrote < highlight_start)
1603 {
1604 String preceding_text = new String(content_characters, last_wrote, (highlight_start - last_wrote));
1605 content_element.appendChild(doc.createTextNode(preceding_text));
1606 }
1607
1608 // Print the highlight text, annotated
1609 if (highlight_end > last_wrote)
1610 {
1611 String highlight_text = new String(content_characters, highlight_start, (highlight_end - highlight_start));
1612 Element annotation_element = GSXML.createTextElement(doc, "annotation", highlight_text);
1613 annotation_element.setAttribute("type", "query_term");
1614 content_element.appendChild(annotation_element);
1615 last_wrote = highlight_end;
1616 }
1617 }
1618
1619 // Finish off any unwritten text
1620 if (last_wrote < content_characters.length)
1621 {
1622 String remaining_text = new String(content_characters, last_wrote, (content_characters.length - last_wrote));
1623 content_element.appendChild(doc.createTextNode(remaining_text));
1624 }
1625 return content_element;
1626 }
1627
1628
1629 static private class WordMatch
1630 {
1631 public String word;
1632 public int start_position;
1633 public int end_position;
1634 public boolean preceding_word_matched;
1635
1636 public WordMatch(String word, int start_position, int end_position, boolean preceding_word_matched)
1637 {
1638 this.word = word;
1639 this.start_position = start_position;
1640 this.end_position = end_position;
1641 this.preceding_word_matched = preceding_word_matched;
1642 }
1643 }
1644
1645 static private class PartialPhraseMatch
1646 {
1647 public int start_position;
1648 public int query_phrase_number;
1649 public int num_words_matched;
1650
1651 public PartialPhraseMatch(int start_position, int query_phrase_number)
1652 {
1653 this.start_position = start_position;
1654 this.query_phrase_number = query_phrase_number;
1655 this.num_words_matched = 1;
1656 }
1657 }
1658}
Note: See TracBrowser for help on using the repository browser.