source: main/trunk/greenstone3/src/java/org/greenstone/gsdl3/action/DocumentAction.java@ 32546

Last change on this file since 32546 was 32546, checked in by kjdon, 5 years ago

use variable instead of hard coded string for paramDefault

  • Property svn:keywords set to Author Date Id Revision
File size: 61.8 KB
Line 
1/*
2 * DocumentAction.java
3 * Copyright (C) 2002 New Zealand Digital Library, http://www.nzdl.org
4 *
5 * This program is free software; you can redistribute it and/or modify
6 * it under the terms of the GNU General Public License as published by
7 * the Free Software Foundation; either version 2 of the License, or
8 * (at your option) any later version.
9 *
10 * This program is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 * GNU General Public License for more details.
14 *
15 * You should have received a copy of the GNU General Public License
16 * along with this program; if not, write to the Free Software
17 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
18 */
19package org.greenstone.gsdl3.action;
20
21// Greenstone classes
22import org.greenstone.gsdl3.core.ModuleInterface;
23import org.greenstone.gsdl3.util.*;
24import org.greenstone.util.GlobalProperties;
25
26// XML classes
27import org.w3c.dom.Document;
28import org.w3c.dom.Element;
29import org.w3c.dom.Node;
30import org.w3c.dom.Text;
31import org.w3c.dom.NodeList;
32
33// General Java classes
34import java.util.ArrayList;
35import java.util.HashMap;
36import java.util.HashSet;
37import java.util.Iterator;
38import java.io.File;
39import java.io.Serializable;
40
41import org.apache.log4j.*;
42
43/** Action class for retrieving Documents via the message router */
44public class DocumentAction extends Action
45{
46
47 static Logger logger = Logger.getLogger(org.greenstone.gsdl3.action.DocumentAction.class.getName());
48
49 // this is used to specify that the sibling nodes of a selected one should be obtained
50 public static final String SIBLING_ARG = "sib";
51 public static final String GOTO_PAGE_ARG = "gp";
52 public static final String ENRICH_DOC_ARG = "end";
53 public static final String EXPAND_DOCUMENT_ARG = "ed";
54 public static final String EXPAND_CONTENTS_ARG = "ec";
55 public static final String REALISTIC_BOOK_ARG = "book";
56 public static final String NO_TEXT_ARG = "noText";
57 public static final String DOC_EDIT_ARG = "docEdit";
58
59 /**
60 * if this is set to true, when a document is displayed, any annotation type
61 * services (enrich) will be offered to the user as well
62 */
63 protected boolean provide_annotations = false;
64
65 protected boolean highlight_query_terms = false;
66
67 public boolean configure()
68 {
69 super.configure();
70 String highlight = (String) config_params.get("highlightQueryTerms");
71 if (highlight != null && highlight.equals("true"))
72 {
73 highlight_query_terms = true;
74 }
75 String annotate = (String) config_params.get("displayAnnotationService");
76 if (annotate != null && annotate.equals("true"))
77 {
78 provide_annotations = true;
79 }
80 return true;
81 }
82
83 public Node process(Node message_node)
84 {
85 // for now, no subaction eventually we may want to have subactions such as text assoc or something ?
86
87 Element message = GSXML.nodeToElement(message_node);
88 Document doc = XMLConverter.newDOM();
89
90 // the response
91 Element result = doc.createElement(GSXML.MESSAGE_ELEM);
92 Element page_response = doc.createElement(GSXML.RESPONSE_ELEM);
93 result.appendChild(page_response);
94
95 // get the request - assume only one
96 Element request = (Element) GSXML.getChildByTagName(message, GSXML.REQUEST_ELEM);
97 Element cgi_paramList = (Element) GSXML.getChildByTagName(request, GSXML.PARAM_ELEM + GSXML.LIST_MODIFIER);
98 HashMap<String, Serializable> params = GSXML.extractParams(cgi_paramList, false);
99
100 // just in case there are some that need to get passed to the services
101 // why do we use s0 here and s1 in other places???
102 HashMap service_params = (HashMap) params.get("s0");
103
104 String collection = (String) params.get(GSParams.COLLECTION);
105 String document_id = (String) params.get(GSParams.DOCUMENT);
106 if (document_id != null && document_id.equals(""))
107 {
108 document_id = null;
109 }
110 String href = (String) params.get(GSParams.HREF);//for an external link : get the href URL if it is existing in the params list
111 if (href != null && href.equals(""))
112 {
113 href = null;
114 }
115 String rl = (String) params.get(GSParams.RELATIVE_LINK);//for an external link : get the rl value if it is existing in the params list
116 if (document_id == null && href == null)
117 {
118 logger.error("no document specified!");
119 return result;
120 }
121 if (rl != null && rl.equals("0"))
122 {
123 // this is a true external link, we should have been directed to a different page or action
124 logger.error("rl value was 0, shouldn't get here");
125 return result;
126 }
127
128 UserContext userContext = new UserContext(request);
129
130 //append site metadata
131 addSiteMetadata(page_response, userContext);
132 addInterfaceOptions(page_response);
133
134 // get the additional data needed for the page
135 getBackgroundData(page_response, collection, userContext);
136 Element format_elem = (Element) GSXML.getChildByTagName(page_response, GSXML.FORMAT_ELEM);
137
138 if (format_elem != null) {
139 // lets look for param defaults set in config file
140 NodeList param_defaults = format_elem.getElementsByTagName(GSXML.PARAM_DEFAULT_ELEM);
141 for (int i=0; i<param_defaults.getLength(); i++) {
142 Element p = (Element)param_defaults.item(i);
143 String name = p.getAttribute(GSXML.NAME_ATT);
144 if (params.get(name) ==null) {
145 // wasn't set from interface
146 String value = p.getAttribute(GSXML.VALUE_ATT);
147 params.put(name, value );
148 // also add into request param xml so that xslt knows it too
149 GSXML.addParameterToList(cgi_paramList, name, value);
150 }
151 }
152 }
153
154 String document_type = (String) params.get(GSParams.DOCUMENT_TYPE);
155 if (document_type != null && document_type.equals(""))
156 {
157 //document_type = "hierarchy";
158 document_type = null; // we'll get it later if not already specified
159 }
160 // what if it is null here?? Anu to check...
161
162
163 boolean editing_document = false;
164 String doc_edit = (String) params.get(DOC_EDIT_ARG);
165 if (doc_edit != null && doc_edit.equals("1")) {
166 editing_document = true;
167 }
168
169 // are we editing mode? just get the archive document, convert to our internal doc format, and return it
170 if (editing_document) {
171 return getFormattedArchiveDoc(doc, collection, document_id, document_type, result, page_response, userContext);
172 }
173
174 //whether to retrieve siblings or not
175 boolean get_siblings = false;
176 String sibs = (String) params.get(SIBLING_ARG);
177 if (sibs != null && sibs.equals("1"))
178 {
179 get_siblings = true;
180 }
181
182 String doc_id_modifier = "";
183 String sibling_num = (String) params.get(GOTO_PAGE_ARG);
184 if (sibling_num != null && !sibling_num.equals(""))
185 {
186 // we have to modify the doc name
187 doc_id_modifier = "." + sibling_num + ".ss";
188 }
189
190 boolean expand_document = false;
191 String ed_arg = (String) params.get(EXPAND_DOCUMENT_ARG);
192 if (ed_arg != null && ed_arg.equals("1"))
193 {
194 expand_document = true;
195 }
196
197 boolean expand_contents = false;
198 if (expand_document)
199 { // we always expand the contents with the text
200 expand_contents = true;
201 }
202 else
203 {
204 String ec_arg = (String) params.get(EXPAND_CONTENTS_ARG);
205 if (ec_arg != null && ec_arg.equals("1"))
206 {
207 expand_contents = true;
208 }
209 }
210
211 // do we want text content? Not if no_text=1.
212 // expand_document overrides this. - should it??
213 boolean get_text = true;
214 String nt_arg = (String) params.get(NO_TEXT_ARG);
215
216 if (!expand_document && nt_arg!=null && nt_arg.equals("1")) {
217 logger.debug("SETTING GET TEXT TO FALSE");
218 get_text = false;
219 } else {
220 logger.debug("GET TEXT REMAINS TRUE");
221 }
222
223 // the_document is where all the doc info - structure and metadata etc
224 // is added into, to be returned in the page
225 Element the_document = doc.createElement(GSXML.DOCUMENT_ELEM);
226 page_response.appendChild(the_document);
227
228 // create a basic doc list containing the current node
229 Element basic_doc_list = doc.createElement(GSXML.DOC_NODE_ELEM + GSXML.LIST_MODIFIER);
230 Element current_doc = doc.createElement(GSXML.DOC_NODE_ELEM);
231 basic_doc_list.appendChild(current_doc);
232 if (document_id != null)
233 {
234 current_doc.setAttribute(GSXML.NODE_ID_ATT, document_id + doc_id_modifier);
235 }
236 else
237 {
238 current_doc.setAttribute(GSXML.HREF_ID_ATT, href);
239 // do we need this??
240 current_doc.setAttribute(GSXML.ID_MOD_ATT, doc_id_modifier);
241 }
242
243 if (document_type == null)
244 {
245 document_type = getDocumentType(basic_doc_list, collection, userContext, page_response);
246 }
247 if (document_type == null)
248 {
249 logger.debug("##### doctype is null, setting to simple");
250 document_type = GSXML.DOC_TYPE_SIMPLE;
251 }
252
253 the_document.setAttribute(GSXML.DOC_TYPE_ATT, document_type);
254
255 // start getting doc structure
256
257 // Create a parameter list to specify the required structure information
258 Element ds_param_list = doc.createElement(GSXML.PARAM_ELEM + GSXML.LIST_MODIFIER);
259
260 if (service_params != null)
261 {
262 GSXML.addParametersToList(ds_param_list, service_params);
263 }
264
265 Element ds_param = null;
266 boolean get_structure = false;
267 boolean get_structure_info = false;
268 if (document_type.equals(GSXML.DOC_TYPE_PAGED))
269 {
270 get_structure_info = true;
271
272 if (expand_contents)
273 {
274 ds_param = doc.createElement(GSXML.PARAM_ELEM);
275 ds_param_list.appendChild(ds_param);
276 ds_param.setAttribute(GSXML.NAME_ATT, "structure");
277 ds_param.setAttribute(GSXML.VALUE_ATT, "entire");
278 }
279
280 // get the info needed for paged naviagtion
281 ds_param = doc.createElement(GSXML.PARAM_ELEM);
282 ds_param_list.appendChild(ds_param);
283 ds_param.setAttribute(GSXML.NAME_ATT, "info");
284 ds_param.setAttribute(GSXML.VALUE_ATT, "numSiblings");
285 ds_param = doc.createElement(GSXML.PARAM_ELEM);
286 ds_param_list.appendChild(ds_param);
287 ds_param.setAttribute(GSXML.NAME_ATT, "info");
288 ds_param.setAttribute(GSXML.VALUE_ATT, "numChildren");
289 ds_param = doc.createElement(GSXML.PARAM_ELEM);
290 ds_param_list.appendChild(ds_param);
291 ds_param.setAttribute(GSXML.NAME_ATT, "info");
292 ds_param.setAttribute(GSXML.VALUE_ATT, "siblingPosition");
293
294 if (get_siblings)
295 {
296 ds_param = doc.createElement(GSXML.PARAM_ELEM);
297 ds_param_list.appendChild(ds_param);
298 ds_param.setAttribute(GSXML.NAME_ATT, "structure");
299 ds_param.setAttribute(GSXML.VALUE_ATT, "siblings");
300 }
301
302 }
303 else if (document_type.equals(GSXML.DOC_TYPE_HIERARCHY) || document_type.equals(GSXML.DOC_TYPE_PAGED_HIERARCHY))
304 {
305 get_structure = true;
306 if (expand_contents)
307 {
308 ds_param = doc.createElement(GSXML.PARAM_ELEM);
309 ds_param_list.appendChild(ds_param);
310 ds_param.setAttribute(GSXML.NAME_ATT, "structure");
311 ds_param.setAttribute(GSXML.VALUE_ATT, "entire");
312 }
313 else
314 {
315 // get the info needed for table of contents
316 ds_param = doc.createElement(GSXML.PARAM_ELEM);
317 ds_param_list.appendChild(ds_param);
318 ds_param.setAttribute(GSXML.NAME_ATT, "structure");
319 ds_param.setAttribute(GSXML.VALUE_ATT, "ancestors");
320 ds_param = doc.createElement(GSXML.PARAM_ELEM);
321 ds_param_list.appendChild(ds_param);
322 ds_param.setAttribute(GSXML.NAME_ATT, "structure");
323 ds_param.setAttribute(GSXML.VALUE_ATT, "children");
324 if (get_siblings)
325 {
326 ds_param = doc.createElement(GSXML.PARAM_ELEM);
327 ds_param_list.appendChild(ds_param);
328 ds_param.setAttribute(GSXML.NAME_ATT, "structure");
329 ds_param.setAttribute(GSXML.VALUE_ATT, "siblings");
330 }
331 }
332 }
333 else
334 {
335 // we dont need any structure
336 }
337
338 boolean has_dummy = false;
339 if (get_structure || get_structure_info)
340 {
341
342 // Build a request to obtain the document structure
343 Element ds_message = doc.createElement(GSXML.MESSAGE_ELEM);
344 String to = GSPath.appendLink(collection, "DocumentStructureRetrieve");// Hard-wired?
345 Element ds_request = GSXML.createBasicRequest(doc, GSXML.REQUEST_TYPE_PROCESS, to, userContext);
346 ds_message.appendChild(ds_request);
347 ds_request.appendChild(ds_param_list);
348
349 // add the node list we created earlier
350 ds_request.appendChild(basic_doc_list);
351
352 // Process the document structure retrieve message
353 Element ds_response_message = (Element) this.mr.process(ds_message);
354 if (processErrorElements(ds_response_message, page_response))
355 {
356 return result;
357 }
358
359 // get the info and print out
360 String path = GSPath.appendLink(GSXML.RESPONSE_ELEM, GSXML.DOC_NODE_ELEM + GSXML.LIST_MODIFIER);
361 path = GSPath.appendLink(path, GSXML.DOC_NODE_ELEM);
362 path = GSPath.appendLink(path, "nodeStructureInfo");
363 Element ds_response_struct_info = (Element) GSXML.getNodeByPath(ds_response_message, path);
364 // get the doc_node bit
365 if (ds_response_struct_info != null)
366 {
367 the_document.appendChild(doc.importNode(ds_response_struct_info, true));
368 }
369 path = GSPath.appendLink(GSXML.RESPONSE_ELEM, GSXML.DOC_NODE_ELEM + GSXML.LIST_MODIFIER);
370 path = GSPath.appendLink(path, GSXML.DOC_NODE_ELEM);
371 path = GSPath.appendLink(path, GSXML.NODE_STRUCTURE_ELEM);
372 Element ds_response_structure = (Element) GSXML.getNodeByPath(ds_response_message, path);
373
374 if (ds_response_structure != null)
375 {
376 // add the contents of the structure bit into the_document
377 NodeList structs = ds_response_structure.getChildNodes();
378 for (int i = 0; i < structs.getLength(); i++)
379 {
380 the_document.appendChild(doc.importNode(structs.item(i), true));
381 }
382 }
383 else
384 {
385 // no structure nodes, so put in a dummy doc node
386 Element doc_node = doc.createElement(GSXML.DOC_NODE_ELEM);
387 if (document_id != null)
388 {
389 doc_node.setAttribute(GSXML.NODE_ID_ATT, document_id);
390 }
391 else
392 {
393 doc_node.setAttribute(GSXML.HREF_ID_ATT, href);
394
395 }
396 the_document.appendChild(doc_node);
397 has_dummy = true;
398 }
399 }
400 else
401 { // a simple type - we dont have a dummy node for simple
402 // should think about this more
403 // no structure request, so just put in a dummy doc node
404 Element doc_node = doc.createElement(GSXML.DOC_NODE_ELEM);
405 if (document_id != null)
406 {
407 doc_node.setAttribute(GSXML.NODE_ID_ATT, document_id);
408 }
409 else
410 {
411 doc_node.setAttribute(GSXML.HREF_ID_ATT, href);
412 }
413 the_document.appendChild(doc_node);
414 has_dummy = true;
415 }
416
417 // end getting doc structure
418
419 // start getting doc metadata
420
421 // Build a request to obtain some document metadata
422 Element dm_message = doc.createElement(GSXML.MESSAGE_ELEM);
423 String to = GSPath.appendLink(collection, "DocumentMetadataRetrieve"); // Hard-wired?
424 Element dm_request = GSXML.createBasicRequest(doc, GSXML.REQUEST_TYPE_PROCESS, to, userContext);
425 dm_message.appendChild(dm_request);
426 // Create a parameter list to specify the required metadata information
427
428 HashSet<String> meta_names = new HashSet<String>();
429 meta_names.add("Title"); // the default
430 if (format_elem != null)
431 {
432 getRequiredMetadataNames(format_elem, meta_names);
433 }
434
435 Element extraMetaListElem = (Element) GSXML.getChildByTagName(request, GSXML.EXTRA_METADATA + GSXML.LIST_MODIFIER);
436 if (extraMetaListElem != null)
437 {
438 NodeList extraMetaList = extraMetaListElem.getElementsByTagName(GSXML.EXTRA_METADATA);
439 for (int i = 0; i < extraMetaList.getLength(); i++)
440 {
441 meta_names.add(((Element) extraMetaList.item(i)).getAttribute(GSXML.NAME_ATT));
442 }
443 }
444
445 Element dm_param_list = createMetadataParamList(doc,meta_names);
446 if (service_params != null)
447 {
448 GSXML.addParametersToList(dm_param_list, service_params);
449 }
450
451 dm_request.appendChild(dm_param_list);
452
453 // create the doc node list for the metadata request
454 Element dm_doc_list = doc.createElement(GSXML.DOC_NODE_ELEM + GSXML.LIST_MODIFIER);
455 dm_request.appendChild(dm_doc_list);
456
457 // Add each node from the structure response into the metadata request
458 NodeList doc_nodes = the_document.getElementsByTagName(GSXML.DOC_NODE_ELEM);
459 for (int i = 0; i < doc_nodes.getLength(); i++)
460 {
461 Element doc_node = (Element) doc_nodes.item(i);
462 String doc_node_id = doc_node.getAttribute(GSXML.NODE_ID_ATT);
463
464 // Add the documentNode to the list
465 Element dm_doc_node = doc.createElement(GSXML.DOC_NODE_ELEM);
466 if (needSectionContent(params)) {
467 if (doc_node_id.equals(document_id)) {
468 dm_doc_list.appendChild(dm_doc_node);
469 }
470 } else {
471 dm_doc_list.appendChild(dm_doc_node);
472 }
473 //dm_doc_list.appendChild(dm_doc_node);
474 dm_doc_node.setAttribute(GSXML.NODE_ID_ATT, doc_node_id);
475 dm_doc_node.setAttribute(GSXML.NODE_TYPE_ATT, doc_node.getAttribute(GSXML.NODE_TYPE_ATT));
476 if (document_id == null){
477 dm_doc_node.setAttribute(GSXML.HREF_ID_ATT, href );
478 }
479
480 }
481 // we also want a metadata request to the top level document to get
482 // assocfilepath - this could be cached too
483 Element doc_meta_request = GSXML.createBasicRequest(doc, GSXML.REQUEST_TYPE_PROCESS, to, userContext);
484 dm_message.appendChild(doc_meta_request);
485 Element doc_meta_param_list = doc.createElement(GSXML.PARAM_ELEM + GSXML.LIST_MODIFIER);
486 if (service_params != null)
487 {
488 GSXML.addParametersToList(doc_meta_param_list, service_params);
489 }
490
491 doc_meta_request.appendChild(doc_meta_param_list);
492 Element doc_param = doc.createElement(GSXML.PARAM_ELEM);
493 doc_meta_param_list.appendChild(doc_param);
494 doc_param.setAttribute(GSXML.NAME_ATT, "metadata");
495 doc_param.setAttribute(GSXML.VALUE_ATT, "assocfilepath");
496
497 // create the doc node list for the metadata request
498 Element doc_list = doc.createElement(GSXML.DOC_NODE_ELEM + GSXML.LIST_MODIFIER);
499 doc_meta_request.appendChild(doc_list);
500
501 Element doc_node = doc.createElement(GSXML.DOC_NODE_ELEM);
502 // the node we want is the root document node
503 if (document_id != null)
504 {
505 doc_node.setAttribute(GSXML.NODE_ID_ATT, document_id + ".rt");
506 }
507 /*else
508 {
509 doc_node.setAttribute(GSXML.HREF_ID_ATT, href);// + ".rt");
510 // can we assume that href is always a top level doc??
511 //doc_node.setAttribute(GSXML.ID_MOD_ATT, ".rt");
512 //doc_node.setAttribute("externalURL", has_rl);
513 }*/
514 doc_list.appendChild(doc_node);
515
516 Element dm_response_message = (Element) this.mr.process(dm_message);
517 if (processErrorElements(dm_response_message, page_response))
518 {
519 return result;
520 }
521
522 String path = GSPath.appendLink(GSXML.RESPONSE_ELEM, GSXML.DOC_NODE_ELEM + GSXML.LIST_MODIFIER);
523 Element dm_response_doc_list = (Element) GSXML.getNodeByPath(dm_response_message, path);
524
525 // Merge the metadata with the structure information
526 NodeList dm_response_docs = dm_response_doc_list.getChildNodes();
527 for (int i = 0; i < doc_nodes.getLength(); i++)
528 {
529 Node dcNode;
530 String node_idd = ((Element)doc_nodes.item(i)).getAttribute(GSXML.NODE_ID_ATT);
531 if (node_idd.isEmpty()) {
532 String href_id_att = ((Element)doc_nodes.item(i)).getAttribute(GSXML.HREF_ID_ATT);
533 dcNode = GSXML.getNamedElement(dm_response_doc_list, "documentNode", GSXML.HREF_ID_ATT, href_id_att);
534 } else {
535 dcNode = GSXML.getNamedElement(dm_response_doc_list, "documentNode", GSXML.NODE_ID_ATT, node_idd);
536 }
537 GSXML.mergeMetadataLists(doc_nodes.item(i), dcNode);
538 }
539 // get the top level doc metadata out
540 Element doc_meta_response = (Element) dm_response_message.getElementsByTagName(GSXML.RESPONSE_ELEM).item(1);
541 Element top_doc_node = (Element) GSXML.getNodeByPath(doc_meta_response, "documentNodeList/documentNode");
542 GSXML.mergeMetadataLists(the_document, top_doc_node);
543
544 // if we are highlighting query terms, then we also get them highlighted in the metadata
545
546 HashSet<String> query_term_variants = null;
547 ArrayList<ArrayList<HashSet<String>>> phrase_query_term_variants_hierarchy = null;
548 boolean do_highlight_query_terms = highlight_query_terms;
549 int query_terms_status = 0;
550 if (highlight_query_terms) {
551 // lets get the query term equivalents
552 query_term_variants = new HashSet<String>();
553 phrase_query_term_variants_hierarchy = new ArrayList<ArrayList<HashSet<String>>>();
554 if ((query_terms_status = getQueryTermVariants(request, query_term_variants, phrase_query_term_variants_hierarchy)) ==0) {
555 do_highlight_query_terms = false; // we couldn't get the terms
556 }
557 }
558
559 // lets try marking up the metadata with search terms
560 // if the search service doesn't send back <equivTermlist> then we haven't got the term variants. We lower case everything and do case insensitive matching
561 boolean highlight_case_insensitive = false;
562 if (query_terms_status == NO_EQUIV_QUERY_TERMS) {
563 highlight_case_insensitive = true;
564 }
565 if (do_highlight_query_terms) {
566 highlightQueryTermsDOM(doc, the_document, "metadata", query_term_variants, phrase_query_term_variants_hierarchy, highlight_case_insensitive);
567 }
568
569 // do we want doc text content? If not, we are done.
570 if (!get_text) {
571 // don't get text
572 return result;
573 }
574
575 // Build a request to obtain some document content
576 Element dc_message = doc.createElement(GSXML.MESSAGE_ELEM);
577 to = GSPath.appendLink(collection, "DocumentContentRetrieve"); // Hard-wired?
578 Element dc_request = GSXML.createBasicRequest(doc, GSXML.REQUEST_TYPE_PROCESS, to, userContext);
579 dc_message.appendChild(dc_request);
580
581 // Create a parameter list to specify the request parameters - empty for now
582 Element dc_param_list = doc.createElement(GSXML.PARAM_ELEM + GSXML.LIST_MODIFIER);
583 if (service_params != null)
584 {
585 GSXML.addParametersToList(dc_param_list, service_params);
586 }
587
588 dc_request.appendChild(dc_param_list);
589
590 // get the content
591 // the doc list for the content request is the same as the one for the structure request unless we want the whole document, in which case its the same as for the metadata request.
592 if (expand_document)
593 {
594 dc_request.appendChild(dm_doc_list);
595 }
596 else
597 {
598 dc_request.appendChild(basic_doc_list);
599 }
600 Element dc_response_message = (Element) this.mr.process(dc_message);
601
602 if (processErrorElements(dc_response_message, page_response))
603 {
604 return result;
605
606 }
607 Element dc_response_doc_list = (Element) GSXML.getNodeByPath(dc_response_message, path);
608
609 boolean get_marked_up_doc_from_query = false;
610 if (do_highlight_query_terms && query_terms_status == NO_EQUIV_QUERY_TERMS) {
611 get_marked_up_doc_from_query = true; // we try to. solr we can, lucene we can't
612 }
613
614 if (expand_document)
615 {
616 // Merge the content with the structure information
617 NodeList dc_response_docs = dc_response_doc_list.getChildNodes();
618 for (int i = 0; i < doc_nodes.getLength(); i++)
619 {
620 String node_id = ((Element)doc_nodes.item(i)).getAttribute(GSXML.NODE_ID_ATT);
621 Node docNode = GSXML.getNamedElement(dc_response_doc_list, "documentNode", GSXML.NODE_ID_ATT, node_id);
622 Node content = GSXML.getChildByTagName(docNode, GSXML.NODE_CONTENT_ELEM);
623 if (content != null)
624 {
625 if (do_highlight_query_terms) {
626 if (get_marked_up_doc_from_query) {
627
628 Element new_content = retrieveHighlightedContent(request, node_id);
629
630 if (new_content == null) {
631 // we didn't get any text back from the request. assume we won't be able to get it next time either (eg lucene)
632 get_marked_up_doc_from_query = false;
633 content = highlightQueryTermsElementText(doc, (Element)content, query_term_variants, phrase_query_term_variants_hierarchy, highlight_case_insensitive);
634 } else {
635 content= new_content;
636 }
637 } else {
638 content = highlightQueryTermsElementText(doc, (Element)content, query_term_variants, phrase_query_term_variants_hierarchy, highlight_case_insensitive);
639 }
640 }
641 doc_nodes.item(i).appendChild(doc.importNode(content, true));
642 }
643
644 }
645 if (has_dummy && document_type.equals(GSXML.DOC_TYPE_SIMPLE)) {
646 Element dummy_node = (Element) doc_nodes.item(0);
647 the_document.removeChild(dummy_node);
648 the_document.setAttribute(GSXML.NODE_ID_ATT, dummy_node.getAttribute(GSXML.NODE_ID_ATT));
649 NodeList dummy_children = dummy_node.getChildNodes();
650 for (int i = dummy_children.getLength() - 1; i >= 0; i--)
651 {
652 // special case as we don't want more than one metadata list
653 if (dummy_children.item(i).getNodeName().equals(GSXML.METADATA_ELEM + GSXML.LIST_MODIFIER))
654 {
655 GSXML.mergeMetadataFromList(the_document, dummy_children.item(i));
656 }
657 else
658 {
659 the_document.appendChild(dummy_children.item(i));
660 }
661 }
662 }
663 }
664 else
665 {
666 Element dc_response_doc = (Element) GSXML.getChildByTagName(dc_response_doc_list, GSXML.DOC_NODE_ELEM);
667 Element dc_response_doc_content = (Element) GSXML.getChildByTagName(dc_response_doc, GSXML.NODE_CONTENT_ELEM);
668
669 if (dc_response_doc_content == null)
670 {
671 // no content to add
672 if (dc_response_doc.getAttribute("external").equals("true"))
673 {
674 String href_id = dc_response_doc.getAttribute(GSXML.HREF_ID_ATT);
675
676 the_document.setAttribute("selectedNode", href_id);
677 the_document.setAttribute("external", href_id);
678 }
679 return result;
680 }
681 if (do_highlight_query_terms)
682 {
683 dc_response_doc.removeChild(dc_response_doc_content);
684 if (get_marked_up_doc_from_query) {
685 Element new_content = retrieveHighlightedContent(request, null);
686 if (new_content == null) {
687 get_marked_up_doc_from_query = false;
688 dc_response_doc_content = highlightQueryTermsElementText(doc, (Element)dc_response_doc_content, query_term_variants, phrase_query_term_variants_hierarchy, highlight_case_insensitive);
689 } else {
690
691 dc_response_doc_content = new_content;
692 }
693 } else {
694 dc_response_doc_content = highlightQueryTermsElementText(doc, (Element)dc_response_doc_content, query_term_variants, phrase_query_term_variants_hierarchy, highlight_case_insensitive);
695 }
696 dc_response_doc.appendChild(dc_response_doc.getOwnerDocument().importNode(dc_response_doc_content, true));
697 }
698
699 if (provide_annotations)
700 {
701 String service_selected = (String) params.get(ENRICH_DOC_ARG);
702 if (service_selected != null && service_selected.equals("1"))
703 {
704 // now we can modifiy the response doc if needed
705 String enrich_service = (String) params.get(GSParams.SERVICE);
706 // send a message to the service
707 Element enrich_message = doc.createElement(GSXML.MESSAGE_ELEM);
708 Element enrich_request = GSXML.createBasicRequest(doc, GSXML.REQUEST_TYPE_PROCESS, enrich_service, userContext);
709 enrich_message.appendChild(enrich_request);
710 // check for parameters
711 HashMap e_service_params = (HashMap) params.get("s1");
712 if (e_service_params != null)
713 {
714 Element enrich_pl = doc.createElement(GSXML.PARAM_ELEM + GSXML.LIST_MODIFIER);
715 GSXML.addParametersToList(enrich_pl, e_service_params);
716 enrich_request.appendChild(enrich_pl);
717 }
718 Element e_doc_list = doc.createElement(GSXML.DOC_NODE_ELEM + GSXML.LIST_MODIFIER);
719 enrich_request.appendChild(e_doc_list);
720 e_doc_list.appendChild(doc.importNode(dc_response_doc, true));
721
722 Node enrich_response = this.mr.process(enrich_message);
723
724 String[] links = { GSXML.RESPONSE_ELEM, GSXML.DOC_NODE_ELEM + GSXML.LIST_MODIFIER, GSXML.DOC_NODE_ELEM, GSXML.NODE_CONTENT_ELEM };
725 path = GSPath.createPath(links);
726 dc_response_doc_content = (Element) GSXML.getNodeByPath(enrich_response, path);
727
728 }
729 } // if provide_annotations
730
731 // use the returned id rather than the sent one cos there may have
732 // been modifiers such as .pr that are removed.
733 String modified_doc_id = dc_response_doc.getAttribute(GSXML.NODE_ID_ATT);
734 the_document.setAttribute("selectedNode", modified_doc_id);
735 if (has_dummy)
736 {
737 // change the id if necessary and add the content
738 Element dummy_node = (Element) doc_nodes.item(0);
739
740 dummy_node.setAttribute(GSXML.NODE_ID_ATT, modified_doc_id);
741 dummy_node.appendChild(doc.importNode(dc_response_doc_content, true));
742 // hack for simple type
743 if (document_type.equals(GSXML.DOC_TYPE_SIMPLE))
744 {
745 // we dont want the internal docNode, just want the content and metadata in the document
746 // rethink this!!
747 the_document.removeChild(dummy_node);
748
749 NodeList dummy_children = dummy_node.getChildNodes();
750 for (int i = dummy_children.getLength() - 1; i >= 0; i--)
751 {
752 // special case as we don't want more than one metadata list
753 if (dummy_children.item(i).getNodeName().equals(GSXML.METADATA_ELEM + GSXML.LIST_MODIFIER))
754 {
755 GSXML.mergeMetadataFromList(the_document, dummy_children.item(i));
756 }
757 else
758 {
759 the_document.appendChild(dummy_children.item(i));
760 }
761 }
762 }
763
764 the_document.setAttribute(GSXML.NODE_ID_ATT, modified_doc_id);
765 }
766 else
767 {
768 // Merge the document content with the metadata and structure information
769 for (int i = 0; i < doc_nodes.getLength(); i++)
770 {
771 Node dn = doc_nodes.item(i);
772 String dn_id = ((Element) dn).getAttribute(GSXML.NODE_ID_ATT);
773 if (dn_id.equals(modified_doc_id))
774 {
775 dn.appendChild(doc.importNode(dc_response_doc_content, true));
776 break;
777 }
778 }
779 }
780 }
781 //logger.debug("(DocumentAction) Page:\n" + GSXML.xmlNodeToString(result));
782 return result;
783 }
784
785 protected Element getFormattedArchiveDoc(Document doc, String collection, String document_id, String document_type, Element result, Element page_response, UserContext userContext ) {
786 // call get archive doc
787 Element dx_message = doc.createElement(GSXML.MESSAGE_ELEM);
788 String to = "DocXMLGetSection";
789 Element dx_request = GSXML.createBasicRequest(doc, GSXML.REQUEST_TYPE_PROCESS, to, userContext);
790 dx_message.appendChild(dx_request);
791 Element dx_section = doc.createElement(GSXML.DOCXML_SECTION_ELEM);
792 dx_section.setAttribute(GSXML.NODE_ID_ATT, document_id);
793 dx_section.setAttribute(GSXML.COLLECTION_ATT, collection);
794 dx_request.appendChild(dx_section);
795
796 Element dx_response_message = (Element) this.mr.process(dx_message);
797 if (processErrorElements(dx_response_message, page_response))
798 {
799 return result;
800 }
801
802 // get the section out
803 String path = GSPath.appendLink(GSXML.RESPONSE_ELEM, GSXML.DOCXML_SECTION_ELEM);
804 Element section = (Element) GSXML.getNodeByPath(dx_response_message, path);
805 if (section == null) {
806 logger.error("no archive doc returned for "+document_id);
807 return result;
808 }
809 // convert the archive format into the internal format that the page response requires
810
811 // work out doctype
812 // NOTE: this will be coming from collection database in index
813 // the archive file doesn't store this. So we have to assume
814 // that the doc type will not be changing with any
815 // modifications happening to archives.
816
817 // if doc type is null, then we need to work it out.
818 // create a basic doc list containing the current node
819
820 if (document_type == null) {
821 Element basic_doc_list = doc.createElement(GSXML.DOC_NODE_ELEM + GSXML.LIST_MODIFIER);
822 Element current_doc = doc.createElement(GSXML.DOC_NODE_ELEM);
823 basic_doc_list.appendChild(current_doc);
824 current_doc.setAttribute(GSXML.NODE_ID_ATT, document_id);
825 basic_doc_list.appendChild(current_doc);
826 document_type = getDocumentType(basic_doc_list, collection, userContext, page_response);
827 }
828
829 if (document_type == null) {
830 logger.debug("@@@ doctype is null, setting to simple");
831 document_type = GSXML.DOC_TYPE_SIMPLE;
832 }
833
834 Element doc_elem = doc.createElement(GSXML.DOCUMENT_ELEM);
835 doc_elem.setAttribute(GSXML.DOC_TYPE_ATT, document_type);
836 page_response.appendChild(doc_elem);
837
838 Element transformed_section = transformArchiveToDocument(section);
839 if (document_type == GSXML.DOC_TYPE_SIMPLE) {
840 // simple doc, only returning a single document node, which is the top level section.
841 doc_elem.setAttribute(GSXML.NODE_ID_ATT, document_id);
842 GSXML.mergeElements(doc_elem, transformed_section);
843 return result;
844 }
845
846 // multi sectioned document.
847 transformed_section.setAttribute(GSXML.NODE_ID_ATT, document_id);
848 // In docEdit mode, we obtain the text from archives, from doc.xml
849 // Now the transformation has replaced <Section> with <documentNode>
850 // Need to add nodeID, nodeType and docType attributes to each docNode
851 // as doc.xml doesn't store that.
852 insertDocNodeAttributes(transformed_section, document_type, null);
853 doc_elem.appendChild(doc.importNode(transformed_section, true));
854 logger.debug("dx result = "+XMLConverter.getPrettyString(result));
855
856 return result;
857 }
858
859
860 private boolean needSectionContent(HashMap<String, Serializable> params) {
861 String document_id = (String) params.get(GSParams.DOCUMENT);
862 String ilt = (String) params.get(GSParams.INLINE_TEMPLATE);
863 String iltPrefix = "<xsl:template match=\"/\"><text><xsl:for-each select=\"/page/pageResponse/document//documentNode[@nodeID =";
864 if (ilt != null && ilt.startsWith(iltPrefix) && document_id != null) {
865 return true;
866 }
867
868 return false;
869 }
870 /**
871 * this method gets the collection description, the format info, the list of
872 * enrich services, etc - stuff that is needed for the page, but is the same
873 * whatever the query is - should be cached
874 */
875 protected boolean getBackgroundData(Element page_response, String collection, UserContext userContext)
876 {
877 Document doc = page_response.getOwnerDocument();
878
879 // create a message to process - contains requests for the collection
880 // description, the format element, the enrich services on offer
881 // these could all be cached
882 Element info_message = doc.createElement(GSXML.MESSAGE_ELEM);
883 String path = GSPath.appendLink(collection, "DocumentContentRetrieve");
884 // the format request - ignore for now, where does this request go to??
885 Element format_request = GSXML.createBasicRequest(doc, GSXML.REQUEST_TYPE_FORMAT, path, userContext);
886 info_message.appendChild(format_request);
887
888 // the enrich_services request - only do this if provide_annotations is true
889
890 if (provide_annotations)
891 {
892 Element enrich_services_request = GSXML.createBasicRequest(doc, GSXML.REQUEST_TYPE_DESCRIBE, "", userContext);
893 enrich_services_request.setAttribute(GSXML.INFO_ATT, "serviceList");
894 info_message.appendChild(enrich_services_request);
895 }
896
897 Element info_response = (Element) this.mr.process(info_message);
898
899 // the collection is the first response
900 NodeList responses = info_response.getElementsByTagName(GSXML.RESPONSE_ELEM);
901 Element format_resp = (Element) responses.item(0);
902
903 Element format_elem = (Element) GSXML.getChildByTagName(format_resp, GSXML.FORMAT_ELEM);
904 if (format_elem != null)
905 {
906 Element global_format_elem = (Element) GSXML.getChildByTagName(format_resp, GSXML.GLOBAL_FORMAT_ELEM);
907 if (global_format_elem != null)
908 {
909 GSXSLT.mergeFormatElements(format_elem, global_format_elem, false);
910 }
911
912 // set the format type
913 format_elem.setAttribute(GSXML.TYPE_ATT, "display");
914 page_response.appendChild(doc.importNode(format_elem, true));
915 }
916
917 if (provide_annotations)
918 {
919 Element services_resp = (Element) responses.item(1);
920
921 // a new message for the mr
922 Element enrich_message = doc.createElement(GSXML.MESSAGE_ELEM);
923 NodeList e_services = services_resp.getElementsByTagName(GSXML.SERVICE_ELEM);
924 boolean service_found = false;
925 for (int j = 0; j < e_services.getLength(); j++)
926 {
927 if (((Element) e_services.item(j)).getAttribute(GSXML.TYPE_ATT).equals("enrich"))
928 {
929 Element s = GSXML.createBasicRequest(doc, GSXML.REQUEST_TYPE_DESCRIBE, ((Element) e_services.item(j)).getAttribute(GSXML.NAME_ATT), userContext);
930 enrich_message.appendChild(s);
931 service_found = true;
932 }
933 }
934 if (service_found)
935 {
936 Element enrich_response = (Element) this.mr.process(enrich_message);
937
938 NodeList e_responses = enrich_response.getElementsByTagName(GSXML.RESPONSE_ELEM);
939 Element service_list = doc.createElement(GSXML.SERVICE_ELEM + GSXML.LIST_MODIFIER);
940 for (int i = 0; i < e_responses.getLength(); i++)
941 {
942 Element e_resp = (Element) e_responses.item(i);
943 Element e_service = (Element) doc.importNode(GSXML.getChildByTagName(e_resp, GSXML.SERVICE_ELEM), true);
944 e_service.setAttribute(GSXML.NAME_ATT, e_resp.getAttribute(GSXML.FROM_ATT));
945 service_list.appendChild(e_service);
946 }
947 page_response.appendChild(service_list);
948 }
949 } // if provide_annotations
950 return true;
951
952 }
953
954 protected String getDocumentType(Element basic_doc_list, String collection, UserContext userContext, Element page_response)
955 {
956 Document doc = basic_doc_list.getOwnerDocument();
957
958 Element ds_message = doc.createElement(GSXML.MESSAGE_ELEM);
959 String to = GSPath.appendLink(collection, "DocumentStructureRetrieve");// Hard-wired?
960 Element ds_request = GSXML.createBasicRequest(doc, GSXML.REQUEST_TYPE_PROCESS, to, userContext);
961 ds_message.appendChild(ds_request);
962
963 // Create a parameter list to specify the required structure information
964 Element ds_param_list = doc.createElement(GSXML.PARAM_ELEM + GSXML.LIST_MODIFIER);
965 Element ds_param = doc.createElement(GSXML.PARAM_ELEM);
966 ds_param_list.appendChild(ds_param);
967 ds_param.setAttribute(GSXML.NAME_ATT, "info");
968 ds_param.setAttribute(GSXML.VALUE_ATT, "documentType");
969
970 ds_request.appendChild(ds_param_list);
971
972 // add the node list we created earlier
973 ds_request.appendChild(basic_doc_list);
974
975 // Process the document structure retrieve message
976 Element ds_response_message = (Element) this.mr.process(ds_message);
977 if (processErrorElements(ds_response_message, page_response))
978 {
979 return null;
980 }
981
982 String[] links = { GSXML.RESPONSE_ELEM, GSXML.DOC_NODE_ELEM + GSXML.LIST_MODIFIER, GSXML.DOC_NODE_ELEM, "nodeStructureInfo" };
983 String path = GSPath.createPath(links);
984 Element info_elem = (Element) GSXML.getNodeByPath(ds_response_message, path);
985 if (info_elem == null) {
986 return null;
987 }
988 Element doctype_elem = GSXML.getNamedElement(info_elem, "info", "name", "documentType");
989 if (doctype_elem != null)
990 {
991 String doc_type = doctype_elem.getAttribute("value");
992 return doc_type;
993 }
994 return null;
995 }
996
997 // Recursive method to set the docType, nodeType and nodeID attributes of each docNode
998 // The docType remains constant as in parameter document_type
999 // The nodeID for the first (root) docNode is already set. For all children, the rootNode id
1000 // is updated to be <parent-id>.<num-child>, where the first parent-id is rootNode id.
1001 // The nodeType is root if rootNode, internal if there are children and leaf if no children
1002 protected void insertDocNodeAttributes(Element docNode, String document_type, String id) {
1003
1004 boolean isRoot = false;
1005 if(id == null) { // rootNode, get the root nodeID to work with recursively
1006 id = docNode.getAttribute(GSXML.NODE_ID_ATT);
1007 isRoot = true;
1008 } else { // for all but the root node, need to still set the nodeID
1009 docNode.setAttribute(GSXML.NODE_ID_ATT, id);
1010 }
1011
1012 docNode.setAttribute(GSXML.DOC_TYPE_ATT, document_type);
1013
1014 NodeList docNodes = GSXML.getChildrenByTagName(docNode, GSXML.DOC_NODE_ELEM);
1015 if(docNodes.getLength() > 0) {
1016 docNode.setAttribute(GSXML.NODE_TYPE_ATT, GSXML.NODE_TYPE_INTERNAL);
1017 for(int i = 0; i < docNodes.getLength(); i++) {
1018 Element childDocNode = (Element)docNodes.item(i);
1019
1020 // work out the child docNode's nodeID based on current id
1021 String nodeID = id + "." + (i+1);
1022 insertDocNodeAttributes(childDocNode, document_type, nodeID); //recursion step
1023 }
1024 } else {
1025 docNode.setAttribute(GSXML.NODE_TYPE_ATT, GSXML.NODE_TYPE_LEAF);
1026 }
1027
1028 // rootNode's nodeType is a special case: it's "root", not "leaf" or "internal"
1029 if(isRoot) docNode.setAttribute(GSXML.NODE_TYPE_ATT, GSXML.NODE_TYPE_ROOT);
1030
1031 }
1032
1033 /** run the XSLT transform which converts from doc.xml format to our internal document format */
1034 protected Element transformArchiveToDocument(Element section) {
1035
1036 String stylesheet_filename = GSFile.stylesheetFile(GlobalProperties.getGSDL3Home(), (String) this.config_params.get(GSConstants.SITE_NAME), "", (String) this.config_params.get(GSConstants.INTERFACE_NAME), (ArrayList<String>) this.config_params.get(GSConstants.BASE_INTERFACES), "archive2document.xsl");
1037 if (stylesheet_filename == null) {
1038 logger.error("Couldn't find stylesheet archive2document.xsl");
1039 return section;
1040 }
1041
1042 Document stylesheet_doc = XMLConverter.getDOM(new File(stylesheet_filename));
1043 if (stylesheet_doc == null) {
1044 logger.error("Couldn't load in stylesheet "+stylesheet_filename);
1045 return section;
1046 }
1047
1048 Document section_doc = XMLConverter.newDOM();
1049 section_doc.appendChild(section_doc.importNode(section, true));
1050 Node result = this.transformer.transform(stylesheet_doc, section_doc);
1051 logger.debug("transform result = "+XMLConverter.getPrettyString(result));
1052
1053 Element new_element;
1054 if (result.getNodeType() == Node.DOCUMENT_NODE) {
1055 new_element = ((Document) result).getDocumentElement();
1056 } else {
1057 new_element = (Element) result;
1058 }
1059
1060
1061 return new_element;
1062
1063 }
1064
1065 protected final int NO_QUERY_TERMS = 0;
1066 protected final int NO_EQUIV_QUERY_TERMS = 1;
1067 protected final int EQUIV_QUERY_TERMS = 2;
1068 /**
1069 * this involves a bit of a hack to get the equivalent query terms - has to
1070 * requery the query service - uses the last selected service name. (if it
1071 * ends in query).
1072 */
1073 protected int getQueryTermVariants(Element request, HashSet<String> query_term_variants, ArrayList<ArrayList<HashSet<String>>> phrase_query_term_variants_hierarchy)
1074 {
1075 Document doc = XMLConverter.newDOM();
1076
1077 // do the query again to get term info
1078 Element cgi_param_list = (Element) GSXML.getChildByTagName(request, GSXML.PARAM_ELEM + GSXML.LIST_MODIFIER);
1079 HashMap<String, Serializable> params = GSXML.extractParams(cgi_param_list, false);
1080
1081 HashMap previous_params = (HashMap) params.get("p");
1082 if (previous_params == null)
1083 {
1084 return NO_QUERY_TERMS;
1085 }
1086 String service_name = (String) previous_params.get(GSParams.SERVICE);
1087 if (service_name == null || !service_name.endsWith("Query"))
1088 { // hack for now - we only do highlighting if we were in a query last - ie not if we were in a browse thingy
1089 logger.debug("invalid service "+service_name+", not doing highlighting");
1090 return NO_QUERY_TERMS;
1091 }
1092
1093 String collection = (String) params.get(GSParams.COLLECTION);
1094 UserContext userContext = new UserContext(request);
1095 String to = GSPath.appendLink(collection, service_name);
1096
1097 Element mr_query_message = doc.createElement(GSXML.MESSAGE_ELEM);
1098 Element mr_query_request = GSXML.createBasicRequest(doc, GSXML.REQUEST_TYPE_PROCESS, to, userContext);
1099 mr_query_message.appendChild(mr_query_request);
1100
1101 // paramList
1102 HashMap service_params = (HashMap) params.get("s1");
1103
1104 Element query_param_list = doc.createElement(GSXML.PARAM_ELEM + GSXML.LIST_MODIFIER);
1105 GSXML.addParametersToList(query_param_list, service_params);
1106 mr_query_request.appendChild(query_param_list);
1107
1108 // do the query
1109 Element mr_query_response = (Element) this.mr.process(mr_query_message);
1110
1111 // find the term lists
1112 String path = GSPath.appendLink(GSXML.RESPONSE_ELEM, GSXML.TERM_ELEM + GSXML.LIST_MODIFIER);
1113 Element query_term_list_element = (Element) GSXML.getNodeByPath(mr_query_response, path);
1114 if (query_term_list_element == null)
1115 {
1116 // no term info
1117 return NO_QUERY_TERMS;
1118 }
1119
1120 int result_code = NO_EQUIV_QUERY_TERMS;
1121 NodeList equivalent_terms_nodelist = query_term_list_element.getElementsByTagName("equivTermList");
1122 if (equivalent_terms_nodelist == null || equivalent_terms_nodelist.getLength() == 0)
1123 {
1124 // if we have no equivalent terms, just add the current terms lower cased and we do case insensitive matching later on
1125 NodeList terms_nodelist = query_term_list_element.getElementsByTagName("term");
1126 if (terms_nodelist != null && terms_nodelist.getLength() > 0)
1127 {
1128 for (int i = 0; i < terms_nodelist.getLength(); i++)
1129 {
1130 String termValue = ((Element) terms_nodelist.item(i)).getAttribute("name");
1131 query_term_variants.add(termValue.toLowerCase());
1132 }
1133 }
1134 }
1135 else
1136 {
1137 result_code = EQUIV_QUERY_TERMS;
1138 for (int i = 0; i < equivalent_terms_nodelist.getLength(); i++)
1139 {
1140 Element equivalent_terms_element = (Element) equivalent_terms_nodelist.item(i);
1141 String[] equivalent_terms = GSXML.getAttributeValuesFromList(equivalent_terms_element, GSXML.NAME_ATT);
1142 for (int j = 0; j < equivalent_terms.length; j++)
1143 {
1144 query_term_variants.add(equivalent_terms[j]);
1145 }
1146 }
1147 }
1148
1149 String metadata_path = GSPath.appendLink(GSXML.RESPONSE_ELEM, GSXML.METADATA_ELEM + GSXML.LIST_MODIFIER);
1150 Element metadata_list = (Element) GSXML.getNodeByPath(mr_query_response, metadata_path);
1151
1152 Element query_element = GSXML.getNamedElement(metadata_list, GSXML.METADATA_ELEM, GSXML.NAME_ATT, "query");
1153 String performed_query = GSXML.getNodeText(query_element) + " ";
1154 logger.debug("performed query="+performed_query);
1155
1156 boolean has_phrases = false; // if there are no phrases, we don't bother making the phrase variants structure
1157 if (performed_query.contains("\"")) {
1158 has_phrases = true;
1159 }
1160
1161 ArrayList<HashSet<String>> phrase_query_p_term_variants_list = new ArrayList<HashSet<String>>();
1162 int term_start = 0;
1163 boolean in_term = false;
1164 boolean in_phrase = false;
1165 for (int i = 0; i < performed_query.length(); i++) {
1166
1167 char character = performed_query.charAt(i);
1168 boolean is_character_letter_or_digit = Character.isLetterOrDigit(character);
1169
1170 // Has a query term just started?
1171 if (in_term == false && is_character_letter_or_digit == true)
1172 {
1173 in_term = true;
1174 term_start = i;
1175 }
1176
1177 // Or has a term just finished?
1178 else if (in_term == true && is_character_letter_or_digit == false)
1179 {
1180 in_term = false;
1181 String term = performed_query.substring(term_start, i);
1182 if (has_phrases) {
1183 // do the phrase bit
1184 HashSet<String> phrase_query_p_term_x_variants = new HashSet<String>();
1185 if (result_code == EQUIV_QUERY_TERMS) {
1186 Element term_element = GSXML.getNamedElement(query_term_list_element, GSXML.TERM_ELEM, GSXML.NAME_ATT, term);
1187 if (term_element != null) {
1188 // might be null for eg TX in [snails]:TX
1189
1190 NodeList term_equivalent_terms_nodelist = term_element.getElementsByTagName("equivTermList");
1191 if (term_equivalent_terms_nodelist != null || term_equivalent_terms_nodelist.getLength() != 0) {
1192 for (int j = 0; j < term_equivalent_terms_nodelist.getLength(); j++)
1193 {
1194 Element term_equivalent_terms_element = (Element) term_equivalent_terms_nodelist.item(j);
1195 String[] term_equivalent_terms = GSXML.getAttributeValuesFromList(term_equivalent_terms_element, GSXML.NAME_ATT);
1196 for (int k = 0; k < term_equivalent_terms.length; k++)
1197 {
1198 phrase_query_p_term_x_variants.add(term_equivalent_terms[k]);
1199 }
1200 }
1201 }
1202 }
1203 } else { // result_code != EQUIV_QUERY_TERMS
1204 // we don;t have equivalent term list, so just add the lower cased version in, and we do case-insensitive matching later on
1205 if (query_term_variants.contains(term.toLowerCase()) || containsSubString(query_term_variants, term)) {
1206 // this handles the case where the user has searched for snails, but term list returns 'snail'
1207 phrase_query_p_term_x_variants.add(term.toLowerCase());
1208 }
1209 }
1210 if (phrase_query_p_term_x_variants.size()>0) {
1211 // we have found a valid term
1212 phrase_query_p_term_variants_list.add(phrase_query_p_term_x_variants);
1213
1214 if (in_phrase == false)
1215 {
1216 phrase_query_term_variants_hierarchy.add(phrase_query_p_term_variants_list);
1217 phrase_query_p_term_variants_list = new ArrayList<HashSet<String>>();
1218 }
1219 }
1220 } // end if has_phrases
1221 else {
1222 // no phrases so we don't have to do the phrasey stuff. but
1223 // we need to check the term against the query term list - if its not in there, check whether its the root of a term.
1224 // we want to handle the case where user has queried "snails", the term list returned only has snail, and therefore snails doesn't get highlighted.
1225 // but dont want to include eg TX
1226 if (result_code == NO_EQUIV_QUERY_TERMS) {
1227 if (containsSubString(query_term_variants, term)) {
1228 query_term_variants.add(term.toLowerCase());
1229 }
1230 }
1231
1232 }
1233 } // end of in_term...
1234 // Watch for phrases (surrounded by quotes)
1235 if (character == '\"') {
1236
1237 // Has a phrase just started?
1238 if (in_phrase == false)
1239 {
1240 in_phrase = true;
1241 }
1242 // Or has a phrase just finished?
1243 else if (in_phrase == true)
1244 {
1245 in_phrase = false;
1246 phrase_query_term_variants_hierarchy.add(phrase_query_p_term_variants_list);
1247 }
1248
1249 phrase_query_p_term_variants_list = new ArrayList<HashSet<String>>();
1250 } // if char == "
1251 } // for each char in performed query
1252
1253 return result_code;
1254 }
1255
1256 protected boolean containsSubString(HashSet<String> query_term_variants, String term) {
1257 // hack to filter out TX, TI field names
1258 String lc_term = term.toLowerCase();
1259 if (query_term_variants.contains(term)) {
1260 return false; // or true??
1261 }
1262 if (term.matches("[A-Z][A-Z][A-Z]?")) {
1263 return false;
1264 }
1265 Iterator i = query_term_variants.iterator();
1266 while (i.hasNext()) {
1267 String t = (String)i.next();
1268 if (term.startsWith(t)) {
1269 return true;
1270 }
1271 }
1272 return false;
1273 }
1274
1275
1276 /** retrieve the marked up highlighted section - only works for solr collection */
1277 protected Element retrieveHighlightedContent(Element request, String node_id) {
1278
1279 Document doc = XMLConverter.newDOM();
1280
1281 // do the query again to get term info
1282 Element cgi_param_list = (Element) GSXML.getChildByTagName(request, GSXML.PARAM_ELEM + GSXML.LIST_MODIFIER);
1283 HashMap<String, Serializable> params = GSXML.extractParams(cgi_param_list, false);
1284
1285 HashMap previous_params = (HashMap) params.get("p");
1286 if (previous_params == null)
1287 {
1288 return null;
1289 }
1290 String service_name = (String) previous_params.get(GSParams.SERVICE);
1291 if (service_name == null || !service_name.endsWith("Query"))
1292 { // hack for now - we only do highlighting if we were in a query last - ie not if we were in a browse thingy
1293 logger.debug("HL invalid service, not doing highlighting");
1294 return null;
1295 }
1296
1297 String collection = (String) params.get(GSParams.COLLECTION);
1298 UserContext userContext = new UserContext(request);
1299 String to = GSPath.appendLink(collection, service_name);
1300
1301 Element mr_query_message = doc.createElement(GSXML.MESSAGE_ELEM);
1302 Element mr_query_request = GSXML.createBasicRequest(doc, GSXML.REQUEST_TYPE_PROCESS, to, userContext);
1303 mr_query_message.appendChild(mr_query_request);
1304
1305 // paramList
1306 HashMap service_params = (HashMap) params.get("s1");
1307
1308 // hack in case the user searched on eg titles, but we want highlighting in the text
1309 service_params.put("index", "TX");
1310 Element query_param_list = doc.createElement(GSXML.PARAM_ELEM + GSXML.LIST_MODIFIER);
1311 GSXML.addParametersToList(query_param_list, service_params);
1312
1313 if (node_id != null) {
1314 GSXML.addParameterToList(query_param_list, "hldocOID", node_id);
1315 } else {
1316 GSXML.addParameterToList(query_param_list, "hldocOID", (String) params.get(GSParams.DOCUMENT));
1317 }
1318 mr_query_request.appendChild(query_param_list);
1319 // do the query
1320
1321 Element mr_query_response = (Element) this.mr.process(mr_query_message);
1322 String pathNode = GSPath.appendLink(GSXML.RESPONSE_ELEM, GSXML.NODE_CONTENT_ELEM);
1323 Element highlighted_node = (Element) GSXML.getNodeByPath(mr_query_response, pathNode);
1324
1325 if (highlighted_node == null) {
1326 return null;
1327 }
1328 // For SOLR, the highlighted node will be a nodeContent element, which is the hldocOID section content, with search terms marked up.
1329 //We send it back to the documnetContentRetrieve service so that resolveTextMacros can be applied, and it can be properly encased in documentNode etc elements
1330
1331 // Build a request to process highlighted text
1332
1333 Element hl_message = doc.createElement(GSXML.MESSAGE_ELEM);
1334 to = GSPath.appendLink(collection, "DocumentContentRetrieve");
1335 Element dc_request = GSXML.createBasicRequest(doc, GSXML.REQUEST_TYPE_PROCESS, to, userContext);
1336 hl_message.appendChild(dc_request);
1337
1338 // Create a parameter list to specify the request parameters - empty for now
1339 Element dc_param_list = doc.createElement(GSXML.PARAM_ELEM + GSXML.LIST_MODIFIER);
1340 dc_request.appendChild(dc_param_list);
1341
1342 // get the content
1343 Element doc_list = doc.createElement(GSXML.DOC_NODE_ELEM + GSXML.LIST_MODIFIER);
1344 dc_request.appendChild(doc_list);
1345 Element current_doc = doc.createElement(GSXML.DOC_NODE_ELEM);
1346 doc_list.appendChild(current_doc);
1347 current_doc.setAttribute(GSXML.NODE_ID_ATT, (String) params.get(GSParams.DOCUMENT));
1348 //Append highlighted content to request for processing
1349 dc_request.appendChild(doc.importNode(highlighted_node, true));
1350 Element hl_response_message = (Element) this.mr.process(hl_message);
1351 //Get results
1352 NodeList contentList = hl_response_message.getElementsByTagName(GSXML.NODE_CONTENT_ELEM);
1353 Element content = (Element) contentList.item(0);
1354 return content;
1355
1356
1357 }
1358 /**
1359 * Highlights query terms in specified elements (whose name is in element_names) text inside top_level_elem
1360 */
1361 protected boolean highlightQueryTermsDOM(Document doc, Element top_level_elem, String element_name, HashSet<String> query_term_variants, ArrayList<ArrayList<HashSet<String>>> phrase_query_term_variants_hierarchy, boolean case_insensitive) {
1362
1363 NodeList named_elems = top_level_elem.getElementsByTagName(element_name);
1364 for (int j=named_elems.getLength()-1; j>=0; j--) {
1365 Element this_elem = (Element)named_elems.item(j);
1366 Element replacement_elem = highlightQueryTermsElementText(doc, this_elem, query_term_variants, phrase_query_term_variants_hierarchy, case_insensitive);
1367 this_elem.getParentNode().replaceChild(replacement_elem, this_elem);
1368 }
1369 return true;
1370 }
1371 /**
1372 * Highlights query terms in the text content of an element.
1373 */
1374 private Element highlightQueryTermsElementText(Document doc, Element original_element, HashSet<String> query_term_variants, ArrayList<ArrayList<HashSet<String>>> phrase_query_term_variants_hierarchy, boolean case_insensitive)
1375 {
1376 String content = GSXML.getNodeText(original_element);
1377 // Convert the content string to an array of characters for speed
1378 char[] content_characters = new char[content.length()];
1379 content.getChars(0, content.length(), content_characters, 0);
1380
1381 // Now skim through the content, identifying word matches
1382 ArrayList<WordMatch> word_matches = new ArrayList<WordMatch>();
1383 int word_start = 0;
1384 boolean in_word = false;
1385 boolean preceding_word_matched = false;
1386 boolean inTag = false;
1387 for (int i = 0; i < content_characters.length; i++)
1388 {
1389 //We don't want to find words inside HTML tags
1390 if (content_characters[i] == '<')
1391 {
1392 // are we currently in a word?
1393 if (in_word) {
1394 in_word = false;
1395 String word = new String(content_characters, word_start, (i - word_start));
1396 if (case_insensitive) {
1397 word = word.toLowerCase();
1398 }
1399 if (query_term_variants.contains(word)) {
1400 // We have found a matching word, so remember its location
1401 word_matches.add(new WordMatch(word, word_start, i, preceding_word_matched));
1402 // should preceding word matched be set to true/false here??
1403 preceding_word_matched = true;
1404 } else {
1405 preceding_word_matched = false;
1406 }
1407 }
1408 inTag = true;
1409 continue;
1410 }
1411 else if (inTag && content_characters[i] == '>')
1412 {
1413 inTag = false;
1414 continue;
1415 }
1416 else if (inTag)
1417 {
1418 continue;
1419 }
1420
1421 boolean is_character_letter_or_digit = Character.isLetterOrDigit(content_characters[i]);
1422
1423 // Has a word just started?
1424 if (in_word == false && is_character_letter_or_digit == true)
1425 {
1426 in_word = true;
1427 word_start = i;
1428 }
1429
1430 // Or has a word just finished?
1431 else if (in_word == true && is_character_letter_or_digit == false)
1432 {
1433 in_word = false;
1434
1435 // Check if the word matches any of the query term equivalents
1436 String word = new String(content_characters, word_start, (i - word_start));
1437 if (case_insensitive) {
1438 word = word.toLowerCase();
1439 }
1440 if (query_term_variants.contains(word))
1441 {
1442 // We have found a matching word, so remember its location
1443 word_matches.add(new WordMatch(word, word_start, i, preceding_word_matched));
1444 preceding_word_matched = true;
1445 }
1446 else
1447 {
1448 preceding_word_matched = false;
1449 }
1450 }
1451 }
1452
1453 // Don't forget the last word...
1454 if (in_word == true)
1455 {
1456 // Check if the word matches any of the query term equivalents
1457 String word = new String(content_characters, word_start, (content_characters.length - word_start));
1458 if (case_insensitive) {
1459 word = word.toLowerCase();
1460 }
1461 if (query_term_variants.contains(word))
1462 {
1463 // We have found a matching word, so remember its location
1464 word_matches.add(new WordMatch(word, word_start, content_characters.length, preceding_word_matched));
1465 }
1466 }
1467
1468 if (word_matches.size() == 0) {
1469 // just return a copy of the original element
1470 return (Element)doc.importNode(original_element, true);
1471
1472 }
1473
1474 ArrayList<Integer> highlight_start_positions = new ArrayList<Integer>();
1475 ArrayList<Integer> highlight_end_positions = new ArrayList<Integer>();
1476
1477 if (phrase_query_term_variants_hierarchy.size() ==0) {
1478 for (int i = 0; i < word_matches.size(); i++) {
1479 highlight_start_positions.add(new Integer(word_matches.get(i).start_position));
1480 highlight_end_positions.add(new Integer(word_matches.get(i).end_position));
1481 }
1482 }
1483 else {
1484 // Deal with phrases now
1485 ArrayList<PartialPhraseMatch> partial_phrase_matches = new ArrayList<PartialPhraseMatch>();
1486 for (int i = 0; i < word_matches.size(); i++)
1487 {
1488 WordMatch word_match = word_matches.get(i);
1489
1490 // See if any partial phrase matches are extended by this word
1491 if (word_match.preceding_word_matched)
1492 {
1493 for (int j = partial_phrase_matches.size() - 1; j >= 0; j--)
1494 {
1495 PartialPhraseMatch partial_phrase_match = partial_phrase_matches.remove(j);
1496 ArrayList phrase_query_p_term_variants_list = phrase_query_term_variants_hierarchy.get(partial_phrase_match.query_phrase_number);
1497 HashSet phrase_query_p_term_x_variants = (HashSet) phrase_query_p_term_variants_list.get(partial_phrase_match.num_words_matched);
1498 if (phrase_query_p_term_x_variants.contains(word_match.word))
1499 {
1500 partial_phrase_match.num_words_matched++;
1501
1502 // Has a complete phrase match occurred?
1503 if (partial_phrase_match.num_words_matched == phrase_query_p_term_variants_list.size())
1504 {
1505 // Check for overlaps by looking at the previous highlight range
1506 if (!highlight_end_positions.isEmpty())
1507 {
1508 int last_highlight_index = highlight_end_positions.size() - 1;
1509 int last_highlight_end = highlight_end_positions.get(last_highlight_index).intValue();
1510 if (last_highlight_end > partial_phrase_match.start_position)
1511 {
1512 // There is an overlap, so remove the previous phrase match
1513 int last_highlight_start = highlight_start_positions.remove(last_highlight_index).intValue();
1514 highlight_end_positions.remove(last_highlight_index);
1515 partial_phrase_match.start_position = last_highlight_start;
1516 }
1517 }
1518
1519 highlight_start_positions.add(new Integer(partial_phrase_match.start_position));
1520 highlight_end_positions.add(new Integer(word_match.end_position));
1521 }
1522 // No, but add the partial match back into the list for next time
1523 else
1524 {
1525 partial_phrase_matches.add(partial_phrase_match);
1526 }
1527 }
1528 }
1529 }
1530 else
1531 {
1532 partial_phrase_matches.clear();
1533 }
1534
1535 // See if this word is at the start of any of the phrases
1536 for (int p = 0; p < phrase_query_term_variants_hierarchy.size(); p++)
1537 {
1538 ArrayList phrase_query_p_term_variants_list = phrase_query_term_variants_hierarchy.get(p);
1539 if (phrase_query_p_term_variants_list.size()>0) {
1540 HashSet phrase_query_p_term_1_variants = (HashSet) phrase_query_p_term_variants_list.get(0);
1541 if (phrase_query_p_term_1_variants.contains(word_match.word))
1542 {
1543 // If this phrase is just one word long, we have a complete match
1544 if (phrase_query_p_term_variants_list.size() == 1)
1545 {
1546 highlight_start_positions.add(new Integer(word_match.start_position));
1547 highlight_end_positions.add(new Integer(word_match.end_position));
1548 }
1549 // Otherwise we have the start of a potential phrase match
1550 else
1551 {
1552 partial_phrase_matches.add(new PartialPhraseMatch(word_match.start_position, p));
1553 }
1554 }
1555 }
1556 }
1557 }
1558 }
1559
1560 // Now add the annotation tags into the document at the correct points
1561 Element content_element = (Element)doc.importNode(original_element, false); // just copy the element plus any attributes, but not any children.
1562 int last_wrote = 0;
1563 for (int i = 0; i < highlight_start_positions.size(); i++)
1564 {
1565 int highlight_start = highlight_start_positions.get(i).intValue();
1566 int highlight_end = highlight_end_positions.get(i).intValue();
1567
1568 // Print anything before the highlight range
1569 if (last_wrote < highlight_start)
1570 {
1571 String preceding_text = new String(content_characters, last_wrote, (highlight_start - last_wrote));
1572 content_element.appendChild(doc.createTextNode(preceding_text));
1573 }
1574
1575 // Print the highlight text, annotated
1576 if (highlight_end > last_wrote)
1577 {
1578 String highlight_text = new String(content_characters, highlight_start, (highlight_end - highlight_start));
1579 Element annotation_element = GSXML.createTextElement(doc, "annotation", highlight_text);
1580 annotation_element.setAttribute("type", "query_term");
1581 content_element.appendChild(annotation_element);
1582 last_wrote = highlight_end;
1583 }
1584 }
1585
1586 // Finish off any unwritten text
1587 if (last_wrote < content_characters.length)
1588 {
1589 String remaining_text = new String(content_characters, last_wrote, (content_characters.length - last_wrote));
1590 content_element.appendChild(doc.createTextNode(remaining_text));
1591 }
1592 return content_element;
1593 }
1594
1595
1596 static private class WordMatch
1597 {
1598 public String word;
1599 public int start_position;
1600 public int end_position;
1601 public boolean preceding_word_matched;
1602
1603 public WordMatch(String word, int start_position, int end_position, boolean preceding_word_matched)
1604 {
1605 this.word = word;
1606 this.start_position = start_position;
1607 this.end_position = end_position;
1608 this.preceding_word_matched = preceding_word_matched;
1609 }
1610 }
1611
1612 static private class PartialPhraseMatch
1613 {
1614 public int start_position;
1615 public int query_phrase_number;
1616 public int num_words_matched;
1617
1618 public PartialPhraseMatch(int start_position, int query_phrase_number)
1619 {
1620 this.start_position = start_position;
1621 this.query_phrase_number = query_phrase_number;
1622 this.num_words_matched = 1;
1623 }
1624 }
1625}
Note: See TracBrowser for help on using the repository browser.