source: main/trunk/greenstone3/src/java/org/greenstone/gsdl3/action/DocumentAction.java@ 32448

Last change on this file since 32448 was 32448, checked in by kjdon, 6 years ago

params class changed, now returns false by default for shouldsave. so don't need to add any that we don't want saving in the session. turned hard coded strings into static string variables

  • Property svn:keywords set to Author Date Id Revision
File size: 54.0 KB
Line 
1/*
2 * DocumentAction.java
3 * Copyright (C) 2002 New Zealand Digital Library, http://www.nzdl.org
4 *
5 * This program is free software; you can redistribute it and/or modify
6 * it under the terms of the GNU General Public License as published by
7 * the Free Software Foundation; either version 2 of the License, or
8 * (at your option) any later version.
9 *
10 * This program is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 * GNU General Public License for more details.
14 *
15 * You should have received a copy of the GNU General Public License
16 * along with this program; if not, write to the Free Software
17 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
18 */
19package org.greenstone.gsdl3.action;
20
21// Greenstone classes
22import org.greenstone.gsdl3.core.ModuleInterface;
23import org.greenstone.gsdl3.util.*;
24import org.greenstone.util.GlobalProperties;
25
26// XML classes
27import org.w3c.dom.Document;
28import org.w3c.dom.Element;
29import org.w3c.dom.Node;
30import org.w3c.dom.Text;
31import org.w3c.dom.NodeList;
32
33// General Java classes
34import java.util.ArrayList;
35import java.util.HashMap;
36import java.util.HashSet;
37import java.io.File;
38import java.io.Serializable;
39
40import org.apache.log4j.*;
41
42/** Action class for retrieving Documents via the message router */
43public class DocumentAction extends Action
44{
45
46 static Logger logger = Logger.getLogger(org.greenstone.gsdl3.action.DocumentAction.class.getName());
47
48 // this is used to specify that the sibling nodes of a selected one should be obtained
49 public static final String SIBLING_ARG = "sib";
50 public static final String GOTO_PAGE_ARG = "gp";
51 public static final String ENRICH_DOC_ARG = "end";
52 public static final String EXPAND_DOCUMENT_ARG = "ed";
53 public static final String EXPAND_CONTENTS_ARG = "ec";
54 public static final String REALISTIC_BOOK_ARG = "book";
55 public static final String NO_TEXT_ARG = "noText";
56 public static final String DOC_EDIT_ARG = "docEdit";
57
58 /**
59 * if this is set to true, when a document is displayed, any annotation type
60 * services (enrich) will be offered to the user as well
61 */
62 protected boolean provide_annotations = false;
63
64 protected boolean highlight_query_terms = false;
65
66 public boolean configure()
67 {
68 super.configure();
69 String highlight = (String) config_params.get("highlightQueryTerms");
70 if (highlight != null && highlight.equals("true"))
71 {
72 highlight_query_terms = true;
73 }
74 String annotate = (String) config_params.get("displayAnnotationService");
75 if (annotate != null && annotate.equals("true"))
76 {
77 provide_annotations = true;
78 }
79 return true;
80 }
81
82 public Node process(Node message_node)
83 {
84 // for now, no subaction eventually we may want to have subactions such as text assoc or something ?
85
86 Element message = GSXML.nodeToElement(message_node);
87 Document doc = XMLConverter.newDOM(); //message.getOwnerDocument();
88
89 // the response
90 Element result = doc.createElement(GSXML.MESSAGE_ELEM);
91 Element page_response = doc.createElement(GSXML.RESPONSE_ELEM);
92 result.appendChild(page_response);
93
94 // get the request - assume only one
95 Element request = (Element) GSXML.getChildByTagName(message, GSXML.REQUEST_ELEM);
96 Element cgi_paramList = (Element) GSXML.getChildByTagName(request, GSXML.PARAM_ELEM + GSXML.LIST_MODIFIER);
97 HashMap<String, Serializable> params = GSXML.extractParams(cgi_paramList, false);
98
99 // just in case there are some that need to get passed to the services
100 HashMap service_params = (HashMap) params.get("s0");
101
102 String collection = (String) params.get(GSParams.COLLECTION);
103 String document_id = (String) params.get(GSParams.DOCUMENT);
104 if (document_id != null && document_id.equals(""))
105 {
106 document_id = null;
107 }
108 String href = (String) params.get(GSParams.HREF);//for an external link : get the href URL if it is existing in the params list
109 if (href != null && href.equals(""))
110 {
111 href = null;
112 }
113 String rl = (String) params.get(GSParams.RELATIVE_LINK);//for an external link : get the rl value if it is existing in the params list
114 if (document_id == null && href == null)
115 {
116 logger.error("no document specified!");
117 return result;
118 }
119 if (rl != null && rl.equals("0"))
120 {
121 // this is a true external link, we should have been directed to a different page or action
122 logger.error("rl value was 0, shouldn't get here");
123 return result;
124 }
125
126 UserContext userContext = new UserContext(request);
127
128 //append site metadata
129 addSiteMetadata(page_response, userContext);
130 addInterfaceOptions(page_response);
131
132 // get the additional data needed for the page
133 getBackgroundData(page_response, collection, userContext);
134 Element format_elem = (Element) GSXML.getChildByTagName(page_response, GSXML.FORMAT_ELEM);
135
136 if (format_elem != null) {
137 // lets look for param defaults set in config file
138 NodeList param_defaults = format_elem.getElementsByTagName("paramDefault");
139 for (int i=0; i<param_defaults.getLength(); i++) {
140 Element p = (Element)param_defaults.item(i);
141 String name = p.getAttribute(GSXML.NAME_ATT);
142 if (params.get(name) ==null) {
143 // wasn't set from interface
144 String value = p.getAttribute(GSXML.VALUE_ATT);
145 params.put(name, value );
146 // also add into request param xml so that xslt knows it too
147 GSXML.addParameterToList(cgi_paramList, name, value);
148 }
149 }
150 }
151
152 String document_type = (String) params.get(GSParams.DOCUMENT_TYPE);
153 if (document_type != null && document_type.equals(""))
154 {
155 //document_type = "hierarchy";
156 document_type = null; // we'll get it later if not already specified
157 }
158 // what if it is null here?? Anu to check...
159
160
161 boolean editing_document = false;
162 String doc_edit = (String) params.get(DOC_EDIT_ARG);
163 if (doc_edit != null && doc_edit.equals("1")) {
164 editing_document = true;
165 }
166
167 // are we editing mode? just get the archive document, convert to our internal doc format, and return it
168 if (editing_document) {
169
170 // call get archive doc
171 Element dx_message = doc.createElement(GSXML.MESSAGE_ELEM);
172 String to = "DocXMLGetSection";
173 Element dx_request = GSXML.createBasicRequest(doc, GSXML.REQUEST_TYPE_PROCESS, to, userContext);
174 dx_message.appendChild(dx_request);
175 Element dx_section = doc.createElement(GSXML.DOCXML_SECTION_ELEM);
176 dx_section.setAttribute(GSXML.NODE_ID_ATT, document_id);
177 dx_section.setAttribute(GSXML.COLLECTION_ATT, collection);
178 dx_request.appendChild(dx_section);
179
180 Element dx_response_message = (Element) this.mr.process(dx_message);
181 if (processErrorElements(dx_response_message, page_response))
182 {
183 return result;
184 }
185
186 // get the section out
187 String path = GSPath.appendLink(GSXML.RESPONSE_ELEM, GSXML.DOCXML_SECTION_ELEM);
188 Element section = (Element) GSXML.getNodeByPath(dx_response_message, path);
189 if (section == null) {
190 logger.error("no archive doc returned for "+document_id);
191 return result;
192 }
193 // convert the archive format into the internal format that the page response requires
194
195 // work out doctype
196 // NOTE: this will be coming from collection database in index
197 // the archive file doesn't store this. So we have to assume
198 // that the doc type will not be changing with any
199 // modifications happening to archives.
200
201 // if doc type is null, then we need to work it out.
202 // create a basic doc list containing the current node
203
204 if (document_type == null) {
205 Element basic_doc_list = doc.createElement(GSXML.DOC_NODE_ELEM + GSXML.LIST_MODIFIER);
206 Element current_doc = doc.createElement(GSXML.DOC_NODE_ELEM);
207 basic_doc_list.appendChild(current_doc);
208 current_doc.setAttribute(GSXML.NODE_ID_ATT, document_id);
209 basic_doc_list.appendChild(current_doc);
210 document_type = getDocumentType(basic_doc_list, collection, userContext, page_response);
211 }
212
213 if (document_type == null) {
214 logger.debug("@@@ doctype is null, setting to simple");
215 document_type = GSXML.DOC_TYPE_SIMPLE;
216 }
217
218 Element doc_elem = doc.createElement(GSXML.DOCUMENT_ELEM);
219 doc_elem.setAttribute(GSXML.DOC_TYPE_ATT, document_type);
220 page_response.appendChild(doc_elem);
221
222 Element transformed_section = transformArchiveToDocument(section);
223 if (document_type == GSXML.DOC_TYPE_SIMPLE) {
224 // simple doc, only returning a single document node, which is the top level section.
225 doc_elem.setAttribute(GSXML.NODE_ID_ATT, document_id);
226 GSXML.mergeElements(doc_elem, transformed_section);
227 return result;
228 }
229
230 // multi sectioned document.
231 transformed_section.setAttribute(GSXML.NODE_ID_ATT, document_id);
232 // In docEdit mode, we obtain the text from archives, from doc.xml
233 // Now the transformation has replaced <Section> with <documentNode>
234 // Need to add nodeID, nodeType and docType attributes to each docNode
235 // as doc.xml doesn't store that.
236 insertDocNodeAttributes(transformed_section, document_type, null);
237 doc_elem.appendChild(doc.importNode(transformed_section, true));
238 logger.debug("dx result = "+XMLConverter.getPrettyString(result));
239
240 return result;
241 }
242
243 //whether to retrieve siblings or not
244 boolean get_siblings = false;
245 String sibs = (String) params.get(SIBLING_ARG);
246 if (sibs != null && sibs.equals("1"))
247 {
248 get_siblings = true;
249 }
250
251 String doc_id_modifier = "";
252 String sibling_num = (String) params.get(GOTO_PAGE_ARG);
253 if (sibling_num != null && !sibling_num.equals(""))
254 {
255 // we have to modify the doc name
256 doc_id_modifier = "." + sibling_num + ".ss";
257 }
258
259 boolean expand_document = false;
260 String ed_arg = (String) params.get(EXPAND_DOCUMENT_ARG);
261 if (ed_arg != null && ed_arg.equals("1"))
262 {
263 expand_document = true;
264 }
265
266 boolean expand_contents = false;
267 if (expand_document)
268 { // we always expand the contents with the text
269 expand_contents = true;
270 }
271 else
272 {
273 String ec_arg = (String) params.get(EXPAND_CONTENTS_ARG);
274 if (ec_arg != null && ec_arg.equals("1"))
275 {
276 expand_contents = true;
277 }
278 }
279
280 // do we want text content? Not if no_text=1.
281 // expand_document overrides this. - should it??
282 boolean get_text = true;
283 String nt_arg = (String) params.get(NO_TEXT_ARG);
284
285 if (!expand_document && nt_arg!=null && nt_arg.equals("1")) {
286 logger.debug("SETTING GET TEXT TO FALSE");
287 get_text = false;
288 } else {
289 logger.debug("GET TEXT REMAINS TRUE");
290 }
291
292 // the_document is where all the doc info - structure and metadata etc
293 // is added into, to be returned in the page
294 Element the_document = doc.createElement(GSXML.DOCUMENT_ELEM);
295 page_response.appendChild(the_document);
296
297 // create a basic doc list containing the current node
298 Element basic_doc_list = doc.createElement(GSXML.DOC_NODE_ELEM + GSXML.LIST_MODIFIER);
299 Element current_doc = doc.createElement(GSXML.DOC_NODE_ELEM);
300 basic_doc_list.appendChild(current_doc);
301 if (document_id != null)
302 {
303 current_doc.setAttribute(GSXML.NODE_ID_ATT, document_id + doc_id_modifier);
304 }
305 else
306 {
307 current_doc.setAttribute(GSXML.HREF_ID_ATT, href);
308 // do we need this??
309 current_doc.setAttribute(GSXML.ID_MOD_ATT, doc_id_modifier);
310 }
311
312 if (document_type == null)
313 {
314 document_type = getDocumentType(basic_doc_list, collection, userContext, page_response);
315 }
316 if (document_type == null)
317 {
318 logger.debug("##### doctype is null, setting to simple");
319 document_type = GSXML.DOC_TYPE_SIMPLE;
320 }
321
322 the_document.setAttribute(GSXML.DOC_TYPE_ATT, document_type);
323
324 // Create a parameter list to specify the required structure information
325 Element ds_param_list = doc.createElement(GSXML.PARAM_ELEM + GSXML.LIST_MODIFIER);
326
327 if (service_params != null)
328 {
329 GSXML.addParametersToList(ds_param_list, service_params);
330 }
331
332 Element ds_param = null;
333 boolean get_structure = false;
334 boolean get_structure_info = false;
335 if (document_type.equals(GSXML.DOC_TYPE_PAGED))
336 {
337 get_structure_info = true;
338
339 if (expand_contents)
340 {
341 ds_param = doc.createElement(GSXML.PARAM_ELEM);
342 ds_param_list.appendChild(ds_param);
343 ds_param.setAttribute(GSXML.NAME_ATT, "structure");
344 ds_param.setAttribute(GSXML.VALUE_ATT, "entire");
345 }
346
347 // get the info needed for paged naviagtion
348 ds_param = doc.createElement(GSXML.PARAM_ELEM);
349 ds_param_list.appendChild(ds_param);
350 ds_param.setAttribute(GSXML.NAME_ATT, "info");
351 ds_param.setAttribute(GSXML.VALUE_ATT, "numSiblings");
352 ds_param = doc.createElement(GSXML.PARAM_ELEM);
353 ds_param_list.appendChild(ds_param);
354 ds_param.setAttribute(GSXML.NAME_ATT, "info");
355 ds_param.setAttribute(GSXML.VALUE_ATT, "numChildren");
356 ds_param = doc.createElement(GSXML.PARAM_ELEM);
357 ds_param_list.appendChild(ds_param);
358 ds_param.setAttribute(GSXML.NAME_ATT, "info");
359 ds_param.setAttribute(GSXML.VALUE_ATT, "siblingPosition");
360
361 if (get_siblings)
362 {
363 ds_param = doc.createElement(GSXML.PARAM_ELEM);
364 ds_param_list.appendChild(ds_param);
365 ds_param.setAttribute(GSXML.NAME_ATT, "structure");
366 ds_param.setAttribute(GSXML.VALUE_ATT, "siblings");
367 }
368
369 }
370 else if (document_type.equals(GSXML.DOC_TYPE_HIERARCHY) || document_type.equals(GSXML.DOC_TYPE_PAGED_HIERARCHY))
371 {
372 get_structure = true;
373 if (expand_contents)
374 {
375 ds_param = doc.createElement(GSXML.PARAM_ELEM);
376 ds_param_list.appendChild(ds_param);
377 ds_param.setAttribute(GSXML.NAME_ATT, "structure");
378 ds_param.setAttribute(GSXML.VALUE_ATT, "entire");
379 }
380 else
381 {
382 // get the info needed for table of contents
383 ds_param = doc.createElement(GSXML.PARAM_ELEM);
384 ds_param_list.appendChild(ds_param);
385 ds_param.setAttribute(GSXML.NAME_ATT, "structure");
386 ds_param.setAttribute(GSXML.VALUE_ATT, "ancestors");
387 ds_param = doc.createElement(GSXML.PARAM_ELEM);
388 ds_param_list.appendChild(ds_param);
389 ds_param.setAttribute(GSXML.NAME_ATT, "structure");
390 ds_param.setAttribute(GSXML.VALUE_ATT, "children");
391 if (get_siblings)
392 {
393 ds_param = doc.createElement(GSXML.PARAM_ELEM);
394 ds_param_list.appendChild(ds_param);
395 ds_param.setAttribute(GSXML.NAME_ATT, "structure");
396 ds_param.setAttribute(GSXML.VALUE_ATT, "siblings");
397 }
398 }
399 }
400 else
401 {
402 // we dont need any structure
403 }
404
405 boolean has_dummy = false;
406 if (get_structure || get_structure_info)
407 {
408
409 // Build a request to obtain the document structure
410 Element ds_message = doc.createElement(GSXML.MESSAGE_ELEM);
411 String to = GSPath.appendLink(collection, "DocumentStructureRetrieve");// Hard-wired?
412 Element ds_request = GSXML.createBasicRequest(doc, GSXML.REQUEST_TYPE_PROCESS, to, userContext);
413 ds_message.appendChild(ds_request);
414 ds_request.appendChild(ds_param_list);
415
416 // add the node list we created earlier
417 ds_request.appendChild(basic_doc_list);
418
419 // Process the document structure retrieve message
420 Element ds_response_message = (Element) this.mr.process(ds_message);
421 if (processErrorElements(ds_response_message, page_response))
422 {
423 return result;
424 }
425
426 // get the info and print out
427 String path = GSPath.appendLink(GSXML.RESPONSE_ELEM, GSXML.DOC_NODE_ELEM + GSXML.LIST_MODIFIER);
428 path = GSPath.appendLink(path, GSXML.DOC_NODE_ELEM);
429 path = GSPath.appendLink(path, "nodeStructureInfo");
430 Element ds_response_struct_info = (Element) GSXML.getNodeByPath(ds_response_message, path);
431 // get the doc_node bit
432 if (ds_response_struct_info != null)
433 {
434 the_document.appendChild(doc.importNode(ds_response_struct_info, true));
435 }
436 path = GSPath.appendLink(GSXML.RESPONSE_ELEM, GSXML.DOC_NODE_ELEM + GSXML.LIST_MODIFIER);
437 path = GSPath.appendLink(path, GSXML.DOC_NODE_ELEM);
438 path = GSPath.appendLink(path, GSXML.NODE_STRUCTURE_ELEM);
439 Element ds_response_structure = (Element) GSXML.getNodeByPath(ds_response_message, path);
440
441 if (ds_response_structure != null)
442 {
443 // add the contents of the structure bit into the_document
444 NodeList structs = ds_response_structure.getChildNodes();
445 for (int i = 0; i < structs.getLength(); i++)
446 {
447 the_document.appendChild(doc.importNode(structs.item(i), true));
448 }
449 }
450 else
451 {
452 // no structure nodes, so put in a dummy doc node
453 Element doc_node = doc.createElement(GSXML.DOC_NODE_ELEM);
454 if (document_id != null)
455 {
456 doc_node.setAttribute(GSXML.NODE_ID_ATT, document_id);
457 }
458 else
459 {
460 doc_node.setAttribute(GSXML.HREF_ID_ATT, href);
461
462 }
463 the_document.appendChild(doc_node);
464 has_dummy = true;
465 }
466 }
467 else
468 { // a simple type - we dont have a dummy node for simple
469 // should think about this more
470 // no structure request, so just put in a dummy doc node
471 Element doc_node = doc.createElement(GSXML.DOC_NODE_ELEM);
472 if (document_id != null)
473 {
474 doc_node.setAttribute(GSXML.NODE_ID_ATT, document_id);
475 }
476 else
477 {
478 doc_node.setAttribute(GSXML.HREF_ID_ATT, href);
479 }
480 the_document.appendChild(doc_node);
481 has_dummy = true;
482 }
483
484 // Build a request to obtain some document metadata
485 Element dm_message = doc.createElement(GSXML.MESSAGE_ELEM);
486 String to = GSPath.appendLink(collection, "DocumentMetadataRetrieve"); // Hard-wired?
487 Element dm_request = GSXML.createBasicRequest(doc, GSXML.REQUEST_TYPE_PROCESS, to, userContext);
488 dm_message.appendChild(dm_request);
489 // Create a parameter list to specify the required metadata information
490
491 HashSet<String> meta_names = new HashSet<String>();
492 meta_names.add("Title"); // the default
493 if (format_elem != null)
494 {
495 getRequiredMetadataNames(format_elem, meta_names);
496 }
497
498 Element extraMetaListElem = (Element) GSXML.getChildByTagName(request, GSXML.EXTRA_METADATA + GSXML.LIST_MODIFIER);
499 if (extraMetaListElem != null)
500 {
501 NodeList extraMetaList = extraMetaListElem.getElementsByTagName(GSXML.EXTRA_METADATA);
502 for (int i = 0; i < extraMetaList.getLength(); i++)
503 {
504 meta_names.add(((Element) extraMetaList.item(i)).getAttribute(GSXML.NAME_ATT));
505 }
506 }
507
508 Element dm_param_list = createMetadataParamList(doc,meta_names);
509 if (service_params != null)
510 {
511 GSXML.addParametersToList(dm_param_list, service_params);
512 }
513
514 dm_request.appendChild(dm_param_list);
515
516 // create the doc node list for the metadata request
517 Element dm_doc_list = doc.createElement(GSXML.DOC_NODE_ELEM + GSXML.LIST_MODIFIER);
518 dm_request.appendChild(dm_doc_list);
519
520 // Add each node from the structure response into the metadata request
521 NodeList doc_nodes = the_document.getElementsByTagName(GSXML.DOC_NODE_ELEM);
522 for (int i = 0; i < doc_nodes.getLength(); i++)
523 {
524 Element doc_node = (Element) doc_nodes.item(i);
525 String doc_node_id = doc_node.getAttribute(GSXML.NODE_ID_ATT);
526
527 // Add the documentNode to the list
528 Element dm_doc_node = doc.createElement(GSXML.DOC_NODE_ELEM);
529 if (needSectionContent(params)) {
530 if (doc_node_id.equals(document_id)) {
531 dm_doc_list.appendChild(dm_doc_node);
532 }
533 } else {
534 dm_doc_list.appendChild(dm_doc_node);
535 }
536 //dm_doc_list.appendChild(dm_doc_node);
537 dm_doc_node.setAttribute(GSXML.NODE_ID_ATT, doc_node_id);
538 dm_doc_node.setAttribute(GSXML.NODE_TYPE_ATT, doc_node.getAttribute(GSXML.NODE_TYPE_ATT));
539 if (document_id == null){
540 dm_doc_node.setAttribute(GSXML.HREF_ID_ATT, href );
541 }
542
543 }
544 // we also want a metadata request to the top level document to get
545 // assocfilepath - this could be cached too
546 Element doc_meta_request = GSXML.createBasicRequest(doc, GSXML.REQUEST_TYPE_PROCESS, to, userContext);
547 dm_message.appendChild(doc_meta_request);
548 Element doc_meta_param_list = doc.createElement(GSXML.PARAM_ELEM + GSXML.LIST_MODIFIER);
549 if (service_params != null)
550 {
551 GSXML.addParametersToList(doc_meta_param_list, service_params);
552 }
553
554 doc_meta_request.appendChild(doc_meta_param_list);
555 Element doc_param = doc.createElement(GSXML.PARAM_ELEM);
556 doc_meta_param_list.appendChild(doc_param);
557 doc_param.setAttribute(GSXML.NAME_ATT, "metadata");
558 doc_param.setAttribute(GSXML.VALUE_ATT, "assocfilepath");
559
560 // create the doc node list for the metadata request
561 Element doc_list = doc.createElement(GSXML.DOC_NODE_ELEM + GSXML.LIST_MODIFIER);
562 doc_meta_request.appendChild(doc_list);
563
564 Element doc_node = doc.createElement(GSXML.DOC_NODE_ELEM);
565 // the node we want is the root document node
566 if (document_id != null)
567 {
568 doc_node.setAttribute(GSXML.NODE_ID_ATT, document_id + ".rt");
569 }
570 /*else
571 {
572 doc_node.setAttribute(GSXML.HREF_ID_ATT, href);// + ".rt");
573 // can we assume that href is always a top level doc??
574 //doc_node.setAttribute(GSXML.ID_MOD_ATT, ".rt");
575 //doc_node.setAttribute("externalURL", has_rl);
576 }*/
577 doc_list.appendChild(doc_node);
578
579 Element dm_response_message = (Element) this.mr.process(dm_message);
580 if (processErrorElements(dm_response_message, page_response))
581 {
582 return result;
583 }
584
585 String path = GSPath.appendLink(GSXML.RESPONSE_ELEM, GSXML.DOC_NODE_ELEM + GSXML.LIST_MODIFIER);
586 Element dm_response_doc_list = (Element) GSXML.getNodeByPath(dm_response_message, path);
587
588 // Merge the metadata with the structure information
589 NodeList dm_response_docs = dm_response_doc_list.getChildNodes();
590 for (int i = 0; i < doc_nodes.getLength(); i++)
591 {
592 Node dcNode;
593 String node_idd = ((Element)doc_nodes.item(i)).getAttribute(GSXML.NODE_ID_ATT);
594 if (node_idd.isEmpty()) {
595 String href_id_att = ((Element)doc_nodes.item(i)).getAttribute(GSXML.HREF_ID_ATT);
596 dcNode = GSXML.getNamedElement(dm_response_doc_list, "documentNode", GSXML.HREF_ID_ATT, href_id_att);
597 } else {
598 dcNode = GSXML.getNamedElement(dm_response_doc_list, "documentNode", GSXML.NODE_ID_ATT, node_idd);
599 }
600 GSXML.mergeMetadataLists(doc_nodes.item(i), dcNode);
601 }
602 // get the top level doc metadata out
603 Element doc_meta_response = (Element) dm_response_message.getElementsByTagName(GSXML.RESPONSE_ELEM).item(1);
604 Element top_doc_node = (Element) GSXML.getNodeByPath(doc_meta_response, "documentNodeList/documentNode");
605 GSXML.mergeMetadataLists(the_document, top_doc_node);
606
607 // do we want doc text content? If not, we are done.
608 if (!get_text) {
609 // don't get text
610 return result;
611 }
612
613 // Build a request to obtain some document content
614 Element dc_message = doc.createElement(GSXML.MESSAGE_ELEM);
615 to = GSPath.appendLink(collection, "DocumentContentRetrieve"); // Hard-wired?
616 Element dc_request = GSXML.createBasicRequest(doc, GSXML.REQUEST_TYPE_PROCESS, to, userContext);
617 dc_message.appendChild(dc_request);
618
619 // Create a parameter list to specify the request parameters - empty for now
620 Element dc_param_list = doc.createElement(GSXML.PARAM_ELEM + GSXML.LIST_MODIFIER);
621 if (service_params != null)
622 {
623 GSXML.addParametersToList(dc_param_list, service_params);
624 }
625
626 dc_request.appendChild(dc_param_list);
627
628 // get the content
629 // the doc list for the content request is the same as the one for the structure request unless we want the whole document, in which case its the same as for the metadata request.
630 if (expand_document)
631 {
632 dc_request.appendChild(dm_doc_list);
633 }
634 else
635 {
636 dc_request.appendChild(basic_doc_list);
637 }
638 Element dc_response_message = (Element) this.mr.process(dc_message);
639
640 if (processErrorElements(dc_response_message, page_response))
641 {
642 return result;
643
644 }
645 Element dc_response_doc_list = (Element) GSXML.getNodeByPath(dc_response_message, path);
646
647 if (expand_document)
648 {
649 // Merge the content with the structure information
650 NodeList dc_response_docs = dc_response_doc_list.getChildNodes();
651 for (int i = 0; i < doc_nodes.getLength(); i++)
652 {
653 String node_id = ((Element)doc_nodes.item(i)).getAttribute(GSXML.NODE_ID_ATT);
654 //Node content = GSXML.getChildByTagName((Element) dc_response_docs.item(i), GSXML.NODE_CONTENT_ELEM);
655 Node docNode = GSXML.getNamedElement(dc_response_doc_list, "documentNode", GSXML.NODE_ID_ATT, node_id);
656 Node content = GSXML.getChildByTagName(docNode, GSXML.NODE_CONTENT_ELEM);
657 if (content != null)
658 {
659 if (highlight_query_terms)
660 {
661
662 content = highlightQueryTerms(request, node_id, (Element) content);
663 }
664
665 doc_nodes.item(i).appendChild(doc.importNode(content, true));
666 }
667 //GSXML.mergeMetadataLists(doc_nodes.item(i), dm_response_docs.item(i));
668 }
669 if (has_dummy && document_type.equals(GSXML.DOC_TYPE_SIMPLE)) {
670 Element dummy_node = (Element) doc_nodes.item(0);
671 the_document.removeChild(dummy_node);
672 the_document.setAttribute(GSXML.NODE_ID_ATT, dummy_node.getAttribute(GSXML.NODE_ID_ATT));
673 NodeList dummy_children = dummy_node.getChildNodes();
674 for (int i = dummy_children.getLength() - 1; i >= 0; i--)
675 {
676 // special case as we don't want more than one metadata list
677 if (dummy_children.item(i).getNodeName().equals(GSXML.METADATA_ELEM + GSXML.LIST_MODIFIER))
678 {
679 GSXML.mergeMetadataFromList(the_document, dummy_children.item(i));
680 }
681 else
682 {
683 the_document.appendChild(dummy_children.item(i));
684 }
685 }
686 }
687 }
688 else
689 {
690 //path = GSPath.appendLink(path, GSXML.DOC_NODE_ELEM);
691 Element dc_response_doc = (Element) GSXML.getChildByTagName(dc_response_doc_list, GSXML.DOC_NODE_ELEM);
692 Element dc_response_doc_content = (Element) GSXML.getChildByTagName(dc_response_doc, GSXML.NODE_CONTENT_ELEM);
693 //Element dc_response_doc_external = (Element) GSXML.getChildByTagName(dc_response_doc, "external");
694
695 if (dc_response_doc_content == null)
696 {
697 // no content to add
698 if (dc_response_doc.getAttribute("external").equals("true"))
699 {
700
701 //if (dc_response_doc_external != null)
702 //{
703 String href_id = dc_response_doc.getAttribute(GSXML.HREF_ID_ATT);
704
705 the_document.setAttribute("selectedNode", href_id);
706 the_document.setAttribute("external", href_id);
707 }
708 return result;
709 }
710 if (highlight_query_terms)
711 {
712 dc_response_doc.removeChild(dc_response_doc_content);
713
714 dc_response_doc_content = highlightQueryTerms(request, null, dc_response_doc_content);
715 dc_response_doc.appendChild(dc_response_doc.getOwnerDocument().importNode(dc_response_doc_content, true));
716 }
717
718 if (provide_annotations)
719 {
720 String service_selected = (String) params.get(ENRICH_DOC_ARG);
721 if (service_selected != null && service_selected.equals("1"))
722 {
723 // now we can modifiy the response doc if needed
724 String enrich_service = (String) params.get(GSParams.SERVICE);
725 // send a message to the service
726 Element enrich_message = doc.createElement(GSXML.MESSAGE_ELEM);
727 Element enrich_request = GSXML.createBasicRequest(doc, GSXML.REQUEST_TYPE_PROCESS, enrich_service, userContext);
728 enrich_message.appendChild(enrich_request);
729 // check for parameters
730 HashMap e_service_params = (HashMap) params.get("s1");
731 if (e_service_params != null)
732 {
733 Element enrich_pl = doc.createElement(GSXML.PARAM_ELEM + GSXML.LIST_MODIFIER);
734 GSXML.addParametersToList(enrich_pl, e_service_params);
735 enrich_request.appendChild(enrich_pl);
736 }
737 Element e_doc_list = doc.createElement(GSXML.DOC_NODE_ELEM + GSXML.LIST_MODIFIER);
738 enrich_request.appendChild(e_doc_list);
739 e_doc_list.appendChild(doc.importNode(dc_response_doc, true));
740
741 Node enrich_response = this.mr.process(enrich_message);
742
743 String[] links = { GSXML.RESPONSE_ELEM, GSXML.DOC_NODE_ELEM + GSXML.LIST_MODIFIER, GSXML.DOC_NODE_ELEM, GSXML.NODE_CONTENT_ELEM };
744 path = GSPath.createPath(links);
745 dc_response_doc_content = (Element) GSXML.getNodeByPath(enrich_response, path);
746
747 }
748 } // if provide_annotations
749
750 // use the returned id rather than the sent one cos there may have
751 // been modifiers such as .pr that are removed.
752 String modified_doc_id = dc_response_doc.getAttribute(GSXML.NODE_ID_ATT);
753 the_document.setAttribute("selectedNode", modified_doc_id);
754 if (has_dummy)
755 {
756 // change the id if necessary and add the content
757 Element dummy_node = (Element) doc_nodes.item(0);
758
759 dummy_node.setAttribute(GSXML.NODE_ID_ATT, modified_doc_id);
760 dummy_node.appendChild(doc.importNode(dc_response_doc_content, true));
761 // hack for simple type
762 if (document_type.equals(GSXML.DOC_TYPE_SIMPLE))
763 {
764 // we dont want the internal docNode, just want the content and metadata in the document
765 // rethink this!!
766 the_document.removeChild(dummy_node);
767
768 NodeList dummy_children = dummy_node.getChildNodes();
769 //for (int i=0; i<dummy_children.getLength(); i++) {
770 for (int i = dummy_children.getLength() - 1; i >= 0; i--)
771 {
772 // special case as we don't want more than one metadata list
773 if (dummy_children.item(i).getNodeName().equals(GSXML.METADATA_ELEM + GSXML.LIST_MODIFIER))
774 {
775 GSXML.mergeMetadataFromList(the_document, dummy_children.item(i));
776 }
777 else
778 {
779 the_document.appendChild(dummy_children.item(i));
780 }
781 }
782 }
783
784 the_document.setAttribute(GSXML.NODE_ID_ATT, modified_doc_id);
785 }
786 else
787 {
788 // Merge the document content with the metadata and structure information
789 for (int i = 0; i < doc_nodes.getLength(); i++)
790 {
791 Node dn = doc_nodes.item(i);
792 String dn_id = ((Element) dn).getAttribute(GSXML.NODE_ID_ATT);
793 if (dn_id.equals(modified_doc_id))
794 {
795 dn.appendChild(doc.importNode(dc_response_doc_content, true));
796 break;
797 }
798 }
799 }
800 }
801 //logger.debug("(DocumentAction) Page:\n" + GSXML.xmlNodeToString(result));
802 return result;
803 }
804
805
806 private boolean needSectionContent(HashMap<String, Serializable> params) {
807 String document_id = (String) params.get(GSParams.DOCUMENT);
808 String ilt = (String) params.get(GSParams.INLINE_TEMPLATE);
809 String iltPrefix = "<xsl:template match=\"/\"><text><xsl:for-each select=\"/page/pageResponse/document//documentNode[@nodeID =";
810 if (ilt != null && ilt.startsWith(iltPrefix) && document_id != null) {
811 return true;
812 }
813
814 return false;
815 }
816 /**
817 * this method gets the collection description, the format info, the list of
818 * enrich services, etc - stuff that is needed for the page, but is the same
819 * whatever the query is - should be cached
820 */
821 protected boolean getBackgroundData(Element page_response, String collection, UserContext userContext)
822 {
823 Document doc = page_response.getOwnerDocument();
824
825 // create a message to process - contains requests for the collection
826 // description, the format element, the enrich services on offer
827 // these could all be cached
828 Element info_message = doc.createElement(GSXML.MESSAGE_ELEM);
829 String path = GSPath.appendLink(collection, "DocumentContentRetrieve");
830 // the format request - ignore for now, where does this request go to??
831 Element format_request = GSXML.createBasicRequest(doc, GSXML.REQUEST_TYPE_FORMAT, path, userContext);
832 info_message.appendChild(format_request);
833
834 // the enrich_services request - only do this if provide_annotations is true
835
836 if (provide_annotations)
837 {
838 Element enrich_services_request = GSXML.createBasicRequest(doc, GSXML.REQUEST_TYPE_DESCRIBE, "", userContext);
839 enrich_services_request.setAttribute(GSXML.INFO_ATT, "serviceList");
840 info_message.appendChild(enrich_services_request);
841 }
842
843 Element info_response = (Element) this.mr.process(info_message);
844
845 // the collection is the first response
846 NodeList responses = info_response.getElementsByTagName(GSXML.RESPONSE_ELEM);
847 Element format_resp = (Element) responses.item(0);
848
849 Element format_elem = (Element) GSXML.getChildByTagName(format_resp, GSXML.FORMAT_ELEM);
850 if (format_elem != null)
851 {
852 Element global_format_elem = (Element) GSXML.getChildByTagName(format_resp, GSXML.GLOBAL_FORMAT_ELEM);
853 if (global_format_elem != null)
854 {
855 GSXSLT.mergeFormatElements(format_elem, global_format_elem, false);
856 }
857
858 // set the format type
859 format_elem.setAttribute(GSXML.TYPE_ATT, "display");
860 page_response.appendChild(doc.importNode(format_elem, true));
861 }
862
863 if (provide_annotations)
864 {
865 Element services_resp = (Element) responses.item(1);
866
867 // a new message for the mr
868 Element enrich_message = doc.createElement(GSXML.MESSAGE_ELEM);
869 NodeList e_services = services_resp.getElementsByTagName(GSXML.SERVICE_ELEM);
870 boolean service_found = false;
871 for (int j = 0; j < e_services.getLength(); j++)
872 {
873 if (((Element) e_services.item(j)).getAttribute(GSXML.TYPE_ATT).equals("enrich"))
874 {
875 Element s = GSXML.createBasicRequest(doc, GSXML.REQUEST_TYPE_DESCRIBE, ((Element) e_services.item(j)).getAttribute(GSXML.NAME_ATT), userContext);
876 enrich_message.appendChild(s);
877 service_found = true;
878 }
879 }
880 if (service_found)
881 {
882 Element enrich_response = (Element) this.mr.process(enrich_message);
883
884 NodeList e_responses = enrich_response.getElementsByTagName(GSXML.RESPONSE_ELEM);
885 Element service_list = doc.createElement(GSXML.SERVICE_ELEM + GSXML.LIST_MODIFIER);
886 for (int i = 0; i < e_responses.getLength(); i++)
887 {
888 Element e_resp = (Element) e_responses.item(i);
889 Element e_service = (Element) doc.importNode(GSXML.getChildByTagName(e_resp, GSXML.SERVICE_ELEM), true);
890 e_service.setAttribute(GSXML.NAME_ATT, e_resp.getAttribute(GSXML.FROM_ATT));
891 service_list.appendChild(e_service);
892 }
893 page_response.appendChild(service_list);
894 }
895 } // if provide_annotations
896 return true;
897
898 }
899
900 protected String getDocumentType(Element basic_doc_list, String collection, UserContext userContext, Element page_response)
901 {
902 Document doc = basic_doc_list.getOwnerDocument();
903
904 Element ds_message = doc.createElement(GSXML.MESSAGE_ELEM);
905 String to = GSPath.appendLink(collection, "DocumentStructureRetrieve");// Hard-wired?
906 Element ds_request = GSXML.createBasicRequest(doc, GSXML.REQUEST_TYPE_PROCESS, to, userContext);
907 ds_message.appendChild(ds_request);
908
909 // Create a parameter list to specify the required structure information
910 Element ds_param_list = doc.createElement(GSXML.PARAM_ELEM + GSXML.LIST_MODIFIER);
911 Element ds_param = doc.createElement(GSXML.PARAM_ELEM);
912 ds_param_list.appendChild(ds_param);
913 ds_param.setAttribute(GSXML.NAME_ATT, "info");
914 ds_param.setAttribute(GSXML.VALUE_ATT, "documentType");
915
916 ds_request.appendChild(ds_param_list);
917
918 // add the node list we created earlier
919 ds_request.appendChild(basic_doc_list);
920
921 // Process the document structure retrieve message
922 Element ds_response_message = (Element) this.mr.process(ds_message);
923 if (processErrorElements(ds_response_message, page_response))
924 {
925 return null;
926 }
927
928 String[] links = { GSXML.RESPONSE_ELEM, GSXML.DOC_NODE_ELEM + GSXML.LIST_MODIFIER, GSXML.DOC_NODE_ELEM, "nodeStructureInfo" };
929 String path = GSPath.createPath(links);
930 Element info_elem = (Element) GSXML.getNodeByPath(ds_response_message, path);
931 if (info_elem == null) {
932 return null;
933 }
934 Element doctype_elem = GSXML.getNamedElement(info_elem, "info", "name", "documentType");
935 if (doctype_elem != null)
936 {
937 String doc_type = doctype_elem.getAttribute("value");
938 return doc_type;
939 }
940 return null;
941 }
942
943 // Recursive method to set the docType, nodeType and nodeID attributes of each docNode
944 // The docType remains constant as in parameter document_type
945 // The nodeID for the first (root) docNode is already set. For all children, the rootNode id
946 // is updated to be <parent-id>.<num-child>, where the first parent-id is rootNode id.
947 // The nodeType is root if rootNode, internal if there are children and leaf if no children
948 protected void insertDocNodeAttributes(Element docNode, String document_type, String id) {
949
950 boolean isRoot = false;
951 if(id == null) { // rootNode, get the root nodeID to work with recursively
952 id = docNode.getAttribute(GSXML.NODE_ID_ATT);
953 isRoot = true;
954 } else { // for all but the root node, need to still set the nodeID
955 docNode.setAttribute(GSXML.NODE_ID_ATT, id);
956 }
957
958 docNode.setAttribute(GSXML.DOC_TYPE_ATT, document_type);
959
960 NodeList docNodes = GSXML.getChildrenByTagName(docNode, GSXML.DOC_NODE_ELEM);
961 if(docNodes.getLength() > 0) {
962 docNode.setAttribute(GSXML.NODE_TYPE_ATT, GSXML.NODE_TYPE_INTERNAL);
963 for(int i = 0; i < docNodes.getLength(); i++) {
964 Element childDocNode = (Element)docNodes.item(i);
965
966 // work out the child docNode's nodeID based on current id
967 String nodeID = id + "." + (i+1);
968 insertDocNodeAttributes(childDocNode, document_type, nodeID); //recursion step
969 }
970 } else {
971 docNode.setAttribute(GSXML.NODE_TYPE_ATT, GSXML.NODE_TYPE_LEAF);
972 }
973
974 // rootNode's nodeType is a special case: it's "root", not "leaf" or "internal"
975 if(isRoot) docNode.setAttribute(GSXML.NODE_TYPE_ATT, GSXML.NODE_TYPE_ROOT);
976
977 }
978
979 /** run the XSLT transform which converts from doc.xml format to our internal document format */
980 protected Element transformArchiveToDocument(Element section) {
981
982 String stylesheet_filename = GSFile.stylesheetFile(GlobalProperties.getGSDL3Home(), (String) this.config_params.get(GSConstants.SITE_NAME), "", (String) this.config_params.get(GSConstants.INTERFACE_NAME), (ArrayList<String>) this.config_params.get(GSConstants.BASE_INTERFACES), "archive2document.xsl");
983 if (stylesheet_filename == null) {
984 logger.error("Couldn't find stylesheet archive2document.xsl");
985 return section;
986 }
987
988 Document stylesheet_doc = XMLConverter.getDOM(new File(stylesheet_filename));
989 if (stylesheet_doc == null) {
990 logger.error("Couldn't load in stylesheet "+stylesheet_filename);
991 return section;
992 }
993
994 Document section_doc = XMLConverter.newDOM();
995 section_doc.appendChild(section_doc.importNode(section, true));
996 Node result = this.transformer.transform(stylesheet_doc, section_doc);
997 logger.debug("transform result = "+XMLConverter.getPrettyString(result));
998
999 Element new_element;
1000 if (result.getNodeType() == Node.DOCUMENT_NODE) {
1001 new_element = ((Document) result).getDocumentElement();
1002 } else {
1003 new_element = (Element) result;
1004 }
1005
1006
1007 return new_element;
1008
1009 }
1010
1011
1012 /**
1013 * this involves a bit of a hack to get the equivalent query terms - has to
1014 * requery the query service - uses the last selected service name. (if it
1015 * ends in query). should this action do the query or should it send a
1016 * message to the query action? but that will involve lots of extra stuff.
1017 * also doesn't handle phrases properly - just highlights all the terms
1018 * found in the text.
1019 */
1020 protected Element highlightQueryTerms(Element request, String current_node_id, Element dc_response_doc_content)
1021 {
1022 Document doc = request.getOwnerDocument();
1023
1024 // do the query again to get term info
1025 Element cgi_param_list = (Element) GSXML.getChildByTagName(request, GSXML.PARAM_ELEM + GSXML.LIST_MODIFIER);
1026 HashMap<String, Serializable> params = GSXML.extractParams(cgi_param_list, false);
1027
1028 HashMap previous_params = (HashMap) params.get("p");
1029 if (previous_params == null)
1030 {
1031 return dc_response_doc_content;
1032 }
1033 String service_name = (String) previous_params.get(GSParams.SERVICE);
1034 if (service_name == null || !service_name.endsWith("Query"))
1035 { // hack for now - we only do highlighting if we were in a query last - ie not if we were in a browse thingy
1036 logger.debug("invalid service, not doing highlighting");
1037 return dc_response_doc_content;
1038 }
1039 String collection = (String) params.get(GSParams.COLLECTION);
1040 UserContext userContext = new UserContext(request);
1041 String to = GSPath.appendLink(collection, service_name);
1042
1043 Element mr_query_message = doc.createElement(GSXML.MESSAGE_ELEM);
1044 Element mr_query_request = GSXML.createBasicRequest(doc, GSXML.REQUEST_TYPE_PROCESS, to, userContext);
1045 mr_query_message.appendChild(mr_query_request);
1046
1047 // paramList
1048 HashMap service_params = (HashMap) params.get("s1");
1049
1050 Element query_param_list = doc.createElement(GSXML.PARAM_ELEM + GSXML.LIST_MODIFIER);
1051 GSXML.addParametersToList(query_param_list, service_params);
1052 if (current_node_id != null) {
1053 GSXML.addParameterToList(query_param_list, "hldocOID", current_node_id);
1054 } else {
1055 GSXML.addParameterToList(query_param_list, "hldocOID", (String) params.get(GSParams.DOCUMENT));
1056 }
1057 mr_query_request.appendChild(query_param_list);
1058 // do the query
1059 Element mr_query_response = (Element) this.mr.process(mr_query_message);
1060 String pathNode = GSPath.appendLink(GSXML.RESPONSE_ELEM, GSXML.NODE_CONTENT_ELEM);
1061 Element highlighted_Node = (Element) GSXML.getNodeByPath(mr_query_response, pathNode);
1062 // For SOLR, the above query may come back with a nodeContent element, which is the hldocOID section content, with search terms marked up. We send it back to the documnetContentRetrieve service so that resolveTextMacros can be applied, and it can be properly encased in documentNode etc elements
1063 if (highlighted_Node != null)
1064 {
1065 // Build a request to process highlighted text
1066
1067 Element hl_message = doc.createElement(GSXML.MESSAGE_ELEM);
1068 to = GSPath.appendLink(collection, "DocumentContentRetrieve");
1069 Element dc_request = GSXML.createBasicRequest(doc, GSXML.REQUEST_TYPE_PROCESS, to, userContext);
1070 hl_message.appendChild(dc_request);
1071
1072 // Create a parameter list to specify the request parameters - empty for now
1073 Element dc_param_list = doc.createElement(GSXML.PARAM_ELEM + GSXML.LIST_MODIFIER);
1074 dc_request.appendChild(dc_param_list);
1075
1076 // get the content
1077 Element doc_list = doc.createElement(GSXML.DOC_NODE_ELEM + GSXML.LIST_MODIFIER);
1078 dc_request.appendChild(doc_list);
1079 Element current_doc = doc.createElement(GSXML.DOC_NODE_ELEM);
1080 doc_list.appendChild(current_doc);
1081 current_doc.setAttribute(GSXML.NODE_ID_ATT, (String) params.get(GSParams.DOCUMENT));
1082 //Append highlighted content to request for processing
1083 dc_request.appendChild(doc.importNode(highlighted_Node, true));
1084 Element hl_response_message = (Element) this.mr.process(hl_message);
1085
1086 //Get results
1087 NodeList contentList = hl_response_message.getElementsByTagName(GSXML.NODE_CONTENT_ELEM);
1088 Element content = (Element) contentList.item(0);
1089 return content;
1090 }
1091 String path = GSPath.appendLink(GSXML.RESPONSE_ELEM, GSXML.TERM_ELEM + GSXML.LIST_MODIFIER);
1092 Element query_term_list_element = (Element) GSXML.getNodeByPath(mr_query_response, path);
1093 if (query_term_list_element == null)
1094 {
1095 // no term info
1096 logger.error("No query term information.\n");
1097 return dc_response_doc_content;
1098 }
1099
1100 String content = GSXML.getNodeText(dc_response_doc_content);
1101
1102 String metadata_path = GSPath.appendLink(GSXML.RESPONSE_ELEM, GSXML.METADATA_ELEM + GSXML.LIST_MODIFIER);
1103 Element metadata_list = (Element) GSXML.getNodeByPath(mr_query_response, metadata_path);
1104
1105 HashSet<String> query_term_variants = new HashSet<String>();
1106 NodeList equivalent_terms_nodelist = query_term_list_element.getElementsByTagName("equivTermList");
1107 if (equivalent_terms_nodelist == null || equivalent_terms_nodelist.getLength() == 0)
1108 {
1109 NodeList terms_nodelist = query_term_list_element.getElementsByTagName("term");
1110 if (terms_nodelist != null && terms_nodelist.getLength() > 0)
1111 {
1112 for (int i = 0; i < terms_nodelist.getLength(); i++)
1113 {
1114 String termValue = ((Element) terms_nodelist.item(i)).getAttribute("name");
1115 String termValueU = null;
1116 String termValueL = null;
1117
1118 if (termValue.length() > 1)
1119 {
1120 termValueU = termValue.substring(0, 1).toUpperCase() + termValue.substring(1);
1121 termValueL = termValue.substring(0, 1).toLowerCase() + termValue.substring(1);
1122 }
1123 else
1124 {
1125 termValueU = termValue.substring(0, 1).toUpperCase();
1126 termValueL = termValue.substring(0, 1).toLowerCase();
1127 }
1128
1129 query_term_variants.add(termValueU);
1130 query_term_variants.add(termValueL);
1131 }
1132 }
1133 }
1134 else
1135 {
1136 for (int i = 0; i < equivalent_terms_nodelist.getLength(); i++)
1137 {
1138 Element equivalent_terms_element = (Element) equivalent_terms_nodelist.item(i);
1139 String[] equivalent_terms = GSXML.getAttributeValuesFromList(equivalent_terms_element, GSXML.NAME_ATT);
1140 for (int j = 0; j < equivalent_terms.length; j++)
1141 {
1142 query_term_variants.add(equivalent_terms[j]);
1143 }
1144 }
1145 }
1146
1147 ArrayList<ArrayList<HashSet<String>>> phrase_query_term_variants_hierarchy = new ArrayList<ArrayList<HashSet<String>>>();
1148
1149 Element query_element = GSXML.getNamedElement(metadata_list, GSXML.METADATA_ELEM, GSXML.NAME_ATT, "query");
1150 String performed_query = GSXML.getNodeText(query_element) + " ";
1151
1152 ArrayList<HashSet<String>> phrase_query_p_term_variants_list = new ArrayList<HashSet<String>>();
1153 int term_start = 0;
1154 boolean in_term = false;
1155 boolean in_phrase = false;
1156 for (int i = 0; i < performed_query.length(); i++)
1157 {
1158 char character = performed_query.charAt(i);
1159 boolean is_character_letter_or_digit = Character.isLetterOrDigit(character);
1160
1161 // Has a query term just started?
1162 if (in_term == false && is_character_letter_or_digit == true)
1163 {
1164 in_term = true;
1165 term_start = i;
1166 }
1167
1168 // Or has a term just finished?
1169 else if (in_term == true && is_character_letter_or_digit == false)
1170 {
1171 in_term = false;
1172 String term = performed_query.substring(term_start, i);
1173
1174 Element term_element = GSXML.getNamedElement(query_term_list_element, GSXML.TERM_ELEM, GSXML.NAME_ATT, term);
1175 if (term_element != null)
1176 {
1177
1178 HashSet<String> phrase_query_p_term_x_variants = new HashSet<String>();
1179
1180 NodeList term_equivalent_terms_nodelist = term_element.getElementsByTagName("equivTermList");
1181 if (term_equivalent_terms_nodelist == null || term_equivalent_terms_nodelist.getLength() == 0)
1182 {
1183 String termValueU = null;
1184 String termValueL = null;
1185
1186 if (term.length() > 1)
1187 {
1188 termValueU = term.substring(0, 1).toUpperCase() + term.substring(1);
1189 termValueL = term.substring(0, 1).toLowerCase() + term.substring(1);
1190 }
1191 else
1192 {
1193 termValueU = term.substring(0, 1).toUpperCase();
1194 termValueL = term.substring(0, 1).toLowerCase();
1195 }
1196
1197 phrase_query_p_term_x_variants.add(termValueU);
1198 phrase_query_p_term_x_variants.add(termValueL);
1199 }
1200 else
1201 {
1202 for (int j = 0; j < term_equivalent_terms_nodelist.getLength(); j++)
1203 {
1204 Element term_equivalent_terms_element = (Element) term_equivalent_terms_nodelist.item(j);
1205 String[] term_equivalent_terms = GSXML.getAttributeValuesFromList(term_equivalent_terms_element, GSXML.NAME_ATT);
1206 for (int k = 0; k < term_equivalent_terms.length; k++)
1207 {
1208 phrase_query_p_term_x_variants.add(term_equivalent_terms[k]);
1209 }
1210 }
1211 }
1212 phrase_query_p_term_variants_list.add(phrase_query_p_term_x_variants);
1213
1214 if (in_phrase == false)
1215 {
1216 phrase_query_term_variants_hierarchy.add(phrase_query_p_term_variants_list);
1217 phrase_query_p_term_variants_list = new ArrayList<HashSet<String>>();
1218 }
1219 }
1220 }
1221 // Watch for phrases (surrounded by quotes)
1222 if (character == '\"')
1223 {
1224 // Has a phrase just started?
1225 if (in_phrase == false)
1226 {
1227 in_phrase = true;
1228 }
1229 // Or has a phrase just finished?
1230 else if (in_phrase == true)
1231 {
1232 in_phrase = false;
1233 phrase_query_term_variants_hierarchy.add(phrase_query_p_term_variants_list);
1234 }
1235
1236 phrase_query_p_term_variants_list = new ArrayList<HashSet<String>>();
1237 }
1238 }
1239
1240 return highlightQueryTermsInternal(doc, content, query_term_variants, phrase_query_term_variants_hierarchy);
1241 }
1242
1243 /**
1244 * Highlights query terms in a piece of text.
1245 */
1246 private Element highlightQueryTermsInternal(Document doc, String content, HashSet<String> query_term_variants, ArrayList<ArrayList<HashSet<String>>> phrase_query_term_variants_hierarchy)
1247 {
1248 // Convert the content string to an array of characters for speed
1249 char[] content_characters = new char[content.length()];
1250 content.getChars(0, content.length(), content_characters, 0);
1251
1252 // Now skim through the content, identifying word matches
1253 ArrayList<WordMatch> word_matches = new ArrayList<WordMatch>();
1254 int word_start = 0;
1255 boolean in_word = false;
1256 boolean preceding_word_matched = false;
1257 boolean inTag = false;
1258 for (int i = 0; i < content_characters.length; i++)
1259 {
1260 //We don't want to find words inside HTML tags
1261 if (content_characters[i] == '<')
1262 {
1263 inTag = true;
1264 continue;
1265 }
1266 else if (inTag && content_characters[i] == '>')
1267 {
1268 inTag = false;
1269 }
1270 else if (inTag)
1271 {
1272 continue;
1273 }
1274
1275 boolean is_character_letter_or_digit = Character.isLetterOrDigit(content_characters[i]);
1276
1277 // Has a word just started?
1278 if (in_word == false && is_character_letter_or_digit == true)
1279 {
1280 in_word = true;
1281 word_start = i;
1282 }
1283
1284 // Or has a word just finished?
1285 else if (in_word == true && is_character_letter_or_digit == false)
1286 {
1287 in_word = false;
1288
1289 // Check if the word matches any of the query term equivalents
1290 String word = new String(content_characters, word_start, (i - word_start));
1291 if (query_term_variants.contains(word))
1292 {
1293 // We have found a matching word, so remember its location
1294 word_matches.add(new WordMatch(word, word_start, i, preceding_word_matched));
1295 preceding_word_matched = true;
1296 }
1297 else
1298 {
1299 preceding_word_matched = false;
1300 }
1301 }
1302 }
1303
1304 // Don't forget the last word...
1305 if (in_word == true)
1306 {
1307 // Check if the word matches any of the query term equivalents
1308 String word = new String(content_characters, word_start, (content_characters.length - word_start));
1309 if (query_term_variants.contains(word))
1310 {
1311 // We have found a matching word, so remember its location
1312 word_matches.add(new WordMatch(word, word_start, content_characters.length, preceding_word_matched));
1313 }
1314 }
1315
1316 ArrayList<Integer> highlight_start_positions = new ArrayList<Integer>();
1317 ArrayList<Integer> highlight_end_positions = new ArrayList<Integer>();
1318
1319 // Deal with phrases now
1320 ArrayList<PartialPhraseMatch> partial_phrase_matches = new ArrayList<PartialPhraseMatch>();
1321 for (int i = 0; i < word_matches.size(); i++)
1322 {
1323 WordMatch word_match = word_matches.get(i);
1324
1325 // See if any partial phrase matches are extended by this word
1326 if (word_match.preceding_word_matched)
1327 {
1328 for (int j = partial_phrase_matches.size() - 1; j >= 0; j--)
1329 {
1330 PartialPhraseMatch partial_phrase_match = partial_phrase_matches.remove(j);
1331 ArrayList phrase_query_p_term_variants_list = phrase_query_term_variants_hierarchy.get(partial_phrase_match.query_phrase_number);
1332 HashSet phrase_query_p_term_x_variants = (HashSet) phrase_query_p_term_variants_list.get(partial_phrase_match.num_words_matched);
1333 if (phrase_query_p_term_x_variants.contains(word_match.word))
1334 {
1335 partial_phrase_match.num_words_matched++;
1336
1337 // Has a complete phrase match occurred?
1338 if (partial_phrase_match.num_words_matched == phrase_query_p_term_variants_list.size())
1339 {
1340 // Check for overlaps by looking at the previous highlight range
1341 if (!highlight_end_positions.isEmpty())
1342 {
1343 int last_highlight_index = highlight_end_positions.size() - 1;
1344 int last_highlight_end = highlight_end_positions.get(last_highlight_index).intValue();
1345 if (last_highlight_end > partial_phrase_match.start_position)
1346 {
1347 // There is an overlap, so remove the previous phrase match
1348 int last_highlight_start = highlight_start_positions.remove(last_highlight_index).intValue();
1349 highlight_end_positions.remove(last_highlight_index);
1350 partial_phrase_match.start_position = last_highlight_start;
1351 }
1352 }
1353
1354 highlight_start_positions.add(new Integer(partial_phrase_match.start_position));
1355 highlight_end_positions.add(new Integer(word_match.end_position));
1356 }
1357 // No, but add the partial match back into the list for next time
1358 else
1359 {
1360 partial_phrase_matches.add(partial_phrase_match);
1361 }
1362 }
1363 }
1364 }
1365 else
1366 {
1367 partial_phrase_matches.clear();
1368 }
1369
1370 // See if this word is at the start of any of the phrases
1371 for (int p = 0; p < phrase_query_term_variants_hierarchy.size(); p++)
1372 {
1373 ArrayList phrase_query_p_term_variants_list = phrase_query_term_variants_hierarchy.get(p);
1374 if (phrase_query_p_term_variants_list.size()>0) {
1375 HashSet phrase_query_p_term_1_variants = (HashSet) phrase_query_p_term_variants_list.get(0);
1376 if (phrase_query_p_term_1_variants.contains(word_match.word))
1377 {
1378 // If this phrase is just one word long, we have a complete match
1379 if (phrase_query_p_term_variants_list.size() == 1)
1380 {
1381 highlight_start_positions.add(new Integer(word_match.start_position));
1382 highlight_end_positions.add(new Integer(word_match.end_position));
1383 }
1384 // Otherwise we have the start of a potential phrase match
1385 else
1386 {
1387 partial_phrase_matches.add(new PartialPhraseMatch(word_match.start_position, p));
1388 }
1389 }
1390 }
1391 }
1392 }
1393
1394 // Now add the annotation tags into the document at the correct points
1395 Element content_element = doc.createElement(GSXML.NODE_CONTENT_ELEM);
1396
1397 int last_wrote = 0;
1398 for (int i = 0; i < highlight_start_positions.size(); i++)
1399 {
1400 int highlight_start = highlight_start_positions.get(i).intValue();
1401 int highlight_end = highlight_end_positions.get(i).intValue();
1402
1403 // Print anything before the highlight range
1404 if (last_wrote < highlight_start)
1405 {
1406 String preceding_text = new String(content_characters, last_wrote, (highlight_start - last_wrote));
1407 content_element.appendChild(doc.createTextNode(preceding_text));
1408 }
1409
1410 // Print the highlight text, annotated
1411 if (highlight_end > last_wrote)
1412 {
1413 String highlight_text = new String(content_characters, highlight_start, (highlight_end - highlight_start));
1414 Element annotation_element = GSXML.createTextElement(doc, "annotation", highlight_text);
1415 annotation_element.setAttribute("type", "query_term");
1416 content_element.appendChild(annotation_element);
1417 last_wrote = highlight_end;
1418 }
1419 }
1420
1421 // Finish off any unwritten text
1422 if (last_wrote < content_characters.length)
1423 {
1424 String remaining_text = new String(content_characters, last_wrote, (content_characters.length - last_wrote));
1425 content_element.appendChild(doc.createTextNode(remaining_text));
1426 }
1427 return content_element;
1428 }
1429
1430 static private class WordMatch
1431 {
1432 public String word;
1433 public int start_position;
1434 public int end_position;
1435 public boolean preceding_word_matched;
1436
1437 public WordMatch(String word, int start_position, int end_position, boolean preceding_word_matched)
1438 {
1439 this.word = word;
1440 this.start_position = start_position;
1441 this.end_position = end_position;
1442 this.preceding_word_matched = preceding_word_matched;
1443 }
1444 }
1445
1446 static private class PartialPhraseMatch
1447 {
1448 public int start_position;
1449 public int query_phrase_number;
1450 public int num_words_matched;
1451
1452 public PartialPhraseMatch(int start_position, int query_phrase_number)
1453 {
1454 this.start_position = start_position;
1455 this.query_phrase_number = query_phrase_number;
1456 this.num_words_matched = 1;
1457 }
1458 }
1459}
Note: See TracBrowser for help on using the repository browser.