source: main/trunk/greenstone3/src/java/org/greenstone/gsdl3/action/DocumentAction.java@ 32128

Last change on this file since 32128 was 32128, checked in by Georgiy Litvinov, 6 years ago

Remove all sections from requests except needed by inline template.

  • Property svn:keywords set to Author Date Id Revision
File size: 54.2 KB
Line 
1/*
2 * DocumentAction.java
3 * Copyright (C) 2002 New Zealand Digital Library, http://www.nzdl.org
4 *
5 * This program is free software; you can redistribute it and/or modify
6 * it under the terms of the GNU General Public License as published by
7 * the Free Software Foundation; either version 2 of the License, or
8 * (at your option) any later version.
9 *
10 * This program is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 * GNU General Public License for more details.
14 *
15 * You should have received a copy of the GNU General Public License
16 * along with this program; if not, write to the Free Software
17 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
18 */
19package org.greenstone.gsdl3.action;
20
21// Greenstone classes
22import org.greenstone.gsdl3.core.ModuleInterface;
23import org.greenstone.gsdl3.util.*;
24import org.greenstone.util.GlobalProperties;
25
26// XML classes
27import org.w3c.dom.Document;
28import org.w3c.dom.Element;
29import org.w3c.dom.Node;
30import org.w3c.dom.Text;
31import org.w3c.dom.NodeList;
32
33// General Java classes
34import java.util.ArrayList;
35import java.util.HashMap;
36import java.util.HashSet;
37import java.io.File;
38import java.io.Serializable;
39
40import org.apache.log4j.*;
41
42/** Action class for retrieving Documents via the message router */
43public class DocumentAction extends Action
44{
45
46 static Logger logger = Logger.getLogger(org.greenstone.gsdl3.action.DocumentAction.class.getName());
47
48 // this is used to specify that the sibling nodes of a selected one should be obtained
49 public static final String SIBLING_ARG = "sib";
50 public static final String GOTO_PAGE_ARG = "gp";
51 public static final String ENRICH_DOC_ARG = "end";
52 public static final String EXPAND_DOCUMENT_ARG = "ed";
53 public static final String EXPAND_CONTENTS_ARG = "ec";
54 public static final String REALISTIC_BOOK_ARG = "book";
55 public static final String NO_TEXT_ARG = "noText";
56 public static final String DOC_EDIT_ARG = "docEdit";
57
58 /**
59 * if this is set to true, when a document is displayed, any annotation type
60 * services (enrich) will be offered to the user as well
61 */
62 protected boolean provide_annotations = false;
63
64 protected boolean highlight_query_terms = false;
65
66 public boolean configure()
67 {
68 super.configure();
69 String highlight = (String) config_params.get("highlightQueryTerms");
70 if (highlight != null && highlight.equals("true"))
71 {
72 highlight_query_terms = true;
73 }
74 String annotate = (String) config_params.get("displayAnnotationService");
75 if (annotate != null && annotate.equals("true"))
76 {
77 provide_annotations = true;
78 }
79 return true;
80 }
81
82 public Node process(Node message_node)
83 {
84 // for now, no subaction eventually we may want to have subactions such as text assoc or something ?
85
86 Element message = GSXML.nodeToElement(message_node);
87 Document doc = XMLConverter.newDOM(); //message.getOwnerDocument();
88
89 // the response
90 Element result = doc.createElement(GSXML.MESSAGE_ELEM);
91 Element page_response = doc.createElement(GSXML.RESPONSE_ELEM);
92 result.appendChild(page_response);
93
94 // get the request - assume only one
95 Element request = (Element) GSXML.getChildByTagName(message, GSXML.REQUEST_ELEM);
96 Element cgi_paramList = (Element) GSXML.getChildByTagName(request, GSXML.PARAM_ELEM + GSXML.LIST_MODIFIER);
97 HashMap<String, Serializable> params = GSXML.extractParams(cgi_paramList, false);
98
99 // just in case there are some that need to get passed to the services
100 HashMap service_params = (HashMap) params.get("s0");
101
102 String collection = (String) params.get(GSParams.COLLECTION);
103 String document_id = (String) params.get(GSParams.DOCUMENT);
104 if (document_id != null && document_id.equals(""))
105 {
106 document_id = null;
107 }
108 String href = (String) params.get(GSParams.HREF);//for an external link : get the href URL if it is existing in the params list
109 if (href != null && href.equals(""))
110 {
111 href = null;
112 }
113 String rl = (String) params.get(GSParams.RELATIVE_LINK);//for an external link : get the rl value if it is existing in the params list
114 if (document_id == null && href == null)
115 {
116 logger.error("no document specified!");
117 return result;
118 }
119 if (rl != null && rl.equals("0"))
120 {
121 // this is a true external link, we should have been directed to a different page or action
122 logger.error("rl value was 0, shouldn't get here");
123 return result;
124 }
125
126 UserContext userContext = new UserContext(request);
127
128 //append site metadata
129 addSiteMetadata(page_response, userContext);
130 addInterfaceOptions(page_response);
131
132 // get the additional data needed for the page
133 getBackgroundData(page_response, collection, userContext);
134 Element format_elem = (Element) GSXML.getChildByTagName(page_response, GSXML.FORMAT_ELEM);
135
136 if (format_elem != null) {
137 // lets look for param defaults set in config file
138 NodeList param_defaults = format_elem.getElementsByTagName("paramDefault");
139 for (int i=0; i<param_defaults.getLength(); i++) {
140 Element p = (Element)param_defaults.item(i);
141 String name = p.getAttribute(GSXML.NAME_ATT);
142 if (params.get(name) ==null) {
143 // wasn't set from interface
144 String value = p.getAttribute(GSXML.VALUE_ATT);
145 params.put(name, value );
146 // also add into request param xml so that xslt knows it too
147 GSXML.addParameterToList(cgi_paramList, name, value);
148 }
149 }
150 }
151
152 String document_type = (String) params.get(GSParams.DOCUMENT_TYPE);
153 if (document_type != null && document_type.equals(""))
154 {
155 //document_type = "hierarchy";
156 document_type = null; // we'll get it later if not already specified
157 }
158 // what if it is null here?? Anu to check...
159
160
161 boolean editing_document = false;
162 String doc_edit = (String) params.get(DOC_EDIT_ARG);
163 if (doc_edit != null && doc_edit.equals("1")) {
164 editing_document = true;
165 }
166
167 // are we editing mode? just get the archive document, convert to our internal doc format, and return it
168 if (editing_document) {
169
170 // call get archive doc
171 Element dx_message = doc.createElement(GSXML.MESSAGE_ELEM);
172 String to = "DocXMLGetSection";
173 Element dx_request = GSXML.createBasicRequest(doc, GSXML.REQUEST_TYPE_PROCESS, to, userContext);
174 dx_message.appendChild(dx_request);
175 Element dx_section = doc.createElement(GSXML.DOCXML_SECTION_ELEM);
176 dx_section.setAttribute(GSXML.NODE_ID_ATT, document_id);
177 dx_section.setAttribute(GSXML.COLLECTION_ATT, collection);
178 dx_request.appendChild(dx_section);
179
180 Element dx_response_message = (Element) this.mr.process(dx_message);
181 if (processErrorElements(dx_response_message, page_response))
182 {
183 return result;
184 }
185
186 // get the section out
187 String path = GSPath.appendLink(GSXML.RESPONSE_ELEM, GSXML.DOCXML_SECTION_ELEM);
188 Element section = (Element) GSXML.getNodeByPath(dx_response_message, path);
189 if (section == null) {
190 logger.error("no archive doc returned for "+document_id);
191 return result;
192 }
193 // convert the archive format into the internal format that the page response requires
194
195 // work out doctype
196 // NOTE: this will be coming from collection database in index
197 // the archive file doesn't store this. So we have to assume
198 // that the doc type will not be changing with any
199 // modifications happening to archives.
200
201 // if doc type is null, then we need to work it out.
202 // create a basic doc list containing the current node
203
204 if (document_type == null) {
205 Element basic_doc_list = doc.createElement(GSXML.DOC_NODE_ELEM + GSXML.LIST_MODIFIER);
206 Element current_doc = doc.createElement(GSXML.DOC_NODE_ELEM);
207 basic_doc_list.appendChild(current_doc);
208 current_doc.setAttribute(GSXML.NODE_ID_ATT, document_id);
209 basic_doc_list.appendChild(current_doc);
210 document_type = getDocumentType(basic_doc_list, collection, userContext, page_response);
211 }
212
213 if (document_type == null) {
214 logger.debug("@@@ doctype is null, setting to simple");
215 document_type = GSXML.DOC_TYPE_SIMPLE;
216 }
217
218 Element doc_elem = doc.createElement(GSXML.DOCUMENT_ELEM);
219 doc_elem.setAttribute(GSXML.DOC_TYPE_ATT, document_type);
220 page_response.appendChild(doc_elem);
221
222 Element transformed_section = transformArchiveToDocument(section);
223 if (document_type == GSXML.DOC_TYPE_SIMPLE) {
224 // simple doc, only returning a single document node, which is the top level section.
225 doc_elem.setAttribute(GSXML.NODE_ID_ATT, document_id);
226 GSXML.mergeElements(doc_elem, transformed_section);
227 return result;
228 }
229
230 // multi sectioned document.
231 transformed_section.setAttribute(GSXML.NODE_ID_ATT, document_id);
232 // In docEdit mode, we obtain the text from archives, from doc.xml
233 // Now the transformation has replaced <Section> with <documentNode>
234 // Need to add nodeID, nodeType and docType attributes to each docNode
235 // as doc.xml doesn't store that.
236 insertDocNodeAttributes(transformed_section, document_type, null);
237 doc_elem.appendChild(doc.importNode(transformed_section, true));
238 logger.debug("dx result = "+XMLConverter.getPrettyString(result));
239
240 return result;
241 }
242
243 //whether to retrieve siblings or not
244 boolean get_siblings = false;
245 String sibs = (String) params.get(SIBLING_ARG);
246 if (sibs != null && sibs.equals("1"))
247 {
248 get_siblings = true;
249 }
250
251 String doc_id_modifier = "";
252 String sibling_num = (String) params.get(GOTO_PAGE_ARG);
253 if (sibling_num != null && !sibling_num.equals(""))
254 {
255 // we have to modify the doc name
256 doc_id_modifier = "." + sibling_num + ".ss";
257 }
258
259 boolean expand_document = false;
260 String ed_arg = (String) params.get(EXPAND_DOCUMENT_ARG);
261 if (ed_arg != null && ed_arg.equals("1"))
262 {
263 expand_document = true;
264 }
265
266 boolean expand_contents = false;
267 if (expand_document)
268 { // we always expand the contents with the text
269 expand_contents = true;
270 }
271 else
272 {
273 String ec_arg = (String) params.get(EXPAND_CONTENTS_ARG);
274 if (ec_arg != null && ec_arg.equals("1"))
275 {
276 expand_contents = true;
277 }
278 }
279
280 // do we want text content? Not if no_text=1.
281 // expand_document overrides this. - should it??
282 boolean get_text = true;
283 String nt_arg = (String) params.get(NO_TEXT_ARG);
284
285 if (!expand_document && nt_arg!=null && nt_arg.equals("1")) {
286 logger.debug("SETTING GET TEXT TO FALSE");
287 get_text = false;
288 } else {
289 logger.debug("GET TEXT REMAINS TRUE");
290 }
291
292 // the_document is where all the doc info - structure and metadata etc
293 // is added into, to be returned in the page
294 Element the_document = doc.createElement(GSXML.DOCUMENT_ELEM);
295 page_response.appendChild(the_document);
296
297 // create a basic doc list containing the current node
298 Element basic_doc_list = doc.createElement(GSXML.DOC_NODE_ELEM + GSXML.LIST_MODIFIER);
299 Element current_doc = doc.createElement(GSXML.DOC_NODE_ELEM);
300 basic_doc_list.appendChild(current_doc);
301 if (document_id != null)
302 {
303 current_doc.setAttribute(GSXML.NODE_ID_ATT, document_id + doc_id_modifier);
304 }
305 else
306 {
307 current_doc.setAttribute(GSXML.HREF_ID_ATT, href);
308 // do we need this??
309 current_doc.setAttribute(GSXML.ID_MOD_ATT, doc_id_modifier);
310 }
311
312 if (document_type == null)
313 {
314 document_type = getDocumentType(basic_doc_list, collection, userContext, page_response);
315 }
316 if (document_type == null)
317 {
318 logger.debug("##### doctype is null, setting to simple");
319 document_type = GSXML.DOC_TYPE_SIMPLE;
320 }
321
322 the_document.setAttribute(GSXML.DOC_TYPE_ATT, document_type);
323
324 // Create a parameter list to specify the required structure information
325 Element ds_param_list = doc.createElement(GSXML.PARAM_ELEM + GSXML.LIST_MODIFIER);
326
327 if (service_params != null)
328 {
329 GSXML.addParametersToList(ds_param_list, service_params);
330 }
331
332 Element ds_param = null;
333 boolean get_structure = false;
334 boolean get_structure_info = false;
335 if (document_type.equals(GSXML.DOC_TYPE_PAGED))
336 {
337 get_structure_info = true;
338
339 if (expand_contents)
340 {
341 ds_param = doc.createElement(GSXML.PARAM_ELEM);
342 ds_param_list.appendChild(ds_param);
343 ds_param.setAttribute(GSXML.NAME_ATT, "structure");
344 ds_param.setAttribute(GSXML.VALUE_ATT, "entire");
345 }
346
347 // get the info needed for paged naviagtion
348 ds_param = doc.createElement(GSXML.PARAM_ELEM);
349 ds_param_list.appendChild(ds_param);
350 ds_param.setAttribute(GSXML.NAME_ATT, "info");
351 ds_param.setAttribute(GSXML.VALUE_ATT, "numSiblings");
352 ds_param = doc.createElement(GSXML.PARAM_ELEM);
353 ds_param_list.appendChild(ds_param);
354 ds_param.setAttribute(GSXML.NAME_ATT, "info");
355 ds_param.setAttribute(GSXML.VALUE_ATT, "numChildren");
356 ds_param = doc.createElement(GSXML.PARAM_ELEM);
357 ds_param_list.appendChild(ds_param);
358 ds_param.setAttribute(GSXML.NAME_ATT, "info");
359 ds_param.setAttribute(GSXML.VALUE_ATT, "siblingPosition");
360
361 if (get_siblings)
362 {
363 ds_param = doc.createElement(GSXML.PARAM_ELEM);
364 ds_param_list.appendChild(ds_param);
365 ds_param.setAttribute(GSXML.NAME_ATT, "structure");
366 ds_param.setAttribute(GSXML.VALUE_ATT, "siblings");
367 }
368
369 }
370 else if (document_type.equals(GSXML.DOC_TYPE_HIERARCHY) || document_type.equals(GSXML.DOC_TYPE_PAGED_HIERARCHY))
371 {
372 get_structure = true;
373 if (expand_contents)
374 {
375 ds_param = doc.createElement(GSXML.PARAM_ELEM);
376 ds_param_list.appendChild(ds_param);
377 ds_param.setAttribute(GSXML.NAME_ATT, "structure");
378 ds_param.setAttribute(GSXML.VALUE_ATT, "entire");
379 }
380 else
381 {
382 // get the info needed for table of contents
383 ds_param = doc.createElement(GSXML.PARAM_ELEM);
384 ds_param_list.appendChild(ds_param);
385 ds_param.setAttribute(GSXML.NAME_ATT, "structure");
386 ds_param.setAttribute(GSXML.VALUE_ATT, "ancestors");
387 ds_param = doc.createElement(GSXML.PARAM_ELEM);
388 ds_param_list.appendChild(ds_param);
389 ds_param.setAttribute(GSXML.NAME_ATT, "structure");
390 ds_param.setAttribute(GSXML.VALUE_ATT, "children");
391 if (get_siblings)
392 {
393 ds_param = doc.createElement(GSXML.PARAM_ELEM);
394 ds_param_list.appendChild(ds_param);
395 ds_param.setAttribute(GSXML.NAME_ATT, "structure");
396 ds_param.setAttribute(GSXML.VALUE_ATT, "siblings");
397 }
398 }
399 }
400 else
401 {
402 // we dont need any structure
403 }
404
405 boolean has_dummy = false;
406 if (get_structure || get_structure_info)
407 {
408
409 // Build a request to obtain the document structure
410 Element ds_message = doc.createElement(GSXML.MESSAGE_ELEM);
411 String to = GSPath.appendLink(collection, "DocumentStructureRetrieve");// Hard-wired?
412 Element ds_request = GSXML.createBasicRequest(doc, GSXML.REQUEST_TYPE_PROCESS, to, userContext);
413 ds_message.appendChild(ds_request);
414 ds_request.appendChild(ds_param_list);
415
416 // add the node list we created earlier
417 ds_request.appendChild(basic_doc_list);
418
419 // Process the document structure retrieve message
420 Element ds_response_message = (Element) this.mr.process(ds_message);
421 if (processErrorElements(ds_response_message, page_response))
422 {
423 return result;
424 }
425
426 // get the info and print out
427 String path = GSPath.appendLink(GSXML.RESPONSE_ELEM, GSXML.DOC_NODE_ELEM + GSXML.LIST_MODIFIER);
428 path = GSPath.appendLink(path, GSXML.DOC_NODE_ELEM);
429 path = GSPath.appendLink(path, "nodeStructureInfo");
430 Element ds_response_struct_info = (Element) GSXML.getNodeByPath(ds_response_message, path);
431 // get the doc_node bit
432 if (ds_response_struct_info != null)
433 {
434 the_document.appendChild(doc.importNode(ds_response_struct_info, true));
435 }
436 path = GSPath.appendLink(GSXML.RESPONSE_ELEM, GSXML.DOC_NODE_ELEM + GSXML.LIST_MODIFIER);
437 path = GSPath.appendLink(path, GSXML.DOC_NODE_ELEM);
438 path = GSPath.appendLink(path, GSXML.NODE_STRUCTURE_ELEM);
439 Element ds_response_structure = (Element) GSXML.getNodeByPath(ds_response_message, path);
440
441 if (ds_response_structure != null)
442 {
443 // add the contents of the structure bit into the_document
444 NodeList structs = ds_response_structure.getChildNodes();
445 for (int i = 0; i < structs.getLength(); i++)
446 {
447 the_document.appendChild(doc.importNode(structs.item(i), true));
448 }
449 }
450 else
451 {
452 // no structure nodes, so put in a dummy doc node
453 Element doc_node = doc.createElement(GSXML.DOC_NODE_ELEM);
454 if (document_id != null)
455 {
456 doc_node.setAttribute(GSXML.NODE_ID_ATT, document_id);
457 }
458 else
459 {
460 doc_node.setAttribute(GSXML.HREF_ID_ATT, href);
461
462 }
463 the_document.appendChild(doc_node);
464 has_dummy = true;
465 }
466 }
467 else
468 { // a simple type - we dont have a dummy node for simple
469 // should think about this more
470 // no structure request, so just put in a dummy doc node
471 Element doc_node = doc.createElement(GSXML.DOC_NODE_ELEM);
472 if (document_id != null)
473 {
474 doc_node.setAttribute(GSXML.NODE_ID_ATT, document_id);
475 }
476 else
477 {
478 doc_node.setAttribute(GSXML.HREF_ID_ATT, href);
479 }
480 the_document.appendChild(doc_node);
481 has_dummy = true;
482 }
483
484 // Build a request to obtain some document metadata
485 Element dm_message = doc.createElement(GSXML.MESSAGE_ELEM);
486 String to = GSPath.appendLink(collection, "DocumentMetadataRetrieve"); // Hard-wired?
487 Element dm_request = GSXML.createBasicRequest(doc, GSXML.REQUEST_TYPE_PROCESS, to, userContext);
488 dm_message.appendChild(dm_request);
489 // Create a parameter list to specify the required metadata information
490
491 HashSet<String> meta_names = new HashSet<String>();
492 meta_names.add("Title"); // the default
493 if (format_elem != null)
494 {
495 getRequiredMetadataNames(format_elem, meta_names);
496 }
497
498 Element extraMetaListElem = (Element) GSXML.getChildByTagName(request, GSXML.EXTRA_METADATA + GSXML.LIST_MODIFIER);
499 if (extraMetaListElem != null)
500 {
501 NodeList extraMetaList = extraMetaListElem.getElementsByTagName(GSXML.EXTRA_METADATA);
502 for (int i = 0; i < extraMetaList.getLength(); i++)
503 {
504 meta_names.add(((Element) extraMetaList.item(i)).getAttribute(GSXML.NAME_ATT));
505 }
506 }
507
508 Element dm_param_list = createMetadataParamList(doc,meta_names);
509 if (service_params != null)
510 {
511 GSXML.addParametersToList(dm_param_list, service_params);
512 }
513
514 dm_request.appendChild(dm_param_list);
515
516 // create the doc node list for the metadata request
517 Element dm_doc_list = doc.createElement(GSXML.DOC_NODE_ELEM + GSXML.LIST_MODIFIER);
518 dm_request.appendChild(dm_doc_list);
519
520 // Add each node from the structure response into the metadata request
521 NodeList doc_nodes = the_document.getElementsByTagName(GSXML.DOC_NODE_ELEM);
522 for (int i = 0; i < doc_nodes.getLength(); i++)
523 {
524 Element doc_node = (Element) doc_nodes.item(i);
525 String doc_node_id = doc_node.getAttribute(GSXML.NODE_ID_ATT);
526
527 // Add the documentNode to the list
528 Element dm_doc_node = doc.createElement(GSXML.DOC_NODE_ELEM);
529 if (needSectionContent(params)) {
530 if (doc_node_id.equals(document_id)) {
531 dm_doc_list.appendChild(dm_doc_node);
532 }
533 } else {
534 dm_doc_list.appendChild(dm_doc_node);
535 }
536 //dm_doc_list.appendChild(dm_doc_node);
537 dm_doc_node.setAttribute(GSXML.NODE_ID_ATT, doc_node_id);
538 dm_doc_node.setAttribute(GSXML.NODE_TYPE_ATT, doc_node.getAttribute(GSXML.NODE_TYPE_ATT));
539 if (document_id == null){
540 dm_doc_node.setAttribute(GSXML.HREF_ID_ATT, href );
541 }
542
543 }
544 // we also want a metadata request to the top level document to get
545 // assocfilepath - this could be cached too
546 Element doc_meta_request = GSXML.createBasicRequest(doc, GSXML.REQUEST_TYPE_PROCESS, to, userContext);
547 dm_message.appendChild(doc_meta_request);
548 Element doc_meta_param_list = doc.createElement(GSXML.PARAM_ELEM + GSXML.LIST_MODIFIER);
549 if (service_params != null)
550 {
551 GSXML.addParametersToList(doc_meta_param_list, service_params);
552 }
553
554 doc_meta_request.appendChild(doc_meta_param_list);
555 Element doc_param = doc.createElement(GSXML.PARAM_ELEM);
556 doc_meta_param_list.appendChild(doc_param);
557 doc_param.setAttribute(GSXML.NAME_ATT, "metadata");
558 doc_param.setAttribute(GSXML.VALUE_ATT, "assocfilepath");
559
560 // create the doc node list for the metadata request
561 Element doc_list = doc.createElement(GSXML.DOC_NODE_ELEM + GSXML.LIST_MODIFIER);
562 doc_meta_request.appendChild(doc_list);
563
564 Element doc_node = doc.createElement(GSXML.DOC_NODE_ELEM);
565 // the node we want is the root document node
566 if (document_id != null)
567 {
568 doc_node.setAttribute(GSXML.NODE_ID_ATT, document_id + ".rt");
569 }
570 /*else
571 {
572 doc_node.setAttribute(GSXML.HREF_ID_ATT, href);// + ".rt");
573 // can we assume that href is always a top level doc??
574 //doc_node.setAttribute(GSXML.ID_MOD_ATT, ".rt");
575 //doc_node.setAttribute("externalURL", has_rl);
576 }*/
577 doc_list.appendChild(doc_node);
578
579 Element dm_response_message = (Element) this.mr.process(dm_message);
580 if (processErrorElements(dm_response_message, page_response))
581 {
582 return result;
583 }
584
585 String path = GSPath.appendLink(GSXML.RESPONSE_ELEM, GSXML.DOC_NODE_ELEM + GSXML.LIST_MODIFIER);
586 Element dm_response_doc_list = (Element) GSXML.getNodeByPath(dm_response_message, path);
587
588 // Merge the metadata with the structure information
589 NodeList dm_response_docs = dm_response_doc_list.getChildNodes();
590 for (int i = 0; i < doc_nodes.getLength(); i++)
591 {
592 String node_idd = ((Element)doc_nodes.item(i)).getAttribute(GSXML.NODE_ID_ATT);
593 Node dcNode = GSXML.getNamedElement(dm_response_doc_list, "documentNode", GSXML.NODE_ID_ATT, node_idd);
594 GSXML.mergeMetadataLists(doc_nodes.item(i), dcNode);
595 }
596 // get the top level doc metadata out
597 Element doc_meta_response = (Element) dm_response_message.getElementsByTagName(GSXML.RESPONSE_ELEM).item(1);
598 Element top_doc_node = (Element) GSXML.getNodeByPath(doc_meta_response, "documentNodeList/documentNode");
599 GSXML.mergeMetadataLists(the_document, top_doc_node);
600
601 // do we want doc text content? If not, we are done.
602 if (!get_text) {
603 // don't get text
604 return result;
605 }
606
607 // Build a request to obtain some document content
608 Element dc_message = doc.createElement(GSXML.MESSAGE_ELEM);
609 to = GSPath.appendLink(collection, "DocumentContentRetrieve"); // Hard-wired?
610 Element dc_request = GSXML.createBasicRequest(doc, GSXML.REQUEST_TYPE_PROCESS, to, userContext);
611 dc_message.appendChild(dc_request);
612
613 // Create a parameter list to specify the request parameters - empty for now
614 Element dc_param_list = doc.createElement(GSXML.PARAM_ELEM + GSXML.LIST_MODIFIER);
615 if (service_params != null)
616 {
617 GSXML.addParametersToList(dc_param_list, service_params);
618 }
619
620 dc_request.appendChild(dc_param_list);
621
622 // get the content
623 // the doc list for the content request is the same as the one for the structure request unless we want the whole document, in which case its the same as for the metadata request.
624 if (expand_document)
625 {
626 dc_request.appendChild(dm_doc_list);
627 }
628 else
629 {
630 dc_request.appendChild(basic_doc_list);
631 }
632 Element dc_response_message = (Element) this.mr.process(dc_message);
633
634 if (processErrorElements(dc_response_message, page_response))
635 {
636 return result;
637
638 }
639 Element dc_response_doc_list = (Element) GSXML.getNodeByPath(dc_response_message, path);
640
641 if (expand_document)
642 {
643 // Merge the content with the structure information
644 NodeList dc_response_docs = dc_response_doc_list.getChildNodes();
645 for (int i = 0; i < doc_nodes.getLength(); i++)
646 {
647 String node_id = ((Element)doc_nodes.item(i)).getAttribute(GSXML.NODE_ID_ATT);
648 //Node content = GSXML.getChildByTagName((Element) dc_response_docs.item(i), GSXML.NODE_CONTENT_ELEM);
649 Node docNode = GSXML.getNamedElement(dc_response_doc_list, "documentNode", GSXML.NODE_ID_ATT, node_id);
650 Node content = GSXML.getChildByTagName(docNode, GSXML.NODE_CONTENT_ELEM);
651 if (content != null)
652 {
653 if (highlight_query_terms)
654 {
655
656 content = highlightQueryTerms(request, node_id, (Element) content);
657 }
658
659 doc_nodes.item(i).appendChild(doc.importNode(content, true));
660 }
661 //GSXML.mergeMetadataLists(doc_nodes.item(i), dm_response_docs.item(i));
662 }
663 if (has_dummy && document_type.equals(GSXML.DOC_TYPE_SIMPLE)) {
664 Element dummy_node = (Element) doc_nodes.item(0);
665 the_document.removeChild(dummy_node);
666 the_document.setAttribute(GSXML.NODE_ID_ATT, dummy_node.getAttribute(GSXML.NODE_ID_ATT));
667 NodeList dummy_children = dummy_node.getChildNodes();
668 for (int i = dummy_children.getLength() - 1; i >= 0; i--)
669 {
670 // special case as we don't want more than one metadata list
671 if (dummy_children.item(i).getNodeName().equals(GSXML.METADATA_ELEM + GSXML.LIST_MODIFIER))
672 {
673 GSXML.mergeMetadataFromList(the_document, dummy_children.item(i));
674 }
675 else
676 {
677 the_document.appendChild(dummy_children.item(i));
678 }
679 }
680 }
681 }
682 else
683 {
684 //path = GSPath.appendLink(path, GSXML.DOC_NODE_ELEM);
685 Element dc_response_doc = (Element) GSXML.getChildByTagName(dc_response_doc_list, GSXML.DOC_NODE_ELEM);
686 Element dc_response_doc_content = (Element) GSXML.getChildByTagName(dc_response_doc, GSXML.NODE_CONTENT_ELEM);
687 //Element dc_response_doc_external = (Element) GSXML.getChildByTagName(dc_response_doc, "external");
688
689 if (dc_response_doc_content == null)
690 {
691 // no content to add
692 if (dc_response_doc.getAttribute("external").equals("true"))
693 {
694
695 //if (dc_response_doc_external != null)
696 //{
697 String href_id = dc_response_doc.getAttribute(GSXML.HREF_ID_ATT);
698
699 the_document.setAttribute("selectedNode", href_id);
700 the_document.setAttribute("external", href_id);
701 }
702 return result;
703 }
704 if (highlight_query_terms)
705 {
706 dc_response_doc.removeChild(dc_response_doc_content);
707
708 dc_response_doc_content = highlightQueryTerms(request, null, dc_response_doc_content);
709 dc_response_doc.appendChild(dc_response_doc.getOwnerDocument().importNode(dc_response_doc_content, true));
710 }
711
712 if (provide_annotations)
713 {
714 String service_selected = (String) params.get(ENRICH_DOC_ARG);
715 if (service_selected != null && service_selected.equals("1"))
716 {
717 // now we can modifiy the response doc if needed
718 String enrich_service = (String) params.get(GSParams.SERVICE);
719 // send a message to the service
720 Element enrich_message = doc.createElement(GSXML.MESSAGE_ELEM);
721 Element enrich_request = GSXML.createBasicRequest(doc, GSXML.REQUEST_TYPE_PROCESS, enrich_service, userContext);
722 enrich_message.appendChild(enrich_request);
723 // check for parameters
724 HashMap e_service_params = (HashMap) params.get("s1");
725 if (e_service_params != null)
726 {
727 Element enrich_pl = doc.createElement(GSXML.PARAM_ELEM + GSXML.LIST_MODIFIER);
728 GSXML.addParametersToList(enrich_pl, e_service_params);
729 enrich_request.appendChild(enrich_pl);
730 }
731 Element e_doc_list = doc.createElement(GSXML.DOC_NODE_ELEM + GSXML.LIST_MODIFIER);
732 enrich_request.appendChild(e_doc_list);
733 e_doc_list.appendChild(doc.importNode(dc_response_doc, true));
734
735 Node enrich_response = this.mr.process(enrich_message);
736
737 String[] links = { GSXML.RESPONSE_ELEM, GSXML.DOC_NODE_ELEM + GSXML.LIST_MODIFIER, GSXML.DOC_NODE_ELEM, GSXML.NODE_CONTENT_ELEM };
738 path = GSPath.createPath(links);
739 dc_response_doc_content = (Element) GSXML.getNodeByPath(enrich_response, path);
740
741 }
742 } // if provide_annotations
743
744 // use the returned id rather than the sent one cos there may have
745 // been modifiers such as .pr that are removed.
746 String modified_doc_id = dc_response_doc.getAttribute(GSXML.NODE_ID_ATT);
747 the_document.setAttribute("selectedNode", modified_doc_id);
748 if (has_dummy)
749 {
750 // change the id if necessary and add the content
751 Element dummy_node = (Element) doc_nodes.item(0);
752
753 dummy_node.setAttribute(GSXML.NODE_ID_ATT, modified_doc_id);
754 dummy_node.appendChild(doc.importNode(dc_response_doc_content, true));
755 // hack for simple type
756 if (document_type.equals(GSXML.DOC_TYPE_SIMPLE))
757 {
758 // we dont want the internal docNode, just want the content and metadata in the document
759 // rethink this!!
760 the_document.removeChild(dummy_node);
761
762 NodeList dummy_children = dummy_node.getChildNodes();
763 //for (int i=0; i<dummy_children.getLength(); i++) {
764 for (int i = dummy_children.getLength() - 1; i >= 0; i--)
765 {
766 // special case as we don't want more than one metadata list
767 if (dummy_children.item(i).getNodeName().equals(GSXML.METADATA_ELEM + GSXML.LIST_MODIFIER))
768 {
769 GSXML.mergeMetadataFromList(the_document, dummy_children.item(i));
770 }
771 else
772 {
773 the_document.appendChild(dummy_children.item(i));
774 }
775 }
776 }
777
778 the_document.setAttribute(GSXML.NODE_ID_ATT, modified_doc_id);
779 }
780 else
781 {
782 // Merge the document content with the metadata and structure information
783 for (int i = 0; i < doc_nodes.getLength(); i++)
784 {
785 Node dn = doc_nodes.item(i);
786 String dn_id = ((Element) dn).getAttribute(GSXML.NODE_ID_ATT);
787 if (dn_id.equals(modified_doc_id))
788 {
789 dn.appendChild(doc.importNode(dc_response_doc_content, true));
790 break;
791 }
792 }
793 }
794 }
795 //logger.debug("(DocumentAction) Page:\n" + GSXML.xmlNodeToString(result));
796 return result;
797 }
798
799 /**
800 * tell the param class what its arguments are if an action has its own
801 * arguments, this should add them to the params object - particularly
802 * important for args that should not be saved
803 */
804 public boolean addActionParameters(GSParams params)
805 {
806 params.addParameter(GOTO_PAGE_ARG, false);
807 params.addParameter(ENRICH_DOC_ARG, false);
808 params.addParameter(EXPAND_DOCUMENT_ARG, false);
809 params.addParameter(EXPAND_CONTENTS_ARG, false);
810 params.addParameter(REALISTIC_BOOK_ARG, false);
811
812 return true;
813 }
814
815 private boolean needSectionContent(HashMap<String, Serializable> params) {
816 String document_id = (String) params.get(GSParams.DOCUMENT);
817 String ilt = (String) params.get(GSParams.INLINE_TEMPLATE);
818 String iltPrefix = "<xsl:template match=\"/\"><text><xsl:for-each select=\"/page/pageResponse/document//documentNode[@nodeID =";
819 if (ilt != null && ilt.startsWith(iltPrefix) && document_id != null) {
820 return true;
821 }
822
823 return false;
824 }
825 /**
826 * this method gets the collection description, the format info, the list of
827 * enrich services, etc - stuff that is needed for the page, but is the same
828 * whatever the query is - should be cached
829 */
830 protected boolean getBackgroundData(Element page_response, String collection, UserContext userContext)
831 {
832 Document doc = page_response.getOwnerDocument();
833
834 // create a message to process - contains requests for the collection
835 // description, the format element, the enrich services on offer
836 // these could all be cached
837 Element info_message = doc.createElement(GSXML.MESSAGE_ELEM);
838 String path = GSPath.appendLink(collection, "DocumentContentRetrieve");
839 // the format request - ignore for now, where does this request go to??
840 Element format_request = GSXML.createBasicRequest(doc, GSXML.REQUEST_TYPE_FORMAT, path, userContext);
841 info_message.appendChild(format_request);
842
843 // the enrich_services request - only do this if provide_annotations is true
844
845 if (provide_annotations)
846 {
847 Element enrich_services_request = GSXML.createBasicRequest(doc, GSXML.REQUEST_TYPE_DESCRIBE, "", userContext);
848 enrich_services_request.setAttribute(GSXML.INFO_ATT, "serviceList");
849 info_message.appendChild(enrich_services_request);
850 }
851
852 Element info_response = (Element) this.mr.process(info_message);
853
854 // the collection is the first response
855 NodeList responses = info_response.getElementsByTagName(GSXML.RESPONSE_ELEM);
856 Element format_resp = (Element) responses.item(0);
857
858 Element format_elem = (Element) GSXML.getChildByTagName(format_resp, GSXML.FORMAT_ELEM);
859 if (format_elem != null)
860 {
861 Element global_format_elem = (Element) GSXML.getChildByTagName(format_resp, GSXML.GLOBAL_FORMAT_ELEM);
862 if (global_format_elem != null)
863 {
864 GSXSLT.mergeFormatElements(format_elem, global_format_elem, false);
865 }
866
867 // set the format type
868 format_elem.setAttribute(GSXML.TYPE_ATT, "display");
869 page_response.appendChild(doc.importNode(format_elem, true));
870 }
871
872 if (provide_annotations)
873 {
874 Element services_resp = (Element) responses.item(1);
875
876 // a new message for the mr
877 Element enrich_message = doc.createElement(GSXML.MESSAGE_ELEM);
878 NodeList e_services = services_resp.getElementsByTagName(GSXML.SERVICE_ELEM);
879 boolean service_found = false;
880 for (int j = 0; j < e_services.getLength(); j++)
881 {
882 if (((Element) e_services.item(j)).getAttribute(GSXML.TYPE_ATT).equals("enrich"))
883 {
884 Element s = GSXML.createBasicRequest(doc, GSXML.REQUEST_TYPE_DESCRIBE, ((Element) e_services.item(j)).getAttribute(GSXML.NAME_ATT), userContext);
885 enrich_message.appendChild(s);
886 service_found = true;
887 }
888 }
889 if (service_found)
890 {
891 Element enrich_response = (Element) this.mr.process(enrich_message);
892
893 NodeList e_responses = enrich_response.getElementsByTagName(GSXML.RESPONSE_ELEM);
894 Element service_list = doc.createElement(GSXML.SERVICE_ELEM + GSXML.LIST_MODIFIER);
895 for (int i = 0; i < e_responses.getLength(); i++)
896 {
897 Element e_resp = (Element) e_responses.item(i);
898 Element e_service = (Element) doc.importNode(GSXML.getChildByTagName(e_resp, GSXML.SERVICE_ELEM), true);
899 e_service.setAttribute(GSXML.NAME_ATT, e_resp.getAttribute(GSXML.FROM_ATT));
900 service_list.appendChild(e_service);
901 }
902 page_response.appendChild(service_list);
903 }
904 } // if provide_annotations
905 return true;
906
907 }
908
909 protected String getDocumentType(Element basic_doc_list, String collection, UserContext userContext, Element page_response)
910 {
911 Document doc = basic_doc_list.getOwnerDocument();
912
913 Element ds_message = doc.createElement(GSXML.MESSAGE_ELEM);
914 String to = GSPath.appendLink(collection, "DocumentStructureRetrieve");// Hard-wired?
915 Element ds_request = GSXML.createBasicRequest(doc, GSXML.REQUEST_TYPE_PROCESS, to, userContext);
916 ds_message.appendChild(ds_request);
917
918 // Create a parameter list to specify the required structure information
919 Element ds_param_list = doc.createElement(GSXML.PARAM_ELEM + GSXML.LIST_MODIFIER);
920 Element ds_param = doc.createElement(GSXML.PARAM_ELEM);
921 ds_param_list.appendChild(ds_param);
922 ds_param.setAttribute(GSXML.NAME_ATT, "info");
923 ds_param.setAttribute(GSXML.VALUE_ATT, "documentType");
924
925 ds_request.appendChild(ds_param_list);
926
927 // add the node list we created earlier
928 ds_request.appendChild(basic_doc_list);
929
930 // Process the document structure retrieve message
931 Element ds_response_message = (Element) this.mr.process(ds_message);
932 if (processErrorElements(ds_response_message, page_response))
933 {
934 return null;
935 }
936
937 String[] links = { GSXML.RESPONSE_ELEM, GSXML.DOC_NODE_ELEM + GSXML.LIST_MODIFIER, GSXML.DOC_NODE_ELEM, "nodeStructureInfo" };
938 String path = GSPath.createPath(links);
939 Element info_elem = (Element) GSXML.getNodeByPath(ds_response_message, path);
940 if (info_elem == null) {
941 return null;
942 }
943 Element doctype_elem = GSXML.getNamedElement(info_elem, "info", "name", "documentType");
944 if (doctype_elem != null)
945 {
946 String doc_type = doctype_elem.getAttribute("value");
947 return doc_type;
948 }
949 return null;
950 }
951
952 // Recursive method to set the docType, nodeType and nodeID attributes of each docNode
953 // The docType remains constant as in parameter document_type
954 // The nodeID for the first (root) docNode is already set. For all children, the rootNode id
955 // is updated to be <parent-id>.<num-child>, where the first parent-id is rootNode id.
956 // The nodeType is root if rootNode, internal if there are children and leaf if no children
957 protected void insertDocNodeAttributes(Element docNode, String document_type, String id) {
958
959 boolean isRoot = false;
960 if(id == null) { // rootNode, get the root nodeID to work with recursively
961 id = docNode.getAttribute(GSXML.NODE_ID_ATT);
962 isRoot = true;
963 } else { // for all but the root node, need to still set the nodeID
964 docNode.setAttribute(GSXML.NODE_ID_ATT, id);
965 }
966
967 docNode.setAttribute(GSXML.DOC_TYPE_ATT, document_type);
968
969 NodeList docNodes = GSXML.getChildrenByTagName(docNode, GSXML.DOC_NODE_ELEM);
970 if(docNodes.getLength() > 0) {
971 docNode.setAttribute(GSXML.NODE_TYPE_ATT, GSXML.NODE_TYPE_INTERNAL);
972 for(int i = 0; i < docNodes.getLength(); i++) {
973 Element childDocNode = (Element)docNodes.item(i);
974
975 // work out the child docNode's nodeID based on current id
976 String nodeID = id + "." + (i+1);
977 insertDocNodeAttributes(childDocNode, document_type, nodeID); //recursion step
978 }
979 } else {
980 docNode.setAttribute(GSXML.NODE_TYPE_ATT, GSXML.NODE_TYPE_LEAF);
981 }
982
983 // rootNode's nodeType is a special case: it's "root", not "leaf" or "internal"
984 if(isRoot) docNode.setAttribute(GSXML.NODE_TYPE_ATT, GSXML.NODE_TYPE_ROOT);
985
986 }
987
988 /** run the XSLT transform which converts from doc.xml format to our internal document format */
989 protected Element transformArchiveToDocument(Element section) {
990
991 String stylesheet_filename = GSFile.stylesheetFile(GlobalProperties.getGSDL3Home(), (String) this.config_params.get(GSConstants.SITE_NAME), "", (String) this.config_params.get(GSConstants.INTERFACE_NAME), (ArrayList<String>) this.config_params.get(GSConstants.BASE_INTERFACES), "archive2document.xsl");
992 if (stylesheet_filename == null) {
993 logger.error("Couldn't find stylesheet archive2document.xsl");
994 return section;
995 }
996
997 Document stylesheet_doc = XMLConverter.getDOM(new File(stylesheet_filename));
998 if (stylesheet_doc == null) {
999 logger.error("Couldn't load in stylesheet "+stylesheet_filename);
1000 return section;
1001 }
1002
1003 Document section_doc = XMLConverter.newDOM();
1004 section_doc.appendChild(section_doc.importNode(section, true));
1005 Node result = this.transformer.transform(stylesheet_doc, section_doc);
1006 logger.debug("transform result = "+XMLConverter.getPrettyString(result));
1007
1008 Element new_element;
1009 if (result.getNodeType() == Node.DOCUMENT_NODE) {
1010 new_element = ((Document) result).getDocumentElement();
1011 } else {
1012 new_element = (Element) result;
1013 }
1014
1015
1016 return new_element;
1017
1018 }
1019
1020
1021 /**
1022 * this involves a bit of a hack to get the equivalent query terms - has to
1023 * requery the query service - uses the last selected service name. (if it
1024 * ends in query). should this action do the query or should it send a
1025 * message to the query action? but that will involve lots of extra stuff.
1026 * also doesn't handle phrases properly - just highlights all the terms
1027 * found in the text.
1028 */
1029 protected Element highlightQueryTerms(Element request, String current_node_id, Element dc_response_doc_content)
1030 {
1031 Document doc = request.getOwnerDocument();
1032
1033 // do the query again to get term info
1034 Element cgi_param_list = (Element) GSXML.getChildByTagName(request, GSXML.PARAM_ELEM + GSXML.LIST_MODIFIER);
1035 HashMap<String, Serializable> params = GSXML.extractParams(cgi_param_list, false);
1036
1037 HashMap previous_params = (HashMap) params.get("p");
1038 if (previous_params == null)
1039 {
1040 return dc_response_doc_content;
1041 }
1042 String service_name = (String) previous_params.get(GSParams.SERVICE);
1043 if (service_name == null || !service_name.endsWith("Query"))
1044 { // hack for now - we only do highlighting if we were in a query last - ie not if we were in a browse thingy
1045 logger.debug("invalid service, not doing highlighting");
1046 return dc_response_doc_content;
1047 }
1048 String collection = (String) params.get(GSParams.COLLECTION);
1049 UserContext userContext = new UserContext(request);
1050 String to = GSPath.appendLink(collection, service_name);
1051
1052 Element mr_query_message = doc.createElement(GSXML.MESSAGE_ELEM);
1053 Element mr_query_request = GSXML.createBasicRequest(doc, GSXML.REQUEST_TYPE_PROCESS, to, userContext);
1054 mr_query_message.appendChild(mr_query_request);
1055
1056 // paramList
1057 HashMap service_params = (HashMap) params.get("s1");
1058
1059 Element query_param_list = doc.createElement(GSXML.PARAM_ELEM + GSXML.LIST_MODIFIER);
1060 GSXML.addParametersToList(query_param_list, service_params);
1061 if (current_node_id != null) {
1062 GSXML.addParameterToList(query_param_list, "hldocOID", current_node_id);
1063 } else {
1064 GSXML.addParameterToList(query_param_list, "hldocOID", (String) params.get(GSParams.DOCUMENT));
1065 }
1066 mr_query_request.appendChild(query_param_list);
1067 // do the query
1068 Element mr_query_response = (Element) this.mr.process(mr_query_message);
1069 String pathNode = GSPath.appendLink(GSXML.RESPONSE_ELEM, GSXML.NODE_CONTENT_ELEM);
1070 Element highlighted_Node = (Element) GSXML.getNodeByPath(mr_query_response, pathNode);
1071 // For SOLR, the above query may come back with a nodeContent element, which is the hldocOID section content, with search terms marked up. We send it back to the documnetContentRetrieve service so that resolveTextMacros can be applied, and it can be properly encased in documentNode etc elements
1072 if (highlighted_Node != null)
1073 {
1074 // Build a request to process highlighted text
1075
1076 Element hl_message = doc.createElement(GSXML.MESSAGE_ELEM);
1077 to = GSPath.appendLink(collection, "DocumentContentRetrieve");
1078 Element dc_request = GSXML.createBasicRequest(doc, GSXML.REQUEST_TYPE_PROCESS, to, userContext);
1079 hl_message.appendChild(dc_request);
1080
1081 // Create a parameter list to specify the request parameters - empty for now
1082 Element dc_param_list = doc.createElement(GSXML.PARAM_ELEM + GSXML.LIST_MODIFIER);
1083 dc_request.appendChild(dc_param_list);
1084
1085 // get the content
1086 Element doc_list = doc.createElement(GSXML.DOC_NODE_ELEM + GSXML.LIST_MODIFIER);
1087 dc_request.appendChild(doc_list);
1088 Element current_doc = doc.createElement(GSXML.DOC_NODE_ELEM);
1089 doc_list.appendChild(current_doc);
1090 current_doc.setAttribute(GSXML.NODE_ID_ATT, (String) params.get(GSParams.DOCUMENT));
1091 //Append highlighted content to request for processing
1092 dc_request.appendChild(doc.importNode(highlighted_Node, true));
1093 Element hl_response_message = (Element) this.mr.process(hl_message);
1094
1095 //Get results
1096 NodeList contentList = hl_response_message.getElementsByTagName(GSXML.NODE_CONTENT_ELEM);
1097 Element content = (Element) contentList.item(0);
1098 return content;
1099 }
1100 String path = GSPath.appendLink(GSXML.RESPONSE_ELEM, GSXML.TERM_ELEM + GSXML.LIST_MODIFIER);
1101 Element query_term_list_element = (Element) GSXML.getNodeByPath(mr_query_response, path);
1102 if (query_term_list_element == null)
1103 {
1104 // no term info
1105 logger.error("No query term information.\n");
1106 return dc_response_doc_content;
1107 }
1108
1109 String content = GSXML.getNodeText(dc_response_doc_content);
1110
1111 String metadata_path = GSPath.appendLink(GSXML.RESPONSE_ELEM, GSXML.METADATA_ELEM + GSXML.LIST_MODIFIER);
1112 Element metadata_list = (Element) GSXML.getNodeByPath(mr_query_response, metadata_path);
1113
1114 HashSet<String> query_term_variants = new HashSet<String>();
1115 NodeList equivalent_terms_nodelist = query_term_list_element.getElementsByTagName("equivTermList");
1116 if (equivalent_terms_nodelist == null || equivalent_terms_nodelist.getLength() == 0)
1117 {
1118 NodeList terms_nodelist = query_term_list_element.getElementsByTagName("term");
1119 if (terms_nodelist != null && terms_nodelist.getLength() > 0)
1120 {
1121 for (int i = 0; i < terms_nodelist.getLength(); i++)
1122 {
1123 String termValue = ((Element) terms_nodelist.item(i)).getAttribute("name");
1124 String termValueU = null;
1125 String termValueL = null;
1126
1127 if (termValue.length() > 1)
1128 {
1129 termValueU = termValue.substring(0, 1).toUpperCase() + termValue.substring(1);
1130 termValueL = termValue.substring(0, 1).toLowerCase() + termValue.substring(1);
1131 }
1132 else
1133 {
1134 termValueU = termValue.substring(0, 1).toUpperCase();
1135 termValueL = termValue.substring(0, 1).toLowerCase();
1136 }
1137
1138 query_term_variants.add(termValueU);
1139 query_term_variants.add(termValueL);
1140 }
1141 }
1142 }
1143 else
1144 {
1145 for (int i = 0; i < equivalent_terms_nodelist.getLength(); i++)
1146 {
1147 Element equivalent_terms_element = (Element) equivalent_terms_nodelist.item(i);
1148 String[] equivalent_terms = GSXML.getAttributeValuesFromList(equivalent_terms_element, GSXML.NAME_ATT);
1149 for (int j = 0; j < equivalent_terms.length; j++)
1150 {
1151 query_term_variants.add(equivalent_terms[j]);
1152 }
1153 }
1154 }
1155
1156 ArrayList<ArrayList<HashSet<String>>> phrase_query_term_variants_hierarchy = new ArrayList<ArrayList<HashSet<String>>>();
1157
1158 Element query_element = GSXML.getNamedElement(metadata_list, GSXML.METADATA_ELEM, GSXML.NAME_ATT, "query");
1159 String performed_query = GSXML.getNodeText(query_element) + " ";
1160
1161 ArrayList<HashSet<String>> phrase_query_p_term_variants_list = new ArrayList<HashSet<String>>();
1162 int term_start = 0;
1163 boolean in_term = false;
1164 boolean in_phrase = false;
1165 for (int i = 0; i < performed_query.length(); i++)
1166 {
1167 char character = performed_query.charAt(i);
1168 boolean is_character_letter_or_digit = Character.isLetterOrDigit(character);
1169
1170 // Has a query term just started?
1171 if (in_term == false && is_character_letter_or_digit == true)
1172 {
1173 in_term = true;
1174 term_start = i;
1175 }
1176
1177 // Or has a term just finished?
1178 else if (in_term == true && is_character_letter_or_digit == false)
1179 {
1180 in_term = false;
1181 String term = performed_query.substring(term_start, i);
1182
1183 Element term_element = GSXML.getNamedElement(query_term_list_element, GSXML.TERM_ELEM, GSXML.NAME_ATT, term);
1184 if (term_element != null)
1185 {
1186
1187 HashSet<String> phrase_query_p_term_x_variants = new HashSet<String>();
1188
1189 NodeList term_equivalent_terms_nodelist = term_element.getElementsByTagName("equivTermList");
1190 if (term_equivalent_terms_nodelist == null || term_equivalent_terms_nodelist.getLength() == 0)
1191 {
1192 String termValueU = null;
1193 String termValueL = null;
1194
1195 if (term.length() > 1)
1196 {
1197 termValueU = term.substring(0, 1).toUpperCase() + term.substring(1);
1198 termValueL = term.substring(0, 1).toLowerCase() + term.substring(1);
1199 }
1200 else
1201 {
1202 termValueU = term.substring(0, 1).toUpperCase();
1203 termValueL = term.substring(0, 1).toLowerCase();
1204 }
1205
1206 phrase_query_p_term_x_variants.add(termValueU);
1207 phrase_query_p_term_x_variants.add(termValueL);
1208 }
1209 else
1210 {
1211 for (int j = 0; j < term_equivalent_terms_nodelist.getLength(); j++)
1212 {
1213 Element term_equivalent_terms_element = (Element) term_equivalent_terms_nodelist.item(j);
1214 String[] term_equivalent_terms = GSXML.getAttributeValuesFromList(term_equivalent_terms_element, GSXML.NAME_ATT);
1215 for (int k = 0; k < term_equivalent_terms.length; k++)
1216 {
1217 phrase_query_p_term_x_variants.add(term_equivalent_terms[k]);
1218 }
1219 }
1220 }
1221 phrase_query_p_term_variants_list.add(phrase_query_p_term_x_variants);
1222
1223 if (in_phrase == false)
1224 {
1225 phrase_query_term_variants_hierarchy.add(phrase_query_p_term_variants_list);
1226 phrase_query_p_term_variants_list = new ArrayList<HashSet<String>>();
1227 }
1228 }
1229 }
1230 // Watch for phrases (surrounded by quotes)
1231 if (character == '\"')
1232 {
1233 // Has a phrase just started?
1234 if (in_phrase == false)
1235 {
1236 in_phrase = true;
1237 }
1238 // Or has a phrase just finished?
1239 else if (in_phrase == true)
1240 {
1241 in_phrase = false;
1242 phrase_query_term_variants_hierarchy.add(phrase_query_p_term_variants_list);
1243 }
1244
1245 phrase_query_p_term_variants_list = new ArrayList<HashSet<String>>();
1246 }
1247 }
1248
1249 return highlightQueryTermsInternal(doc, content, query_term_variants, phrase_query_term_variants_hierarchy);
1250 }
1251
1252 /**
1253 * Highlights query terms in a piece of text.
1254 */
1255 private Element highlightQueryTermsInternal(Document doc, String content, HashSet<String> query_term_variants, ArrayList<ArrayList<HashSet<String>>> phrase_query_term_variants_hierarchy)
1256 {
1257 // Convert the content string to an array of characters for speed
1258 char[] content_characters = new char[content.length()];
1259 content.getChars(0, content.length(), content_characters, 0);
1260
1261 // Now skim through the content, identifying word matches
1262 ArrayList<WordMatch> word_matches = new ArrayList<WordMatch>();
1263 int word_start = 0;
1264 boolean in_word = false;
1265 boolean preceding_word_matched = false;
1266 boolean inTag = false;
1267 for (int i = 0; i < content_characters.length; i++)
1268 {
1269 //We don't want to find words inside HTML tags
1270 if (content_characters[i] == '<')
1271 {
1272 inTag = true;
1273 continue;
1274 }
1275 else if (inTag && content_characters[i] == '>')
1276 {
1277 inTag = false;
1278 }
1279 else if (inTag)
1280 {
1281 continue;
1282 }
1283
1284 boolean is_character_letter_or_digit = Character.isLetterOrDigit(content_characters[i]);
1285
1286 // Has a word just started?
1287 if (in_word == false && is_character_letter_or_digit == true)
1288 {
1289 in_word = true;
1290 word_start = i;
1291 }
1292
1293 // Or has a word just finished?
1294 else if (in_word == true && is_character_letter_or_digit == false)
1295 {
1296 in_word = false;
1297
1298 // Check if the word matches any of the query term equivalents
1299 String word = new String(content_characters, word_start, (i - word_start));
1300 if (query_term_variants.contains(word))
1301 {
1302 // We have found a matching word, so remember its location
1303 word_matches.add(new WordMatch(word, word_start, i, preceding_word_matched));
1304 preceding_word_matched = true;
1305 }
1306 else
1307 {
1308 preceding_word_matched = false;
1309 }
1310 }
1311 }
1312
1313 // Don't forget the last word...
1314 if (in_word == true)
1315 {
1316 // Check if the word matches any of the query term equivalents
1317 String word = new String(content_characters, word_start, (content_characters.length - word_start));
1318 if (query_term_variants.contains(word))
1319 {
1320 // We have found a matching word, so remember its location
1321 word_matches.add(new WordMatch(word, word_start, content_characters.length, preceding_word_matched));
1322 }
1323 }
1324
1325 ArrayList<Integer> highlight_start_positions = new ArrayList<Integer>();
1326 ArrayList<Integer> highlight_end_positions = new ArrayList<Integer>();
1327
1328 // Deal with phrases now
1329 ArrayList<PartialPhraseMatch> partial_phrase_matches = new ArrayList<PartialPhraseMatch>();
1330 for (int i = 0; i < word_matches.size(); i++)
1331 {
1332 WordMatch word_match = word_matches.get(i);
1333
1334 // See if any partial phrase matches are extended by this word
1335 if (word_match.preceding_word_matched)
1336 {
1337 for (int j = partial_phrase_matches.size() - 1; j >= 0; j--)
1338 {
1339 PartialPhraseMatch partial_phrase_match = partial_phrase_matches.remove(j);
1340 ArrayList phrase_query_p_term_variants_list = phrase_query_term_variants_hierarchy.get(partial_phrase_match.query_phrase_number);
1341 HashSet phrase_query_p_term_x_variants = (HashSet) phrase_query_p_term_variants_list.get(partial_phrase_match.num_words_matched);
1342 if (phrase_query_p_term_x_variants.contains(word_match.word))
1343 {
1344 partial_phrase_match.num_words_matched++;
1345
1346 // Has a complete phrase match occurred?
1347 if (partial_phrase_match.num_words_matched == phrase_query_p_term_variants_list.size())
1348 {
1349 // Check for overlaps by looking at the previous highlight range
1350 if (!highlight_end_positions.isEmpty())
1351 {
1352 int last_highlight_index = highlight_end_positions.size() - 1;
1353 int last_highlight_end = highlight_end_positions.get(last_highlight_index).intValue();
1354 if (last_highlight_end > partial_phrase_match.start_position)
1355 {
1356 // There is an overlap, so remove the previous phrase match
1357 int last_highlight_start = highlight_start_positions.remove(last_highlight_index).intValue();
1358 highlight_end_positions.remove(last_highlight_index);
1359 partial_phrase_match.start_position = last_highlight_start;
1360 }
1361 }
1362
1363 highlight_start_positions.add(new Integer(partial_phrase_match.start_position));
1364 highlight_end_positions.add(new Integer(word_match.end_position));
1365 }
1366 // No, but add the partial match back into the list for next time
1367 else
1368 {
1369 partial_phrase_matches.add(partial_phrase_match);
1370 }
1371 }
1372 }
1373 }
1374 else
1375 {
1376 partial_phrase_matches.clear();
1377 }
1378
1379 // See if this word is at the start of any of the phrases
1380 for (int p = 0; p < phrase_query_term_variants_hierarchy.size(); p++)
1381 {
1382 ArrayList phrase_query_p_term_variants_list = phrase_query_term_variants_hierarchy.get(p);
1383 if (phrase_query_p_term_variants_list.size()>0) {
1384 HashSet phrase_query_p_term_1_variants = (HashSet) phrase_query_p_term_variants_list.get(0);
1385 if (phrase_query_p_term_1_variants.contains(word_match.word))
1386 {
1387 // If this phrase is just one word long, we have a complete match
1388 if (phrase_query_p_term_variants_list.size() == 1)
1389 {
1390 highlight_start_positions.add(new Integer(word_match.start_position));
1391 highlight_end_positions.add(new Integer(word_match.end_position));
1392 }
1393 // Otherwise we have the start of a potential phrase match
1394 else
1395 {
1396 partial_phrase_matches.add(new PartialPhraseMatch(word_match.start_position, p));
1397 }
1398 }
1399 }
1400 }
1401 }
1402
1403 // Now add the annotation tags into the document at the correct points
1404 Element content_element = doc.createElement(GSXML.NODE_CONTENT_ELEM);
1405
1406 int last_wrote = 0;
1407 for (int i = 0; i < highlight_start_positions.size(); i++)
1408 {
1409 int highlight_start = highlight_start_positions.get(i).intValue();
1410 int highlight_end = highlight_end_positions.get(i).intValue();
1411
1412 // Print anything before the highlight range
1413 if (last_wrote < highlight_start)
1414 {
1415 String preceding_text = new String(content_characters, last_wrote, (highlight_start - last_wrote));
1416 content_element.appendChild(doc.createTextNode(preceding_text));
1417 }
1418
1419 // Print the highlight text, annotated
1420 if (highlight_end > last_wrote)
1421 {
1422 String highlight_text = new String(content_characters, highlight_start, (highlight_end - highlight_start));
1423 Element annotation_element = GSXML.createTextElement(doc, "annotation", highlight_text);
1424 annotation_element.setAttribute("type", "query_term");
1425 content_element.appendChild(annotation_element);
1426 last_wrote = highlight_end;
1427 }
1428 }
1429
1430 // Finish off any unwritten text
1431 if (last_wrote < content_characters.length)
1432 {
1433 String remaining_text = new String(content_characters, last_wrote, (content_characters.length - last_wrote));
1434 content_element.appendChild(doc.createTextNode(remaining_text));
1435 }
1436 return content_element;
1437 }
1438
1439 static private class WordMatch
1440 {
1441 public String word;
1442 public int start_position;
1443 public int end_position;
1444 public boolean preceding_word_matched;
1445
1446 public WordMatch(String word, int start_position, int end_position, boolean preceding_word_matched)
1447 {
1448 this.word = word;
1449 this.start_position = start_position;
1450 this.end_position = end_position;
1451 this.preceding_word_matched = preceding_word_matched;
1452 }
1453 }
1454
1455 static private class PartialPhraseMatch
1456 {
1457 public int start_position;
1458 public int query_phrase_number;
1459 public int num_words_matched;
1460
1461 public PartialPhraseMatch(int start_position, int query_phrase_number)
1462 {
1463 this.start_position = start_position;
1464 this.query_phrase_number = query_phrase_number;
1465 this.num_words_matched = 1;
1466 }
1467 }
1468}
Note: See TracBrowser for help on using the repository browser.