source: main/trunk/greenstone3/src/java/org/greenstone/gsdl3/action/DocumentAction.java@ 32111

Last change on this file since 32111 was 32111, checked in by kjdon, 6 years ago

pass in base interfaces array to the call to find archive2document.xsl. If you have a custom interface it will probably live in hte default one. Then check to make sure the file was there before trying to use it.

  • Property svn:keywords set to Author Date Id Revision
File size: 53.3 KB
Line 
1/*
2 * DocumentAction.java
3 * Copyright (C) 2002 New Zealand Digital Library, http://www.nzdl.org
4 *
5 * This program is free software; you can redistribute it and/or modify
6 * it under the terms of the GNU General Public License as published by
7 * the Free Software Foundation; either version 2 of the License, or
8 * (at your option) any later version.
9 *
10 * This program is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 * GNU General Public License for more details.
14 *
15 * You should have received a copy of the GNU General Public License
16 * along with this program; if not, write to the Free Software
17 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
18 */
19package org.greenstone.gsdl3.action;
20
21// Greenstone classes
22import org.greenstone.gsdl3.core.ModuleInterface;
23import org.greenstone.gsdl3.util.*;
24import org.greenstone.util.GlobalProperties;
25
26// XML classes
27import org.w3c.dom.Document;
28import org.w3c.dom.Element;
29import org.w3c.dom.Node;
30import org.w3c.dom.Text;
31import org.w3c.dom.NodeList;
32
33// General Java classes
34import java.util.ArrayList;
35import java.util.HashMap;
36import java.util.HashSet;
37import java.io.File;
38import java.io.Serializable;
39
40import org.apache.log4j.*;
41
42/** Action class for retrieving Documents via the message router */
43public class DocumentAction extends Action
44{
45
46 static Logger logger = Logger.getLogger(org.greenstone.gsdl3.action.DocumentAction.class.getName());
47
48 // this is used to specify that the sibling nodes of a selected one should be obtained
49 public static final String SIBLING_ARG = "sib";
50 public static final String GOTO_PAGE_ARG = "gp";
51 public static final String ENRICH_DOC_ARG = "end";
52 public static final String EXPAND_DOCUMENT_ARG = "ed";
53 public static final String EXPAND_CONTENTS_ARG = "ec";
54 public static final String REALISTIC_BOOK_ARG = "book";
55 public static final String NO_TEXT_ARG = "noText";
56 public static final String DOC_EDIT_ARG = "docEdit";
57
58 /**
59 * if this is set to true, when a document is displayed, any annotation type
60 * services (enrich) will be offered to the user as well
61 */
62 protected boolean provide_annotations = false;
63
64 protected boolean highlight_query_terms = false;
65
66 public boolean configure()
67 {
68 super.configure();
69 String highlight = (String) config_params.get("highlightQueryTerms");
70 if (highlight != null && highlight.equals("true"))
71 {
72 highlight_query_terms = true;
73 }
74 String annotate = (String) config_params.get("displayAnnotationService");
75 if (annotate != null && annotate.equals("true"))
76 {
77 provide_annotations = true;
78 }
79 return true;
80 }
81
82 public Node process(Node message_node)
83 {
84 // for now, no subaction eventually we may want to have subactions such as text assoc or something ?
85
86 Element message = GSXML.nodeToElement(message_node);
87 Document doc = XMLConverter.newDOM(); //message.getOwnerDocument();
88
89 // the response
90 Element result = doc.createElement(GSXML.MESSAGE_ELEM);
91 Element page_response = doc.createElement(GSXML.RESPONSE_ELEM);
92 result.appendChild(page_response);
93
94 // get the request - assume only one
95 Element request = (Element) GSXML.getChildByTagName(message, GSXML.REQUEST_ELEM);
96 Element cgi_paramList = (Element) GSXML.getChildByTagName(request, GSXML.PARAM_ELEM + GSXML.LIST_MODIFIER);
97 HashMap<String, Serializable> params = GSXML.extractParams(cgi_paramList, false);
98
99 // just in case there are some that need to get passed to the services
100 HashMap service_params = (HashMap) params.get("s0");
101
102 String collection = (String) params.get(GSParams.COLLECTION);
103 String document_id = (String) params.get(GSParams.DOCUMENT);
104 if (document_id != null && document_id.equals(""))
105 {
106 document_id = null;
107 }
108 String href = (String) params.get(GSParams.HREF);//for an external link : get the href URL if it is existing in the params list
109 if (href != null && href.equals(""))
110 {
111 href = null;
112 }
113 String rl = (String) params.get(GSParams.RELATIVE_LINK);//for an external link : get the rl value if it is existing in the params list
114 if (document_id == null && href == null)
115 {
116 logger.error("no document specified!");
117 return result;
118 }
119 if (rl != null && rl.equals("0"))
120 {
121 // this is a true external link, we should have been directed to a different page or action
122 logger.error("rl value was 0, shouldn't get here");
123 return result;
124 }
125
126 UserContext userContext = new UserContext(request);
127
128 //append site metadata
129 addSiteMetadata(page_response, userContext);
130 addInterfaceOptions(page_response);
131
132 // get the additional data needed for the page
133 getBackgroundData(page_response, collection, userContext);
134 Element format_elem = (Element) GSXML.getChildByTagName(page_response, GSXML.FORMAT_ELEM);
135
136 if (format_elem != null) {
137 // lets look for param defaults set in config file
138 NodeList param_defaults = format_elem.getElementsByTagName("paramDefault");
139 for (int i=0; i<param_defaults.getLength(); i++) {
140 Element p = (Element)param_defaults.item(i);
141 String name = p.getAttribute(GSXML.NAME_ATT);
142 if (params.get(name) ==null) {
143 // wasn't set from interface
144 String value = p.getAttribute(GSXML.VALUE_ATT);
145 params.put(name, value );
146 // also add into request param xml so that xslt knows it too
147 GSXML.addParameterToList(cgi_paramList, name, value);
148 }
149 }
150 }
151
152 String document_type = (String) params.get(GSParams.DOCUMENT_TYPE);
153 if (document_type != null && document_type.equals(""))
154 {
155 //document_type = "hierarchy";
156 document_type = null; // we'll get it later if not already specified
157 }
158 // what if it is null here?? Anu to check...
159
160
161 boolean editing_document = false;
162 String doc_edit = (String) params.get(DOC_EDIT_ARG);
163 if (doc_edit != null && doc_edit.equals("1")) {
164 editing_document = true;
165 }
166
167 // are we editing mode? just get the archive document, convert to our internal doc format, and return it
168 if (editing_document) {
169
170 // call get archive doc
171 Element dx_message = doc.createElement(GSXML.MESSAGE_ELEM);
172 String to = "DocXMLGetSection";
173 Element dx_request = GSXML.createBasicRequest(doc, GSXML.REQUEST_TYPE_PROCESS, to, userContext);
174 dx_message.appendChild(dx_request);
175 Element dx_section = doc.createElement(GSXML.DOCXML_SECTION_ELEM);
176 dx_section.setAttribute(GSXML.NODE_ID_ATT, document_id);
177 dx_section.setAttribute(GSXML.COLLECTION_ATT, collection);
178 dx_request.appendChild(dx_section);
179
180 Element dx_response_message = (Element) this.mr.process(dx_message);
181 if (processErrorElements(dx_response_message, page_response))
182 {
183 return result;
184 }
185
186 // get the section out
187 String path = GSPath.appendLink(GSXML.RESPONSE_ELEM, GSXML.DOCXML_SECTION_ELEM);
188 Element section = (Element) GSXML.getNodeByPath(dx_response_message, path);
189 if (section == null) {
190 logger.error("no archive doc returned for "+document_id);
191 return result;
192 }
193 // convert the archive format into the internal format that the page response requires
194
195 // work out doctype
196 // NOTE: this will be coming from collection database in index
197 // the archive file doesn't store this. So we have to assume
198 // that the doc type will not be changing with any
199 // modifications happening to archives.
200
201 // if doc type is null, then we need to work it out.
202 // create a basic doc list containing the current node
203
204 if (document_type == null) {
205 Element basic_doc_list = doc.createElement(GSXML.DOC_NODE_ELEM + GSXML.LIST_MODIFIER);
206 Element current_doc = doc.createElement(GSXML.DOC_NODE_ELEM);
207 basic_doc_list.appendChild(current_doc);
208 current_doc.setAttribute(GSXML.NODE_ID_ATT, document_id);
209 basic_doc_list.appendChild(current_doc);
210 document_type = getDocumentType(basic_doc_list, collection, userContext, page_response);
211 }
212
213 if (document_type == null) {
214 logger.debug("@@@ doctype is null, setting to simple");
215 document_type = GSXML.DOC_TYPE_SIMPLE;
216 }
217
218 Element doc_elem = doc.createElement(GSXML.DOCUMENT_ELEM);
219 doc_elem.setAttribute(GSXML.DOC_TYPE_ATT, document_type);
220 page_response.appendChild(doc_elem);
221
222 Element transformed_section = transformArchiveToDocument(section);
223 if (document_type == GSXML.DOC_TYPE_SIMPLE) {
224 // simple doc, only returning a single document node, which is the top level section.
225 doc_elem.setAttribute(GSXML.NODE_ID_ATT, document_id);
226 GSXML.mergeElements(doc_elem, transformed_section);
227 return result;
228 }
229
230 // multi sectioned document.
231 transformed_section.setAttribute(GSXML.NODE_ID_ATT, document_id);
232 // In docEdit mode, we obtain the text from archives, from doc.xml
233 // Now the transformation has replaced <Section> with <documentNode>
234 // Need to add nodeID, nodeType and docType attributes to each docNode
235 // as doc.xml doesn't store that.
236 insertDocNodeAttributes(transformed_section, document_type, null);
237 doc_elem.appendChild(doc.importNode(transformed_section, true));
238 logger.debug("dx result = "+XMLConverter.getPrettyString(result));
239
240 return result;
241 }
242
243 //whether to retrieve siblings or not
244 boolean get_siblings = false;
245 String sibs = (String) params.get(SIBLING_ARG);
246 if (sibs != null && sibs.equals("1"))
247 {
248 get_siblings = true;
249 }
250
251 String doc_id_modifier = "";
252 String sibling_num = (String) params.get(GOTO_PAGE_ARG);
253 if (sibling_num != null && !sibling_num.equals(""))
254 {
255 // we have to modify the doc name
256 doc_id_modifier = "." + sibling_num + ".ss";
257 }
258
259 boolean expand_document = false;
260 String ed_arg = (String) params.get(EXPAND_DOCUMENT_ARG);
261 if (ed_arg != null && ed_arg.equals("1"))
262 {
263 expand_document = true;
264 }
265
266 boolean expand_contents = false;
267 if (expand_document)
268 { // we always expand the contents with the text
269 expand_contents = true;
270 }
271 else
272 {
273 String ec_arg = (String) params.get(EXPAND_CONTENTS_ARG);
274 if (ec_arg != null && ec_arg.equals("1"))
275 {
276 expand_contents = true;
277 }
278 }
279
280 // do we want text content? Not if no_text=1.
281 // expand_document overrides this. - should it??
282 boolean get_text = true;
283 String nt_arg = (String) params.get(NO_TEXT_ARG);
284
285 if (!expand_document && nt_arg!=null && nt_arg.equals("1")) {
286 logger.debug("SETTING GET TEXT TO FALSE");
287 get_text = false;
288 } else {
289 logger.debug("GET TEXT REMAINS TRUE");
290 }
291
292 // the_document is where all the doc info - structure and metadata etc
293 // is added into, to be returned in the page
294 Element the_document = doc.createElement(GSXML.DOCUMENT_ELEM);
295 page_response.appendChild(the_document);
296
297 // create a basic doc list containing the current node
298 Element basic_doc_list = doc.createElement(GSXML.DOC_NODE_ELEM + GSXML.LIST_MODIFIER);
299 Element current_doc = doc.createElement(GSXML.DOC_NODE_ELEM);
300 basic_doc_list.appendChild(current_doc);
301 if (document_id != null)
302 {
303 current_doc.setAttribute(GSXML.NODE_ID_ATT, document_id + doc_id_modifier);
304 }
305 else
306 {
307 current_doc.setAttribute(GSXML.HREF_ID_ATT, href);
308 // do we need this??
309 current_doc.setAttribute(GSXML.ID_MOD_ATT, doc_id_modifier);
310 }
311
312 if (document_type == null)
313 {
314 document_type = getDocumentType(basic_doc_list, collection, userContext, page_response);
315 }
316 if (document_type == null)
317 {
318 logger.debug("##### doctype is null, setting to simple");
319 document_type = GSXML.DOC_TYPE_SIMPLE;
320 }
321
322 the_document.setAttribute(GSXML.DOC_TYPE_ATT, document_type);
323
324 // Create a parameter list to specify the required structure information
325 Element ds_param_list = doc.createElement(GSXML.PARAM_ELEM + GSXML.LIST_MODIFIER);
326
327 if (service_params != null)
328 {
329 GSXML.addParametersToList(ds_param_list, service_params);
330 }
331
332 Element ds_param = null;
333 boolean get_structure = false;
334 boolean get_structure_info = false;
335 if (document_type.equals(GSXML.DOC_TYPE_PAGED))
336 {
337 get_structure_info = true;
338
339 if (expand_contents)
340 {
341 ds_param = doc.createElement(GSXML.PARAM_ELEM);
342 ds_param_list.appendChild(ds_param);
343 ds_param.setAttribute(GSXML.NAME_ATT, "structure");
344 ds_param.setAttribute(GSXML.VALUE_ATT, "entire");
345 }
346
347 // get the info needed for paged naviagtion
348 ds_param = doc.createElement(GSXML.PARAM_ELEM);
349 ds_param_list.appendChild(ds_param);
350 ds_param.setAttribute(GSXML.NAME_ATT, "info");
351 ds_param.setAttribute(GSXML.VALUE_ATT, "numSiblings");
352 ds_param = doc.createElement(GSXML.PARAM_ELEM);
353 ds_param_list.appendChild(ds_param);
354 ds_param.setAttribute(GSXML.NAME_ATT, "info");
355 ds_param.setAttribute(GSXML.VALUE_ATT, "numChildren");
356 ds_param = doc.createElement(GSXML.PARAM_ELEM);
357 ds_param_list.appendChild(ds_param);
358 ds_param.setAttribute(GSXML.NAME_ATT, "info");
359 ds_param.setAttribute(GSXML.VALUE_ATT, "siblingPosition");
360
361 if (get_siblings)
362 {
363 ds_param = doc.createElement(GSXML.PARAM_ELEM);
364 ds_param_list.appendChild(ds_param);
365 ds_param.setAttribute(GSXML.NAME_ATT, "structure");
366 ds_param.setAttribute(GSXML.VALUE_ATT, "siblings");
367 }
368
369 }
370 else if (document_type.equals(GSXML.DOC_TYPE_HIERARCHY) || document_type.equals(GSXML.DOC_TYPE_PAGED_HIERARCHY))
371 {
372 get_structure = true;
373 if (expand_contents)
374 {
375 ds_param = doc.createElement(GSXML.PARAM_ELEM);
376 ds_param_list.appendChild(ds_param);
377 ds_param.setAttribute(GSXML.NAME_ATT, "structure");
378 ds_param.setAttribute(GSXML.VALUE_ATT, "entire");
379 }
380 else
381 {
382 // get the info needed for table of contents
383 ds_param = doc.createElement(GSXML.PARAM_ELEM);
384 ds_param_list.appendChild(ds_param);
385 ds_param.setAttribute(GSXML.NAME_ATT, "structure");
386 ds_param.setAttribute(GSXML.VALUE_ATT, "ancestors");
387 ds_param = doc.createElement(GSXML.PARAM_ELEM);
388 ds_param_list.appendChild(ds_param);
389 ds_param.setAttribute(GSXML.NAME_ATT, "structure");
390 ds_param.setAttribute(GSXML.VALUE_ATT, "children");
391 if (get_siblings)
392 {
393 ds_param = doc.createElement(GSXML.PARAM_ELEM);
394 ds_param_list.appendChild(ds_param);
395 ds_param.setAttribute(GSXML.NAME_ATT, "structure");
396 ds_param.setAttribute(GSXML.VALUE_ATT, "siblings");
397 }
398 }
399 }
400 else
401 {
402 // we dont need any structure
403 }
404
405 boolean has_dummy = false;
406 if (get_structure || get_structure_info)
407 {
408
409 // Build a request to obtain the document structure
410 Element ds_message = doc.createElement(GSXML.MESSAGE_ELEM);
411 String to = GSPath.appendLink(collection, "DocumentStructureRetrieve");// Hard-wired?
412 Element ds_request = GSXML.createBasicRequest(doc, GSXML.REQUEST_TYPE_PROCESS, to, userContext);
413 ds_message.appendChild(ds_request);
414 ds_request.appendChild(ds_param_list);
415
416 // add the node list we created earlier
417 ds_request.appendChild(basic_doc_list);
418
419 // Process the document structure retrieve message
420 Element ds_response_message = (Element) this.mr.process(ds_message);
421 if (processErrorElements(ds_response_message, page_response))
422 {
423 return result;
424 }
425
426 // get the info and print out
427 String path = GSPath.appendLink(GSXML.RESPONSE_ELEM, GSXML.DOC_NODE_ELEM + GSXML.LIST_MODIFIER);
428 path = GSPath.appendLink(path, GSXML.DOC_NODE_ELEM);
429 path = GSPath.appendLink(path, "nodeStructureInfo");
430 Element ds_response_struct_info = (Element) GSXML.getNodeByPath(ds_response_message, path);
431 // get the doc_node bit
432 if (ds_response_struct_info != null)
433 {
434 the_document.appendChild(doc.importNode(ds_response_struct_info, true));
435 }
436 path = GSPath.appendLink(GSXML.RESPONSE_ELEM, GSXML.DOC_NODE_ELEM + GSXML.LIST_MODIFIER);
437 path = GSPath.appendLink(path, GSXML.DOC_NODE_ELEM);
438 path = GSPath.appendLink(path, GSXML.NODE_STRUCTURE_ELEM);
439 Element ds_response_structure = (Element) GSXML.getNodeByPath(ds_response_message, path);
440
441 if (ds_response_structure != null)
442 {
443 // add the contents of the structure bit into the_document
444 NodeList structs = ds_response_structure.getChildNodes();
445 for (int i = 0; i < structs.getLength(); i++)
446 {
447 the_document.appendChild(doc.importNode(structs.item(i), true));
448 }
449 }
450 else
451 {
452 // no structure nodes, so put in a dummy doc node
453 Element doc_node = doc.createElement(GSXML.DOC_NODE_ELEM);
454 if (document_id != null)
455 {
456 doc_node.setAttribute(GSXML.NODE_ID_ATT, document_id);
457 }
458 else
459 {
460 doc_node.setAttribute(GSXML.HREF_ID_ATT, href);
461
462 }
463 the_document.appendChild(doc_node);
464 has_dummy = true;
465 }
466 }
467 else
468 { // a simple type - we dont have a dummy node for simple
469 // should think about this more
470 // no structure request, so just put in a dummy doc node
471 Element doc_node = doc.createElement(GSXML.DOC_NODE_ELEM);
472 if (document_id != null)
473 {
474 doc_node.setAttribute(GSXML.NODE_ID_ATT, document_id);
475 }
476 else
477 {
478 doc_node.setAttribute(GSXML.HREF_ID_ATT, href);
479 }
480 the_document.appendChild(doc_node);
481 has_dummy = true;
482 }
483
484 // Build a request to obtain some document metadata
485 Element dm_message = doc.createElement(GSXML.MESSAGE_ELEM);
486 String to = GSPath.appendLink(collection, "DocumentMetadataRetrieve"); // Hard-wired?
487 Element dm_request = GSXML.createBasicRequest(doc, GSXML.REQUEST_TYPE_PROCESS, to, userContext);
488 dm_message.appendChild(dm_request);
489 // Create a parameter list to specify the required metadata information
490
491 HashSet<String> meta_names = new HashSet<String>();
492 meta_names.add("Title"); // the default
493 if (format_elem != null)
494 {
495 getRequiredMetadataNames(format_elem, meta_names);
496 }
497
498 Element extraMetaListElem = (Element) GSXML.getChildByTagName(request, GSXML.EXTRA_METADATA + GSXML.LIST_MODIFIER);
499 if (extraMetaListElem != null)
500 {
501 NodeList extraMetaList = extraMetaListElem.getElementsByTagName(GSXML.EXTRA_METADATA);
502 for (int i = 0; i < extraMetaList.getLength(); i++)
503 {
504 meta_names.add(((Element) extraMetaList.item(i)).getAttribute(GSXML.NAME_ATT));
505 }
506 }
507
508 Element dm_param_list = createMetadataParamList(doc,meta_names);
509 if (service_params != null)
510 {
511 GSXML.addParametersToList(dm_param_list, service_params);
512 }
513
514 dm_request.appendChild(dm_param_list);
515
516 // create the doc node list for the metadata request
517 Element dm_doc_list = doc.createElement(GSXML.DOC_NODE_ELEM + GSXML.LIST_MODIFIER);
518 dm_request.appendChild(dm_doc_list);
519
520 // Add each node from the structure response into the metadata request
521 NodeList doc_nodes = the_document.getElementsByTagName(GSXML.DOC_NODE_ELEM);
522 for (int i = 0; i < doc_nodes.getLength(); i++)
523 {
524 Element doc_node = (Element) doc_nodes.item(i);
525 String doc_node_id = doc_node.getAttribute(GSXML.NODE_ID_ATT);
526
527 // Add the documentNode to the list
528 Element dm_doc_node = doc.createElement(GSXML.DOC_NODE_ELEM);
529 dm_doc_list.appendChild(dm_doc_node);
530 dm_doc_node.setAttribute(GSXML.NODE_ID_ATT, doc_node_id);
531 dm_doc_node.setAttribute(GSXML.NODE_TYPE_ATT, doc_node.getAttribute(GSXML.NODE_TYPE_ATT));
532 if (document_id == null){
533 dm_doc_node.setAttribute(GSXML.HREF_ID_ATT, href );
534 }
535
536 }
537
538 // we also want a metadata request to the top level document to get
539 // assocfilepath - this could be cached too
540 Element doc_meta_request = GSXML.createBasicRequest(doc, GSXML.REQUEST_TYPE_PROCESS, to, userContext);
541 dm_message.appendChild(doc_meta_request);
542 Element doc_meta_param_list = doc.createElement(GSXML.PARAM_ELEM + GSXML.LIST_MODIFIER);
543 if (service_params != null)
544 {
545 GSXML.addParametersToList(doc_meta_param_list, service_params);
546 }
547
548 doc_meta_request.appendChild(doc_meta_param_list);
549 Element doc_param = doc.createElement(GSXML.PARAM_ELEM);
550 doc_meta_param_list.appendChild(doc_param);
551 doc_param.setAttribute(GSXML.NAME_ATT, "metadata");
552 doc_param.setAttribute(GSXML.VALUE_ATT, "assocfilepath");
553
554 // create the doc node list for the metadata request
555 Element doc_list = doc.createElement(GSXML.DOC_NODE_ELEM + GSXML.LIST_MODIFIER);
556 doc_meta_request.appendChild(doc_list);
557
558 Element doc_node = doc.createElement(GSXML.DOC_NODE_ELEM);
559 // the node we want is the root document node
560 if (document_id != null)
561 {
562 doc_node.setAttribute(GSXML.NODE_ID_ATT, document_id + ".rt");
563 }
564 /*else
565 {
566 doc_node.setAttribute(GSXML.HREF_ID_ATT, href);// + ".rt");
567 // can we assume that href is always a top level doc??
568 //doc_node.setAttribute(GSXML.ID_MOD_ATT, ".rt");
569 //doc_node.setAttribute("externalURL", has_rl);
570 }*/
571 doc_list.appendChild(doc_node);
572
573 Element dm_response_message = (Element) this.mr.process(dm_message);
574 if (processErrorElements(dm_response_message, page_response))
575 {
576 return result;
577 }
578
579 String path = GSPath.appendLink(GSXML.RESPONSE_ELEM, GSXML.DOC_NODE_ELEM + GSXML.LIST_MODIFIER);
580 Element dm_response_doc_list = (Element) GSXML.getNodeByPath(dm_response_message, path);
581
582 // Merge the metadata with the structure information
583 NodeList dm_response_docs = dm_response_doc_list.getChildNodes();
584 for (int i = 0; i < doc_nodes.getLength(); i++)
585 {
586 GSXML.mergeMetadataLists(doc_nodes.item(i), dm_response_docs.item(i));
587 }
588 // get the top level doc metadata out
589 Element doc_meta_response = (Element) dm_response_message.getElementsByTagName(GSXML.RESPONSE_ELEM).item(1);
590 Element top_doc_node = (Element) GSXML.getNodeByPath(doc_meta_response, "documentNodeList/documentNode");
591 GSXML.mergeMetadataLists(the_document, top_doc_node);
592
593 // do we want doc text content? If not, we are done.
594 if (!get_text) {
595 // don't get text
596 return result;
597 }
598
599 // Build a request to obtain some document content
600 Element dc_message = doc.createElement(GSXML.MESSAGE_ELEM);
601 to = GSPath.appendLink(collection, "DocumentContentRetrieve"); // Hard-wired?
602 Element dc_request = GSXML.createBasicRequest(doc, GSXML.REQUEST_TYPE_PROCESS, to, userContext);
603 dc_message.appendChild(dc_request);
604
605 // Create a parameter list to specify the request parameters - empty for now
606 Element dc_param_list = doc.createElement(GSXML.PARAM_ELEM + GSXML.LIST_MODIFIER);
607 if (service_params != null)
608 {
609 GSXML.addParametersToList(dc_param_list, service_params);
610 }
611
612 dc_request.appendChild(dc_param_list);
613
614 // get the content
615 // the doc list for the content request is the same as the one for the structure request unless we want the whole document, in which case its the same as for the metadata request.
616 if (expand_document)
617 {
618 dc_request.appendChild(dm_doc_list);
619 }
620 else
621 {
622 dc_request.appendChild(basic_doc_list);
623 }
624 Element dc_response_message = (Element) this.mr.process(dc_message);
625 if (processErrorElements(dc_response_message, page_response))
626 {
627 return result;
628 }
629 Element dc_response_doc_list = (Element) GSXML.getNodeByPath(dc_response_message, path);
630
631 if (expand_document)
632 {
633 // Merge the content with the structure information
634 NodeList dc_response_docs = dc_response_doc_list.getChildNodes();
635 for (int i = 0; i < doc_nodes.getLength(); i++)
636 {
637 Node content = GSXML.getChildByTagName((Element) dc_response_docs.item(i), GSXML.NODE_CONTENT_ELEM);
638 if (content != null)
639 {
640 if (highlight_query_terms)
641 {
642 String node_id = ((Element)doc_nodes.item(i)).getAttribute(GSXML.NODE_ID_ATT);
643 content = highlightQueryTerms(request, node_id, (Element) content);
644 }
645
646 doc_nodes.item(i).appendChild(doc.importNode(content, true));
647 }
648 //GSXML.mergeMetadataLists(doc_nodes.item(i), dm_response_docs.item(i));
649 }
650 if (has_dummy && document_type.equals(GSXML.DOC_TYPE_SIMPLE)) {
651 Element dummy_node = (Element) doc_nodes.item(0);
652 the_document.removeChild(dummy_node);
653 the_document.setAttribute(GSXML.NODE_ID_ATT, dummy_node.getAttribute(GSXML.NODE_ID_ATT));
654 NodeList dummy_children = dummy_node.getChildNodes();
655 for (int i = dummy_children.getLength() - 1; i >= 0; i--)
656 {
657 // special case as we don't want more than one metadata list
658 if (dummy_children.item(i).getNodeName().equals(GSXML.METADATA_ELEM + GSXML.LIST_MODIFIER))
659 {
660 GSXML.mergeMetadataFromList(the_document, dummy_children.item(i));
661 }
662 else
663 {
664 the_document.appendChild(dummy_children.item(i));
665 }
666 }
667 }
668 }
669 else
670 {
671 //path = GSPath.appendLink(path, GSXML.DOC_NODE_ELEM);
672 Element dc_response_doc = (Element) GSXML.getChildByTagName(dc_response_doc_list, GSXML.DOC_NODE_ELEM);
673 Element dc_response_doc_content = (Element) GSXML.getChildByTagName(dc_response_doc, GSXML.NODE_CONTENT_ELEM);
674 //Element dc_response_doc_external = (Element) GSXML.getChildByTagName(dc_response_doc, "external");
675
676 if (dc_response_doc_content == null)
677 {
678 // no content to add
679 if (dc_response_doc.getAttribute("external").equals("true"))
680 {
681
682 //if (dc_response_doc_external != null)
683 //{
684 String href_id = dc_response_doc.getAttribute(GSXML.HREF_ID_ATT);
685
686 the_document.setAttribute("selectedNode", href_id);
687 the_document.setAttribute("external", href_id);
688 }
689 return result;
690 }
691 if (highlight_query_terms)
692 {
693 dc_response_doc.removeChild(dc_response_doc_content);
694
695 dc_response_doc_content = highlightQueryTerms(request, null, dc_response_doc_content);
696 dc_response_doc.appendChild(dc_response_doc.getOwnerDocument().importNode(dc_response_doc_content, true));
697 }
698
699 if (provide_annotations)
700 {
701 String service_selected = (String) params.get(ENRICH_DOC_ARG);
702 if (service_selected != null && service_selected.equals("1"))
703 {
704 // now we can modifiy the response doc if needed
705 String enrich_service = (String) params.get(GSParams.SERVICE);
706 // send a message to the service
707 Element enrich_message = doc.createElement(GSXML.MESSAGE_ELEM);
708 Element enrich_request = GSXML.createBasicRequest(doc, GSXML.REQUEST_TYPE_PROCESS, enrich_service, userContext);
709 enrich_message.appendChild(enrich_request);
710 // check for parameters
711 HashMap e_service_params = (HashMap) params.get("s1");
712 if (e_service_params != null)
713 {
714 Element enrich_pl = doc.createElement(GSXML.PARAM_ELEM + GSXML.LIST_MODIFIER);
715 GSXML.addParametersToList(enrich_pl, e_service_params);
716 enrich_request.appendChild(enrich_pl);
717 }
718 Element e_doc_list = doc.createElement(GSXML.DOC_NODE_ELEM + GSXML.LIST_MODIFIER);
719 enrich_request.appendChild(e_doc_list);
720 e_doc_list.appendChild(doc.importNode(dc_response_doc, true));
721
722 Node enrich_response = this.mr.process(enrich_message);
723
724 String[] links = { GSXML.RESPONSE_ELEM, GSXML.DOC_NODE_ELEM + GSXML.LIST_MODIFIER, GSXML.DOC_NODE_ELEM, GSXML.NODE_CONTENT_ELEM };
725 path = GSPath.createPath(links);
726 dc_response_doc_content = (Element) GSXML.getNodeByPath(enrich_response, path);
727
728 }
729 } // if provide_annotations
730
731 // use the returned id rather than the sent one cos there may have
732 // been modifiers such as .pr that are removed.
733 String modified_doc_id = dc_response_doc.getAttribute(GSXML.NODE_ID_ATT);
734 the_document.setAttribute("selectedNode", modified_doc_id);
735 if (has_dummy)
736 {
737 // change the id if necessary and add the content
738 Element dummy_node = (Element) doc_nodes.item(0);
739
740 dummy_node.setAttribute(GSXML.NODE_ID_ATT, modified_doc_id);
741 dummy_node.appendChild(doc.importNode(dc_response_doc_content, true));
742 // hack for simple type
743 if (document_type.equals(GSXML.DOC_TYPE_SIMPLE))
744 {
745 // we dont want the internal docNode, just want the content and metadata in the document
746 // rethink this!!
747 the_document.removeChild(dummy_node);
748
749 NodeList dummy_children = dummy_node.getChildNodes();
750 //for (int i=0; i<dummy_children.getLength(); i++) {
751 for (int i = dummy_children.getLength() - 1; i >= 0; i--)
752 {
753 // special case as we don't want more than one metadata list
754 if (dummy_children.item(i).getNodeName().equals(GSXML.METADATA_ELEM + GSXML.LIST_MODIFIER))
755 {
756 GSXML.mergeMetadataFromList(the_document, dummy_children.item(i));
757 }
758 else
759 {
760 the_document.appendChild(dummy_children.item(i));
761 }
762 }
763 }
764
765 the_document.setAttribute(GSXML.NODE_ID_ATT, modified_doc_id);
766 }
767 else
768 {
769 // Merge the document content with the metadata and structure information
770 for (int i = 0; i < doc_nodes.getLength(); i++)
771 {
772 Node dn = doc_nodes.item(i);
773 String dn_id = ((Element) dn).getAttribute(GSXML.NODE_ID_ATT);
774 if (dn_id.equals(modified_doc_id))
775 {
776 dn.appendChild(doc.importNode(dc_response_doc_content, true));
777 break;
778 }
779 }
780 }
781 }
782 //logger.debug("(DocumentAction) Page:\n" + GSXML.xmlNodeToString(result));
783 return result;
784 }
785
786 /**
787 * tell the param class what its arguments are if an action has its own
788 * arguments, this should add them to the params object - particularly
789 * important for args that should not be saved
790 */
791 public boolean addActionParameters(GSParams params)
792 {
793 params.addParameter(GOTO_PAGE_ARG, false);
794 params.addParameter(ENRICH_DOC_ARG, false);
795 params.addParameter(EXPAND_DOCUMENT_ARG, false);
796 params.addParameter(EXPAND_CONTENTS_ARG, false);
797 params.addParameter(REALISTIC_BOOK_ARG, false);
798
799 return true;
800 }
801
802 /**
803 * this method gets the collection description, the format info, the list of
804 * enrich services, etc - stuff that is needed for the page, but is the same
805 * whatever the query is - should be cached
806 */
807 protected boolean getBackgroundData(Element page_response, String collection, UserContext userContext)
808 {
809 Document doc = page_response.getOwnerDocument();
810
811 // create a message to process - contains requests for the collection
812 // description, the format element, the enrich services on offer
813 // these could all be cached
814 Element info_message = doc.createElement(GSXML.MESSAGE_ELEM);
815 String path = GSPath.appendLink(collection, "DocumentContentRetrieve");
816 // the format request - ignore for now, where does this request go to??
817 Element format_request = GSXML.createBasicRequest(doc, GSXML.REQUEST_TYPE_FORMAT, path, userContext);
818 info_message.appendChild(format_request);
819
820 // the enrich_services request - only do this if provide_annotations is true
821
822 if (provide_annotations)
823 {
824 Element enrich_services_request = GSXML.createBasicRequest(doc, GSXML.REQUEST_TYPE_DESCRIBE, "", userContext);
825 enrich_services_request.setAttribute(GSXML.INFO_ATT, "serviceList");
826 info_message.appendChild(enrich_services_request);
827 }
828
829 Element info_response = (Element) this.mr.process(info_message);
830
831 // the collection is the first response
832 NodeList responses = info_response.getElementsByTagName(GSXML.RESPONSE_ELEM);
833 Element format_resp = (Element) responses.item(0);
834
835 Element format_elem = (Element) GSXML.getChildByTagName(format_resp, GSXML.FORMAT_ELEM);
836 if (format_elem != null)
837 {
838 Element global_format_elem = (Element) GSXML.getChildByTagName(format_resp, GSXML.GLOBAL_FORMAT_ELEM);
839 if (global_format_elem != null)
840 {
841 GSXSLT.mergeFormatElements(format_elem, global_format_elem, false);
842 }
843
844 // set the format type
845 format_elem.setAttribute(GSXML.TYPE_ATT, "display");
846 page_response.appendChild(doc.importNode(format_elem, true));
847 }
848
849 if (provide_annotations)
850 {
851 Element services_resp = (Element) responses.item(1);
852
853 // a new message for the mr
854 Element enrich_message = doc.createElement(GSXML.MESSAGE_ELEM);
855 NodeList e_services = services_resp.getElementsByTagName(GSXML.SERVICE_ELEM);
856 boolean service_found = false;
857 for (int j = 0; j < e_services.getLength(); j++)
858 {
859 if (((Element) e_services.item(j)).getAttribute(GSXML.TYPE_ATT).equals("enrich"))
860 {
861 Element s = GSXML.createBasicRequest(doc, GSXML.REQUEST_TYPE_DESCRIBE, ((Element) e_services.item(j)).getAttribute(GSXML.NAME_ATT), userContext);
862 enrich_message.appendChild(s);
863 service_found = true;
864 }
865 }
866 if (service_found)
867 {
868 Element enrich_response = (Element) this.mr.process(enrich_message);
869
870 NodeList e_responses = enrich_response.getElementsByTagName(GSXML.RESPONSE_ELEM);
871 Element service_list = doc.createElement(GSXML.SERVICE_ELEM + GSXML.LIST_MODIFIER);
872 for (int i = 0; i < e_responses.getLength(); i++)
873 {
874 Element e_resp = (Element) e_responses.item(i);
875 Element e_service = (Element) doc.importNode(GSXML.getChildByTagName(e_resp, GSXML.SERVICE_ELEM), true);
876 e_service.setAttribute(GSXML.NAME_ATT, e_resp.getAttribute(GSXML.FROM_ATT));
877 service_list.appendChild(e_service);
878 }
879 page_response.appendChild(service_list);
880 }
881 } // if provide_annotations
882 return true;
883
884 }
885
886 protected String getDocumentType(Element basic_doc_list, String collection, UserContext userContext, Element page_response)
887 {
888 Document doc = basic_doc_list.getOwnerDocument();
889
890 Element ds_message = doc.createElement(GSXML.MESSAGE_ELEM);
891 String to = GSPath.appendLink(collection, "DocumentStructureRetrieve");// Hard-wired?
892 Element ds_request = GSXML.createBasicRequest(doc, GSXML.REQUEST_TYPE_PROCESS, to, userContext);
893 ds_message.appendChild(ds_request);
894
895 // Create a parameter list to specify the required structure information
896 Element ds_param_list = doc.createElement(GSXML.PARAM_ELEM + GSXML.LIST_MODIFIER);
897 Element ds_param = doc.createElement(GSXML.PARAM_ELEM);
898 ds_param_list.appendChild(ds_param);
899 ds_param.setAttribute(GSXML.NAME_ATT, "info");
900 ds_param.setAttribute(GSXML.VALUE_ATT, "documentType");
901
902 ds_request.appendChild(ds_param_list);
903
904 // add the node list we created earlier
905 ds_request.appendChild(basic_doc_list);
906
907 // Process the document structure retrieve message
908 Element ds_response_message = (Element) this.mr.process(ds_message);
909 if (processErrorElements(ds_response_message, page_response))
910 {
911 return null;
912 }
913
914 String[] links = { GSXML.RESPONSE_ELEM, GSXML.DOC_NODE_ELEM + GSXML.LIST_MODIFIER, GSXML.DOC_NODE_ELEM, "nodeStructureInfo" };
915 String path = GSPath.createPath(links);
916 Element info_elem = (Element) GSXML.getNodeByPath(ds_response_message, path);
917 if (info_elem == null) {
918 return null;
919 }
920 Element doctype_elem = GSXML.getNamedElement(info_elem, "info", "name", "documentType");
921 if (doctype_elem != null)
922 {
923 String doc_type = doctype_elem.getAttribute("value");
924 return doc_type;
925 }
926 return null;
927 }
928
929 // Recursive method to set the docType, nodeType and nodeID attributes of each docNode
930 // The docType remains constant as in parameter document_type
931 // The nodeID for the first (root) docNode is already set. For all children, the rootNode id
932 // is updated to be <parent-id>.<num-child>, where the first parent-id is rootNode id.
933 // The nodeType is root if rootNode, internal if there are children and leaf if no children
934 protected void insertDocNodeAttributes(Element docNode, String document_type, String id) {
935
936 boolean isRoot = false;
937 if(id == null) { // rootNode, get the root nodeID to work with recursively
938 id = docNode.getAttribute(GSXML.NODE_ID_ATT);
939 isRoot = true;
940 } else { // for all but the root node, need to still set the nodeID
941 docNode.setAttribute(GSXML.NODE_ID_ATT, id);
942 }
943
944 docNode.setAttribute(GSXML.DOC_TYPE_ATT, document_type);
945
946 NodeList docNodes = GSXML.getChildrenByTagName(docNode, GSXML.DOC_NODE_ELEM);
947 if(docNodes.getLength() > 0) {
948 docNode.setAttribute(GSXML.NODE_TYPE_ATT, GSXML.NODE_TYPE_INTERNAL);
949 for(int i = 0; i < docNodes.getLength(); i++) {
950 Element childDocNode = (Element)docNodes.item(i);
951
952 // work out the child docNode's nodeID based on current id
953 String nodeID = id + "." + (i+1);
954 insertDocNodeAttributes(childDocNode, document_type, nodeID); //recursion step
955 }
956 } else {
957 docNode.setAttribute(GSXML.NODE_TYPE_ATT, GSXML.NODE_TYPE_LEAF);
958 }
959
960 // rootNode's nodeType is a special case: it's "root", not "leaf" or "internal"
961 if(isRoot) docNode.setAttribute(GSXML.NODE_TYPE_ATT, GSXML.NODE_TYPE_ROOT);
962
963 }
964
965 /** run the XSLT transform which converts from doc.xml format to our internal document format */
966 protected Element transformArchiveToDocument(Element section) {
967
968 String stylesheet_filename = GSFile.stylesheetFile(GlobalProperties.getGSDL3Home(), (String) this.config_params.get(GSConstants.SITE_NAME), "", (String) this.config_params.get(GSConstants.INTERFACE_NAME), (ArrayList<String>) this.config_params.get(GSConstants.BASE_INTERFACES), "archive2document.xsl");
969 if (stylesheet_filename == null) {
970 logger.error("Couldn't find stylesheet archive2document.xsl");
971 return section;
972 }
973
974 Document stylesheet_doc = XMLConverter.getDOM(new File(stylesheet_filename));
975 if (stylesheet_doc == null) {
976 logger.error("Couldn't load in stylesheet "+stylesheet_filename);
977 return section;
978 }
979
980 Document section_doc = XMLConverter.newDOM();
981 section_doc.appendChild(section_doc.importNode(section, true));
982 Node result = this.transformer.transform(stylesheet_doc, section_doc);
983 logger.debug("transform result = "+XMLConverter.getPrettyString(result));
984
985 Element new_element;
986 if (result.getNodeType() == Node.DOCUMENT_NODE) {
987 new_element = ((Document) result).getDocumentElement();
988 } else {
989 new_element = (Element) result;
990 }
991
992
993 return new_element;
994
995 }
996
997
998 /**
999 * this involves a bit of a hack to get the equivalent query terms - has to
1000 * requery the query service - uses the last selected service name. (if it
1001 * ends in query). should this action do the query or should it send a
1002 * message to the query action? but that will involve lots of extra stuff.
1003 * also doesn't handle phrases properly - just highlights all the terms
1004 * found in the text.
1005 */
1006 protected Element highlightQueryTerms(Element request, String current_node_id, Element dc_response_doc_content)
1007 {
1008 Document doc = request.getOwnerDocument();
1009
1010 // do the query again to get term info
1011 Element cgi_param_list = (Element) GSXML.getChildByTagName(request, GSXML.PARAM_ELEM + GSXML.LIST_MODIFIER);
1012 HashMap<String, Serializable> params = GSXML.extractParams(cgi_param_list, false);
1013
1014 HashMap previous_params = (HashMap) params.get("p");
1015 if (previous_params == null)
1016 {
1017 return dc_response_doc_content;
1018 }
1019 String service_name = (String) previous_params.get(GSParams.SERVICE);
1020 if (service_name == null || !service_name.endsWith("Query"))
1021 { // hack for now - we only do highlighting if we were in a query last - ie not if we were in a browse thingy
1022 logger.debug("invalid service, not doing highlighting");
1023 return dc_response_doc_content;
1024 }
1025 String collection = (String) params.get(GSParams.COLLECTION);
1026 UserContext userContext = new UserContext(request);
1027 String to = GSPath.appendLink(collection, service_name);
1028
1029 Element mr_query_message = doc.createElement(GSXML.MESSAGE_ELEM);
1030 Element mr_query_request = GSXML.createBasicRequest(doc, GSXML.REQUEST_TYPE_PROCESS, to, userContext);
1031 mr_query_message.appendChild(mr_query_request);
1032
1033 // paramList
1034 HashMap service_params = (HashMap) params.get("s1");
1035
1036 Element query_param_list = doc.createElement(GSXML.PARAM_ELEM + GSXML.LIST_MODIFIER);
1037 GSXML.addParametersToList(query_param_list, service_params);
1038 if (current_node_id != null) {
1039 GSXML.addParameterToList(query_param_list, "hldocOID", current_node_id);
1040 } else {
1041 GSXML.addParameterToList(query_param_list, "hldocOID", (String) params.get(GSParams.DOCUMENT));
1042 }
1043 mr_query_request.appendChild(query_param_list);
1044 // do the query
1045 Element mr_query_response = (Element) this.mr.process(mr_query_message);
1046 String pathNode = GSPath.appendLink(GSXML.RESPONSE_ELEM, GSXML.NODE_CONTENT_ELEM);
1047 Element highlighted_Node = (Element) GSXML.getNodeByPath(mr_query_response, pathNode);
1048 // For SOLR, the above query may come back with a nodeContent element, which is the hldocOID section content, with search terms marked up. We send it back to the documnetContentRetrieve service so that resolveTextMacros can be applied, and it can be properly encased in documentNode etc elements
1049 if (highlighted_Node != null)
1050 {
1051 // Build a request to process highlighted text
1052
1053 Element hl_message = doc.createElement(GSXML.MESSAGE_ELEM);
1054 to = GSPath.appendLink(collection, "DocumentContentRetrieve");
1055 Element dc_request = GSXML.createBasicRequest(doc, GSXML.REQUEST_TYPE_PROCESS, to, userContext);
1056 hl_message.appendChild(dc_request);
1057
1058 // Create a parameter list to specify the request parameters - empty for now
1059 Element dc_param_list = doc.createElement(GSXML.PARAM_ELEM + GSXML.LIST_MODIFIER);
1060 dc_request.appendChild(dc_param_list);
1061
1062 // get the content
1063 Element doc_list = doc.createElement(GSXML.DOC_NODE_ELEM + GSXML.LIST_MODIFIER);
1064 dc_request.appendChild(doc_list);
1065 Element current_doc = doc.createElement(GSXML.DOC_NODE_ELEM);
1066 doc_list.appendChild(current_doc);
1067 current_doc.setAttribute(GSXML.NODE_ID_ATT, (String) params.get(GSParams.DOCUMENT));
1068 //Append highlighted content to request for processing
1069 dc_request.appendChild(doc.importNode(highlighted_Node, true));
1070 Element hl_response_message = (Element) this.mr.process(hl_message);
1071
1072 //Get results
1073 NodeList contentList = hl_response_message.getElementsByTagName(GSXML.NODE_CONTENT_ELEM);
1074 Element content = (Element) contentList.item(0);
1075 return content;
1076 }
1077 String path = GSPath.appendLink(GSXML.RESPONSE_ELEM, GSXML.TERM_ELEM + GSXML.LIST_MODIFIER);
1078 Element query_term_list_element = (Element) GSXML.getNodeByPath(mr_query_response, path);
1079 if (query_term_list_element == null)
1080 {
1081 // no term info
1082 logger.error("No query term information.\n");
1083 return dc_response_doc_content;
1084 }
1085
1086 String content = GSXML.getNodeText(dc_response_doc_content);
1087
1088 String metadata_path = GSPath.appendLink(GSXML.RESPONSE_ELEM, GSXML.METADATA_ELEM + GSXML.LIST_MODIFIER);
1089 Element metadata_list = (Element) GSXML.getNodeByPath(mr_query_response, metadata_path);
1090
1091 HashSet<String> query_term_variants = new HashSet<String>();
1092 NodeList equivalent_terms_nodelist = query_term_list_element.getElementsByTagName("equivTermList");
1093 if (equivalent_terms_nodelist == null || equivalent_terms_nodelist.getLength() == 0)
1094 {
1095 NodeList terms_nodelist = query_term_list_element.getElementsByTagName("term");
1096 if (terms_nodelist != null && terms_nodelist.getLength() > 0)
1097 {
1098 for (int i = 0; i < terms_nodelist.getLength(); i++)
1099 {
1100 String termValue = ((Element) terms_nodelist.item(i)).getAttribute("name");
1101 String termValueU = null;
1102 String termValueL = null;
1103
1104 if (termValue.length() > 1)
1105 {
1106 termValueU = termValue.substring(0, 1).toUpperCase() + termValue.substring(1);
1107 termValueL = termValue.substring(0, 1).toLowerCase() + termValue.substring(1);
1108 }
1109 else
1110 {
1111 termValueU = termValue.substring(0, 1).toUpperCase();
1112 termValueL = termValue.substring(0, 1).toLowerCase();
1113 }
1114
1115 query_term_variants.add(termValueU);
1116 query_term_variants.add(termValueL);
1117 }
1118 }
1119 }
1120 else
1121 {
1122 for (int i = 0; i < equivalent_terms_nodelist.getLength(); i++)
1123 {
1124 Element equivalent_terms_element = (Element) equivalent_terms_nodelist.item(i);
1125 String[] equivalent_terms = GSXML.getAttributeValuesFromList(equivalent_terms_element, GSXML.NAME_ATT);
1126 for (int j = 0; j < equivalent_terms.length; j++)
1127 {
1128 query_term_variants.add(equivalent_terms[j]);
1129 }
1130 }
1131 }
1132
1133 ArrayList<ArrayList<HashSet<String>>> phrase_query_term_variants_hierarchy = new ArrayList<ArrayList<HashSet<String>>>();
1134
1135 Element query_element = GSXML.getNamedElement(metadata_list, GSXML.METADATA_ELEM, GSXML.NAME_ATT, "query");
1136 String performed_query = GSXML.getNodeText(query_element) + " ";
1137
1138 ArrayList<HashSet<String>> phrase_query_p_term_variants_list = new ArrayList<HashSet<String>>();
1139 int term_start = 0;
1140 boolean in_term = false;
1141 boolean in_phrase = false;
1142 for (int i = 0; i < performed_query.length(); i++)
1143 {
1144 char character = performed_query.charAt(i);
1145 boolean is_character_letter_or_digit = Character.isLetterOrDigit(character);
1146
1147 // Has a query term just started?
1148 if (in_term == false && is_character_letter_or_digit == true)
1149 {
1150 in_term = true;
1151 term_start = i;
1152 }
1153
1154 // Or has a term just finished?
1155 else if (in_term == true && is_character_letter_or_digit == false)
1156 {
1157 in_term = false;
1158 String term = performed_query.substring(term_start, i);
1159
1160 Element term_element = GSXML.getNamedElement(query_term_list_element, GSXML.TERM_ELEM, GSXML.NAME_ATT, term);
1161 if (term_element != null)
1162 {
1163
1164 HashSet<String> phrase_query_p_term_x_variants = new HashSet<String>();
1165
1166 NodeList term_equivalent_terms_nodelist = term_element.getElementsByTagName("equivTermList");
1167 if (term_equivalent_terms_nodelist == null || term_equivalent_terms_nodelist.getLength() == 0)
1168 {
1169 String termValueU = null;
1170 String termValueL = null;
1171
1172 if (term.length() > 1)
1173 {
1174 termValueU = term.substring(0, 1).toUpperCase() + term.substring(1);
1175 termValueL = term.substring(0, 1).toLowerCase() + term.substring(1);
1176 }
1177 else
1178 {
1179 termValueU = term.substring(0, 1).toUpperCase();
1180 termValueL = term.substring(0, 1).toLowerCase();
1181 }
1182
1183 phrase_query_p_term_x_variants.add(termValueU);
1184 phrase_query_p_term_x_variants.add(termValueL);
1185 }
1186 else
1187 {
1188 for (int j = 0; j < term_equivalent_terms_nodelist.getLength(); j++)
1189 {
1190 Element term_equivalent_terms_element = (Element) term_equivalent_terms_nodelist.item(j);
1191 String[] term_equivalent_terms = GSXML.getAttributeValuesFromList(term_equivalent_terms_element, GSXML.NAME_ATT);
1192 for (int k = 0; k < term_equivalent_terms.length; k++)
1193 {
1194 phrase_query_p_term_x_variants.add(term_equivalent_terms[k]);
1195 }
1196 }
1197 }
1198 phrase_query_p_term_variants_list.add(phrase_query_p_term_x_variants);
1199
1200 if (in_phrase == false)
1201 {
1202 phrase_query_term_variants_hierarchy.add(phrase_query_p_term_variants_list);
1203 phrase_query_p_term_variants_list = new ArrayList<HashSet<String>>();
1204 }
1205 }
1206 }
1207 // Watch for phrases (surrounded by quotes)
1208 if (character == '\"')
1209 {
1210 // Has a phrase just started?
1211 if (in_phrase == false)
1212 {
1213 in_phrase = true;
1214 }
1215 // Or has a phrase just finished?
1216 else if (in_phrase == true)
1217 {
1218 in_phrase = false;
1219 phrase_query_term_variants_hierarchy.add(phrase_query_p_term_variants_list);
1220 }
1221
1222 phrase_query_p_term_variants_list = new ArrayList<HashSet<String>>();
1223 }
1224 }
1225
1226 return highlightQueryTermsInternal(doc, content, query_term_variants, phrase_query_term_variants_hierarchy);
1227 }
1228
1229 /**
1230 * Highlights query terms in a piece of text.
1231 */
1232 private Element highlightQueryTermsInternal(Document doc, String content, HashSet<String> query_term_variants, ArrayList<ArrayList<HashSet<String>>> phrase_query_term_variants_hierarchy)
1233 {
1234 // Convert the content string to an array of characters for speed
1235 char[] content_characters = new char[content.length()];
1236 content.getChars(0, content.length(), content_characters, 0);
1237
1238 // Now skim through the content, identifying word matches
1239 ArrayList<WordMatch> word_matches = new ArrayList<WordMatch>();
1240 int word_start = 0;
1241 boolean in_word = false;
1242 boolean preceding_word_matched = false;
1243 boolean inTag = false;
1244 for (int i = 0; i < content_characters.length; i++)
1245 {
1246 //We don't want to find words inside HTML tags
1247 if (content_characters[i] == '<')
1248 {
1249 inTag = true;
1250 continue;
1251 }
1252 else if (inTag && content_characters[i] == '>')
1253 {
1254 inTag = false;
1255 }
1256 else if (inTag)
1257 {
1258 continue;
1259 }
1260
1261 boolean is_character_letter_or_digit = Character.isLetterOrDigit(content_characters[i]);
1262
1263 // Has a word just started?
1264 if (in_word == false && is_character_letter_or_digit == true)
1265 {
1266 in_word = true;
1267 word_start = i;
1268 }
1269
1270 // Or has a word just finished?
1271 else if (in_word == true && is_character_letter_or_digit == false)
1272 {
1273 in_word = false;
1274
1275 // Check if the word matches any of the query term equivalents
1276 String word = new String(content_characters, word_start, (i - word_start));
1277 if (query_term_variants.contains(word))
1278 {
1279 // We have found a matching word, so remember its location
1280 word_matches.add(new WordMatch(word, word_start, i, preceding_word_matched));
1281 preceding_word_matched = true;
1282 }
1283 else
1284 {
1285 preceding_word_matched = false;
1286 }
1287 }
1288 }
1289
1290 // Don't forget the last word...
1291 if (in_word == true)
1292 {
1293 // Check if the word matches any of the query term equivalents
1294 String word = new String(content_characters, word_start, (content_characters.length - word_start));
1295 if (query_term_variants.contains(word))
1296 {
1297 // We have found a matching word, so remember its location
1298 word_matches.add(new WordMatch(word, word_start, content_characters.length, preceding_word_matched));
1299 }
1300 }
1301
1302 ArrayList<Integer> highlight_start_positions = new ArrayList<Integer>();
1303 ArrayList<Integer> highlight_end_positions = new ArrayList<Integer>();
1304
1305 // Deal with phrases now
1306 ArrayList<PartialPhraseMatch> partial_phrase_matches = new ArrayList<PartialPhraseMatch>();
1307 for (int i = 0; i < word_matches.size(); i++)
1308 {
1309 WordMatch word_match = word_matches.get(i);
1310
1311 // See if any partial phrase matches are extended by this word
1312 if (word_match.preceding_word_matched)
1313 {
1314 for (int j = partial_phrase_matches.size() - 1; j >= 0; j--)
1315 {
1316 PartialPhraseMatch partial_phrase_match = partial_phrase_matches.remove(j);
1317 ArrayList phrase_query_p_term_variants_list = phrase_query_term_variants_hierarchy.get(partial_phrase_match.query_phrase_number);
1318 HashSet phrase_query_p_term_x_variants = (HashSet) phrase_query_p_term_variants_list.get(partial_phrase_match.num_words_matched);
1319 if (phrase_query_p_term_x_variants.contains(word_match.word))
1320 {
1321 partial_phrase_match.num_words_matched++;
1322
1323 // Has a complete phrase match occurred?
1324 if (partial_phrase_match.num_words_matched == phrase_query_p_term_variants_list.size())
1325 {
1326 // Check for overlaps by looking at the previous highlight range
1327 if (!highlight_end_positions.isEmpty())
1328 {
1329 int last_highlight_index = highlight_end_positions.size() - 1;
1330 int last_highlight_end = highlight_end_positions.get(last_highlight_index).intValue();
1331 if (last_highlight_end > partial_phrase_match.start_position)
1332 {
1333 // There is an overlap, so remove the previous phrase match
1334 int last_highlight_start = highlight_start_positions.remove(last_highlight_index).intValue();
1335 highlight_end_positions.remove(last_highlight_index);
1336 partial_phrase_match.start_position = last_highlight_start;
1337 }
1338 }
1339
1340 highlight_start_positions.add(new Integer(partial_phrase_match.start_position));
1341 highlight_end_positions.add(new Integer(word_match.end_position));
1342 }
1343 // No, but add the partial match back into the list for next time
1344 else
1345 {
1346 partial_phrase_matches.add(partial_phrase_match);
1347 }
1348 }
1349 }
1350 }
1351 else
1352 {
1353 partial_phrase_matches.clear();
1354 }
1355
1356 // See if this word is at the start of any of the phrases
1357 for (int p = 0; p < phrase_query_term_variants_hierarchy.size(); p++)
1358 {
1359 ArrayList phrase_query_p_term_variants_list = phrase_query_term_variants_hierarchy.get(p);
1360 if (phrase_query_p_term_variants_list.size()>0) {
1361 HashSet phrase_query_p_term_1_variants = (HashSet) phrase_query_p_term_variants_list.get(0);
1362 if (phrase_query_p_term_1_variants.contains(word_match.word))
1363 {
1364 // If this phrase is just one word long, we have a complete match
1365 if (phrase_query_p_term_variants_list.size() == 1)
1366 {
1367 highlight_start_positions.add(new Integer(word_match.start_position));
1368 highlight_end_positions.add(new Integer(word_match.end_position));
1369 }
1370 // Otherwise we have the start of a potential phrase match
1371 else
1372 {
1373 partial_phrase_matches.add(new PartialPhraseMatch(word_match.start_position, p));
1374 }
1375 }
1376 }
1377 }
1378 }
1379
1380 // Now add the annotation tags into the document at the correct points
1381 Element content_element = doc.createElement(GSXML.NODE_CONTENT_ELEM);
1382
1383 int last_wrote = 0;
1384 for (int i = 0; i < highlight_start_positions.size(); i++)
1385 {
1386 int highlight_start = highlight_start_positions.get(i).intValue();
1387 int highlight_end = highlight_end_positions.get(i).intValue();
1388
1389 // Print anything before the highlight range
1390 if (last_wrote < highlight_start)
1391 {
1392 String preceding_text = new String(content_characters, last_wrote, (highlight_start - last_wrote));
1393 content_element.appendChild(doc.createTextNode(preceding_text));
1394 }
1395
1396 // Print the highlight text, annotated
1397 if (highlight_end > last_wrote)
1398 {
1399 String highlight_text = new String(content_characters, highlight_start, (highlight_end - highlight_start));
1400 Element annotation_element = GSXML.createTextElement(doc, "annotation", highlight_text);
1401 annotation_element.setAttribute("type", "query_term");
1402 content_element.appendChild(annotation_element);
1403 last_wrote = highlight_end;
1404 }
1405 }
1406
1407 // Finish off any unwritten text
1408 if (last_wrote < content_characters.length)
1409 {
1410 String remaining_text = new String(content_characters, last_wrote, (content_characters.length - last_wrote));
1411 content_element.appendChild(doc.createTextNode(remaining_text));
1412 }
1413 return content_element;
1414 }
1415
1416 static private class WordMatch
1417 {
1418 public String word;
1419 public int start_position;
1420 public int end_position;
1421 public boolean preceding_word_matched;
1422
1423 public WordMatch(String word, int start_position, int end_position, boolean preceding_word_matched)
1424 {
1425 this.word = word;
1426 this.start_position = start_position;
1427 this.end_position = end_position;
1428 this.preceding_word_matched = preceding_word_matched;
1429 }
1430 }
1431
1432 static private class PartialPhraseMatch
1433 {
1434 public int start_position;
1435 public int query_phrase_number;
1436 public int num_words_matched;
1437
1438 public PartialPhraseMatch(int start_position, int query_phrase_number)
1439 {
1440 this.start_position = start_position;
1441 this.query_phrase_number = query_phrase_number;
1442 this.num_words_matched = 1;
1443 }
1444 }
1445}
Note: See TracBrowser for help on using the repository browser.