source: main/trunk/greenstone3/src/java/org/greenstone/gsdl3/action/DocumentAction.java@ 32071

Last change on this file since 32071 was 32071, checked in by ak19, 6 years ago

Changes to set the docType of the <document> element and to set the docType, nodeType and nodeID on each <documentNode>.

  • Property svn:keywords set to Author Date Id Revision
File size: 52.4 KB
Line 
1/*
2 * DocumentAction.java
3 * Copyright (C) 2002 New Zealand Digital Library, http://www.nzdl.org
4 *
5 * This program is free software; you can redistribute it and/or modify
6 * it under the terms of the GNU General Public License as published by
7 * the Free Software Foundation; either version 2 of the License, or
8 * (at your option) any later version.
9 *
10 * This program is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 * GNU General Public License for more details.
14 *
15 * You should have received a copy of the GNU General Public License
16 * along with this program; if not, write to the Free Software
17 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
18 */
19package org.greenstone.gsdl3.action;
20
21// Greenstone classes
22import org.greenstone.gsdl3.core.ModuleInterface;
23import org.greenstone.gsdl3.util.*;
24import org.greenstone.util.GlobalProperties;
25
26// XML classes
27import org.w3c.dom.Document;
28import org.w3c.dom.Element;
29import org.w3c.dom.Node;
30import org.w3c.dom.Text;
31import org.w3c.dom.NodeList;
32
33// General Java classes
34import java.util.ArrayList;
35import java.util.HashMap;
36import java.util.HashSet;
37import java.io.File;
38import java.io.Serializable;
39
40import org.apache.log4j.*;
41
42/** Action class for retrieving Documents via the message router */
43public class DocumentAction extends Action
44{
45
46 static Logger logger = Logger.getLogger(org.greenstone.gsdl3.action.DocumentAction.class.getName());
47
48 // this is used to specify that the sibling nodes of a selected one should be obtained
49 public static final String SIBLING_ARG = "sib";
50 public static final String GOTO_PAGE_ARG = "gp";
51 public static final String ENRICH_DOC_ARG = "end";
52 public static final String EXPAND_DOCUMENT_ARG = "ed";
53 public static final String EXPAND_CONTENTS_ARG = "ec";
54 public static final String REALISTIC_BOOK_ARG = "book";
55 public static final String NO_TEXT_ARG = "noText";
56 public static final String DOC_EDIT_ARG = "docEdit";
57
58 /**
59 * if this is set to true, when a document is displayed, any annotation type
60 * services (enrich) will be offered to the user as well
61 */
62 protected boolean provide_annotations = false;
63
64 protected boolean highlight_query_terms = false;
65
66 public boolean configure()
67 {
68 super.configure();
69 String highlight = (String) config_params.get("highlightQueryTerms");
70 if (highlight != null && highlight.equals("true"))
71 {
72 highlight_query_terms = true;
73 }
74 String annotate = (String) config_params.get("displayAnnotationService");
75 if (annotate != null && annotate.equals("true"))
76 {
77 provide_annotations = true;
78 }
79 return true;
80 }
81
82 public Node process(Node message_node)
83 {
84 // for now, no subaction eventually we may want to have subactions such as text assoc or something ?
85
86 Element message = GSXML.nodeToElement(message_node);
87 Document doc = XMLConverter.newDOM(); //message.getOwnerDocument();
88
89 // the response
90 Element result = doc.createElement(GSXML.MESSAGE_ELEM);
91 Element page_response = doc.createElement(GSXML.RESPONSE_ELEM);
92 result.appendChild(page_response);
93
94 // get the request - assume only one
95 Element request = (Element) GSXML.getChildByTagName(message, GSXML.REQUEST_ELEM);
96 Element cgi_paramList = (Element) GSXML.getChildByTagName(request, GSXML.PARAM_ELEM + GSXML.LIST_MODIFIER);
97 HashMap<String, Serializable> params = GSXML.extractParams(cgi_paramList, false);
98
99 // just in case there are some that need to get passed to the services
100 HashMap service_params = (HashMap) params.get("s0");
101
102 String collection = (String) params.get(GSParams.COLLECTION);
103 String document_id = (String) params.get(GSParams.DOCUMENT);
104 if (document_id != null && document_id.equals(""))
105 {
106 document_id = null;
107 }
108 String href = (String) params.get(GSParams.HREF);//for an external link : get the href URL if it is existing in the params list
109 if (href != null && href.equals(""))
110 {
111 href = null;
112 }
113 String rl = (String) params.get(GSParams.RELATIVE_LINK);//for an external link : get the rl value if it is existing in the params list
114 if (document_id == null && href == null)
115 {
116 logger.error("no document specified!");
117 return result;
118 }
119 if (rl != null && rl.equals("0"))
120 {
121 // this is a true external link, we should have been directed to a different page or action
122 logger.error("rl value was 0, shouldn't get here");
123 return result;
124 }
125
126 UserContext userContext = new UserContext(request);
127
128 //append site metadata
129 addSiteMetadata(page_response, userContext);
130 addInterfaceOptions(page_response);
131
132 // get the additional data needed for the page
133 getBackgroundData(page_response, collection, userContext);
134 Element format_elem = (Element) GSXML.getChildByTagName(page_response, GSXML.FORMAT_ELEM);
135
136 if (format_elem != null) {
137 // lets look for param defaults set in config file
138 NodeList param_defaults = format_elem.getElementsByTagName("paramDefault");
139 for (int i=0; i<param_defaults.getLength(); i++) {
140 Element p = (Element)param_defaults.item(i);
141 String name = p.getAttribute(GSXML.NAME_ATT);
142 if (params.get(name) ==null) {
143 // wasn't set from interface
144 String value = p.getAttribute(GSXML.VALUE_ATT);
145 params.put(name, value );
146 // also add into request param xml so that xslt knows it too
147 GSXML.addParameterToList(cgi_paramList, name, value);
148 }
149 }
150 }
151
152 String document_type = (String) params.get(GSParams.DOCUMENT_TYPE);
153 if (document_type != null && document_type.equals(""))
154 {
155 //document_type = "hierarchy";
156 document_type = null; // we'll get it later if not already specified
157 }
158 // what if it is null here?? Anu to check...
159
160
161 boolean editing_document = false;
162 String doc_edit = (String) params.get(DOC_EDIT_ARG);
163 if (doc_edit != null && doc_edit.equals("1")) {
164 editing_document = true;
165 }
166
167 // are we editing mode? just get the archive document, convert to our internal doc format, and return it
168 if (editing_document) {
169
170 // call get archive doc
171 Element dx_message = doc.createElement(GSXML.MESSAGE_ELEM);
172 String to = "DocXMLGetSection";
173 Element dx_request = GSXML.createBasicRequest(doc, GSXML.REQUEST_TYPE_PROCESS, to, userContext);
174 dx_message.appendChild(dx_request);
175 Element dx_section = doc.createElement(GSXML.DOCXML_SECTION_ELEM);
176 dx_section.setAttribute(GSXML.NODE_ID_ATT, document_id);
177 dx_section.setAttribute(GSXML.COLLECTION_ATT, collection);
178 dx_request.appendChild(dx_section);
179
180 Element dx_response_message = (Element) this.mr.process(dx_message);
181 if (processErrorElements(dx_response_message, page_response))
182 {
183 return result;
184 }
185
186 // get the section out
187 String path = GSPath.appendLink(GSXML.RESPONSE_ELEM, GSXML.DOCXML_SECTION_ELEM);
188 Element section = (Element) GSXML.getNodeByPath(dx_response_message, path);
189 if (section == null) {
190 logger.error("no archive doc returned for "+document_id);
191 return result;
192 }
193 // convert the archive format into the internal format that the page response requires
194
195 // work out doctype
196 // create a basic doc list containing the current node
197 Element basic_doc_list = doc.createElement(GSXML.DOC_NODE_ELEM + GSXML.LIST_MODIFIER);
198 Element current_doc = doc.createElement(GSXML.DOC_NODE_ELEM);
199 basic_doc_list.appendChild(current_doc);
200 current_doc.setAttribute(GSXML.NODE_ID_ATT, document_id);
201 basic_doc_list.appendChild(current_doc);
202 if (document_type == null) {
203 document_type = getDocumentType(basic_doc_list, collection, userContext, page_response);
204 }
205 if (document_type == null) {
206 logger.debug("@@@ doctype is null, setting to simple");
207 document_type = GSXML.DOC_TYPE_SIMPLE;
208 }
209
210 Element doc_elem = doc.createElement(GSXML.DOCUMENT_ELEM);
211 doc_elem.setAttribute(GSXML.DOC_TYPE_ATT, document_type);
212 page_response.appendChild(doc_elem);
213 section.setAttribute(GSXML.NODE_ID_ATT, document_id);
214
215
216 Element transformed_section = transformArchiveToDocument(section);
217 // In docEdit mode, we obtain the text from archives, from doc.xml
218 // Now the transformation has replaced <Section> with <documentNode>
219 // Need to add nodeID, nodeType and docType attributes to each docNode
220 // as doc.xml doesn't store that.
221 insertDocNodeAttributes(transformed_section, document_type, null);
222 doc_elem.appendChild(doc.importNode(transformed_section, true));
223 logger.debug("dx result = "+XMLConverter.getPrettyString(result));
224
225 return result;
226 }
227
228 //whether to retrieve siblings or not
229 boolean get_siblings = false;
230 String sibs = (String) params.get(SIBLING_ARG);
231 if (sibs != null && sibs.equals("1"))
232 {
233 get_siblings = true;
234 }
235
236 String doc_id_modifier = "";
237 String sibling_num = (String) params.get(GOTO_PAGE_ARG);
238 if (sibling_num != null && !sibling_num.equals(""))
239 {
240 // we have to modify the doc name
241 doc_id_modifier = "." + sibling_num + ".ss";
242 }
243
244 boolean expand_document = false;
245 String ed_arg = (String) params.get(EXPAND_DOCUMENT_ARG);
246 if (ed_arg != null && ed_arg.equals("1"))
247 {
248 expand_document = true;
249 }
250
251 boolean expand_contents = false;
252 if (expand_document)
253 { // we always expand the contents with the text
254 expand_contents = true;
255 }
256 else
257 {
258 String ec_arg = (String) params.get(EXPAND_CONTENTS_ARG);
259 if (ec_arg != null && ec_arg.equals("1"))
260 {
261 expand_contents = true;
262 }
263 }
264
265 // do we want text content? Not if no_text=1.
266 // expand_document overrides this. - should it??
267 boolean get_text = true;
268 String nt_arg = (String) params.get(NO_TEXT_ARG);
269
270 if (!expand_document && nt_arg!=null && nt_arg.equals("1")) {
271 logger.debug("SETTING GET TEXT TO FALSE");
272 get_text = false;
273 } else {
274 logger.debug("GET TEXT REMAINS TRUE");
275 }
276
277 // the_document is where all the doc info - structure and metadata etc
278 // is added into, to be returned in the page
279 Element the_document = doc.createElement(GSXML.DOCUMENT_ELEM);
280 page_response.appendChild(the_document);
281
282 // create a basic doc list containing the current node
283 Element basic_doc_list = doc.createElement(GSXML.DOC_NODE_ELEM + GSXML.LIST_MODIFIER);
284 Element current_doc = doc.createElement(GSXML.DOC_NODE_ELEM);
285 basic_doc_list.appendChild(current_doc);
286 if (document_id != null)
287 {
288 current_doc.setAttribute(GSXML.NODE_ID_ATT, document_id + doc_id_modifier);
289 }
290 else
291 {
292 current_doc.setAttribute(GSXML.HREF_ID_ATT, href);
293 // do we need this??
294 current_doc.setAttribute(GSXML.ID_MOD_ATT, doc_id_modifier);
295 }
296
297 if (document_type == null)
298 {
299 document_type = getDocumentType(basic_doc_list, collection, userContext, page_response);
300 }
301 if (document_type == null)
302 {
303 logger.debug("##### doctype is null, setting to simple");
304 document_type = GSXML.DOC_TYPE_SIMPLE;
305 }
306
307 the_document.setAttribute(GSXML.DOC_TYPE_ATT, document_type);
308
309 // Create a parameter list to specify the required structure information
310 Element ds_param_list = doc.createElement(GSXML.PARAM_ELEM + GSXML.LIST_MODIFIER);
311
312 if (service_params != null)
313 {
314 GSXML.addParametersToList(ds_param_list, service_params);
315 }
316
317 Element ds_param = null;
318 boolean get_structure = false;
319 boolean get_structure_info = false;
320 if (document_type.equals(GSXML.DOC_TYPE_PAGED))
321 {
322 get_structure_info = true;
323
324 if (expand_contents)
325 {
326 ds_param = doc.createElement(GSXML.PARAM_ELEM);
327 ds_param_list.appendChild(ds_param);
328 ds_param.setAttribute(GSXML.NAME_ATT, "structure");
329 ds_param.setAttribute(GSXML.VALUE_ATT, "entire");
330 }
331
332 // get the info needed for paged naviagtion
333 ds_param = doc.createElement(GSXML.PARAM_ELEM);
334 ds_param_list.appendChild(ds_param);
335 ds_param.setAttribute(GSXML.NAME_ATT, "info");
336 ds_param.setAttribute(GSXML.VALUE_ATT, "numSiblings");
337 ds_param = doc.createElement(GSXML.PARAM_ELEM);
338 ds_param_list.appendChild(ds_param);
339 ds_param.setAttribute(GSXML.NAME_ATT, "info");
340 ds_param.setAttribute(GSXML.VALUE_ATT, "numChildren");
341 ds_param = doc.createElement(GSXML.PARAM_ELEM);
342 ds_param_list.appendChild(ds_param);
343 ds_param.setAttribute(GSXML.NAME_ATT, "info");
344 ds_param.setAttribute(GSXML.VALUE_ATT, "siblingPosition");
345
346 if (get_siblings)
347 {
348 ds_param = doc.createElement(GSXML.PARAM_ELEM);
349 ds_param_list.appendChild(ds_param);
350 ds_param.setAttribute(GSXML.NAME_ATT, "structure");
351 ds_param.setAttribute(GSXML.VALUE_ATT, "siblings");
352 }
353
354 }
355 else if (document_type.equals(GSXML.DOC_TYPE_HIERARCHY) || document_type.equals(GSXML.DOC_TYPE_PAGED_HIERARCHY))
356 {
357 get_structure = true;
358 if (expand_contents)
359 {
360 ds_param = doc.createElement(GSXML.PARAM_ELEM);
361 ds_param_list.appendChild(ds_param);
362 ds_param.setAttribute(GSXML.NAME_ATT, "structure");
363 ds_param.setAttribute(GSXML.VALUE_ATT, "entire");
364 }
365 else
366 {
367 // get the info needed for table of contents
368 ds_param = doc.createElement(GSXML.PARAM_ELEM);
369 ds_param_list.appendChild(ds_param);
370 ds_param.setAttribute(GSXML.NAME_ATT, "structure");
371 ds_param.setAttribute(GSXML.VALUE_ATT, "ancestors");
372 ds_param = doc.createElement(GSXML.PARAM_ELEM);
373 ds_param_list.appendChild(ds_param);
374 ds_param.setAttribute(GSXML.NAME_ATT, "structure");
375 ds_param.setAttribute(GSXML.VALUE_ATT, "children");
376 if (get_siblings)
377 {
378 ds_param = doc.createElement(GSXML.PARAM_ELEM);
379 ds_param_list.appendChild(ds_param);
380 ds_param.setAttribute(GSXML.NAME_ATT, "structure");
381 ds_param.setAttribute(GSXML.VALUE_ATT, "siblings");
382 }
383 }
384 }
385 else
386 {
387 // we dont need any structure
388 }
389
390 boolean has_dummy = false;
391 if (get_structure || get_structure_info)
392 {
393
394 // Build a request to obtain the document structure
395 Element ds_message = doc.createElement(GSXML.MESSAGE_ELEM);
396 String to = GSPath.appendLink(collection, "DocumentStructureRetrieve");// Hard-wired?
397 Element ds_request = GSXML.createBasicRequest(doc, GSXML.REQUEST_TYPE_PROCESS, to, userContext);
398 ds_message.appendChild(ds_request);
399 ds_request.appendChild(ds_param_list);
400
401 // add the node list we created earlier
402 ds_request.appendChild(basic_doc_list);
403
404 // Process the document structure retrieve message
405 Element ds_response_message = (Element) this.mr.process(ds_message);
406 if (processErrorElements(ds_response_message, page_response))
407 {
408 return result;
409 }
410
411 // get the info and print out
412 String path = GSPath.appendLink(GSXML.RESPONSE_ELEM, GSXML.DOC_NODE_ELEM + GSXML.LIST_MODIFIER);
413 path = GSPath.appendLink(path, GSXML.DOC_NODE_ELEM);
414 path = GSPath.appendLink(path, "nodeStructureInfo");
415 Element ds_response_struct_info = (Element) GSXML.getNodeByPath(ds_response_message, path);
416 // get the doc_node bit
417 if (ds_response_struct_info != null)
418 {
419 the_document.appendChild(doc.importNode(ds_response_struct_info, true));
420 }
421 path = GSPath.appendLink(GSXML.RESPONSE_ELEM, GSXML.DOC_NODE_ELEM + GSXML.LIST_MODIFIER);
422 path = GSPath.appendLink(path, GSXML.DOC_NODE_ELEM);
423 path = GSPath.appendLink(path, GSXML.NODE_STRUCTURE_ELEM);
424 Element ds_response_structure = (Element) GSXML.getNodeByPath(ds_response_message, path);
425
426 if (ds_response_structure != null)
427 {
428 // add the contents of the structure bit into the_document
429 NodeList structs = ds_response_structure.getChildNodes();
430 for (int i = 0; i < structs.getLength(); i++)
431 {
432 the_document.appendChild(doc.importNode(structs.item(i), true));
433 }
434 }
435 else
436 {
437 // no structure nodes, so put in a dummy doc node
438 Element doc_node = doc.createElement(GSXML.DOC_NODE_ELEM);
439 if (document_id != null)
440 {
441 doc_node.setAttribute(GSXML.NODE_ID_ATT, document_id);
442 }
443 else
444 {
445 doc_node.setAttribute(GSXML.HREF_ID_ATT, href);
446
447 }
448 the_document.appendChild(doc_node);
449 has_dummy = true;
450 }
451 }
452 else
453 { // a simple type - we dont have a dummy node for simple
454 // should think about this more
455 // no structure request, so just put in a dummy doc node
456 Element doc_node = doc.createElement(GSXML.DOC_NODE_ELEM);
457 if (document_id != null)
458 {
459 doc_node.setAttribute(GSXML.NODE_ID_ATT, document_id);
460 }
461 else
462 {
463 doc_node.setAttribute(GSXML.HREF_ID_ATT, href);
464 }
465 the_document.appendChild(doc_node);
466 has_dummy = true;
467 }
468
469 // Build a request to obtain some document metadata
470 Element dm_message = doc.createElement(GSXML.MESSAGE_ELEM);
471 String to = GSPath.appendLink(collection, "DocumentMetadataRetrieve"); // Hard-wired?
472 Element dm_request = GSXML.createBasicRequest(doc, GSXML.REQUEST_TYPE_PROCESS, to, userContext);
473 dm_message.appendChild(dm_request);
474 // Create a parameter list to specify the required metadata information
475
476 HashSet<String> meta_names = new HashSet<String>();
477 meta_names.add("Title"); // the default
478 if (format_elem != null)
479 {
480 getRequiredMetadataNames(format_elem, meta_names);
481 }
482
483 Element extraMetaListElem = (Element) GSXML.getChildByTagName(request, GSXML.EXTRA_METADATA + GSXML.LIST_MODIFIER);
484 if (extraMetaListElem != null)
485 {
486 NodeList extraMetaList = extraMetaListElem.getElementsByTagName(GSXML.EXTRA_METADATA);
487 for (int i = 0; i < extraMetaList.getLength(); i++)
488 {
489 meta_names.add(((Element) extraMetaList.item(i)).getAttribute(GSXML.NAME_ATT));
490 }
491 }
492
493 Element dm_param_list = createMetadataParamList(doc,meta_names);
494 if (service_params != null)
495 {
496 GSXML.addParametersToList(dm_param_list, service_params);
497 }
498
499 dm_request.appendChild(dm_param_list);
500
501 // create the doc node list for the metadata request
502 Element dm_doc_list = doc.createElement(GSXML.DOC_NODE_ELEM + GSXML.LIST_MODIFIER);
503 dm_request.appendChild(dm_doc_list);
504
505 // Add each node from the structure response into the metadata request
506 NodeList doc_nodes = the_document.getElementsByTagName(GSXML.DOC_NODE_ELEM);
507 for (int i = 0; i < doc_nodes.getLength(); i++)
508 {
509 Element doc_node = (Element) doc_nodes.item(i);
510 String doc_node_id = doc_node.getAttribute(GSXML.NODE_ID_ATT);
511
512 // Add the documentNode to the list
513 Element dm_doc_node = doc.createElement(GSXML.DOC_NODE_ELEM);
514 dm_doc_list.appendChild(dm_doc_node);
515 dm_doc_node.setAttribute(GSXML.NODE_ID_ATT, doc_node_id);
516 dm_doc_node.setAttribute(GSXML.NODE_TYPE_ATT, doc_node.getAttribute(GSXML.NODE_TYPE_ATT));
517 if (document_id == null){
518 dm_doc_node.setAttribute(GSXML.HREF_ID_ATT, href );
519 }
520
521 }
522
523 // we also want a metadata request to the top level document to get
524 // assocfilepath - this could be cached too
525 Element doc_meta_request = GSXML.createBasicRequest(doc, GSXML.REQUEST_TYPE_PROCESS, to, userContext);
526 dm_message.appendChild(doc_meta_request);
527 Element doc_meta_param_list = doc.createElement(GSXML.PARAM_ELEM + GSXML.LIST_MODIFIER);
528 if (service_params != null)
529 {
530 GSXML.addParametersToList(doc_meta_param_list, service_params);
531 }
532
533 doc_meta_request.appendChild(doc_meta_param_list);
534 Element doc_param = doc.createElement(GSXML.PARAM_ELEM);
535 doc_meta_param_list.appendChild(doc_param);
536 doc_param.setAttribute(GSXML.NAME_ATT, "metadata");
537 doc_param.setAttribute(GSXML.VALUE_ATT, "assocfilepath");
538
539 // create the doc node list for the metadata request
540 Element doc_list = doc.createElement(GSXML.DOC_NODE_ELEM + GSXML.LIST_MODIFIER);
541 doc_meta_request.appendChild(doc_list);
542
543 Element doc_node = doc.createElement(GSXML.DOC_NODE_ELEM);
544 // the node we want is the root document node
545 if (document_id != null)
546 {
547 doc_node.setAttribute(GSXML.NODE_ID_ATT, document_id + ".rt");
548 }
549 /*else
550 {
551 doc_node.setAttribute(GSXML.HREF_ID_ATT, href);// + ".rt");
552 // can we assume that href is always a top level doc??
553 //doc_node.setAttribute(GSXML.ID_MOD_ATT, ".rt");
554 //doc_node.setAttribute("externalURL", has_rl);
555 }*/
556 doc_list.appendChild(doc_node);
557
558 Element dm_response_message = (Element) this.mr.process(dm_message);
559 if (processErrorElements(dm_response_message, page_response))
560 {
561 return result;
562 }
563
564 String path = GSPath.appendLink(GSXML.RESPONSE_ELEM, GSXML.DOC_NODE_ELEM + GSXML.LIST_MODIFIER);
565 Element dm_response_doc_list = (Element) GSXML.getNodeByPath(dm_response_message, path);
566
567 // Merge the metadata with the structure information
568 NodeList dm_response_docs = dm_response_doc_list.getChildNodes();
569 for (int i = 0; i < doc_nodes.getLength(); i++)
570 {
571 GSXML.mergeMetadataLists(doc_nodes.item(i), dm_response_docs.item(i));
572 }
573 // get the top level doc metadata out
574 Element doc_meta_response = (Element) dm_response_message.getElementsByTagName(GSXML.RESPONSE_ELEM).item(1);
575 Element top_doc_node = (Element) GSXML.getNodeByPath(doc_meta_response, "documentNodeList/documentNode");
576 GSXML.mergeMetadataLists(the_document, top_doc_node);
577
578 // do we want doc text content? If not, we are done.
579 if (!get_text) {
580 // don't get text
581 return result;
582 }
583
584 // Build a request to obtain some document content
585 Element dc_message = doc.createElement(GSXML.MESSAGE_ELEM);
586 to = GSPath.appendLink(collection, "DocumentContentRetrieve"); // Hard-wired?
587 Element dc_request = GSXML.createBasicRequest(doc, GSXML.REQUEST_TYPE_PROCESS, to, userContext);
588 dc_message.appendChild(dc_request);
589
590 // Create a parameter list to specify the request parameters - empty for now
591 Element dc_param_list = doc.createElement(GSXML.PARAM_ELEM + GSXML.LIST_MODIFIER);
592 if (service_params != null)
593 {
594 GSXML.addParametersToList(dc_param_list, service_params);
595 }
596
597 dc_request.appendChild(dc_param_list);
598
599 // get the content
600 // the doc list for the content request is the same as the one for the structure request unless we want the whole document, in which case its the same as for the metadata request.
601 if (expand_document)
602 {
603 dc_request.appendChild(dm_doc_list);
604 }
605 else
606 {
607 dc_request.appendChild(basic_doc_list);
608 }
609 Element dc_response_message = (Element) this.mr.process(dc_message);
610 if (processErrorElements(dc_response_message, page_response))
611 {
612 return result;
613 }
614 Element dc_response_doc_list = (Element) GSXML.getNodeByPath(dc_response_message, path);
615
616 if (expand_document)
617 {
618 // Merge the content with the structure information
619 NodeList dc_response_docs = dc_response_doc_list.getChildNodes();
620 for (int i = 0; i < doc_nodes.getLength(); i++)
621 {
622 Node content = GSXML.getChildByTagName((Element) dc_response_docs.item(i), GSXML.NODE_CONTENT_ELEM);
623 if (content != null)
624 {
625 if (highlight_query_terms)
626 {
627 String node_id = ((Element)doc_nodes.item(i)).getAttribute(GSXML.NODE_ID_ATT);
628 content = highlightQueryTerms(request, node_id, (Element) content);
629 }
630
631 doc_nodes.item(i).appendChild(doc.importNode(content, true));
632 }
633 //GSXML.mergeMetadataLists(doc_nodes.item(i), dm_response_docs.item(i));
634 }
635 if (has_dummy && document_type.equals(GSXML.DOC_TYPE_SIMPLE)) {
636 Element dummy_node = (Element) doc_nodes.item(0);
637 the_document.removeChild(dummy_node);
638 the_document.setAttribute(GSXML.NODE_ID_ATT, dummy_node.getAttribute(GSXML.NODE_ID_ATT));
639 NodeList dummy_children = dummy_node.getChildNodes();
640 for (int i = dummy_children.getLength() - 1; i >= 0; i--)
641 {
642 // special case as we don't want more than one metadata list
643 if (dummy_children.item(i).getNodeName().equals(GSXML.METADATA_ELEM + GSXML.LIST_MODIFIER))
644 {
645 GSXML.mergeMetadataFromList(the_document, dummy_children.item(i));
646 }
647 else
648 {
649 the_document.appendChild(dummy_children.item(i));
650 }
651 }
652 }
653 }
654 else
655 {
656 //path = GSPath.appendLink(path, GSXML.DOC_NODE_ELEM);
657 Element dc_response_doc = (Element) GSXML.getChildByTagName(dc_response_doc_list, GSXML.DOC_NODE_ELEM);
658 Element dc_response_doc_content = (Element) GSXML.getChildByTagName(dc_response_doc, GSXML.NODE_CONTENT_ELEM);
659 //Element dc_response_doc_external = (Element) GSXML.getChildByTagName(dc_response_doc, "external");
660
661 if (dc_response_doc_content == null)
662 {
663 // no content to add
664 if (dc_response_doc.getAttribute("external").equals("true"))
665 {
666
667 //if (dc_response_doc_external != null)
668 //{
669 String href_id = dc_response_doc.getAttribute(GSXML.HREF_ID_ATT);
670
671 the_document.setAttribute("selectedNode", href_id);
672 the_document.setAttribute("external", href_id);
673 }
674 return result;
675 }
676 if (highlight_query_terms)
677 {
678 dc_response_doc.removeChild(dc_response_doc_content);
679
680 dc_response_doc_content = highlightQueryTerms(request, null, dc_response_doc_content);
681 dc_response_doc.appendChild(dc_response_doc.getOwnerDocument().importNode(dc_response_doc_content, true));
682 }
683
684 if (provide_annotations)
685 {
686 String service_selected = (String) params.get(ENRICH_DOC_ARG);
687 if (service_selected != null && service_selected.equals("1"))
688 {
689 // now we can modifiy the response doc if needed
690 String enrich_service = (String) params.get(GSParams.SERVICE);
691 // send a message to the service
692 Element enrich_message = doc.createElement(GSXML.MESSAGE_ELEM);
693 Element enrich_request = GSXML.createBasicRequest(doc, GSXML.REQUEST_TYPE_PROCESS, enrich_service, userContext);
694 enrich_message.appendChild(enrich_request);
695 // check for parameters
696 HashMap e_service_params = (HashMap) params.get("s1");
697 if (e_service_params != null)
698 {
699 Element enrich_pl = doc.createElement(GSXML.PARAM_ELEM + GSXML.LIST_MODIFIER);
700 GSXML.addParametersToList(enrich_pl, e_service_params);
701 enrich_request.appendChild(enrich_pl);
702 }
703 Element e_doc_list = doc.createElement(GSXML.DOC_NODE_ELEM + GSXML.LIST_MODIFIER);
704 enrich_request.appendChild(e_doc_list);
705 e_doc_list.appendChild(doc.importNode(dc_response_doc, true));
706
707 Node enrich_response = this.mr.process(enrich_message);
708
709 String[] links = { GSXML.RESPONSE_ELEM, GSXML.DOC_NODE_ELEM + GSXML.LIST_MODIFIER, GSXML.DOC_NODE_ELEM, GSXML.NODE_CONTENT_ELEM };
710 path = GSPath.createPath(links);
711 dc_response_doc_content = (Element) GSXML.getNodeByPath(enrich_response, path);
712
713 }
714 } // if provide_annotations
715
716 // use the returned id rather than the sent one cos there may have
717 // been modifiers such as .pr that are removed.
718 String modified_doc_id = dc_response_doc.getAttribute(GSXML.NODE_ID_ATT);
719 the_document.setAttribute("selectedNode", modified_doc_id);
720 if (has_dummy)
721 {
722 // change the id if necessary and add the content
723 Element dummy_node = (Element) doc_nodes.item(0);
724
725 dummy_node.setAttribute(GSXML.NODE_ID_ATT, modified_doc_id);
726 dummy_node.appendChild(doc.importNode(dc_response_doc_content, true));
727 // hack for simple type
728 if (document_type.equals(GSXML.DOC_TYPE_SIMPLE))
729 {
730 // we dont want the internal docNode, just want the content and metadata in the document
731 // rethink this!!
732 the_document.removeChild(dummy_node);
733
734 NodeList dummy_children = dummy_node.getChildNodes();
735 //for (int i=0; i<dummy_children.getLength(); i++) {
736 for (int i = dummy_children.getLength() - 1; i >= 0; i--)
737 {
738 // special case as we don't want more than one metadata list
739 if (dummy_children.item(i).getNodeName().equals(GSXML.METADATA_ELEM + GSXML.LIST_MODIFIER))
740 {
741 GSXML.mergeMetadataFromList(the_document, dummy_children.item(i));
742 }
743 else
744 {
745 the_document.appendChild(dummy_children.item(i));
746 }
747 }
748 }
749
750 the_document.setAttribute(GSXML.NODE_ID_ATT, modified_doc_id);
751 }
752 else
753 {
754 // Merge the document content with the metadata and structure information
755 for (int i = 0; i < doc_nodes.getLength(); i++)
756 {
757 Node dn = doc_nodes.item(i);
758 String dn_id = ((Element) dn).getAttribute(GSXML.NODE_ID_ATT);
759 if (dn_id.equals(modified_doc_id))
760 {
761 dn.appendChild(doc.importNode(dc_response_doc_content, true));
762 break;
763 }
764 }
765 }
766 }
767 //logger.debug("(DocumentAction) Page:\n" + GSXML.xmlNodeToString(result));
768 return result;
769 }
770
771 /**
772 * tell the param class what its arguments are if an action has its own
773 * arguments, this should add them to the params object - particularly
774 * important for args that should not be saved
775 */
776 public boolean addActionParameters(GSParams params)
777 {
778 params.addParameter(GOTO_PAGE_ARG, false);
779 params.addParameter(ENRICH_DOC_ARG, false);
780 params.addParameter(EXPAND_DOCUMENT_ARG, false);
781 params.addParameter(EXPAND_CONTENTS_ARG, false);
782 params.addParameter(REALISTIC_BOOK_ARG, false);
783
784 return true;
785 }
786
787 /**
788 * this method gets the collection description, the format info, the list of
789 * enrich services, etc - stuff that is needed for the page, but is the same
790 * whatever the query is - should be cached
791 */
792 protected boolean getBackgroundData(Element page_response, String collection, UserContext userContext)
793 {
794 Document doc = page_response.getOwnerDocument();
795
796 // create a message to process - contains requests for the collection
797 // description, the format element, the enrich services on offer
798 // these could all be cached
799 Element info_message = doc.createElement(GSXML.MESSAGE_ELEM);
800 String path = GSPath.appendLink(collection, "DocumentContentRetrieve");
801 // the format request - ignore for now, where does this request go to??
802 Element format_request = GSXML.createBasicRequest(doc, GSXML.REQUEST_TYPE_FORMAT, path, userContext);
803 info_message.appendChild(format_request);
804
805 // the enrich_services request - only do this if provide_annotations is true
806
807 if (provide_annotations)
808 {
809 Element enrich_services_request = GSXML.createBasicRequest(doc, GSXML.REQUEST_TYPE_DESCRIBE, "", userContext);
810 enrich_services_request.setAttribute(GSXML.INFO_ATT, "serviceList");
811 info_message.appendChild(enrich_services_request);
812 }
813
814 Element info_response = (Element) this.mr.process(info_message);
815
816 // the collection is the first response
817 NodeList responses = info_response.getElementsByTagName(GSXML.RESPONSE_ELEM);
818 Element format_resp = (Element) responses.item(0);
819
820 Element format_elem = (Element) GSXML.getChildByTagName(format_resp, GSXML.FORMAT_ELEM);
821 if (format_elem != null)
822 {
823 Element global_format_elem = (Element) GSXML.getChildByTagName(format_resp, GSXML.GLOBAL_FORMAT_ELEM);
824 if (global_format_elem != null)
825 {
826 GSXSLT.mergeFormatElements(format_elem, global_format_elem, false);
827 }
828
829 // set the format type
830 format_elem.setAttribute(GSXML.TYPE_ATT, "display");
831 page_response.appendChild(doc.importNode(format_elem, true));
832 }
833
834 if (provide_annotations)
835 {
836 Element services_resp = (Element) responses.item(1);
837
838 // a new message for the mr
839 Element enrich_message = doc.createElement(GSXML.MESSAGE_ELEM);
840 NodeList e_services = services_resp.getElementsByTagName(GSXML.SERVICE_ELEM);
841 boolean service_found = false;
842 for (int j = 0; j < e_services.getLength(); j++)
843 {
844 if (((Element) e_services.item(j)).getAttribute(GSXML.TYPE_ATT).equals("enrich"))
845 {
846 Element s = GSXML.createBasicRequest(doc, GSXML.REQUEST_TYPE_DESCRIBE, ((Element) e_services.item(j)).getAttribute(GSXML.NAME_ATT), userContext);
847 enrich_message.appendChild(s);
848 service_found = true;
849 }
850 }
851 if (service_found)
852 {
853 Element enrich_response = (Element) this.mr.process(enrich_message);
854
855 NodeList e_responses = enrich_response.getElementsByTagName(GSXML.RESPONSE_ELEM);
856 Element service_list = doc.createElement(GSXML.SERVICE_ELEM + GSXML.LIST_MODIFIER);
857 for (int i = 0; i < e_responses.getLength(); i++)
858 {
859 Element e_resp = (Element) e_responses.item(i);
860 Element e_service = (Element) doc.importNode(GSXML.getChildByTagName(e_resp, GSXML.SERVICE_ELEM), true);
861 e_service.setAttribute(GSXML.NAME_ATT, e_resp.getAttribute(GSXML.FROM_ATT));
862 service_list.appendChild(e_service);
863 }
864 page_response.appendChild(service_list);
865 }
866 } // if provide_annotations
867 return true;
868
869 }
870
871 protected String getDocumentType(Element basic_doc_list, String collection, UserContext userContext, Element page_response)
872 {
873 Document doc = basic_doc_list.getOwnerDocument();
874
875 Element ds_message = doc.createElement(GSXML.MESSAGE_ELEM);
876 String to = GSPath.appendLink(collection, "DocumentStructureRetrieve");// Hard-wired?
877 Element ds_request = GSXML.createBasicRequest(doc, GSXML.REQUEST_TYPE_PROCESS, to, userContext);
878 ds_message.appendChild(ds_request);
879
880 // Create a parameter list to specify the required structure information
881 Element ds_param_list = doc.createElement(GSXML.PARAM_ELEM + GSXML.LIST_MODIFIER);
882 Element ds_param = doc.createElement(GSXML.PARAM_ELEM);
883 ds_param_list.appendChild(ds_param);
884 ds_param.setAttribute(GSXML.NAME_ATT, "info");
885 ds_param.setAttribute(GSXML.VALUE_ATT, "documentType");
886
887 ds_request.appendChild(ds_param_list);
888
889 // add the node list we created earlier
890 ds_request.appendChild(basic_doc_list);
891
892 // Process the document structure retrieve message
893 Element ds_response_message = (Element) this.mr.process(ds_message);
894 if (processErrorElements(ds_response_message, page_response))
895 {
896 return null;
897 }
898
899 String[] links = { GSXML.RESPONSE_ELEM, GSXML.DOC_NODE_ELEM + GSXML.LIST_MODIFIER, GSXML.DOC_NODE_ELEM, "nodeStructureInfo" };
900 String path = GSPath.createPath(links);
901 Element info_elem = (Element) GSXML.getNodeByPath(ds_response_message, path);
902 if (info_elem == null) {
903 return null;
904 }
905 Element doctype_elem = GSXML.getNamedElement(info_elem, "info", "name", "documentType");
906 if (doctype_elem != null)
907 {
908 String doc_type = doctype_elem.getAttribute("value");
909 return doc_type;
910 }
911 return null;
912 }
913
914 // Recursive method to set the docType, nodeType and nodeID attributes of each docNode
915 // The docType remains constant as in parameter document_type
916 // The nodeID for the first (root) docNode is already set. For all children, the rootNode id
917 // is updated to be <parent-id>.<num-child>, where the first parent-id is rootNode id.
918 // The nodeType is root if rootNode, internal if there are children and leaf if no children
919 protected void insertDocNodeAttributes(Element docNode, String document_type, String id) {
920
921 boolean isRoot = false;
922 if(id == null) { // rootNode, get the root nodeID to work with recursively
923 id = docNode.getAttribute(GSXML.NODE_ID_ATT);
924 isRoot = true;
925 } else { // for all but the root node, need to still set the nodeID
926 docNode.setAttribute(GSXML.NODE_ID_ATT, id);
927 }
928
929 docNode.setAttribute(GSXML.DOC_TYPE_ATT, document_type);
930
931 NodeList docNodes = GSXML.getChildrenByTagName(docNode, GSXML.DOC_NODE_ELEM);
932 if(docNodes.getLength() > 0) {
933 docNode.setAttribute(GSXML.NODE_TYPE_ATT, GSXML.NODE_TYPE_INTERNAL);
934 for(int i = 0; i < docNodes.getLength(); i++) {
935 Element childDocNode = (Element)docNodes.item(i);
936
937 // work out the child docNode's nodeID based on current id
938 String nodeID = id + "." + (i+1);
939 insertDocNodeAttributes(childDocNode, document_type, nodeID); //recursion step
940 }
941 } else {
942 docNode.setAttribute(GSXML.NODE_TYPE_ATT, GSXML.NODE_TYPE_LEAF);
943 }
944
945 // rootNode's nodeType is a special case: it's "root", not "leaf" or "internal"
946 if(isRoot) docNode.setAttribute(GSXML.NODE_TYPE_ATT, GSXML.NODE_TYPE_ROOT);
947
948 }
949
950 /** run the XSLT transform which converts from doc.xml format to our internal document format */
951 protected Element transformArchiveToDocument(Element section) {
952
953 String stylesheet_file = GSFile.stylesheetFile(GlobalProperties.getGSDL3Home(), (String) this.config_params.get(GSConstants.SITE_NAME), "", (String) this.config_params.get(GSConstants.INTERFACE_NAME), null, "archive2document.xsl");
954 Document stylesheet_doc = XMLConverter.getDOM(new File(stylesheet_file));
955 if (stylesheet_doc == null) {
956 logger.error("Couldn't load in stylesheet "+stylesheet_file);
957 return section;
958 }
959
960 Document section_doc = XMLConverter.newDOM();
961 section_doc.appendChild(section_doc.importNode(section, true));
962 Node result = this.transformer.transform(stylesheet_doc, section_doc);
963 logger.debug("transform result = "+XMLConverter.getPrettyString(result));
964
965 Element new_element;
966 if (result.getNodeType() == Node.DOCUMENT_NODE) {
967 new_element = ((Document) result).getDocumentElement();
968 } else {
969 new_element = (Element) result;
970 }
971
972
973 return new_element;
974
975 }
976
977
978 /**
979 * this involves a bit of a hack to get the equivalent query terms - has to
980 * requery the query service - uses the last selected service name. (if it
981 * ends in query). should this action do the query or should it send a
982 * message to the query action? but that will involve lots of extra stuff.
983 * also doesn't handle phrases properly - just highlights all the terms
984 * found in the text.
985 */
986 protected Element highlightQueryTerms(Element request, String current_node_id, Element dc_response_doc_content)
987 {
988 Document doc = request.getOwnerDocument();
989
990 // do the query again to get term info
991 Element cgi_param_list = (Element) GSXML.getChildByTagName(request, GSXML.PARAM_ELEM + GSXML.LIST_MODIFIER);
992 HashMap<String, Serializable> params = GSXML.extractParams(cgi_param_list, false);
993
994 HashMap previous_params = (HashMap) params.get("p");
995 if (previous_params == null)
996 {
997 return dc_response_doc_content;
998 }
999 String service_name = (String) previous_params.get(GSParams.SERVICE);
1000 if (service_name == null || !service_name.endsWith("Query"))
1001 { // hack for now - we only do highlighting if we were in a query last - ie not if we were in a browse thingy
1002 logger.debug("invalid service, not doing highlighting");
1003 return dc_response_doc_content;
1004 }
1005 String collection = (String) params.get(GSParams.COLLECTION);
1006 UserContext userContext = new UserContext(request);
1007 String to = GSPath.appendLink(collection, service_name);
1008
1009 Element mr_query_message = doc.createElement(GSXML.MESSAGE_ELEM);
1010 Element mr_query_request = GSXML.createBasicRequest(doc, GSXML.REQUEST_TYPE_PROCESS, to, userContext);
1011 mr_query_message.appendChild(mr_query_request);
1012
1013 // paramList
1014 HashMap service_params = (HashMap) params.get("s1");
1015
1016 Element query_param_list = doc.createElement(GSXML.PARAM_ELEM + GSXML.LIST_MODIFIER);
1017 GSXML.addParametersToList(query_param_list, service_params);
1018 if (current_node_id != null) {
1019 GSXML.addParameterToList(query_param_list, "hldocOID", current_node_id);
1020 } else {
1021 GSXML.addParameterToList(query_param_list, "hldocOID", (String) params.get(GSParams.DOCUMENT));
1022 }
1023 mr_query_request.appendChild(query_param_list);
1024 // do the query
1025 Element mr_query_response = (Element) this.mr.process(mr_query_message);
1026 String pathNode = GSPath.appendLink(GSXML.RESPONSE_ELEM, GSXML.NODE_CONTENT_ELEM);
1027 Element highlighted_Node = (Element) GSXML.getNodeByPath(mr_query_response, pathNode);
1028 // For SOLR, the above query may come back with a nodeContent element, which is the hldocOID section content, with search terms marked up. We send it back to the documnetContentRetrieve service so that resolveTextMacros can be applied, and it can be properly encased in documentNode etc elements
1029 if (highlighted_Node != null)
1030 {
1031 // Build a request to process highlighted text
1032
1033 Element hl_message = doc.createElement(GSXML.MESSAGE_ELEM);
1034 to = GSPath.appendLink(collection, "DocumentContentRetrieve");
1035 Element dc_request = GSXML.createBasicRequest(doc, GSXML.REQUEST_TYPE_PROCESS, to, userContext);
1036 hl_message.appendChild(dc_request);
1037
1038 // Create a parameter list to specify the request parameters - empty for now
1039 Element dc_param_list = doc.createElement(GSXML.PARAM_ELEM + GSXML.LIST_MODIFIER);
1040 dc_request.appendChild(dc_param_list);
1041
1042 // get the content
1043 Element doc_list = doc.createElement(GSXML.DOC_NODE_ELEM + GSXML.LIST_MODIFIER);
1044 dc_request.appendChild(doc_list);
1045 Element current_doc = doc.createElement(GSXML.DOC_NODE_ELEM);
1046 doc_list.appendChild(current_doc);
1047 current_doc.setAttribute(GSXML.NODE_ID_ATT, (String) params.get(GSParams.DOCUMENT));
1048 //Append highlighted content to request for processing
1049 dc_request.appendChild(doc.importNode(highlighted_Node, true));
1050 Element hl_response_message = (Element) this.mr.process(hl_message);
1051
1052 //Get results
1053 NodeList contentList = hl_response_message.getElementsByTagName(GSXML.NODE_CONTENT_ELEM);
1054 Element content = (Element) contentList.item(0);
1055 return content;
1056 }
1057 String path = GSPath.appendLink(GSXML.RESPONSE_ELEM, GSXML.TERM_ELEM + GSXML.LIST_MODIFIER);
1058 Element query_term_list_element = (Element) GSXML.getNodeByPath(mr_query_response, path);
1059 if (query_term_list_element == null)
1060 {
1061 // no term info
1062 logger.error("No query term information.\n");
1063 return dc_response_doc_content;
1064 }
1065
1066 String content = GSXML.getNodeText(dc_response_doc_content);
1067
1068 String metadata_path = GSPath.appendLink(GSXML.RESPONSE_ELEM, GSXML.METADATA_ELEM + GSXML.LIST_MODIFIER);
1069 Element metadata_list = (Element) GSXML.getNodeByPath(mr_query_response, metadata_path);
1070
1071 HashSet<String> query_term_variants = new HashSet<String>();
1072 NodeList equivalent_terms_nodelist = query_term_list_element.getElementsByTagName("equivTermList");
1073 if (equivalent_terms_nodelist == null || equivalent_terms_nodelist.getLength() == 0)
1074 {
1075 NodeList terms_nodelist = query_term_list_element.getElementsByTagName("term");
1076 if (terms_nodelist != null && terms_nodelist.getLength() > 0)
1077 {
1078 for (int i = 0; i < terms_nodelist.getLength(); i++)
1079 {
1080 String termValue = ((Element) terms_nodelist.item(i)).getAttribute("name");
1081 String termValueU = null;
1082 String termValueL = null;
1083
1084 if (termValue.length() > 1)
1085 {
1086 termValueU = termValue.substring(0, 1).toUpperCase() + termValue.substring(1);
1087 termValueL = termValue.substring(0, 1).toLowerCase() + termValue.substring(1);
1088 }
1089 else
1090 {
1091 termValueU = termValue.substring(0, 1).toUpperCase();
1092 termValueL = termValue.substring(0, 1).toLowerCase();
1093 }
1094
1095 query_term_variants.add(termValueU);
1096 query_term_variants.add(termValueL);
1097 }
1098 }
1099 }
1100 else
1101 {
1102 for (int i = 0; i < equivalent_terms_nodelist.getLength(); i++)
1103 {
1104 Element equivalent_terms_element = (Element) equivalent_terms_nodelist.item(i);
1105 String[] equivalent_terms = GSXML.getAttributeValuesFromList(equivalent_terms_element, GSXML.NAME_ATT);
1106 for (int j = 0; j < equivalent_terms.length; j++)
1107 {
1108 query_term_variants.add(equivalent_terms[j]);
1109 }
1110 }
1111 }
1112
1113 ArrayList<ArrayList<HashSet<String>>> phrase_query_term_variants_hierarchy = new ArrayList<ArrayList<HashSet<String>>>();
1114
1115 Element query_element = GSXML.getNamedElement(metadata_list, GSXML.METADATA_ELEM, GSXML.NAME_ATT, "query");
1116 String performed_query = GSXML.getNodeText(query_element) + " ";
1117
1118 ArrayList<HashSet<String>> phrase_query_p_term_variants_list = new ArrayList<HashSet<String>>();
1119 int term_start = 0;
1120 boolean in_term = false;
1121 boolean in_phrase = false;
1122 for (int i = 0; i < performed_query.length(); i++)
1123 {
1124 char character = performed_query.charAt(i);
1125 boolean is_character_letter_or_digit = Character.isLetterOrDigit(character);
1126
1127 // Has a query term just started?
1128 if (in_term == false && is_character_letter_or_digit == true)
1129 {
1130 in_term = true;
1131 term_start = i;
1132 }
1133
1134 // Or has a term just finished?
1135 else if (in_term == true && is_character_letter_or_digit == false)
1136 {
1137 in_term = false;
1138 String term = performed_query.substring(term_start, i);
1139
1140 Element term_element = GSXML.getNamedElement(query_term_list_element, GSXML.TERM_ELEM, GSXML.NAME_ATT, term);
1141 if (term_element != null)
1142 {
1143
1144 HashSet<String> phrase_query_p_term_x_variants = new HashSet<String>();
1145
1146 NodeList term_equivalent_terms_nodelist = term_element.getElementsByTagName("equivTermList");
1147 if (term_equivalent_terms_nodelist == null || term_equivalent_terms_nodelist.getLength() == 0)
1148 {
1149 String termValueU = null;
1150 String termValueL = null;
1151
1152 if (term.length() > 1)
1153 {
1154 termValueU = term.substring(0, 1).toUpperCase() + term.substring(1);
1155 termValueL = term.substring(0, 1).toLowerCase() + term.substring(1);
1156 }
1157 else
1158 {
1159 termValueU = term.substring(0, 1).toUpperCase();
1160 termValueL = term.substring(0, 1).toLowerCase();
1161 }
1162
1163 phrase_query_p_term_x_variants.add(termValueU);
1164 phrase_query_p_term_x_variants.add(termValueL);
1165 }
1166 else
1167 {
1168 for (int j = 0; j < term_equivalent_terms_nodelist.getLength(); j++)
1169 {
1170 Element term_equivalent_terms_element = (Element) term_equivalent_terms_nodelist.item(j);
1171 String[] term_equivalent_terms = GSXML.getAttributeValuesFromList(term_equivalent_terms_element, GSXML.NAME_ATT);
1172 for (int k = 0; k < term_equivalent_terms.length; k++)
1173 {
1174 phrase_query_p_term_x_variants.add(term_equivalent_terms[k]);
1175 }
1176 }
1177 }
1178 phrase_query_p_term_variants_list.add(phrase_query_p_term_x_variants);
1179
1180 if (in_phrase == false)
1181 {
1182 phrase_query_term_variants_hierarchy.add(phrase_query_p_term_variants_list);
1183 phrase_query_p_term_variants_list = new ArrayList<HashSet<String>>();
1184 }
1185 }
1186 }
1187 // Watch for phrases (surrounded by quotes)
1188 if (character == '\"')
1189 {
1190 // Has a phrase just started?
1191 if (in_phrase == false)
1192 {
1193 in_phrase = true;
1194 }
1195 // Or has a phrase just finished?
1196 else if (in_phrase == true)
1197 {
1198 in_phrase = false;
1199 phrase_query_term_variants_hierarchy.add(phrase_query_p_term_variants_list);
1200 }
1201
1202 phrase_query_p_term_variants_list = new ArrayList<HashSet<String>>();
1203 }
1204 }
1205
1206 return highlightQueryTermsInternal(doc, content, query_term_variants, phrase_query_term_variants_hierarchy);
1207 }
1208
1209 /**
1210 * Highlights query terms in a piece of text.
1211 */
1212 private Element highlightQueryTermsInternal(Document doc, String content, HashSet<String> query_term_variants, ArrayList<ArrayList<HashSet<String>>> phrase_query_term_variants_hierarchy)
1213 {
1214 // Convert the content string to an array of characters for speed
1215 char[] content_characters = new char[content.length()];
1216 content.getChars(0, content.length(), content_characters, 0);
1217
1218 // Now skim through the content, identifying word matches
1219 ArrayList<WordMatch> word_matches = new ArrayList<WordMatch>();
1220 int word_start = 0;
1221 boolean in_word = false;
1222 boolean preceding_word_matched = false;
1223 boolean inTag = false;
1224 for (int i = 0; i < content_characters.length; i++)
1225 {
1226 //We don't want to find words inside HTML tags
1227 if (content_characters[i] == '<')
1228 {
1229 inTag = true;
1230 continue;
1231 }
1232 else if (inTag && content_characters[i] == '>')
1233 {
1234 inTag = false;
1235 }
1236 else if (inTag)
1237 {
1238 continue;
1239 }
1240
1241 boolean is_character_letter_or_digit = Character.isLetterOrDigit(content_characters[i]);
1242
1243 // Has a word just started?
1244 if (in_word == false && is_character_letter_or_digit == true)
1245 {
1246 in_word = true;
1247 word_start = i;
1248 }
1249
1250 // Or has a word just finished?
1251 else if (in_word == true && is_character_letter_or_digit == false)
1252 {
1253 in_word = false;
1254
1255 // Check if the word matches any of the query term equivalents
1256 String word = new String(content_characters, word_start, (i - word_start));
1257 if (query_term_variants.contains(word))
1258 {
1259 // We have found a matching word, so remember its location
1260 word_matches.add(new WordMatch(word, word_start, i, preceding_word_matched));
1261 preceding_word_matched = true;
1262 }
1263 else
1264 {
1265 preceding_word_matched = false;
1266 }
1267 }
1268 }
1269
1270 // Don't forget the last word...
1271 if (in_word == true)
1272 {
1273 // Check if the word matches any of the query term equivalents
1274 String word = new String(content_characters, word_start, (content_characters.length - word_start));
1275 if (query_term_variants.contains(word))
1276 {
1277 // We have found a matching word, so remember its location
1278 word_matches.add(new WordMatch(word, word_start, content_characters.length, preceding_word_matched));
1279 }
1280 }
1281
1282 ArrayList<Integer> highlight_start_positions = new ArrayList<Integer>();
1283 ArrayList<Integer> highlight_end_positions = new ArrayList<Integer>();
1284
1285 // Deal with phrases now
1286 ArrayList<PartialPhraseMatch> partial_phrase_matches = new ArrayList<PartialPhraseMatch>();
1287 for (int i = 0; i < word_matches.size(); i++)
1288 {
1289 WordMatch word_match = word_matches.get(i);
1290
1291 // See if any partial phrase matches are extended by this word
1292 if (word_match.preceding_word_matched)
1293 {
1294 for (int j = partial_phrase_matches.size() - 1; j >= 0; j--)
1295 {
1296 PartialPhraseMatch partial_phrase_match = partial_phrase_matches.remove(j);
1297 ArrayList phrase_query_p_term_variants_list = phrase_query_term_variants_hierarchy.get(partial_phrase_match.query_phrase_number);
1298 HashSet phrase_query_p_term_x_variants = (HashSet) phrase_query_p_term_variants_list.get(partial_phrase_match.num_words_matched);
1299 if (phrase_query_p_term_x_variants.contains(word_match.word))
1300 {
1301 partial_phrase_match.num_words_matched++;
1302
1303 // Has a complete phrase match occurred?
1304 if (partial_phrase_match.num_words_matched == phrase_query_p_term_variants_list.size())
1305 {
1306 // Check for overlaps by looking at the previous highlight range
1307 if (!highlight_end_positions.isEmpty())
1308 {
1309 int last_highlight_index = highlight_end_positions.size() - 1;
1310 int last_highlight_end = highlight_end_positions.get(last_highlight_index).intValue();
1311 if (last_highlight_end > partial_phrase_match.start_position)
1312 {
1313 // There is an overlap, so remove the previous phrase match
1314 int last_highlight_start = highlight_start_positions.remove(last_highlight_index).intValue();
1315 highlight_end_positions.remove(last_highlight_index);
1316 partial_phrase_match.start_position = last_highlight_start;
1317 }
1318 }
1319
1320 highlight_start_positions.add(new Integer(partial_phrase_match.start_position));
1321 highlight_end_positions.add(new Integer(word_match.end_position));
1322 }
1323 // No, but add the partial match back into the list for next time
1324 else
1325 {
1326 partial_phrase_matches.add(partial_phrase_match);
1327 }
1328 }
1329 }
1330 }
1331 else
1332 {
1333 partial_phrase_matches.clear();
1334 }
1335
1336 // See if this word is at the start of any of the phrases
1337 for (int p = 0; p < phrase_query_term_variants_hierarchy.size(); p++)
1338 {
1339 ArrayList phrase_query_p_term_variants_list = phrase_query_term_variants_hierarchy.get(p);
1340 if (phrase_query_p_term_variants_list.size()>0) {
1341 HashSet phrase_query_p_term_1_variants = (HashSet) phrase_query_p_term_variants_list.get(0);
1342 if (phrase_query_p_term_1_variants.contains(word_match.word))
1343 {
1344 // If this phrase is just one word long, we have a complete match
1345 if (phrase_query_p_term_variants_list.size() == 1)
1346 {
1347 highlight_start_positions.add(new Integer(word_match.start_position));
1348 highlight_end_positions.add(new Integer(word_match.end_position));
1349 }
1350 // Otherwise we have the start of a potential phrase match
1351 else
1352 {
1353 partial_phrase_matches.add(new PartialPhraseMatch(word_match.start_position, p));
1354 }
1355 }
1356 }
1357 }
1358 }
1359
1360 // Now add the annotation tags into the document at the correct points
1361 Element content_element = doc.createElement(GSXML.NODE_CONTENT_ELEM);
1362
1363 int last_wrote = 0;
1364 for (int i = 0; i < highlight_start_positions.size(); i++)
1365 {
1366 int highlight_start = highlight_start_positions.get(i).intValue();
1367 int highlight_end = highlight_end_positions.get(i).intValue();
1368
1369 // Print anything before the highlight range
1370 if (last_wrote < highlight_start)
1371 {
1372 String preceding_text = new String(content_characters, last_wrote, (highlight_start - last_wrote));
1373 content_element.appendChild(doc.createTextNode(preceding_text));
1374 }
1375
1376 // Print the highlight text, annotated
1377 if (highlight_end > last_wrote)
1378 {
1379 String highlight_text = new String(content_characters, highlight_start, (highlight_end - highlight_start));
1380 Element annotation_element = GSXML.createTextElement(doc, "annotation", highlight_text);
1381 annotation_element.setAttribute("type", "query_term");
1382 content_element.appendChild(annotation_element);
1383 last_wrote = highlight_end;
1384 }
1385 }
1386
1387 // Finish off any unwritten text
1388 if (last_wrote < content_characters.length)
1389 {
1390 String remaining_text = new String(content_characters, last_wrote, (content_characters.length - last_wrote));
1391 content_element.appendChild(doc.createTextNode(remaining_text));
1392 }
1393 return content_element;
1394 }
1395
1396 static private class WordMatch
1397 {
1398 public String word;
1399 public int start_position;
1400 public int end_position;
1401 public boolean preceding_word_matched;
1402
1403 public WordMatch(String word, int start_position, int end_position, boolean preceding_word_matched)
1404 {
1405 this.word = word;
1406 this.start_position = start_position;
1407 this.end_position = end_position;
1408 this.preceding_word_matched = preceding_word_matched;
1409 }
1410 }
1411
1412 static private class PartialPhraseMatch
1413 {
1414 public int start_position;
1415 public int query_phrase_number;
1416 public int num_words_matched;
1417
1418 public PartialPhraseMatch(int start_position, int query_phrase_number)
1419 {
1420 this.start_position = start_position;
1421 this.query_phrase_number = query_phrase_number;
1422 this.num_words_matched = 1;
1423 }
1424 }
1425}
Note: See TracBrowser for help on using the repository browser.