source: main/trunk/greenstone3/src/java/org/greenstone/gsdl3/action/DocumentAction.java@ 24813

Last change on this file since 24813 was 24813, checked in by sjm84, 12 years ago

Fixed highlighting inside tags

  • Property svn:keywords set to Author Date Id Revision
File size: 39.1 KB
Line 
1/*
2 * DocumentAction.java
3 * Copyright (C) 2002 New Zealand Digital Library, http://www.nzdl.org
4 *
5 * This program is free software; you can redistribute it and/or modify
6 * it under the terms of the GNU General Public License as published by
7 * the Free Software Foundation; either version 2 of the License, or
8 * (at your option) any later version.
9 *
10 * This program is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 * GNU General Public License for more details.
14 *
15 * You should have received a copy of the GNU General Public License
16 * along with this program; if not, write to the Free Software
17 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
18 */
19package org.greenstone.gsdl3.action;
20
21// Greenstone classes
22import org.greenstone.gsdl3.core.ModuleInterface;
23import org.greenstone.gsdl3.util.*;
24
25// XML classes
26import org.w3c.dom.Document;
27import org.w3c.dom.Element;
28import org.w3c.dom.Node;
29import org.w3c.dom.Text;
30import org.w3c.dom.NodeList;
31
32// General Java classes
33import java.util.ArrayList;
34import java.util.HashMap;
35import java.util.HashSet;
36import java.io.File;
37
38import org.apache.log4j.*;
39
40/** Action class for retrieving Documents via the message router */
41public class DocumentAction extends Action
42{
43
44 static Logger logger = Logger.getLogger(org.greenstone.gsdl3.action.DocumentAction.class.getName());
45
46 // this is used to specify that the sibling nodes of a selected one should be obtained
47 public static final String SIBLING_ARG = "sib";
48 public static final String GOTO_PAGE_ARG = "gp";
49 public static final String ENRICH_DOC_ARG = "end";
50
51 /**
52 * if this is set to true, when a document is displayed, any annotation type
53 * services (enrich) will be offered to the user as well
54 */
55 protected boolean provide_annotations = false;
56
57 protected boolean highlight_query_terms = false;
58
59 public boolean configure()
60 {
61 super.configure();
62 String highlight = (String) config_params.get("highlightQueryTerms");
63 if (highlight != null && highlight.equals("true"))
64 {
65 highlight_query_terms = true;
66 }
67 String annotate = (String) config_params.get("displayAnnotationService");
68 if (annotate != null && annotate.equals("true"))
69 {
70 provide_annotations = true;
71 }
72 return true;
73 }
74
75 public Node process(Node message_node)
76 {
77 // for now, no subaction eventually we may want to have subactions such as text assoc or something ?
78
79 Element message = this.converter.nodeToElement(message_node);
80
81 // the response
82 Element result = this.doc.createElement(GSXML.MESSAGE_ELEM);
83 Element page_response = this.doc.createElement(GSXML.RESPONSE_ELEM);
84 result.appendChild(page_response);
85
86 // get the request - assume only one
87 Element request = (Element) GSXML.getChildByTagName(message, GSXML.REQUEST_ELEM);
88 Element cgi_paramList = (Element) GSXML.getChildByTagName(request, GSXML.PARAM_ELEM + GSXML.LIST_MODIFIER);
89 HashMap params = GSXML.extractParams(cgi_paramList, false);
90
91 // just in case there are some that need to get passed to the services
92 HashMap service_params = (HashMap) params.get("s0");
93
94 String has_rl = null;
95 String has_href = null;
96 has_href = (String) params.get("href");//for an external link : get the href URL if it is existing in the params list
97 has_rl = (String) params.get("rl");//for an external link : get the rl value if it is existing in the params list
98 String collection = (String) params.get(GSParams.COLLECTION);
99 String lang = request.getAttribute(GSXML.LANG_ATT);
100 String uid = request.getAttribute(GSXML.USER_ID_ATT);
101 String document_name = (String) params.get(GSParams.DOCUMENT);
102 if ((document_name == null || document_name.equals("")) && (has_href == null || has_href.equals("")))
103 {
104 logger.error("no document specified!");
105 return result;
106 }
107 String document_type = (String) params.get(GSParams.DOCUMENT_TYPE);
108 if (document_type == null)
109 {
110 document_type = "simple";
111 }
112 //whether to retrieve siblings or not
113 boolean get_siblings = false;
114 String sibs = (String) params.get(SIBLING_ARG);
115 if (sibs != null && sibs.equals("1"))
116 {
117 get_siblings = true;
118 }
119
120 String sibling_num = (String) params.get(GOTO_PAGE_ARG);
121 if (sibling_num != null && !sibling_num.equals(""))
122 {
123 // we have to modify the doc name
124 document_name = document_name + "." + sibling_num + ".ss";
125 }
126
127 boolean expand_document = false;
128 String ed_arg = (String) params.get(GSParams.EXPAND_DOCUMENT);
129 if (ed_arg != null && ed_arg.equals("1"))
130 {
131 expand_document = true;
132 }
133
134 boolean expand_contents = false;
135 if (expand_document)
136 { // we always expand the contents with the text
137 expand_contents = true;
138 }
139 else
140 {
141 String ec_arg = (String) params.get(GSParams.EXPAND_CONTENTS);
142 if (ec_arg != null && ec_arg.equals("1"))
143 {
144 expand_contents = true;
145 }
146 }
147
148 //append site metadata
149 addSiteMetadata(page_response, lang, uid);
150
151 // get the additional data needed for the page
152 getBackgroundData(page_response, collection, lang, uid);
153 Element format_elem = (Element) GSXML.getChildByTagName(page_response, GSXML.FORMAT_ELEM);
154
155 // the_document is where all the doc info - structure and metadata etc
156 // is added into, to be returned in the page
157 Element the_document = this.doc.createElement(GSXML.DOCUMENT_ELEM);
158 page_response.appendChild(the_document);
159
160 // set the doctype from the cgi arg as an attribute
161 the_document.setAttribute(GSXML.DOC_TYPE_ATT, document_type);
162
163 // create a basic doc list containing the current node
164 Element basic_doc_list = this.doc.createElement(GSXML.DOC_NODE_ELEM + GSXML.LIST_MODIFIER);
165 Element current_doc = this.doc.createElement(GSXML.DOC_NODE_ELEM);
166 basic_doc_list.appendChild(current_doc);
167 if (document_name.length() != 0)
168 {
169 current_doc.setAttribute(GSXML.NODE_ID_ATT, document_name);
170 }
171 else if (has_href.length() != 0)
172 {
173 current_doc.setAttribute(GSXML.NODE_ID_ATT, has_href);
174 current_doc.setAttribute("externalURL", has_rl);
175 }
176
177 // Create a parameter list to specify the required structure information
178 Element ds_param_list = this.doc.createElement(GSXML.PARAM_ELEM + GSXML.LIST_MODIFIER);
179
180 if (service_params != null)
181 {
182 GSXML.addParametersToList(this.doc, ds_param_list, service_params);
183 }
184
185 Element ds_param = null;
186 boolean get_structure = false;
187 boolean get_structure_info = false;
188 if (document_type.equals("paged"))
189 {
190 get_structure_info = true;
191 // get teh info needed for paged naviagtion
192 ds_param = this.doc.createElement(GSXML.PARAM_ELEM);
193 ds_param_list.appendChild(ds_param);
194 ds_param.setAttribute(GSXML.NAME_ATT, "info");
195 ds_param.setAttribute(GSXML.VALUE_ATT, "numSiblings");
196 ds_param = this.doc.createElement(GSXML.PARAM_ELEM);
197 ds_param_list.appendChild(ds_param);
198 ds_param.setAttribute(GSXML.NAME_ATT, "info");
199 ds_param.setAttribute(GSXML.VALUE_ATT, "numChildren");
200 ds_param = this.doc.createElement(GSXML.PARAM_ELEM);
201 ds_param_list.appendChild(ds_param);
202 ds_param.setAttribute(GSXML.NAME_ATT, "info");
203 ds_param.setAttribute(GSXML.VALUE_ATT, "siblingPosition");
204
205 }
206 else if (document_type.equals("hierarchy"))
207 {
208 get_structure = true;
209 if (expand_contents)
210 {
211 ds_param = this.doc.createElement(GSXML.PARAM_ELEM);
212 ds_param_list.appendChild(ds_param);
213 ds_param.setAttribute(GSXML.NAME_ATT, "structure");
214 ds_param.setAttribute(GSXML.VALUE_ATT, "entire");
215 }
216 else
217 {
218 // get the info needed for table of contents
219 ds_param = this.doc.createElement(GSXML.PARAM_ELEM);
220 ds_param_list.appendChild(ds_param);
221 ds_param.setAttribute(GSXML.NAME_ATT, "structure");
222 ds_param.setAttribute(GSXML.VALUE_ATT, "ancestors");
223 ds_param = this.doc.createElement(GSXML.PARAM_ELEM);
224 ds_param_list.appendChild(ds_param);
225 ds_param.setAttribute(GSXML.NAME_ATT, "structure");
226 ds_param.setAttribute(GSXML.VALUE_ATT, "children");
227 if (get_siblings)
228 {
229 ds_param = this.doc.createElement(GSXML.PARAM_ELEM);
230 ds_param_list.appendChild(ds_param);
231 ds_param.setAttribute(GSXML.NAME_ATT, "structure");
232 ds_param.setAttribute(GSXML.VALUE_ATT, "siblings");
233 }
234 }
235 }
236 else
237 {
238 // we dont need any structure
239 }
240
241 boolean has_dummy = false;
242 if (get_structure || get_structure_info)
243 {
244
245 // Build a request to obtain the document structure
246 Element ds_message = this.doc.createElement(GSXML.MESSAGE_ELEM);
247 String to = GSPath.appendLink(collection, "DocumentStructureRetrieve");// Hard-wired?
248 Element ds_request = GSXML.createBasicRequest(this.doc, GSXML.REQUEST_TYPE_PROCESS, to, lang, uid);
249 ds_message.appendChild(ds_request);
250 ds_request.appendChild(ds_param_list);
251
252 // create a doc_node_list and put in the doc_node that we are interested in
253 ds_request.appendChild(basic_doc_list);
254
255 // Process the document structure retrieve message
256 Element ds_response_message = (Element) this.mr.process(ds_message);
257 if (processErrorElements(ds_response_message, page_response))
258 {
259 return result;
260 }
261
262 // get the info and print out
263 String path = GSPath.appendLink(GSXML.RESPONSE_ELEM, GSXML.DOC_NODE_ELEM + GSXML.LIST_MODIFIER);
264 path = GSPath.appendLink(path, GSXML.DOC_NODE_ELEM);
265 path = GSPath.appendLink(path, "nodeStructureInfo");
266 Element ds_response_struct_info = (Element) GSXML.getNodeByPath(ds_response_message, path);
267 // get the doc_node bit
268 if (ds_response_struct_info != null)
269 {
270 the_document.appendChild(this.doc.importNode(ds_response_struct_info, true));
271 }
272 path = GSPath.appendLink(GSXML.RESPONSE_ELEM, GSXML.DOC_NODE_ELEM + GSXML.LIST_MODIFIER);
273 path = GSPath.appendLink(path, GSXML.DOC_NODE_ELEM);
274 path = GSPath.appendLink(path, GSXML.NODE_STRUCTURE_ELEM);
275 Element ds_response_structure = (Element) GSXML.getNodeByPath(ds_response_message, path);
276
277 if (ds_response_structure != null)
278 {
279 // add the contents of the structure bit into the_document
280 NodeList structs = ds_response_structure.getChildNodes();
281 for (int i = 0; i < structs.getLength(); i++)
282 {
283 the_document.appendChild(this.doc.importNode(structs.item(i), true));
284 }
285 }
286 else
287 {
288 // no structure nodes, so put in a dummy doc node
289 Element doc_node = this.doc.createElement(GSXML.DOC_NODE_ELEM);
290 if (document_name.length() != 0)
291 {
292 doc_node.setAttribute(GSXML.NODE_ID_ATT, document_name);
293 }
294 else if (has_href.length() != 0)
295 {
296 doc_node.setAttribute(GSXML.NODE_ID_ATT, has_href);
297 doc_node.setAttribute("externalURL", has_rl);
298 }
299 the_document.appendChild(doc_node);
300 has_dummy = true;
301 }
302 }
303 else
304 { // a simple type - we dont have a dummy node for simple
305 // should think about this more
306 // no structure request, so just put in a dummy doc node
307 Element doc_node = this.doc.createElement(GSXML.DOC_NODE_ELEM);
308 if (document_name.length() != 0)
309 {
310 doc_node.setAttribute(GSXML.NODE_ID_ATT, document_name);
311 }
312 else if (has_href.length() != 0)
313 {
314 doc_node.setAttribute(GSXML.NODE_ID_ATT, has_href);
315 doc_node.setAttribute("externalURL", has_rl);
316 }
317 the_document.appendChild(doc_node);
318 has_dummy = true;
319 }
320
321 // Build a request to obtain some document metadata
322 Element dm_message = this.doc.createElement(GSXML.MESSAGE_ELEM);
323 String to = GSPath.appendLink(collection, "DocumentMetadataRetrieve"); // Hard-wired?
324 Element dm_request = GSXML.createBasicRequest(this.doc, GSXML.REQUEST_TYPE_PROCESS, to, lang, uid);
325 dm_message.appendChild(dm_request);
326 // Create a parameter list to specify the required metadata information
327
328 HashSet meta_names = new HashSet();
329 meta_names.add("Title"); // the default
330 if (format_elem != null)
331 {
332 extractMetadataNames(format_elem, meta_names);
333 }
334
335 Element dm_param_list = createMetadataParamList(meta_names);
336 if (service_params != null)
337 {
338 GSXML.addParametersToList(this.doc, dm_param_list, service_params);
339 }
340
341 dm_request.appendChild(dm_param_list);
342
343 // create the doc node list for the metadata request
344 Element dm_doc_list = this.doc.createElement(GSXML.DOC_NODE_ELEM + GSXML.LIST_MODIFIER);
345 dm_request.appendChild(dm_doc_list);
346
347 // Add each node from the structure response into the metadata request
348 NodeList doc_nodes = the_document.getElementsByTagName(GSXML.DOC_NODE_ELEM);
349 for (int i = 0; i < doc_nodes.getLength(); i++)
350 {
351 Element doc_node = (Element) doc_nodes.item(i);
352 String doc_node_id = doc_node.getAttribute(GSXML.NODE_ID_ATT);
353
354 // Add the documentNode to the list
355 Element dm_doc_node = this.doc.createElement(GSXML.DOC_NODE_ELEM);
356 dm_doc_list.appendChild(dm_doc_node);
357 dm_doc_node.setAttribute(GSXML.NODE_ID_ATT, doc_node_id);
358 dm_doc_node.setAttribute(GSXML.NODE_TYPE_ATT, doc_node.getAttribute(GSXML.NODE_TYPE_ATT));
359 }
360
361 // we also want a metadata request to the top level document to get
362 // assocfilepath - this could be cached too
363 Element doc_meta_request = GSXML.createBasicRequest(this.doc, GSXML.REQUEST_TYPE_PROCESS, to, lang, uid);
364 dm_message.appendChild(doc_meta_request);
365 Element doc_meta_param_list = this.doc.createElement(GSXML.PARAM_ELEM + GSXML.LIST_MODIFIER);
366 if (service_params != null)
367 {
368 GSXML.addParametersToList(this.doc, doc_meta_param_list, service_params);
369 }
370
371 doc_meta_request.appendChild(doc_meta_param_list);
372 Element doc_param = this.doc.createElement(GSXML.PARAM_ELEM);
373 doc_meta_param_list.appendChild(doc_param);
374 doc_param.setAttribute(GSXML.NAME_ATT, "metadata");
375 doc_param.setAttribute(GSXML.VALUE_ATT, "assocfilepath");
376
377 // create the doc node list for the metadata request
378 Element doc_list = this.doc.createElement(GSXML.DOC_NODE_ELEM + GSXML.LIST_MODIFIER);
379 doc_meta_request.appendChild(doc_list);
380
381 Element doc_node = this.doc.createElement(GSXML.DOC_NODE_ELEM);
382 // the node we want is the root document node
383 if (document_name.length() != 0)
384 {
385 doc_node.setAttribute(GSXML.NODE_ID_ATT, document_name + ".rt");
386 }
387 else if (has_href.length() != 0)
388 {
389 doc_node.setAttribute(GSXML.NODE_ID_ATT, has_href + ".rt");
390 doc_node.setAttribute("externalURL", has_rl);
391 }
392 doc_list.appendChild(doc_node);
393 Element dm_response_message = (Element) this.mr.process(dm_message);
394 if (processErrorElements(dm_response_message, page_response))
395 {
396 return result;
397 }
398
399 String path = GSPath.appendLink(GSXML.RESPONSE_ELEM, GSXML.DOC_NODE_ELEM + GSXML.LIST_MODIFIER);
400 Element dm_response_doc_list = (Element) GSXML.getNodeByPath(dm_response_message, path);
401
402 // Merge the metadata with the structure information
403 NodeList dm_response_docs = dm_response_doc_list.getChildNodes();
404 for (int i = 0; i < doc_nodes.getLength(); i++)
405 {
406 GSXML.mergeMetadataLists(doc_nodes.item(i), dm_response_docs.item(i));
407 }
408 // get the top level doc metadata out
409 Element doc_meta_response = (Element) dm_response_message.getElementsByTagName(GSXML.RESPONSE_ELEM).item(1);
410 Element top_doc_node = (Element) GSXML.getNodeByPath(doc_meta_response, "documentNodeList/documentNode");
411 GSXML.mergeMetadataLists(the_document, top_doc_node);
412
413 // Build a request to obtain some document content
414 Element dc_message = this.doc.createElement(GSXML.MESSAGE_ELEM);
415 to = GSPath.appendLink(collection, "DocumentContentRetrieve"); // Hard-wired?
416 Element dc_request = GSXML.createBasicRequest(this.doc, GSXML.REQUEST_TYPE_PROCESS, to, lang, uid);
417 dc_message.appendChild(dc_request);
418
419 // Create a parameter list to specify the request parameters - empty for now
420 Element dc_param_list = this.doc.createElement(GSXML.PARAM_ELEM + GSXML.LIST_MODIFIER);
421 if (service_params != null)
422 {
423 GSXML.addParametersToList(this.doc, dc_param_list, service_params);
424 }
425
426 dc_request.appendChild(dc_param_list);
427
428 // get the content
429 // the doc list for the content request is the same as the one for the structure request unless we want the whole document, in which case its the same as for the metadata request.
430 if (expand_document)
431 {
432 dc_request.appendChild(dm_doc_list);
433 }
434 else
435 {
436 dc_request.appendChild(basic_doc_list);
437 }
438 logger.debug("request = " + converter.getString(dc_message));
439 Element dc_response_message = (Element) this.mr.process(dc_message);
440 if (processErrorElements(dc_response_message, page_response))
441 {
442 return result;
443 }
444
445 Element dc_response_doc_list = (Element) GSXML.getNodeByPath(dc_response_message, path);
446
447 if (expand_document)
448 {
449 // Merge the content with the structure information
450 NodeList dc_response_docs = dc_response_doc_list.getChildNodes();
451 for (int i = 0; i < doc_nodes.getLength(); i++)
452 {
453 Node content = GSXML.getChildByTagName((Element) dc_response_docs.item(i), "nodeContent");
454 if (content != null)
455 {
456 if (highlight_query_terms)
457 {
458 content = highlightQueryTerms(request, (Element) content);
459 }
460 doc_nodes.item(i).appendChild(this.doc.importNode(content, true));
461 }
462 //GSXML.mergeMetadataLists(doc_nodes.item(i), dm_response_docs.item(i));
463 }
464 }
465 else
466 {
467 //path = GSPath.appendLink(path, GSXML.DOC_NODE_ELEM);
468 Element dc_response_doc = (Element) GSXML.getChildByTagName(dc_response_doc_list, GSXML.DOC_NODE_ELEM);
469 Element dc_response_doc_content = (Element) GSXML.getChildByTagName(dc_response_doc, GSXML.NODE_CONTENT_ELEM);
470 Element dc_response_doc_external = (Element) GSXML.getChildByTagName(dc_response_doc, "external");
471
472 if (dc_response_doc_content == null)
473 {
474 // no content to add
475 if (dc_response_doc_external != null)
476 {
477 String modified_doc_id = dc_response_doc.getAttribute(GSXML.NODE_ID_ATT);
478
479 the_document.setAttribute("selectedNode", modified_doc_id);
480 the_document.setAttribute("external", dc_response_doc_external.getAttribute("external_link"));
481 }
482 return result;
483 }
484 if (highlight_query_terms)
485 {
486 dc_response_doc.removeChild(dc_response_doc_content);
487
488 dc_response_doc_content = highlightQueryTerms(request, dc_response_doc_content);
489 dc_response_doc.appendChild(dc_response_doc.getOwnerDocument().importNode(dc_response_doc_content, true));
490 }
491
492 if (provide_annotations)
493 {
494 String service_selected = (String) params.get(ENRICH_DOC_ARG);
495 if (service_selected != null && service_selected.equals("1"))
496 {
497 // now we can modifiy the response doc if needed
498 String enrich_service = (String) params.get(GSParams.SERVICE);
499 // send a message to the service
500 Element enrich_message = this.doc.createElement(GSXML.MESSAGE_ELEM);
501 Element enrich_request = GSXML.createBasicRequest(this.doc, GSXML.REQUEST_TYPE_PROCESS, enrich_service, lang, uid);
502 enrich_message.appendChild(enrich_request);
503 // check for parameters
504 HashMap e_service_params = (HashMap) params.get("s1");
505 if (e_service_params != null)
506 {
507 Element enrich_pl = this.doc.createElement(GSXML.PARAM_ELEM + GSXML.LIST_MODIFIER);
508 GSXML.addParametersToList(this.doc, enrich_pl, e_service_params);
509 enrich_request.appendChild(enrich_pl);
510 }
511 Element e_doc_list = this.doc.createElement(GSXML.DOC_NODE_ELEM + GSXML.LIST_MODIFIER);
512 enrich_request.appendChild(e_doc_list);
513 e_doc_list.appendChild(this.doc.importNode(dc_response_doc, true));
514
515 Node enrich_response = this.mr.process(enrich_message);
516
517 String[] links = { GSXML.RESPONSE_ELEM, GSXML.DOC_NODE_ELEM + GSXML.LIST_MODIFIER, GSXML.DOC_NODE_ELEM, GSXML.NODE_CONTENT_ELEM };
518 path = GSPath.createPath(links);
519 dc_response_doc_content = (Element) GSXML.getNodeByPath(enrich_response, path);
520
521 }
522 } // if provide_annotations
523
524 // use the returned id rather than the sent one cos there may have
525 // been modifiers such as .pr that are removed.
526 String modified_doc_id = dc_response_doc.getAttribute(GSXML.NODE_ID_ATT);
527 the_document.setAttribute("selectedNode", modified_doc_id);
528 if (has_dummy)
529 {
530 // change the id if necessary and add the content
531 Element dummy_node = (Element) doc_nodes.item(0);
532
533 dummy_node.setAttribute(GSXML.NODE_ID_ATT, modified_doc_id);
534 dummy_node.appendChild(this.doc.importNode(dc_response_doc_content, true));
535 // hack for simple type
536 if (document_type.equals("simple"))
537 {
538 // we dont want the internal docNode, just want the content and metadata in the document
539 // rethink this!!
540 the_document.removeChild(dummy_node);
541
542 NodeList dummy_children = dummy_node.getChildNodes();
543 //for (int i=0; i<dummy_children.getLength(); i++) {
544 for (int i = dummy_children.getLength() - 1; i >= 0; i--)
545 {
546 // special case as we don't want more than one metadata list
547 if (dummy_children.item(i).getNodeName().equals(GSXML.METADATA_ELEM + GSXML.LIST_MODIFIER))
548 {
549 GSXML.mergeMetadataFromList(the_document, dummy_children.item(i));
550 }
551 else
552 {
553 the_document.appendChild(dummy_children.item(i));
554 }
555 }
556 }
557 }
558 else
559 {
560 // Merge the document content with the metadata and structure information
561 for (int i = 0; i < doc_nodes.getLength(); i++)
562 {
563 Node dn = doc_nodes.item(i);
564 String dn_id = ((Element) dn).getAttribute(GSXML.NODE_ID_ATT);
565 if (dn_id.equals(modified_doc_id))
566 {
567 dn.appendChild(this.doc.importNode(dc_response_doc_content, true));
568 break;
569 }
570 }
571 }
572 }
573 logger.debug("(DocumentAction) Page:\n" + this.converter.getPrettyString(result));
574 return result;
575 }
576
577 /**
578 * tell the param class what its arguments are if an action has its own
579 * arguments, this should add them to the params object - particularly
580 * important for args that should not be saved
581 */
582 public boolean getActionParameters(GSParams params)
583 {
584 params.addParameter(GOTO_PAGE_ARG, false);
585 params.addParameter(ENRICH_DOC_ARG, false);
586 return true;
587 }
588
589 /**
590 * this method gets the collection description, the format info, the list of
591 * enrich services, etc - stuff that is needed for the page, but is the same
592 * whatever the query is - should be cached
593 */
594 protected boolean getBackgroundData(Element page_response, String collection, String lang, String uid)
595 {
596
597 // create a message to process - contains requests for the collection
598 // description, the format element, the enrich services on offer
599 // these could all be cached
600 Element info_message = this.doc.createElement(GSXML.MESSAGE_ELEM);
601 String path = GSPath.appendLink(collection, "DocumentContentRetrieve");
602 // the format request - ignore for now, where does this request go to??
603 Element format_request = GSXML.createBasicRequest(this.doc, GSXML.REQUEST_TYPE_FORMAT, path, lang, uid);
604 info_message.appendChild(format_request);
605
606 // the enrich_services request - only do this if provide_annotations is true
607
608 if (provide_annotations)
609 {
610 Element enrich_services_request = GSXML.createBasicRequest(this.doc, GSXML.REQUEST_TYPE_DESCRIBE, "", lang, uid);
611 enrich_services_request.setAttribute(GSXML.INFO_ATT, "serviceList");
612 info_message.appendChild(enrich_services_request);
613 }
614
615 Element info_response = (Element) this.mr.process(info_message);
616
617 // the collection is the first response
618 NodeList responses = info_response.getElementsByTagName(GSXML.RESPONSE_ELEM);
619 Element format_resp = (Element) responses.item(0);
620
621 Element format_elem = (Element) GSXML.getChildByTagName(format_resp, GSXML.FORMAT_ELEM);
622 if (format_elem != null)
623 {
624 logger.debug("doc action found a format statement");
625 // set teh format type
626 format_elem.setAttribute(GSXML.TYPE_ATT, "display");
627 page_response.appendChild(this.doc.importNode(format_elem, true));
628 }
629
630 if (provide_annotations)
631 {
632 Element services_resp = (Element) responses.item(1);
633
634 // a new message for the mr
635 Element enrich_message = this.doc.createElement(GSXML.MESSAGE_ELEM);
636
637 NodeList e_services = services_resp.getElementsByTagName(GSXML.SERVICE_ELEM);
638 boolean service_found = false;
639 for (int j = 0; j < e_services.getLength(); j++)
640 {
641 if (((Element) e_services.item(j)).getAttribute(GSXML.TYPE_ATT).equals("enrich"))
642 {
643 Element s = GSXML.createBasicRequest(this.doc, GSXML.REQUEST_TYPE_DESCRIBE, ((Element) e_services.item(j)).getAttribute(GSXML.NAME_ATT), lang, uid);
644 enrich_message.appendChild(s);
645 service_found = true;
646 }
647 }
648 if (service_found)
649 {
650 Element enrich_response = (Element) this.mr.process(enrich_message);
651
652 NodeList e_responses = enrich_response.getElementsByTagName(GSXML.RESPONSE_ELEM);
653 Element service_list = this.doc.createElement(GSXML.SERVICE_ELEM + GSXML.LIST_MODIFIER);
654 for (int i = 0; i < e_responses.getLength(); i++)
655 {
656 Element e_resp = (Element) e_responses.item(i);
657 Element e_service = (Element) this.doc.importNode(GSXML.getChildByTagName(e_resp, GSXML.SERVICE_ELEM), true);
658 e_service.setAttribute(GSXML.NAME_ATT, e_resp.getAttribute(GSXML.FROM_ATT));
659 service_list.appendChild(e_service);
660 }
661 page_response.appendChild(service_list);
662 }
663 } // if provide_annotations
664 return true;
665
666 }
667
668 /**
669 * this involves a bit of a hack to get the equivalent query terms - has to
670 * requery the query service - uses the last selected service name. (if it
671 * ends in query). should this action do the query or should it send a
672 * message to the query action? but that will involve lots of extra stuff.
673 * also doesn't handle phrases properly - just highlights all the terms found
674 * in the text.
675 */
676 protected Element highlightQueryTerms(Element request, Element dc_response_doc_content)
677 {
678
679 // do the query again to get term info
680 Element cgi_param_list = (Element) GSXML.getChildByTagName(request, GSXML.PARAM_ELEM + GSXML.LIST_MODIFIER);
681 HashMap params = GSXML.extractParams(cgi_param_list, false);
682
683 HashMap previous_params = (HashMap) params.get("p");
684 if (previous_params == null)
685 {
686 return dc_response_doc_content;
687 }
688 String service_name = (String) previous_params.get(GSParams.SERVICE);
689 if (service_name == null || !service_name.endsWith("Query"))
690 { // hack for now - we only do highlighting if we were in a query last - ie not if we were in a browse thingy
691 logger.debug("invalid service, not doing highlighting");
692 return dc_response_doc_content;
693 }
694 String collection = (String) params.get(GSParams.COLLECTION);
695 String lang = request.getAttribute(GSXML.LANG_ATT);
696 String uid = request.getAttribute(GSXML.USER_ID_ATT);
697 String to = GSPath.appendLink(collection, service_name);
698
699 Element mr_query_message = this.doc.createElement(GSXML.MESSAGE_ELEM);
700 Element mr_query_request = GSXML.createBasicRequest(this.doc, GSXML.REQUEST_TYPE_PROCESS, to, lang, uid);
701 mr_query_message.appendChild(mr_query_request);
702
703 // paramList
704 HashMap service_params = (HashMap) params.get("s1");
705
706 Element query_param_list = this.doc.createElement(GSXML.PARAM_ELEM + GSXML.LIST_MODIFIER);
707 GSXML.addParametersToList(this.doc, query_param_list, service_params);
708 mr_query_request.appendChild(query_param_list);
709
710 // do the query
711 Element mr_query_response = (Element) this.mr.process(mr_query_message);
712
713 String path = GSPath.appendLink(GSXML.RESPONSE_ELEM, GSXML.TERM_ELEM + GSXML.LIST_MODIFIER);
714 Element query_term_list_element = (Element) GSXML.getNodeByPath(mr_query_response, path);
715 if (query_term_list_element == null)
716 {
717 // no term info
718 logger.error("No query term information.\n");
719 return dc_response_doc_content;
720 }
721
722 String content = GSXML.getNodeText(dc_response_doc_content);
723
724 String metadata_path = GSPath.appendLink(GSXML.RESPONSE_ELEM, GSXML.METADATA_ELEM + GSXML.LIST_MODIFIER);
725 Element metadata_list = (Element) GSXML.getNodeByPath(mr_query_response, metadata_path);
726
727 HashSet query_term_variants = new HashSet();
728 NodeList equivalent_terms_nodelist = query_term_list_element.getElementsByTagName("equivTermList");
729 if (equivalent_terms_nodelist == null || equivalent_terms_nodelist.getLength() == 0)
730 {
731 NodeList terms_nodelist = query_term_list_element.getElementsByTagName("term");
732 if (terms_nodelist != null && terms_nodelist.getLength() > 0)
733 {
734 for (int i = 0; i < terms_nodelist.getLength(); i++)
735 {
736 String termValue = ((Element) terms_nodelist.item(i)).getAttribute("name");
737 String termValueU = null;
738 String termValueL = null;
739
740 if (termValue.length() > 1)
741 {
742 termValueU = termValue.substring(0, 1).toUpperCase() + termValue.substring(1);
743 termValueL = termValue.substring(0, 1).toLowerCase() + termValue.substring(1);
744 }
745 else
746 {
747 termValueU = termValue.substring(0, 1).toUpperCase();
748 termValueL = termValue.substring(0, 1).toLowerCase();
749 }
750
751 query_term_variants.add(termValueU);
752 query_term_variants.add(termValueL);
753 }
754 }
755 }
756 else
757 {
758 for (int i = 0; i < equivalent_terms_nodelist.getLength(); i++)
759 {
760 Element equivalent_terms_element = (Element) equivalent_terms_nodelist.item(i);
761 String[] equivalent_terms = GSXML.getAttributeValuesFromList(equivalent_terms_element, GSXML.NAME_ATT);
762 for (int j = 0; j < equivalent_terms.length; j++)
763 {
764 query_term_variants.add(equivalent_terms[j]);
765 }
766 }
767 }
768
769 ArrayList phrase_query_term_variants_hierarchy = new ArrayList();
770
771 Element query_element = GSXML.getNamedElement(metadata_list, GSXML.METADATA_ELEM, GSXML.NAME_ATT, "query");
772 String performed_query = GSXML.getNodeText(query_element) + " ";
773
774 ArrayList phrase_query_p_term_variants_list = new ArrayList();
775 int term_start = 0;
776 boolean in_term = false;
777 boolean in_phrase = false;
778 for (int i = 0; i < performed_query.length(); i++)
779 {
780 char character = performed_query.charAt(i);
781 boolean is_character_letter_or_digit = Character.isLetterOrDigit(character);
782
783 // Has a query term just started?
784 if (in_term == false && is_character_letter_or_digit == true)
785 {
786 in_term = true;
787 term_start = i;
788 }
789
790 // Or has a term just finished?
791 else if (in_term == true && is_character_letter_or_digit == false)
792 {
793 in_term = false;
794 String term = performed_query.substring(term_start, i);
795
796 Element term_element = GSXML.getNamedElement(query_term_list_element, GSXML.TERM_ELEM, GSXML.NAME_ATT, term);
797 if (term_element != null)
798 {
799
800 HashSet phrase_query_p_term_x_variants = new HashSet();
801
802 NodeList term_equivalent_terms_nodelist = term_element.getElementsByTagName("equivTermList");
803 if (term_equivalent_terms_nodelist == null || term_equivalent_terms_nodelist.getLength() == 0)
804 {
805 String termValueU = null;
806 String termValueL = null;
807
808 if (term.length() > 1)
809 {
810 termValueU = term.substring(0, 1).toUpperCase() + term.substring(1);
811 termValueL = term.substring(0, 1).toLowerCase() + term.substring(1);
812 }
813 else
814 {
815 termValueU = term.substring(0, 1).toUpperCase();
816 termValueL = term.substring(0, 1).toLowerCase();
817 }
818
819 phrase_query_p_term_x_variants.add(termValueU);
820 phrase_query_p_term_x_variants.add(termValueL);
821 }
822 else
823 {
824 for (int j = 0; j < term_equivalent_terms_nodelist.getLength(); j++)
825 {
826 Element term_equivalent_terms_element = (Element) term_equivalent_terms_nodelist.item(j);
827 String[] term_equivalent_terms = GSXML.getAttributeValuesFromList(term_equivalent_terms_element, GSXML.NAME_ATT);
828 for (int k = 0; k < term_equivalent_terms.length; k++)
829 {
830 phrase_query_p_term_x_variants.add(term_equivalent_terms[k]);
831 }
832 }
833 }
834 phrase_query_p_term_variants_list.add(phrase_query_p_term_x_variants);
835
836 if (in_phrase == false)
837 {
838 phrase_query_term_variants_hierarchy.add(phrase_query_p_term_variants_list);
839 phrase_query_p_term_variants_list = new ArrayList();
840 }
841 }
842 }
843 // Watch for phrases (surrounded by quotes)
844 if (character == '\"')
845 {
846 // Has a phrase just started?
847 if (in_phrase == false)
848 {
849 in_phrase = true;
850 }
851 // Or has a phrase just finished?
852 else if (in_phrase == true)
853 {
854 in_phrase = false;
855 phrase_query_term_variants_hierarchy.add(phrase_query_p_term_variants_list);
856 }
857
858 phrase_query_p_term_variants_list = new ArrayList();
859 }
860 }
861
862 System.err.println(query_term_variants + " *** " + phrase_query_term_variants_hierarchy);
863 return highlightQueryTermsInternal(content, query_term_variants, phrase_query_term_variants_hierarchy);
864 }
865
866 /**
867 * Highlights query terms in a piece of text.
868 */
869 private Element highlightQueryTermsInternal(String content, HashSet query_term_variants, ArrayList phrase_query_term_variants_hierarchy)
870 {
871 // Convert the content string to an array of characters for speed
872 char[] content_characters = new char[content.length()];
873 content.getChars(0, content.length(), content_characters, 0);
874
875 // Now skim through the content, identifying word matches
876 ArrayList word_matches = new ArrayList();
877 int word_start = 0;
878 boolean in_word = false;
879 boolean preceding_word_matched = false;
880 boolean inTag = false;
881 for (int i = 0; i < content_characters.length; i++)
882 {
883 //We don't want to find words inside HTML tags
884 if(content_characters[i] == '<')
885 {
886 inTag = true;
887 continue;
888 }
889 else if (inTag && content_characters[i] == '>')
890 {
891 inTag = false;
892 }
893 else if (inTag)
894 {
895 continue;
896 }
897
898 boolean is_character_letter_or_digit = Character.isLetterOrDigit(content_characters[i]);
899
900 // Has a word just started?
901 if (in_word == false && is_character_letter_or_digit == true)
902 {
903 in_word = true;
904 word_start = i;
905 }
906
907 // Or has a word just finished?
908 else if (in_word == true && is_character_letter_or_digit == false)
909 {
910 in_word = false;
911
912 // Check if the word matches any of the query term equivalents
913 String word = new String(content_characters, word_start, (i - word_start));
914 if (query_term_variants.contains(word))
915 {
916 // We have found a matching word, so remember its location
917 word_matches.add(new WordMatch(word, word_start, i, preceding_word_matched));
918 preceding_word_matched = true;
919 }
920 else
921 {
922 preceding_word_matched = false;
923 }
924 }
925 }
926
927 // Don't forget the last word...
928 if (in_word == true)
929 {
930 // Check if the word matches any of the query term equivalents
931 String word = new String(content_characters, word_start, (content_characters.length - word_start));
932 if (query_term_variants.contains(word))
933 {
934 // We have found a matching word, so remember its location
935 word_matches.add(new WordMatch(word, word_start, content_characters.length, preceding_word_matched));
936 }
937 }
938
939 ArrayList highlight_start_positions = new ArrayList();
940 ArrayList highlight_end_positions = new ArrayList();
941
942 // Deal with phrases now
943 ArrayList partial_phrase_matches = new ArrayList();
944 for (int i = 0; i < word_matches.size(); i++)
945 {
946 WordMatch word_match = (WordMatch) word_matches.get(i);
947
948 // See if any partial phrase matches are extended by this word
949 if (word_match.preceding_word_matched)
950 {
951 for (int j = partial_phrase_matches.size() - 1; j >= 0; j--)
952 {
953 PartialPhraseMatch partial_phrase_match = (PartialPhraseMatch) partial_phrase_matches.remove(j);
954 ArrayList phrase_query_p_term_variants_list = (ArrayList) phrase_query_term_variants_hierarchy.get(partial_phrase_match.query_phrase_number);
955 HashSet phrase_query_p_term_x_variants = (HashSet) phrase_query_p_term_variants_list.get(partial_phrase_match.num_words_matched);
956 if (phrase_query_p_term_x_variants.contains(word_match.word))
957 {
958 partial_phrase_match.num_words_matched++;
959
960 // Has a complete phrase match occurred?
961 if (partial_phrase_match.num_words_matched == phrase_query_p_term_variants_list.size())
962 {
963 // Check for overlaps by looking at the previous highlight range
964 if (!highlight_end_positions.isEmpty())
965 {
966 int last_highlight_index = highlight_end_positions.size() - 1;
967 int last_highlight_end = ((Integer) highlight_end_positions.get(last_highlight_index)).intValue();
968 if (last_highlight_end > partial_phrase_match.start_position)
969 {
970 // There is an overlap, so remove the previous phrase match
971 int last_highlight_start = ((Integer) highlight_start_positions.remove(last_highlight_index)).intValue();
972 highlight_end_positions.remove(last_highlight_index);
973 partial_phrase_match.start_position = last_highlight_start;
974 }
975 }
976
977 highlight_start_positions.add(new Integer(partial_phrase_match.start_position));
978 highlight_end_positions.add(new Integer(word_match.end_position));
979 }
980 // No, but add the partial match back into the list for next time
981 else
982 {
983 partial_phrase_matches.add(partial_phrase_match);
984 }
985 }
986 }
987 }
988 else
989 {
990 partial_phrase_matches.clear();
991 }
992
993 // See if this word is at the start of any of the phrases
994 for (int p = 0; p < phrase_query_term_variants_hierarchy.size(); p++)
995 {
996 ArrayList phrase_query_p_term_variants_list = (ArrayList) phrase_query_term_variants_hierarchy.get(p);
997 HashSet phrase_query_p_term_1_variants = (HashSet) phrase_query_p_term_variants_list.get(0);
998 if (phrase_query_p_term_1_variants.contains(word_match.word))
999 {
1000 // If this phrase is just one word long, we have a complete match
1001 if (phrase_query_p_term_variants_list.size() == 1)
1002 {
1003 highlight_start_positions.add(new Integer(word_match.start_position));
1004 highlight_end_positions.add(new Integer(word_match.end_position));
1005 }
1006 // Otherwise we have the start of a potential phrase match
1007 else
1008 {
1009 partial_phrase_matches.add(new PartialPhraseMatch(word_match.start_position, p));
1010 }
1011 }
1012 }
1013 }
1014
1015 // Now add the annotation tags into the document at the correct points
1016 Element content_element = this.doc.createElement(GSXML.NODE_CONTENT_ELEM);
1017
1018 int last_wrote = 0;
1019 for (int i = 0; i < highlight_start_positions.size(); i++)
1020 {
1021 int highlight_start = ((Integer) highlight_start_positions.get(i)).intValue();
1022 int highlight_end = ((Integer) highlight_end_positions.get(i)).intValue();
1023
1024 // Print anything before the highlight range
1025 if (last_wrote < highlight_start)
1026 {
1027 String preceding_text = new String(content_characters, last_wrote, (highlight_start - last_wrote));
1028 content_element.appendChild(this.doc.createTextNode(preceding_text));
1029 }
1030
1031 // Print the highlight text, annotated
1032 if (highlight_end > last_wrote)
1033 {
1034 String highlight_text = new String(content_characters, highlight_start, (highlight_end - highlight_start));
1035 Element annotation_element = GSXML.createTextElement(this.doc, "annotation", highlight_text);
1036 annotation_element.setAttribute("type", "query_term");
1037 content_element.appendChild(annotation_element);
1038 last_wrote = highlight_end;
1039 }
1040 }
1041
1042 // Finish off any unwritten text
1043 if (last_wrote < content_characters.length)
1044 {
1045 String remaining_text = new String(content_characters, last_wrote, (content_characters.length - last_wrote));
1046 content_element.appendChild(this.doc.createTextNode(remaining_text));
1047 }
1048
1049 return content_element;
1050 }
1051
1052 static private class WordMatch
1053 {
1054 public String word;
1055 public int start_position;
1056 public int end_position;
1057 public boolean preceding_word_matched;
1058
1059 public WordMatch(String word, int start_position, int end_position, boolean preceding_word_matched)
1060 {
1061 this.word = word;
1062 this.start_position = start_position;
1063 this.end_position = end_position;
1064 this.preceding_word_matched = preceding_word_matched;
1065 }
1066 }
1067
1068 static private class PartialPhraseMatch
1069 {
1070 public int start_position;
1071 public int query_phrase_number;
1072 public int num_words_matched;
1073
1074 public PartialPhraseMatch(int start_position, int query_phrase_number)
1075 {
1076 this.start_position = start_position;
1077 this.query_phrase_number = query_phrase_number;
1078 this.num_words_matched = 1;
1079 }
1080 }
1081}
Note: See TracBrowser for help on using the repository browser.