source: main/trunk/greenstone3/src/java/org/greenstone/gsdl3/action/DocumentAction.java@ 24219

Last change on this file since 24219 was 24116, checked in by sjm84, 13 years ago

Fixed search term highlighting in Lucene

  • Property svn:keywords set to Author Date Id Revision
File size: 38.4 KB
Line 
1/*
2* DocumentAction.java
3* Copyright (C) 2002 New Zealand Digital Library, http://www.nzdl.org
4*
5* This program is free software; you can redistribute it and/or modify
6* it under the terms of the GNU General Public License as published by
7* the Free Software Foundation; either version 2 of the License, or
8* (at your option) any later version.
9*
10* This program is distributed in the hope that it will be useful,
11* but WITHOUT ANY WARRANTY; without even the implied warranty of
12* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13* GNU General Public License for more details.
14*
15* You should have received a copy of the GNU General Public License
16* along with this program; if not, write to the Free Software
17* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
18*/
19package org.greenstone.gsdl3.action;
20
21// Greenstone classes
22import org.greenstone.gsdl3.core.ModuleInterface;
23import org.greenstone.gsdl3.util.*;
24
25// XML classes
26import org.w3c.dom.Document;
27import org.w3c.dom.Element;
28import org.w3c.dom.Node;
29import org.w3c.dom.Text;
30import org.w3c.dom.NodeList;
31
32// General Java classes
33import java.util.ArrayList;
34import java.util.HashMap;
35import java.util.HashSet;
36import java.io.File;
37
38import org.apache.log4j.*;
39
40/** Action class for retrieving Documents via the message router
41*/
42public class DocumentAction extends Action {
43
44 static Logger logger = Logger.getLogger(org.greenstone.gsdl3.action.DocumentAction.class.getName());
45
46 // this is used to specify that the sibling nodes of a selected one should be obtained
47 public static final String SIBLING_ARG = "sib";
48 public static final String GOTO_PAGE_ARG = "gp";
49 public static final String ENRICH_DOC_ARG = "end";
50
51 /** if this is set to true, when a document is displayed, any annotation
52 * type services (enrich) will be offered to the user as well */
53 protected boolean provide_annotations = false;
54
55 protected boolean highlight_query_terms = false;
56
57 public boolean configure() {
58 super.configure();
59 String highlight = (String)config_params.get("highlightQueryTerms");
60 if (highlight != null && highlight.equals("true")) {
61 highlight_query_terms = true;
62 }
63 String annotate = (String)config_params.get("displayAnnotationService");
64 if (annotate != null && annotate.equals("true")) {
65 provide_annotations = true;
66 }
67 return true;
68 }
69 public Node process (Node message_node)
70 {
71 // for now, no subaction eventually we may want to have subactions such as text assoc or something ?
72
73 Element message = this.converter.nodeToElement(message_node);
74
75 // the response
76 Element result = this.doc.createElement(GSXML.MESSAGE_ELEM);
77 Element page_response = this.doc.createElement(GSXML.RESPONSE_ELEM);
78 result.appendChild(page_response);
79
80 // get the request - assume only one
81 Element request = (Element)GSXML.getChildByTagName(message, GSXML.REQUEST_ELEM);
82 Element cgi_paramList = (Element)GSXML.getChildByTagName(request, GSXML.PARAM_ELEM+GSXML.LIST_MODIFIER);
83 HashMap params = GSXML.extractParams(cgi_paramList, false);
84
85 // just in case there are some that need to get passed to the services
86 HashMap service_params = (HashMap)params.get("s0");
87
88
89 String has_rl = null;
90 String has_href = null;
91 has_href = (String) params.get("href");//for an external link : get the href URL if it is existing in the params list
92 has_rl = (String) params.get("rl");//for an external link : get the rl value if it is existing in the params list
93 String collection = (String) params.get(GSParams.COLLECTION);
94 String lang = request.getAttribute(GSXML.LANG_ATT);
95 String uid = request.getAttribute(GSXML.USER_ID_ATT);
96 String document_name = (String) params.get(GSParams.DOCUMENT);
97 if ((document_name == null || document_name.equals("")) && (has_href == null || has_href.equals(""))) {
98 logger.error("no document specified!");
99 return result;
100 }
101 String document_type = (String) params.get(GSParams.DOCUMENT_TYPE);
102 if (document_type == null) {
103 document_type = "simple";
104 }
105 //whether to retrieve siblings or not
106 boolean get_siblings = false;
107 String sibs = (String) params.get(SIBLING_ARG);
108 if (sibs != null && sibs.equals("1")) {
109 get_siblings = true;
110 }
111
112 String sibling_num = (String) params.get(GOTO_PAGE_ARG);
113 if (sibling_num != null && !sibling_num.equals("")) {
114 // we have to modify the doc name
115 document_name = document_name+"."+sibling_num+".ss";
116 }
117
118 boolean expand_document = false;
119 String ed_arg = (String) params.get(GSParams.EXPAND_DOCUMENT);
120 if (ed_arg != null && ed_arg.equals("1")) {
121 expand_document = true;
122 }
123
124
125 boolean expand_contents = false;
126 if (expand_document) { // we always expand the contents with the text
127 expand_contents = true;
128 } else {
129 String ec_arg = (String) params.get(GSParams.EXPAND_CONTENTS);
130 if (ec_arg != null && ec_arg.equals("1")) {
131 expand_contents = true;
132 }
133 }
134
135 //append site metadata
136 addSiteMetadata( page_response, lang, uid);
137
138 // get the additional data needed for the page
139 getBackgroundData(page_response, collection, lang, uid);
140 Element format_elem = (Element)GSXML.getChildByTagName(page_response, GSXML.FORMAT_ELEM);
141
142 // the_document is where all the doc info - structure and metadata etc
143 // is added into, to be returned in the page
144 Element the_document = this.doc.createElement(GSXML.DOCUMENT_ELEM);
145 page_response.appendChild(the_document);
146
147 // set the doctype from the cgi arg as an attribute
148 the_document.setAttribute(GSXML.DOC_TYPE_ATT, document_type);
149
150 // create a basic doc list containing the current node
151 Element basic_doc_list = this.doc.createElement(GSXML.DOC_NODE_ELEM+GSXML.LIST_MODIFIER);
152 Element current_doc = this.doc.createElement(GSXML.DOC_NODE_ELEM);
153 basic_doc_list.appendChild(current_doc);
154 if (document_name.length()!=0){
155 current_doc.setAttribute(GSXML.NODE_ID_ATT, document_name);
156 }else if (has_href.length()!=0){
157 current_doc.setAttribute(GSXML.NODE_ID_ATT, has_href);
158 current_doc.setAttribute("externalURL", has_rl);
159 }
160
161 // Create a parameter list to specify the required structure information
162 Element ds_param_list = this.doc.createElement(GSXML.PARAM_ELEM+GSXML.LIST_MODIFIER);
163
164 if (service_params != null) {
165 GSXML.addParametersToList(this.doc, ds_param_list, service_params);
166 }
167
168 Element ds_param = null;
169 boolean get_structure = false;
170 boolean get_structure_info = false;
171 if (document_type.equals("paged")) {
172 get_structure_info = true;
173 // get teh info needed for paged naviagtion
174 ds_param = this.doc.createElement(GSXML.PARAM_ELEM);
175 ds_param_list.appendChild(ds_param);
176 ds_param.setAttribute(GSXML.NAME_ATT, "info");
177 ds_param.setAttribute(GSXML.VALUE_ATT, "numSiblings");
178 ds_param = this.doc.createElement(GSXML.PARAM_ELEM);
179 ds_param_list.appendChild(ds_param);
180 ds_param.setAttribute(GSXML.NAME_ATT, "info");
181 ds_param.setAttribute(GSXML.VALUE_ATT, "numChildren");
182 ds_param = this.doc.createElement(GSXML.PARAM_ELEM);
183 ds_param_list.appendChild(ds_param);
184 ds_param.setAttribute(GSXML.NAME_ATT, "info");
185 ds_param.setAttribute(GSXML.VALUE_ATT, "siblingPosition");
186
187 } else if (document_type.equals("hierarchy")){
188 get_structure = true;
189 if (expand_contents) {
190 ds_param = this.doc.createElement(GSXML.PARAM_ELEM);
191 ds_param_list.appendChild(ds_param);
192 ds_param.setAttribute(GSXML.NAME_ATT, "structure");
193 ds_param.setAttribute(GSXML.VALUE_ATT, "entire");
194 } else {
195 // get the info needed for table of contents
196 ds_param = this.doc.createElement(GSXML.PARAM_ELEM);
197 ds_param_list.appendChild(ds_param);
198 ds_param.setAttribute(GSXML.NAME_ATT, "structure");
199 ds_param.setAttribute(GSXML.VALUE_ATT, "ancestors");
200 ds_param = this.doc.createElement(GSXML.PARAM_ELEM);
201 ds_param_list.appendChild(ds_param);
202 ds_param.setAttribute(GSXML.NAME_ATT, "structure");
203 ds_param.setAttribute(GSXML.VALUE_ATT, "children");
204 if (get_siblings) {
205 ds_param = this.doc.createElement(GSXML.PARAM_ELEM);
206 ds_param_list.appendChild(ds_param);
207 ds_param.setAttribute(GSXML.NAME_ATT, "structure");
208 ds_param.setAttribute(GSXML.VALUE_ATT, "siblings");
209 }
210 }
211 } else {
212 // we dont need any structure
213 }
214
215 boolean has_dummy = false;
216 if (get_structure || get_structure_info) {
217
218 // Build a request to obtain the document structure
219 Element ds_message = this.doc.createElement(GSXML.MESSAGE_ELEM);
220 String to = GSPath.appendLink(collection, "DocumentStructureRetrieve");// Hard-wired?
221 Element ds_request = GSXML.createBasicRequest(this.doc,GSXML.REQUEST_TYPE_PROCESS, to, lang, uid);
222 ds_message.appendChild(ds_request);
223 ds_request.appendChild(ds_param_list);
224
225 // create a doc_node_list and put in the doc_node that we are interested in
226 ds_request.appendChild(basic_doc_list);
227
228 // Process the document structure retrieve message
229 Element ds_response_message = (Element) this.mr.process(ds_message);
230 if (processErrorElements(ds_response_message, page_response)) {
231 return result;
232 }
233
234 // get the info and print out
235 String path = GSPath.appendLink(GSXML.RESPONSE_ELEM, GSXML.DOC_NODE_ELEM+GSXML.LIST_MODIFIER);
236 path = GSPath.appendLink(path, GSXML.DOC_NODE_ELEM);
237 path = GSPath.appendLink(path, "nodeStructureInfo");
238 Element ds_response_struct_info = (Element) GSXML.getNodeByPath(ds_response_message, path);
239 // get the doc_node bit
240 if (ds_response_struct_info != null) {
241 the_document.appendChild(this.doc.importNode(ds_response_struct_info, true));
242 }
243 path = GSPath.appendLink(GSXML.RESPONSE_ELEM, GSXML.DOC_NODE_ELEM+GSXML.LIST_MODIFIER);
244 path = GSPath.appendLink(path, GSXML.DOC_NODE_ELEM);
245 path = GSPath.appendLink(path, GSXML.NODE_STRUCTURE_ELEM);
246 Element ds_response_structure = (Element) GSXML.getNodeByPath(ds_response_message, path);
247
248 if (ds_response_structure != null) {
249 // add the contents of the structure bit into the_document
250 NodeList structs = ds_response_structure.getChildNodes();
251 for (int i=0; i<structs.getLength();i++) {
252 the_document.appendChild(this.doc.importNode(structs.item(i), true));
253 }
254 } else {
255 // no structure nodes, so put in a dummy doc node
256 Element doc_node = this.doc.createElement(GSXML.DOC_NODE_ELEM);
257 if (document_name.length()!=0){
258 doc_node.setAttribute(GSXML.NODE_ID_ATT, document_name);
259 }else if (has_href.length()!=0){
260 doc_node.setAttribute(GSXML.NODE_ID_ATT, has_href);
261 doc_node.setAttribute("externalURL", has_rl);
262 }
263 the_document.appendChild(doc_node);
264 has_dummy = true;
265 }
266 } else { // a simple type - we dont have a dummy node for simple
267 // should think about this more
268 // no structure request, so just put in a dummy doc node
269 Element doc_node = this.doc.createElement(GSXML.DOC_NODE_ELEM);
270 if (document_name.length()!=0){
271 doc_node.setAttribute(GSXML.NODE_ID_ATT, document_name);
272 }else if (has_href.length()!=0){
273 doc_node.setAttribute(GSXML.NODE_ID_ATT, has_href);
274 doc_node.setAttribute("externalURL", has_rl);
275 }
276 the_document.appendChild(doc_node);
277 has_dummy = true;
278 }
279
280 // Build a request to obtain some document metadata
281 Element dm_message = this.doc.createElement(GSXML.MESSAGE_ELEM);
282 String to = GSPath.appendLink(collection, "DocumentMetadataRetrieve"); // Hard-wired?
283 Element dm_request = GSXML.createBasicRequest(this.doc, GSXML.REQUEST_TYPE_PROCESS, to, lang, uid);
284 dm_message.appendChild(dm_request);
285 // Create a parameter list to specify the required metadata information
286
287 HashSet meta_names = new HashSet();
288 meta_names.add("Title"); // the default
289 if (format_elem != null) {
290 extractMetadataNames(format_elem, meta_names);
291 }
292
293 Element dm_param_list = createMetadataParamList(meta_names);
294 if (service_params != null) {
295 GSXML.addParametersToList(this.doc, dm_param_list, service_params);
296 }
297
298 dm_request.appendChild(dm_param_list);
299
300
301 // create the doc node list for the metadata request
302 Element dm_doc_list = this.doc.createElement(GSXML.DOC_NODE_ELEM+GSXML.LIST_MODIFIER);
303 dm_request.appendChild(dm_doc_list);
304
305 // Add each node from the structure response into the metadata request
306 NodeList doc_nodes = the_document.getElementsByTagName(GSXML.DOC_NODE_ELEM);
307 for (int i = 0; i < doc_nodes.getLength(); i++) {
308 Element doc_node = (Element) doc_nodes.item(i);
309 String doc_node_id = doc_node.getAttribute(GSXML.NODE_ID_ATT);
310
311 // Add the documentNode to the list
312 Element dm_doc_node = this.doc.createElement(GSXML.DOC_NODE_ELEM);
313 dm_doc_list.appendChild(dm_doc_node);
314 dm_doc_node.setAttribute(GSXML.NODE_ID_ATT, doc_node_id);
315 dm_doc_node.setAttribute(GSXML.NODE_TYPE_ATT,
316 doc_node.getAttribute(GSXML.NODE_TYPE_ATT));
317 }
318
319 // we also want a metadata request to the top level document to get
320 // assocfilepath - this could be cached too
321 Element doc_meta_request = GSXML.createBasicRequest(this.doc, GSXML.REQUEST_TYPE_PROCESS, to, lang, uid);
322 dm_message.appendChild(doc_meta_request);
323 Element doc_meta_param_list = this.doc.createElement(GSXML.PARAM_ELEM+GSXML.LIST_MODIFIER);
324 if (service_params != null) {
325 GSXML.addParametersToList(this.doc, doc_meta_param_list, service_params);
326 }
327
328 doc_meta_request.appendChild(doc_meta_param_list);
329 Element doc_param = this.doc.createElement(GSXML.PARAM_ELEM);
330 doc_meta_param_list.appendChild(doc_param);
331 doc_param.setAttribute(GSXML.NAME_ATT, "metadata");
332 doc_param.setAttribute(GSXML.VALUE_ATT, "assocfilepath");
333
334 // create the doc node list for the metadata request
335 Element doc_list = this.doc.createElement(GSXML.DOC_NODE_ELEM+GSXML.LIST_MODIFIER);
336 doc_meta_request.appendChild(doc_list);
337
338 Element doc_node = this.doc.createElement(GSXML.DOC_NODE_ELEM);
339 // the node we want is the root document node
340 if (document_name.length()!=0){
341 doc_node.setAttribute(GSXML.NODE_ID_ATT, document_name+".rt");
342 }else if (has_href.length()!=0){
343 doc_node.setAttribute(GSXML.NODE_ID_ATT, has_href+".rt");
344 doc_node.setAttribute("externalURL", has_rl);
345 }
346 doc_list.appendChild(doc_node);
347 Element dm_response_message = (Element) this.mr.process(dm_message);
348 if (processErrorElements(dm_response_message, page_response)) {
349 return result;
350 }
351
352 String path = GSPath.appendLink(GSXML.RESPONSE_ELEM, GSXML.DOC_NODE_ELEM+GSXML.LIST_MODIFIER);
353 Element dm_response_doc_list = (Element) GSXML.getNodeByPath(dm_response_message, path);
354
355 // Merge the metadata with the structure information
356 NodeList dm_response_docs = dm_response_doc_list.getChildNodes();
357 for (int i = 0; i < doc_nodes.getLength(); i++) {
358 GSXML.mergeMetadataLists(doc_nodes.item(i), dm_response_docs.item(i));
359 }
360 // get the top level doc metadata out
361 Element doc_meta_response = (Element)dm_response_message.getElementsByTagName(GSXML.RESPONSE_ELEM).item(1);
362 Element top_doc_node = (Element)GSXML.getNodeByPath(doc_meta_response, "documentNodeList/documentNode");
363 GSXML.mergeMetadataLists(the_document, top_doc_node);
364
365 // Build a request to obtain some document content
366 Element dc_message = this.doc.createElement(GSXML.MESSAGE_ELEM);
367 to = GSPath.appendLink(collection, "DocumentContentRetrieve"); // Hard-wired?
368 Element dc_request = GSXML.createBasicRequest(this.doc, GSXML.REQUEST_TYPE_PROCESS, to, lang, uid);
369 dc_message.appendChild(dc_request);
370
371
372 // Create a parameter list to specify the request parameters - empty for now
373 Element dc_param_list = this.doc.createElement(GSXML.PARAM_ELEM+GSXML.LIST_MODIFIER);
374 if (service_params != null) {
375 GSXML.addParametersToList(this.doc, dc_param_list, service_params);
376 }
377
378 dc_request.appendChild(dc_param_list);
379
380 // get the content
381 // the doc list for the content request is the same as the one for the structure request unless we want the whole document, in which case its the same as for the metadata request.
382 if (expand_document) {
383 dc_request.appendChild(dm_doc_list);
384 } else {
385 dc_request.appendChild(basic_doc_list);
386 }
387 logger.debug("request = "+converter.getString(dc_message));
388 Element dc_response_message = (Element) this.mr.process(dc_message);
389 if (processErrorElements(dc_response_message, page_response)) {
390 return result;
391 }
392
393 Element dc_response_doc_list = (Element) GSXML.getNodeByPath(dc_response_message, path);
394
395 if (expand_document) {
396 // Merge the content with the structure information
397 NodeList dc_response_docs = dc_response_doc_list.getChildNodes();
398 for (int i = 0; i < doc_nodes.getLength(); i++) {
399 Node content = GSXML.getChildByTagName((Element)dc_response_docs.item(i), "nodeContent");
400 if (content != null) {
401 if (highlight_query_terms) {
402 content = highlightQueryTerms(request, (Element)content);
403 }
404 doc_nodes.item(i).appendChild(this.doc.importNode(content, true));
405 }
406 //GSXML.mergeMetadataLists(doc_nodes.item(i), dm_response_docs.item(i));
407 }
408 } else {
409 //path = GSPath.appendLink(path, GSXML.DOC_NODE_ELEM);
410 Element dc_response_doc = (Element) GSXML.getChildByTagName(dc_response_doc_list, GSXML.DOC_NODE_ELEM);
411 Element dc_response_doc_content = (Element) GSXML.getChildByTagName(dc_response_doc, GSXML.NODE_CONTENT_ELEM);
412 Element dc_response_doc_external = (Element) GSXML.getChildByTagName(dc_response_doc, "external");
413
414 if (dc_response_doc_content == null) {
415 // no content to add
416 if (dc_response_doc_external !=null){
417 String modified_doc_id = dc_response_doc.getAttribute(GSXML.NODE_ID_ATT);
418
419 the_document.setAttribute("selectedNode", modified_doc_id);
420 the_document.setAttribute("external", dc_response_doc_external.getAttribute("external_link"));
421 }
422 return result;
423 }
424 if (highlight_query_terms) {
425 dc_response_doc.removeChild(dc_response_doc_content);
426
427 dc_response_doc_content = highlightQueryTerms(request, dc_response_doc_content);
428 dc_response_doc.appendChild(dc_response_doc.getOwnerDocument().importNode(dc_response_doc_content, true));
429 }
430
431
432 if (provide_annotations) {
433 String service_selected = (String)params.get(ENRICH_DOC_ARG);
434 if (service_selected != null && service_selected.equals("1")) {
435 // now we can modifiy the response doc if needed
436 String enrich_service = (String)params.get(GSParams.SERVICE);
437 // send a message to the service
438 Element enrich_message = this.doc.createElement(GSXML.MESSAGE_ELEM);
439 Element enrich_request = GSXML.createBasicRequest(this.doc, GSXML.REQUEST_TYPE_PROCESS, enrich_service, lang, uid);
440 enrich_message.appendChild(enrich_request);
441 // check for parameters
442 HashMap e_service_params = (HashMap)params.get("s1");
443 if (e_service_params != null) {
444 Element enrich_pl = this.doc.createElement(GSXML.PARAM_ELEM+GSXML.LIST_MODIFIER);
445 GSXML.addParametersToList(this.doc, enrich_pl, e_service_params);
446 enrich_request.appendChild(enrich_pl);
447 }
448 Element e_doc_list = this.doc.createElement(GSXML.DOC_NODE_ELEM+GSXML.LIST_MODIFIER);
449 enrich_request.appendChild(e_doc_list);
450 e_doc_list.appendChild(this.doc.importNode(dc_response_doc, true));
451
452 Node enrich_response = this.mr.process(enrich_message);
453
454 String [] links = {GSXML.RESPONSE_ELEM, GSXML.DOC_NODE_ELEM+GSXML.LIST_MODIFIER, GSXML.DOC_NODE_ELEM, GSXML.NODE_CONTENT_ELEM};
455 path = GSPath.createPath(links);
456 dc_response_doc_content = (Element)GSXML.getNodeByPath(enrich_response, path);
457
458 }
459 } // if provide_annotations
460
461
462 // use the returned id rather than the sent one cos there may have
463 // been modifiers such as .pr that are removed.
464 String modified_doc_id = dc_response_doc.getAttribute(GSXML.NODE_ID_ATT);
465 the_document.setAttribute("selectedNode", modified_doc_id);
466 if (has_dummy) {
467 // change the id if necessary and add the content
468 Element dummy_node = (Element)doc_nodes.item(0);
469
470 dummy_node.setAttribute(GSXML.NODE_ID_ATT, modified_doc_id);
471 dummy_node.appendChild(this.doc.importNode(dc_response_doc_content, true));
472 // hack for simple type
473 if (document_type.equals("simple")) {
474 // we dont want the internal docNode, just want the content and metadata in the document
475 // rethink this!!
476 the_document.removeChild(dummy_node);
477
478 NodeList dummy_children = dummy_node.getChildNodes();
479 //for (int i=0; i<dummy_children.getLength(); i++) {
480 for (int i=dummy_children.getLength()-1; i>=0; i--) {
481 // special case as we don't want more than one metadata list
482 if (dummy_children.item(i).getNodeName().equals(GSXML.METADATA_ELEM+GSXML.LIST_MODIFIER)) {
483 GSXML.mergeMetadataFromList(the_document, dummy_children.item(i));
484 } else {
485 the_document.appendChild(dummy_children.item(i));
486 }
487 }
488 }
489 } else {
490 // Merge the document content with the metadata and structure information
491 for (int i = 0; i < doc_nodes.getLength(); i++) {
492 Node dn = doc_nodes.item(i);
493 String dn_id = ((Element)dn).getAttribute(GSXML.NODE_ID_ATT);
494 if (dn_id.equals(modified_doc_id)) {
495 dn.appendChild(this.doc.importNode(dc_response_doc_content, true));
496 break;
497 }
498 }
499 }
500 }
501 logger.debug("(DocumentAction) Page:\n" + this.converter.getPrettyString(result));
502 return result;
503 }
504
505 /** tell the param class what its arguments are
506 * if an action has its own arguments, this should add them to the params
507 * object - particularly important for args that should not be saved */
508 public boolean getActionParameters(GSParams params) {
509 params.addParameter(GOTO_PAGE_ARG, false);
510 params.addParameter(ENRICH_DOC_ARG, false);
511 return true;
512 }
513
514
515 /** this method gets the collection description, the format info, the
516 * list of enrich services, etc - stuff that is needed for the page,
517 * but is the same whatever the query is - should be cached */
518 protected boolean getBackgroundData(Element page_response,
519 String collection, String lang,
520 String uid) {
521
522 // create a message to process - contains requests for the collection
523 // description, the format element, the enrich services on offer
524 // these could all be cached
525 Element info_message = this.doc.createElement(GSXML.MESSAGE_ELEM);
526 String path = GSPath.appendLink(collection, "DocumentContentRetrieve");
527 // the format request - ignore for now, where does this request go to??
528 Element format_request = GSXML.createBasicRequest(this.doc, GSXML.REQUEST_TYPE_FORMAT, path, lang, uid);
529 info_message.appendChild(format_request);
530
531 // the enrich_services request - only do this if provide_annotations is true
532
533 if (provide_annotations) {
534 Element enrich_services_request = GSXML.createBasicRequest(this.doc, GSXML.REQUEST_TYPE_DESCRIBE, "", lang, uid);
535 enrich_services_request.setAttribute(GSXML.INFO_ATT, "serviceList");
536 info_message.appendChild(enrich_services_request);
537 }
538
539 Element info_response = (Element)this.mr.process(info_message);
540
541 // the collection is the first response
542 NodeList responses = info_response.getElementsByTagName(GSXML.RESPONSE_ELEM);
543 Element format_resp = (Element) responses.item(0);
544
545 Element format_elem = (Element)GSXML.getChildByTagName(format_resp, GSXML.FORMAT_ELEM);
546 if (format_elem != null) {
547 logger.debug("doc action found a format statement");
548 // set teh format type
549 format_elem.setAttribute(GSXML.TYPE_ATT, "display");
550 page_response.appendChild(this.doc.importNode(format_elem, true));
551 }
552
553 if (provide_annotations) {
554 Element services_resp = (Element)responses.item(1);
555
556 // a new message for the mr
557 Element enrich_message = this.doc.createElement(GSXML.MESSAGE_ELEM);
558
559 NodeList e_services = services_resp.getElementsByTagName(GSXML.SERVICE_ELEM);
560 boolean service_found = false;
561 for (int j=0; j<e_services.getLength(); j++) {
562 if (((Element)e_services.item(j)).getAttribute(GSXML.TYPE_ATT).equals("enrich")) {
563 Element s = GSXML.createBasicRequest(this.doc, GSXML.REQUEST_TYPE_DESCRIBE, ((Element)e_services.item(j)).getAttribute(GSXML.NAME_ATT), lang, uid);
564 enrich_message.appendChild(s);
565 service_found = true;
566 }
567 }
568 if (service_found) {
569 Element enrich_response = (Element)this.mr.process(enrich_message);
570
571 NodeList e_responses = enrich_response.getElementsByTagName(GSXML.RESPONSE_ELEM);
572 Element service_list = this.doc.createElement(GSXML.SERVICE_ELEM + GSXML.LIST_MODIFIER);
573 for (int i=0; i<e_responses.getLength(); i++) {
574 Element e_resp = (Element)e_responses.item(i);
575 Element e_service = (Element)this.doc.importNode(GSXML.getChildByTagName(e_resp, GSXML.SERVICE_ELEM), true);
576 e_service.setAttribute(GSXML.NAME_ATT, e_resp.getAttribute(GSXML.FROM_ATT));
577 service_list.appendChild(e_service);
578 }
579 page_response.appendChild(service_list);
580 }
581 } // if provide_annotations
582 return true;
583
584 }
585
586 /** this involves a bit of a hack to get the equivalent query terms - has to requery the query service - uses the last selected service name. (if it ends in query). should this action do the query or should it send a message to the query action? but that will involve lots of extra stuff. also doesn't handle phrases properly - just highlights all the terms found in the text.
587 */
588 protected Element highlightQueryTerms(Element request, Element dc_response_doc_content) {
589
590 // do the query again to get term info
591 Element cgi_param_list = (Element)GSXML.getChildByTagName(request, GSXML.PARAM_ELEM+GSXML.LIST_MODIFIER);
592 HashMap params = GSXML.extractParams(cgi_param_list, false);
593
594 HashMap previous_params = (HashMap)params.get("p");
595 if (previous_params == null) {
596 return dc_response_doc_content;
597 }
598 String service_name = (String)previous_params.get(GSParams.SERVICE);
599 if (service_name == null || !service_name.endsWith("Query")) { // hack for now - we only do highlighting if we were in a query last - ie not if we were in a browse thingy
600 logger.debug("invalid service, not doing highlighting");
601 return dc_response_doc_content;
602 }
603 String collection = (String)params.get(GSParams.COLLECTION);
604 String lang = request.getAttribute(GSXML.LANG_ATT);
605 String uid = request.getAttribute(GSXML.USER_ID_ATT);
606 String to = GSPath.appendLink(collection, service_name);
607
608 Element mr_query_message = this.doc.createElement(GSXML.MESSAGE_ELEM);
609 Element mr_query_request = GSXML.createBasicRequest(this.doc, GSXML.REQUEST_TYPE_PROCESS, to, lang, uid);
610 mr_query_message.appendChild(mr_query_request);
611
612 // paramList
613 HashMap service_params = (HashMap)params.get("s1");
614
615 Element query_param_list = this.doc.createElement(GSXML.PARAM_ELEM + GSXML.LIST_MODIFIER);
616 GSXML.addParametersToList(this.doc, query_param_list, service_params);
617 mr_query_request.appendChild(query_param_list);
618
619 // do the query
620 Element mr_query_response = (Element)this.mr.process(mr_query_message);
621
622 String path = GSPath.appendLink(GSXML.RESPONSE_ELEM, GSXML.TERM_ELEM+GSXML.LIST_MODIFIER);
623 Element query_term_list_element = (Element) GSXML.getNodeByPath(mr_query_response, path);
624 if (query_term_list_element == null) {
625 // no term info
626 logger.error("No query term information.\n");
627 return dc_response_doc_content;
628 }
629
630 String content = GSXML.getNodeText(dc_response_doc_content);
631
632 String metadata_path = GSPath.appendLink(GSXML.RESPONSE_ELEM, GSXML.METADATA_ELEM+GSXML.LIST_MODIFIER);
633 Element metadata_list = (Element) GSXML.getNodeByPath(mr_query_response, metadata_path);
634
635 HashSet query_term_variants = new HashSet();
636 NodeList equivalent_terms_nodelist = query_term_list_element.getElementsByTagName("equivTermList");
637 if(equivalent_terms_nodelist == null || equivalent_terms_nodelist.getLength() == 0)
638 {
639 NodeList terms_nodelist = query_term_list_element.getElementsByTagName("term");
640 if(terms_nodelist != null && terms_nodelist.getLength() > 0)
641 {
642 for(int i = 0; i < terms_nodelist.getLength(); i++)
643 {
644 String termValue = ((Element)terms_nodelist.item(i)).getAttribute("name");
645 String termValueU = null;
646 String termValueL = null;
647
648 if(termValue.length() > 1)
649 {
650 termValueU = termValue.substring(0, 1).toUpperCase() + termValue.substring(1);
651 termValueL = termValue.substring(0, 1).toLowerCase() + termValue.substring(1);
652 }
653 else
654 {
655 termValueU = termValue.substring(0, 1).toUpperCase();
656 termValueL = termValue.substring(0, 1).toLowerCase();
657 }
658
659 query_term_variants.add(termValueU);
660 query_term_variants.add(termValueL);
661 }
662 }
663 }
664 else
665 {
666 for (int i = 0; i < equivalent_terms_nodelist.getLength(); i++) {
667 Element equivalent_terms_element = (Element) equivalent_terms_nodelist.item(i);
668 String[] equivalent_terms = GSXML.getAttributeValuesFromList(equivalent_terms_element, GSXML.NAME_ATT);
669 for (int j = 0; j < equivalent_terms.length; j++) {
670 query_term_variants.add(equivalent_terms[j]);
671 }
672 }
673 }
674
675 ArrayList phrase_query_term_variants_hierarchy = new ArrayList();
676
677 Element query_element = GSXML.getNamedElement(metadata_list, GSXML.METADATA_ELEM, GSXML.NAME_ATT, "query");
678 String performed_query = GSXML.getNodeText(query_element) + " ";
679
680 ArrayList phrase_query_p_term_variants_list = new ArrayList();
681 int term_start = 0;
682 boolean in_term = false;
683 boolean in_phrase = false;
684 for (int i = 0; i < performed_query.length(); i++) {
685 char character = performed_query.charAt(i);
686 boolean is_character_letter_or_digit = Character.isLetterOrDigit(character);
687
688 // Has a query term just started?
689 if (in_term == false && is_character_letter_or_digit == true) {
690 in_term = true;
691 term_start = i;
692 }
693
694 // Or has a term just finished?
695 else if (in_term == true && is_character_letter_or_digit == false) {
696 in_term = false;
697 String term = performed_query.substring(term_start, i);
698
699 Element term_element = GSXML.getNamedElement(query_term_list_element, GSXML.TERM_ELEM, GSXML.NAME_ATT, term);
700 if (term_element != null) {
701
702 HashSet phrase_query_p_term_x_variants = new HashSet();
703
704 NodeList term_equivalent_terms_nodelist = term_element.getElementsByTagName("equivTermList");
705 if(term_equivalent_terms_nodelist == null || term_equivalent_terms_nodelist.getLength() == 0)
706 {
707 String termValueU = null;
708 String termValueL = null;
709
710 if(term.length() > 1)
711 {
712 termValueU = term.substring(0, 1).toUpperCase() + term.substring(1);
713 termValueL = term.substring(0, 1).toLowerCase() + term.substring(1);
714 }
715 else
716 {
717 termValueU = term.substring(0, 1).toUpperCase();
718 termValueL = term.substring(0, 1).toLowerCase();
719 }
720
721 phrase_query_p_term_x_variants.add(termValueU);
722 phrase_query_p_term_x_variants.add(termValueL);
723 }
724 else
725 {
726 for (int j = 0; j < term_equivalent_terms_nodelist.getLength(); j++) {
727 Element term_equivalent_terms_element = (Element) term_equivalent_terms_nodelist.item(j);
728 String[] term_equivalent_terms = GSXML.getAttributeValuesFromList(term_equivalent_terms_element, GSXML.NAME_ATT);
729 for (int k = 0; k < term_equivalent_terms.length; k++) {
730 phrase_query_p_term_x_variants.add(term_equivalent_terms[k]);
731 }
732 }
733 }
734 phrase_query_p_term_variants_list.add(phrase_query_p_term_x_variants);
735
736 if (in_phrase == false) {
737 phrase_query_term_variants_hierarchy.add(phrase_query_p_term_variants_list);
738 phrase_query_p_term_variants_list = new ArrayList();
739 }
740 }
741 }
742 // Watch for phrases (surrounded by quotes)
743 if (character == '\"') {
744 // Has a phrase just started?
745 if (in_phrase == false) {
746 in_phrase = true;
747 }
748 // Or has a phrase just finished?
749 else if (in_phrase == true) {
750 in_phrase = false;
751 phrase_query_term_variants_hierarchy.add(phrase_query_p_term_variants_list);
752 }
753
754 phrase_query_p_term_variants_list = new ArrayList();
755 }
756 }
757
758 System.err.println(query_term_variants + " *** " + phrase_query_term_variants_hierarchy);
759 return highlightQueryTermsInternal(content, query_term_variants, phrase_query_term_variants_hierarchy);
760 }
761
762
763 /**
764 * Highlights query terms in a piece of text.
765 */
766 private Element highlightQueryTermsInternal(String content, HashSet query_term_variants, ArrayList phrase_query_term_variants_hierarchy)
767 {
768 // Convert the content string to an array of characters for speed
769 char[] content_characters = new char[content.length()];
770 content.getChars(0, content.length(), content_characters, 0);
771
772 // Now skim through the content, identifying word matches
773 ArrayList word_matches = new ArrayList();
774 int word_start = 0;
775 boolean in_word = false;
776 boolean preceding_word_matched = false;
777 for (int i = 0; i < content_characters.length; i++) {
778 boolean is_character_letter_or_digit = Character.isLetterOrDigit(content_characters[i]);
779
780 // Has a word just started?
781 if (in_word == false && is_character_letter_or_digit == true) {
782 in_word = true;
783 word_start = i;
784 }
785
786 // Or has a word just finished?
787 else if (in_word == true && is_character_letter_or_digit == false) {
788 in_word = false;
789
790 // Check if the word matches any of the query term equivalents
791 String word = new String(content_characters, word_start, (i - word_start));
792 if (query_term_variants.contains(word)) {
793 // We have found a matching word, so remember its location
794 word_matches.add(new WordMatch(word, word_start, i, preceding_word_matched));
795 preceding_word_matched = true;
796 }
797 else {
798 preceding_word_matched = false;
799 }
800 }
801 }
802
803 // Don't forget the last word...
804 if (in_word == true) {
805 // Check if the word matches any of the query term equivalents
806 String word = new String(content_characters, word_start, (content_characters.length - word_start));
807 if (query_term_variants.contains(word)) {
808 // We have found a matching word, so remember its location
809 word_matches.add(new WordMatch(word, word_start, content_characters.length, preceding_word_matched));
810 }
811 }
812
813 ArrayList highlight_start_positions = new ArrayList();
814 ArrayList highlight_end_positions = new ArrayList();
815
816 // Deal with phrases now
817 ArrayList partial_phrase_matches = new ArrayList();
818 for (int i = 0; i < word_matches.size(); i++) {
819 WordMatch word_match = (WordMatch) word_matches.get(i);
820
821 // See if any partial phrase matches are extended by this word
822 if (word_match.preceding_word_matched) {
823 for (int j = partial_phrase_matches.size() - 1; j >= 0; j--) {
824 PartialPhraseMatch partial_phrase_match = (PartialPhraseMatch) partial_phrase_matches.remove(j);
825 ArrayList phrase_query_p_term_variants_list = (ArrayList) phrase_query_term_variants_hierarchy.get(partial_phrase_match.query_phrase_number);
826 HashSet phrase_query_p_term_x_variants = (HashSet) phrase_query_p_term_variants_list.get(partial_phrase_match.num_words_matched);
827 if (phrase_query_p_term_x_variants.contains(word_match.word)) {
828 partial_phrase_match.num_words_matched++;
829
830 // Has a complete phrase match occurred?
831 if (partial_phrase_match.num_words_matched == phrase_query_p_term_variants_list.size()) {
832 // Check for overlaps by looking at the previous highlight range
833 if (!highlight_end_positions.isEmpty()) {
834 int last_highlight_index = highlight_end_positions.size() - 1;
835 int last_highlight_end = ((Integer) highlight_end_positions.get(last_highlight_index)).intValue();
836 if (last_highlight_end > partial_phrase_match.start_position) {
837 // There is an overlap, so remove the previous phrase match
838 int last_highlight_start = ((Integer) highlight_start_positions.remove(last_highlight_index)).intValue();
839 highlight_end_positions.remove(last_highlight_index);
840 partial_phrase_match.start_position = last_highlight_start;
841 }
842 }
843
844 highlight_start_positions.add(new Integer(partial_phrase_match.start_position));
845 highlight_end_positions.add(new Integer(word_match.end_position));
846 }
847 // No, but add the partial match back into the list for next time
848 else {
849 partial_phrase_matches.add(partial_phrase_match);
850 }
851 }
852 }
853 }
854 else {
855 partial_phrase_matches.clear();
856 }
857
858 // See if this word is at the start of any of the phrases
859 for (int p = 0; p < phrase_query_term_variants_hierarchy.size(); p++) {
860 ArrayList phrase_query_p_term_variants_list = (ArrayList) phrase_query_term_variants_hierarchy.get(p);
861 HashSet phrase_query_p_term_1_variants = (HashSet) phrase_query_p_term_variants_list.get(0);
862 if (phrase_query_p_term_1_variants.contains(word_match.word)) {
863 // If this phrase is just one word long, we have a complete match
864 if (phrase_query_p_term_variants_list.size() == 1) {
865 highlight_start_positions.add(new Integer(word_match.start_position));
866 highlight_end_positions.add(new Integer(word_match.end_position));
867 }
868 // Otherwise we have the start of a potential phrase match
869 else {
870 partial_phrase_matches.add(new PartialPhraseMatch(word_match.start_position, p));
871 }
872 }
873 }
874 }
875
876 // Now add the annotation tags into the document at the correct points
877 Element content_element = this.doc.createElement(GSXML.NODE_CONTENT_ELEM);
878
879 int last_wrote = 0;
880 for (int i = 0; i < highlight_start_positions.size(); i++) {
881 int highlight_start = ((Integer) highlight_start_positions.get(i)).intValue();
882 int highlight_end = ((Integer) highlight_end_positions.get(i)).intValue();
883
884 // Print anything before the highlight range
885 if (last_wrote < highlight_start) {
886 String preceding_text = new String(content_characters, last_wrote, (highlight_start - last_wrote));
887 content_element.appendChild(this.doc.createTextNode(preceding_text));
888 }
889
890 // Print the highlight text, annotated
891 if (highlight_end > last_wrote) {
892 String highlight_text = new String(content_characters, highlight_start, (highlight_end - highlight_start));
893 Element annotation_element = GSXML.createTextElement(this.doc, "annotation", highlight_text);
894 annotation_element.setAttribute("type", "query_term");
895 content_element.appendChild(annotation_element);
896 last_wrote = highlight_end;
897 }
898 }
899
900 // Finish off any unwritten text
901 if (last_wrote < content_characters.length) {
902 String remaining_text = new String(content_characters, last_wrote, (content_characters.length - last_wrote));
903 content_element.appendChild(this.doc.createTextNode(remaining_text));
904 }
905
906 return content_element;
907 }
908
909
910 static private class WordMatch
911 {
912 public String word;
913 public int start_position;
914 public int end_position;
915 public boolean preceding_word_matched;
916
917 public WordMatch(String word, int start_position, int end_position, boolean preceding_word_matched)
918 {
919 this.word = word;
920 this.start_position = start_position;
921 this.end_position = end_position;
922 this.preceding_word_matched = preceding_word_matched;
923 }
924 }
925
926
927 static private class PartialPhraseMatch
928 {
929 public int start_position;
930 public int query_phrase_number;
931 public int num_words_matched;
932
933 public PartialPhraseMatch(int start_position, int query_phrase_number)
934 {
935 this.start_position = start_position;
936 this.query_phrase_number = query_phrase_number;
937 this.num_words_matched = 1;
938 }
939 }
940}
Note: See TracBrowser for help on using the repository browser.