source: trunk/gsdl3/src/java/org/greenstone/gsdl3/action/DocumentAction.java@ 8731

Last change on this file since 8731 was 8731, checked in by mdewsnip, 19 years ago

Much more advanced query term highlighting. Supports query term highlighting and query phrase highlighting, with all permutations of case folding and stemming.

  • Property svn:keywords set to Author Date Id Revision
File size: 34.1 KB
Line 
1/*
2 * DocumentAction.java
3 * Copyright (C) 2002 New Zealand Digital Library, http://www.nzdl.org
4 *
5 * This program is free software; you can redistribute it and/or modify
6 * it under the terms of the GNU General Public License as published by
7 * the Free Software Foundation; either version 2 of the License, or
8 * (at your option) any later version.
9 *
10 * This program is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 * GNU General Public License for more details.
14 *
15 * You should have received a copy of the GNU General Public License
16 * along with this program; if not, write to the Free Software
17 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
18 */
19package org.greenstone.gsdl3.action;
20
21// Greenstone classes
22import org.greenstone.gsdl3.core.ModuleInterface;
23import org.greenstone.gsdl3.util.*;
24
25// XML classes
26import org.w3c.dom.Document;
27import org.w3c.dom.Element;
28import org.w3c.dom.Node;
29import org.w3c.dom.Text;
30import org.w3c.dom.NodeList;
31
32// General Java classes
33import java.util.ArrayList;
34import java.util.HashMap;
35import java.util.HashSet;
36import java.io.File;
37
38
39/** Action class for retrieving Documents via the message router
40 */
41public class DocumentAction extends Action {
42
43 // this is used to specify that the sibling nodes of a selected one should be obtained
44 public static final String SIBLING_ARG = "sib";
45 public static final String GOTO_PAGE_ARG = "gp";
46 public static final String ENRICH_DOC_ARG = "end";
47
48 /** if this is set to true, when a document is displayed, any annotation
49 * type services (enrich) will be offered to the user as well */
50 protected static final boolean provide_annotations = false; //true;
51
52
53 public Element process (Element message)
54 {
55 // for now, no subaction eventually we may want to have subactions such as text assoc or something ?
56
57 // the response
58 Element result = this.doc.createElement(GSXML.MESSAGE_ELEM);
59 Element page_response = this.doc.createElement(GSXML.RESPONSE_ELEM);
60 result.appendChild(page_response);
61
62 // get the request - assume only one
63 Element request = (Element)GSXML.getChildByTagName(message, GSXML.REQUEST_ELEM);
64 Element cgi_paramList = (Element)GSXML.getChildByTagName(request, GSXML.PARAM_ELEM+GSXML.LIST_MODIFIER);
65 HashMap params = GSXML.extractParams(cgi_paramList, false);
66
67 // just in case there are some that need to get passed to the services
68 HashMap service_params = (HashMap)params.get("s0");
69
70 String collection = (String) params.get(GSParams.COLLECTION);
71 String lang = request.getAttribute(GSXML.LANG_ATT);
72 String uid = request.getAttribute(GSXML.USER_ID_ATT);
73 String document_name = (String) params.get(GSParams.DOCUMENT);
74 if (document_name == null || document_name.equals("")) {
75 System.err.println("DocumentAction Error: no document specified!");
76 return result;
77 }
78 String document_type = (String) params.get(GSParams.DOCUMENT_TYPE);
79 if (document_type == null) {
80 document_type = "simple";
81 }
82 //whether to retrieve siblings or not
83 boolean get_siblings = false;
84 String sibs = (String) params.get(SIBLING_ARG);
85 if (sibs != null && sibs.equals("1")) {
86 get_siblings = true;
87 }
88
89 String sibling_num = (String) params.get(GOTO_PAGE_ARG);
90 if (sibling_num != null && !sibling_num.equals("")) {
91 // we have to modify the doc name
92 document_name = document_name+"."+sibling_num+".ss";
93 }
94
95 boolean expand_document = false;
96 String ed_arg = (String) params.get(GSParams.EXPAND_DOCUMENT);
97 if (ed_arg != null && ed_arg.equals("1")) {
98 expand_document = true;
99 }
100
101
102 boolean expand_contents = false;
103 if (expand_document) { // we always expand the contents with the text
104 expand_contents = true;
105 } else {
106 String ec_arg = (String) params.get(GSParams.EXPAND_CONTENTS);
107 if (ec_arg != null && ec_arg.equals("1")) {
108 expand_contents = true;
109 }
110 }
111 // get the additional data needed for the page
112 getBackgroundData(page_response, collection, lang, uid);
113 Element format_elem = (Element)GSXML.getChildByTagName(page_response, GSXML.FORMAT_ELEM);
114
115 // the_document is where all the doc info - structure and metadata etc
116 // is added into, to be returned in the page
117 Element the_document = this.doc.createElement(GSXML.DOCUMENT_ELEM);
118 page_response.appendChild(the_document);
119
120 // set the doctype from the cgi arg as an attribute
121 the_document.setAttribute(GSXML.DOC_TYPE_ATT, document_type);
122
123 // create a basic doc list containing the current node
124 Element basic_doc_list = this.doc.createElement(GSXML.DOC_NODE_ELEM+GSXML.LIST_MODIFIER);
125 Element current_doc = this.doc.createElement(GSXML.DOC_NODE_ELEM);
126 basic_doc_list.appendChild(current_doc);
127 current_doc.setAttribute(GSXML.NODE_ID_ATT, document_name);
128
129 // Create a parameter list to specify the required structure information
130 Element ds_param_list = this.doc.createElement(GSXML.PARAM_ELEM+GSXML.LIST_MODIFIER);
131
132 if (service_params != null) {
133 GSXML.addParametersToList(this.doc, ds_param_list, service_params);
134 }
135
136 Element ds_param = null;
137 boolean get_structure = false;
138 boolean get_structure_info = false;
139 if (document_type.equals("paged")) {
140 get_structure_info = true;
141 // get teh info needed for paged naviagtion
142 ds_param = this.doc.createElement(GSXML.PARAM_ELEM);
143 ds_param_list.appendChild(ds_param);
144 ds_param.setAttribute(GSXML.NAME_ATT, "info");
145 ds_param.setAttribute(GSXML.VALUE_ATT, "numSiblings");
146 ds_param = this.doc.createElement(GSXML.PARAM_ELEM);
147 ds_param_list.appendChild(ds_param);
148 ds_param.setAttribute(GSXML.NAME_ATT, "info");
149 ds_param.setAttribute(GSXML.VALUE_ATT, "numChildren");
150 ds_param = this.doc.createElement(GSXML.PARAM_ELEM);
151 ds_param_list.appendChild(ds_param);
152 ds_param.setAttribute(GSXML.NAME_ATT, "info");
153 ds_param.setAttribute(GSXML.VALUE_ATT, "siblingPosition");
154
155 } else if (document_type.equals("hierarchy")){
156 get_structure = true;
157 if (expand_contents) {
158 ds_param = this.doc.createElement(GSXML.PARAM_ELEM);
159 ds_param_list.appendChild(ds_param);
160 ds_param.setAttribute(GSXML.NAME_ATT, "structure");
161 ds_param.setAttribute(GSXML.VALUE_ATT, "entire");
162 } else {
163 // get the info needed for table of contents
164 ds_param = this.doc.createElement(GSXML.PARAM_ELEM);
165 ds_param_list.appendChild(ds_param);
166 ds_param.setAttribute(GSXML.NAME_ATT, "structure");
167 ds_param.setAttribute(GSXML.VALUE_ATT, "ancestors");
168 ds_param = this.doc.createElement(GSXML.PARAM_ELEM);
169 ds_param_list.appendChild(ds_param);
170 ds_param.setAttribute(GSXML.NAME_ATT, "structure");
171 ds_param.setAttribute(GSXML.VALUE_ATT, "children");
172 if (get_siblings) {
173 ds_param = this.doc.createElement(GSXML.PARAM_ELEM);
174 ds_param_list.appendChild(ds_param);
175 ds_param.setAttribute(GSXML.NAME_ATT, "structure");
176 ds_param.setAttribute(GSXML.VALUE_ATT, "siblings");
177 }
178 }
179 } else {
180 // we dont need any structure
181 }
182
183 boolean has_dummy = false;
184 if (get_structure || get_structure_info) {
185
186 // Build a request to obtain the document structure
187 Element ds_message = this.doc.createElement(GSXML.MESSAGE_ELEM);
188 String to = GSPath.appendLink(collection, "DocumentStructureRetrieve");// Hard-wired?
189 Element ds_request = GSXML.createBasicRequest(this.doc,GSXML.REQUEST_TYPE_PROCESS, to, lang, uid);
190 ds_message.appendChild(ds_request);
191 ds_request.appendChild(ds_param_list);
192
193 // create a doc_node_list and put in the doc_node that we are interested in
194 ds_request.appendChild(basic_doc_list);
195
196 // Process the document structure retrieve message
197 Element ds_response_message = (Element) this.mr.process(ds_message);
198
199 // get the info and print out
200 String path = GSPath.appendLink(GSXML.RESPONSE_ELEM, GSXML.DOC_NODE_ELEM+GSXML.LIST_MODIFIER);
201 path = GSPath.appendLink(path, GSXML.DOC_NODE_ELEM);
202 path = GSPath.appendLink(path, "nodeStructureInfo");
203 Element ds_response_struct_info = (Element) GSXML.getNodeByPath(ds_response_message, path);
204 // get the doc_node bit
205 if (ds_response_struct_info != null) {
206 the_document.appendChild(this.doc.importNode(ds_response_struct_info, true));
207 }
208 path = GSPath.appendLink(GSXML.RESPONSE_ELEM, GSXML.DOC_NODE_ELEM+GSXML.LIST_MODIFIER);
209 path = GSPath.appendLink(path, GSXML.DOC_NODE_ELEM);
210 path = GSPath.appendLink(path, GSXML.NODE_STRUCTURE_ELEM);
211 Element ds_response_structure = (Element) GSXML.getNodeByPath(ds_response_message, path);
212
213 if (ds_response_structure != null) {
214 // add the contents of the structure bit into the_document
215 NodeList structs = ds_response_structure.getChildNodes();
216 for (int i=0; i<structs.getLength();i++) {
217 the_document.appendChild(this.doc.importNode(structs.item(i), true));
218 }
219 } else {
220 // no structure nodes, so put in a dummy doc node
221 Element doc_node = this.doc.createElement(GSXML.DOC_NODE_ELEM);
222 doc_node.setAttribute(GSXML.NODE_ID_ATT, document_name);
223 the_document.appendChild(doc_node);
224 has_dummy = true;
225 }
226 } else { // a simple type - we dont have a dummy node for simple
227 // should think about this more
228 // no structure request, so just put in a dummy doc node
229 Element doc_node = this.doc.createElement(GSXML.DOC_NODE_ELEM);
230 doc_node.setAttribute(GSXML.NODE_ID_ATT, document_name);
231 the_document.appendChild(doc_node);
232 has_dummy = true;
233 }
234
235 // Build a request to obtain some document metadata
236 Element dm_message = this.doc.createElement(GSXML.MESSAGE_ELEM);
237 String to = GSPath.appendLink(collection, "DocumentMetadataRetrieve"); // Hard-wired?
238 Element dm_request = GSXML.createBasicRequest(this.doc, GSXML.REQUEST_TYPE_PROCESS, to, lang, uid);
239 dm_message.appendChild(dm_request);
240 // Create a parameter list to specify the required metadata information
241
242 HashSet meta_names = new HashSet();
243 meta_names.add("Title"); // the default
244 if (format_elem != null) {
245 extractMetadataNames(format_elem, meta_names);
246 }
247
248 Element dm_param_list = createMetadataParamList(meta_names);
249 if (service_params != null) {
250 GSXML.addParametersToList(this.doc, dm_param_list, service_params);
251 }
252
253 dm_request.appendChild(dm_param_list);
254
255
256 // create the doc node list for the metadata request
257 Element dm_doc_list = this.doc.createElement(GSXML.DOC_NODE_ELEM+GSXML.LIST_MODIFIER);
258 dm_request.appendChild(dm_doc_list);
259
260 // Add each node from the structure response into the metadata request
261 NodeList doc_nodes = the_document.getElementsByTagName(GSXML.DOC_NODE_ELEM);
262 for (int i = 0; i < doc_nodes.getLength(); i++) {
263 Element doc_node = (Element) doc_nodes.item(i);
264 String doc_node_id = doc_node.getAttribute(GSXML.NODE_ID_ATT);
265
266 // Add the documentNode to the list
267 Element dm_doc_node = this.doc.createElement(GSXML.DOC_NODE_ELEM);
268 dm_doc_list.appendChild(dm_doc_node);
269 dm_doc_node.setAttribute(GSXML.NODE_ID_ATT, doc_node_id);
270 dm_doc_node.setAttribute(GSXML.NODE_TYPE_ATT,
271 doc_node.getAttribute(GSXML.NODE_TYPE_ATT));
272 }
273
274 // we also want a metadata request to the top level document to get
275 // assocfilepath - this could be cached too
276 Element doc_meta_request = GSXML.createBasicRequest(this.doc, GSXML.REQUEST_TYPE_PROCESS, to, lang, uid);
277 dm_message.appendChild(doc_meta_request);
278 Element doc_meta_param_list = this.doc.createElement(GSXML.PARAM_ELEM+GSXML.LIST_MODIFIER);
279 if (service_params != null) {
280 GSXML.addParametersToList(this.doc, doc_meta_param_list, service_params);
281 }
282
283 doc_meta_request.appendChild(doc_meta_param_list);
284 Element doc_param = this.doc.createElement(GSXML.PARAM_ELEM);
285 doc_meta_param_list.appendChild(doc_param);
286 doc_param.setAttribute(GSXML.NAME_ATT, "metadata");
287 doc_param.setAttribute(GSXML.VALUE_ATT, "archivedir");
288
289 // create the doc node list for the metadata request
290 Element doc_list = this.doc.createElement(GSXML.DOC_NODE_ELEM+GSXML.LIST_MODIFIER);
291 doc_meta_request.appendChild(doc_list);
292
293 Element doc_node = this.doc.createElement(GSXML.DOC_NODE_ELEM);
294 // teh node we want is the root document node
295 doc_node.setAttribute(GSXML.NODE_ID_ATT, document_name+".rt");
296 doc_list.appendChild(doc_node);
297 Element dm_response_message = (Element) this.mr.process(dm_message);
298
299 String path = GSPath.appendLink(GSXML.RESPONSE_ELEM, GSXML.DOC_NODE_ELEM+GSXML.LIST_MODIFIER);
300 Element dm_response_doc_list = (Element) GSXML.getNodeByPath(dm_response_message, path);
301
302 // Merge the metadata with the structure information
303 NodeList dm_response_docs = dm_response_doc_list.getChildNodes();
304 for (int i = 0; i < doc_nodes.getLength(); i++) {
305 GSXML.mergeMetadataLists(doc_nodes.item(i), dm_response_docs.item(i));
306 }
307 // get teh top level doc metadata out
308 Element doc_meta_response = (Element)dm_response_message.getElementsByTagName(GSXML.RESPONSE_ELEM).item(1);
309 Element doc_meta_list = (Element)GSXML.getNodeByPath(doc_meta_response, "documentNodeList/documentNode/metadataList");
310 if (doc_meta_list != null) {
311 the_document.appendChild(this.doc.importNode(doc_meta_list, true));
312 }
313 // Build a request to obtain some document content
314 Element dc_message = this.doc.createElement(GSXML.MESSAGE_ELEM);
315 to = GSPath.appendLink(collection, "DocumentContentRetrieve"); // Hard-wired?
316 Element dc_request = GSXML.createBasicRequest(this.doc, GSXML.REQUEST_TYPE_PROCESS, to, lang, uid);
317 dc_message.appendChild(dc_request);
318
319
320 // Create a parameter list to specify the request parameters - empty for now
321 Element dc_param_list = this.doc.createElement(GSXML.PARAM_ELEM+GSXML.LIST_MODIFIER);
322 if (service_params != null) {
323 GSXML.addParametersToList(this.doc, dc_param_list, service_params);
324 }
325
326 dc_request.appendChild(dc_param_list);
327
328 // the doc list for the content request is the same as the one for the structure request unless we want the whole document, in which case its the same as for the metadata request.
329 if (expand_document) {
330 dc_request.appendChild(dm_doc_list);
331 } else {
332 dc_request.appendChild(basic_doc_list);
333 }
334 Element dc_response_message = (Element) this.mr.process(dc_message);
335 Element dc_response_doc_list = (Element) GSXML.getNodeByPath(dc_response_message, path);
336
337 if (expand_document) {
338 // Merge the content with the structure information
339 NodeList dc_response_docs = dc_response_doc_list.getChildNodes();
340 for (int i = 0; i < doc_nodes.getLength(); i++) {
341 doc_nodes.item(i).appendChild(this.doc.importNode(GSXML.getChildByTagName((Element)dc_response_docs.item(i), "nodeContent"), true));
342 //GSXML.mergeMetadataLists(doc_nodes.item(i), dm_response_docs.item(i));
343 }
344 } else {
345
346 //path = GSPath.appendLink(path, GSXML.DOC_NODE_ELEM);
347 Element dc_response_doc = (Element) GSXML.getChildByTagName(dc_response_doc_list, GSXML.DOC_NODE_ELEM);
348 Element dc_response_doc_content = (Element) GSXML.getChildByTagName(dc_response_doc, GSXML.NODE_CONTENT_ELEM);
349
350
351 boolean highlight_query_terms = true;
352 if (highlight_query_terms) {
353 dc_response_doc.removeChild(dc_response_doc_content);
354
355 dc_response_doc_content = highlightQueryTerms(request, dc_response_doc_content);
356 dc_response_doc.appendChild(dc_response_doc.getOwnerDocument().importNode(dc_response_doc_content, true));
357 }
358
359
360 if (provide_annotations) {
361 String service_selected = (String)params.get(ENRICH_DOC_ARG);
362 if (service_selected != null && service_selected.equals("1")) {
363 // now we can modifiy the response doc if needed
364 String enrich_service = (String)params.get(GSParams.SERVICE);
365 // send a message to the service
366 Element enrich_message = this.doc.createElement(GSXML.MESSAGE_ELEM);
367 Element enrich_request = GSXML.createBasicRequest(this.doc, GSXML.REQUEST_TYPE_PROCESS, enrich_service, lang, uid);
368 enrich_message.appendChild(enrich_request);
369 // check for parameters
370 HashMap e_service_params = (HashMap)params.get("s1");
371 if (e_service_params != null) {
372 Element enrich_pl = this.doc.createElement(GSXML.PARAM_ELEM+GSXML.LIST_MODIFIER);
373 GSXML.addParametersToList(this.doc, enrich_pl, e_service_params);
374 enrich_request.appendChild(enrich_pl);
375 }
376 Element e_doc_list = this.doc.createElement(GSXML.DOC_NODE_ELEM+GSXML.LIST_MODIFIER);
377 enrich_request.appendChild(e_doc_list);
378 e_doc_list.appendChild(this.doc.importNode(dc_response_doc, true));
379
380 Element enrich_response = this.mr.process(enrich_message);
381
382 String [] links = {GSXML.RESPONSE_ELEM, GSXML.DOC_NODE_ELEM+GSXML.LIST_MODIFIER, GSXML.DOC_NODE_ELEM, GSXML.NODE_CONTENT_ELEM};
383 path = GSPath.createPath(links);
384 dc_response_doc_content = (Element)GSXML.getNodeByPath(enrich_response, path);
385
386 }
387 }
388
389
390 // use the returned id rather than the sent one cos there may have
391 // been modifiers such as .pr that are removed.
392 String modified_doc_id = dc_response_doc.getAttribute(GSXML.NODE_ID_ATT);
393 the_document.setAttribute("selectedNode", modified_doc_id);
394 if (has_dummy) {
395 // change the id if necessary and add the content
396 Element dummy_node = (Element)doc_nodes.item(0);
397
398 dummy_node.setAttribute(GSXML.NODE_ID_ATT, modified_doc_id);
399 dummy_node.appendChild(this.doc.importNode(dc_response_doc_content, true));
400 // hack for simple type
401 if (document_type.equals("simple")) {
402 // we dont want the internal docNode, just want the content and metadata in the document
403 // rethink this!!
404 the_document.removeChild(dummy_node);
405
406 NodeList dummy_children = dummy_node.getChildNodes();
407 //for (int i=0; i<dummy_children.getLength(); i++) {
408 for (int i=dummy_children.getLength()-1; i>=0; i--) {
409 the_document.appendChild(dummy_children.item(i));
410
411 }
412 }
413 } else {
414 // Merge the document content with the metadata and structure information
415 for (int i = 0; i < doc_nodes.getLength(); i++) {
416 Node dn = doc_nodes.item(i);
417 String dn_id = ((Element)dn).getAttribute(GSXML.NODE_ID_ATT);
418 if (dn_id.equals(modified_doc_id)) {
419 dn.appendChild(this.doc.importNode(dc_response_doc_content, true));
420 break;
421 }
422 }
423 }
424 }
425 ///ystem.out.println("(DocumentAction) Page:\n" + this.converter.getPrettyString(result));
426 return result;
427 }
428
429 /** tell the param class what its arguments are
430 * if an action has its own arguments, this should add them to the params
431 * object - particularly important for args that should not be saved */
432 public boolean getActionParameters(GSParams params) {
433 params.addParameter(GOTO_PAGE_ARG, false);
434 params.addParameter(ENRICH_DOC_ARG, false);
435 return true;
436 }
437
438
439 /** this method gets the collection description, the format info, the
440 * list of enrich services, etc - stuff that is needed for the page,
441 * but is the same whatever the query is - should be cached */
442 protected boolean getBackgroundData(Element page_response,
443 String collection, String lang,
444 String uid) {
445
446 // create a message to process - contains requests for the collection
447 // description, the format element, the enrich services on offer
448 // these could all be cached
449 Element info_message = this.doc.createElement(GSXML.MESSAGE_ELEM);
450 String path = GSPath.appendLink(collection, "DocumentContentRetrieve");
451 // the format request - ignore for now, where does this request go to??
452 Element format_request = GSXML.createBasicRequest(this.doc, GSXML.REQUEST_TYPE_FORMAT, path, lang, uid);
453 info_message.appendChild(format_request);
454
455 // the enrich_services request - only do this if provide_annotations is true
456
457 if (provide_annotations) {
458 Element enrich_services_request = GSXML.createBasicRequest(this.doc, GSXML.REQUEST_TYPE_DESCRIBE, "", lang, uid);
459 enrich_services_request.setAttribute(GSXML.INFO_ATT, "serviceList");
460 info_message.appendChild(enrich_services_request);
461 }
462
463 Element info_response = (Element)this.mr.process(info_message);
464
465 // the collection is the first response
466 NodeList responses = info_response.getElementsByTagName(GSXML.RESPONSE_ELEM);
467 Element format_resp = (Element) responses.item(0);
468
469 Element format_elem = (Element)GSXML.getChildByTagName(format_resp, GSXML.FORMAT_ELEM);
470 if (format_elem != null) {
471 ///ystem.out.println("doc action found a format statement");
472 // set teh format type
473 format_elem.setAttribute(GSXML.TYPE_ATT, "display");
474 page_response.appendChild(this.doc.importNode(format_elem, true));
475 }
476
477 if (provide_annotations) {
478 Element services_resp = (Element)responses.item(1);
479
480 // a new message for the mr
481 Element enrich_message = this.doc.createElement(GSXML.MESSAGE_ELEM);
482
483 NodeList e_services = services_resp.getElementsByTagName(GSXML.SERVICE_ELEM);
484 boolean service_found = false;
485 for (int j=0; j<e_services.getLength(); j++) {
486 if (((Element)e_services.item(j)).getAttribute(GSXML.TYPE_ATT).equals("enrich")) {
487 Element s = GSXML.createBasicRequest(this.doc, GSXML.REQUEST_TYPE_DESCRIBE, ((Element)e_services.item(j)).getAttribute(GSXML.NAME_ATT), lang, uid);
488 enrich_message.appendChild(s);
489 service_found = true;
490 }
491 }
492 if (service_found) {
493 Element enrich_response = this.mr.process(enrich_message);
494
495 NodeList e_responses = enrich_response.getElementsByTagName(GSXML.RESPONSE_ELEM);
496 Element service_list = this.doc.createElement(GSXML.SERVICE_ELEM + GSXML.LIST_MODIFIER);
497 for (int i=0; i<e_responses.getLength(); i++) {
498 Element e_resp = (Element)e_responses.item(i);
499 Element e_service = (Element)this.doc.importNode(GSXML.getChildByTagName(e_resp, GSXML.SERVICE_ELEM), true);
500 e_service.setAttribute(GSXML.NAME_ATT, e_resp.getAttribute(GSXML.FROM_ATT));
501 service_list.appendChild(e_service);
502 }
503 page_response.appendChild(service_list);
504 }
505 } // if provide_annotations
506 return true;
507
508 }
509
510 /** this involves a bit of a hack to get the equivalent query terms - has to requery the query service - uses the last selected service name. (if it ends in query). should this action do the query or should it send a message to the query action? but that will involve lots of extra stuff. also doesn't handle phrases properly - just highlights all the terms found in the text.
511 */
512 protected Element highlightQueryTerms(Element request, Element dc_response_doc_content) {
513
514 // do the query again to get term info
515 Element cgi_param_list = (Element)GSXML.getChildByTagName(request, GSXML.PARAM_ELEM+GSXML.LIST_MODIFIER);
516 HashMap params = GSXML.extractParams(cgi_param_list, false);
517
518
519 String service_name = (String)((HashMap)params.get("p")).get(GSParams.SERVICE);
520 if (service_name == null || !service_name.endsWith("Query")) { // hack for now - we only do highlighting if we were in a query last - ie not if we were in a browse thingy
521 System.err.println("DocumentAction: invalid service, not doing highlighting");
522 return dc_response_doc_content;
523 }
524 String collection = (String)params.get(GSParams.COLLECTION);
525 String lang = request.getAttribute(GSXML.LANG_ATT);
526 String uid = request.getAttribute(GSXML.USER_ID_ATT);
527 String to = GSPath.appendLink(collection, service_name);
528
529 Element mr_query_message = this.doc.createElement(GSXML.MESSAGE_ELEM);
530 Element mr_query_request = GSXML.createBasicRequest(this.doc, GSXML.REQUEST_TYPE_PROCESS, to, lang, uid);
531 mr_query_message.appendChild(mr_query_request);
532
533 // paramList
534 HashMap service_params = (HashMap)params.get("s1");
535
536 Element query_param_list = this.doc.createElement(GSXML.PARAM_ELEM + GSXML.LIST_MODIFIER);
537 GSXML.addParametersToList(this.doc, query_param_list, service_params);
538 mr_query_request.appendChild(query_param_list);
539
540 // do the query
541 Element mr_query_response = (Element)this.mr.process(mr_query_message);
542
543 String path = GSPath.appendLink(GSXML.RESPONSE_ELEM, GSXML.TERM_ELEM+GSXML.LIST_MODIFIER);
544 Element query_term_list_element = (Element) GSXML.getNodeByPath(mr_query_response, path);
545 if (query_term_list_element == null) {
546 // no term info
547 System.err.println("DocumentAction: Warning: No query term information.\n");
548 return dc_response_doc_content;
549 }
550
551 String content = GSXML.getNodeText(dc_response_doc_content);
552
553 String metadata_path = GSPath.appendLink(GSXML.RESPONSE_ELEM, GSXML.METADATA_ELEM+GSXML.LIST_MODIFIER);
554 Element metadata_list = (Element) GSXML.getNodeByPath(mr_query_response, metadata_path);
555
556 HashSet query_term_variants = new HashSet();
557 NodeList equivalent_terms_nodelist = query_term_list_element.getElementsByTagName("equivTermList");
558 for (int i = 0; i < equivalent_terms_nodelist.getLength(); i++) {
559 Element equivalent_terms_element = (Element) equivalent_terms_nodelist.item(i);
560 String[] equivalent_terms = GSXML.getAttributeValuesFromList(equivalent_terms_element, GSXML.NAME_ATT);
561 for (int j = 0; j < equivalent_terms.length; j++) {
562 System.err.println("Adding query term variant: " + equivalent_terms[j]);
563 query_term_variants.add(equivalent_terms[j]);
564 }
565 }
566
567 ArrayList phrase_query_term_variants_hierarchy = new ArrayList();
568
569 Element query_element = GSXML.getNamedElement(metadata_list, GSXML.METADATA_ELEM, GSXML.NAME_ATT, "query");
570 String performed_query = GSXML.getNodeText(query_element) + " ";
571
572 ArrayList phrase_query_p_term_variants_list = new ArrayList();
573 int term_start = 0;
574 boolean in_term = false;
575 boolean in_phrase = false;
576 for (int i = 0; i < performed_query.length(); i++) {
577 char character = performed_query.charAt(i);
578 boolean is_character_letter_or_digit = Character.isLetterOrDigit(character);
579
580 // Has a query term just started?
581 if (in_term == false && is_character_letter_or_digit == true) {
582 in_term = true;
583 term_start = i;
584 }
585
586 // Or has a term just finished?
587 else if (in_term == true && is_character_letter_or_digit == false) {
588 in_term = false;
589 String term = performed_query.substring(term_start, i);
590 System.err.println("Term: " + term);
591
592 HashSet phrase_query_p_term_x_variants = new HashSet();
593 Element term_element = GSXML.getNamedElement(query_term_list_element, GSXML.TERM_ELEM, GSXML.NAME_ATT, term);
594 NodeList term_equivalent_terms_nodelist = term_element.getElementsByTagName("equivTermList");
595 for (int j = 0; j < term_equivalent_terms_nodelist.getLength(); j++) {
596 Element term_equivalent_terms_element = (Element) term_equivalent_terms_nodelist.item(j);
597 String[] term_equivalent_terms = GSXML.getAttributeValuesFromList(term_equivalent_terms_element, GSXML.NAME_ATT);
598 for (int k = 0; k < term_equivalent_terms.length; k++) {
599 System.err.println("Adding query term variant: " + term_equivalent_terms[k]);
600 phrase_query_p_term_x_variants.add(term_equivalent_terms[k]);
601 }
602 }
603 phrase_query_p_term_variants_list.add(phrase_query_p_term_x_variants);
604
605 if (in_phrase == false) {
606 phrase_query_term_variants_hierarchy.add(phrase_query_p_term_variants_list);
607 phrase_query_p_term_variants_list = new ArrayList();
608 }
609 }
610
611 // Watch for phrases (surrounded by quotes)
612 if (character == '\"') {
613 // Has a phrase just started?
614 if (in_phrase == false) {
615 in_phrase = true;
616 }
617 // Or has a phrase just finished?
618 else if (in_phrase == true) {
619 in_phrase = false;
620 phrase_query_term_variants_hierarchy.add(phrase_query_p_term_variants_list);
621 }
622
623 phrase_query_p_term_variants_list = new ArrayList();
624 }
625 }
626
627 return highlightQueryTermsInternal(content, query_term_variants, phrase_query_term_variants_hierarchy);
628 }
629
630
631 /**
632 * Highlights query terms in a piece of text.
633 */
634 private Element highlightQueryTermsInternal(String content, HashSet query_term_variants, ArrayList phrase_query_term_variants_hierarchy)
635 {
636 // Convert the content string to an array of characters for speed
637 char[] content_characters = new char[content.length()];
638 content.getChars(0, content.length(), content_characters, 0);
639
640 // Now skim through the content, identifying word matches
641 ArrayList word_matches = new ArrayList();
642 int word_start = 0;
643 boolean in_word = false;
644 boolean preceding_word_matched = false;
645 for (int i = 0; i < content_characters.length; i++) {
646 boolean is_character_letter_or_digit = Character.isLetterOrDigit(content_characters[i]);
647
648 // Has a word just started?
649 if (in_word == false && is_character_letter_or_digit == true) {
650 in_word = true;
651 word_start = i;
652 }
653
654 // Or has a word just finished?
655 else if (in_word == true && is_character_letter_or_digit == false) {
656 in_word = false;
657
658 // Check if the word matches any of the query term equivalents
659 String word = new String(content_characters, word_start, (i - word_start));
660 if (query_term_variants.contains(word)) {
661 // We have found a matching word, so remember its location
662 word_matches.add(new WordMatch(word, word_start, i, preceding_word_matched));
663 preceding_word_matched = true;
664 }
665 else {
666 preceding_word_matched = false;
667 }
668 }
669 }
670
671 // Don't forget the last word...
672 if (in_word == true) {
673 // Check if the word matches any of the query term equivalents
674 String word = new String(content_characters, word_start, (content_characters.length - word_start));
675 if (query_term_variants.contains(word)) {
676 // We have found a matching word, so remember its location
677 word_matches.add(new WordMatch(word, word_start, content_characters.length, preceding_word_matched));
678 }
679 }
680
681 ArrayList highlight_start_positions = new ArrayList();
682 ArrayList highlight_end_positions = new ArrayList();
683
684 // Deal with phrases now
685 ArrayList partial_phrase_matches = new ArrayList();
686 for (int i = 0; i < word_matches.size(); i++) {
687 WordMatch word_match = (WordMatch) word_matches.get(i);
688
689 // See if any partial phrase matches are extended by this word
690 if (word_match.preceding_word_matched) {
691 for (int j = partial_phrase_matches.size() - 1; j >= 0; j--) {
692 PartialPhraseMatch partial_phrase_match = (PartialPhraseMatch) partial_phrase_matches.remove(j);
693 ArrayList phrase_query_p_term_variants_list = (ArrayList) phrase_query_term_variants_hierarchy.get(partial_phrase_match.query_phrase_number);
694 HashSet phrase_query_p_term_x_variants = (HashSet) phrase_query_p_term_variants_list.get(partial_phrase_match.num_words_matched);
695 if (phrase_query_p_term_x_variants.contains(word_match.word)) {
696 partial_phrase_match.num_words_matched++;
697
698 // Has a complete phrase match occurred?
699 if (partial_phrase_match.num_words_matched == phrase_query_p_term_variants_list.size()) {
700 // Check for overlaps by looking at the previous highlight range
701 if (!highlight_end_positions.isEmpty()) {
702 int last_highlight_index = highlight_end_positions.size() - 1;
703 int last_highlight_end = ((Integer) highlight_end_positions.get(last_highlight_index)).intValue();
704 if (last_highlight_end > partial_phrase_match.start_position) {
705 // There is an overlap, so remove the previous phrase match
706 int last_highlight_start = ((Integer) highlight_start_positions.remove(last_highlight_index)).intValue();
707 highlight_end_positions.remove(last_highlight_index);
708 partial_phrase_match.start_position = last_highlight_start;
709 }
710 }
711
712 highlight_start_positions.add(new Integer(partial_phrase_match.start_position));
713 highlight_end_positions.add(new Integer(word_match.end_position));
714 }
715 // No, but add the partial match back into the list for next time
716 else {
717 partial_phrase_matches.add(partial_phrase_match);
718 }
719 }
720 }
721 }
722 else {
723 partial_phrase_matches.clear();
724 }
725
726 // See if this word is at the start of any of the phrases
727 for (int p = 0; p < phrase_query_term_variants_hierarchy.size(); p++) {
728 ArrayList phrase_query_p_term_variants_list = (ArrayList) phrase_query_term_variants_hierarchy.get(p);
729 HashSet phrase_query_p_term_1_variants = (HashSet) phrase_query_p_term_variants_list.get(0);
730 if (phrase_query_p_term_1_variants.contains(word_match.word)) {
731 // If this phrase is just one word long, we have a complete match
732 if (phrase_query_p_term_variants_list.size() == 1) {
733 highlight_start_positions.add(new Integer(word_match.start_position));
734 highlight_end_positions.add(new Integer(word_match.end_position));
735 }
736 // Otherwise we have the start of a potential phrase match
737 else {
738 partial_phrase_matches.add(new PartialPhraseMatch(word_match.start_position, p));
739 }
740 }
741 }
742 }
743
744 // Now add the annotation tags into the document at the correct points
745 Element content_element = this.doc.createElement(GSXML.NODE_CONTENT_ELEM);
746
747 int last_wrote = 0;
748 for (int i = 0; i < highlight_start_positions.size(); i++) {
749 int highlight_start = ((Integer) highlight_start_positions.get(i)).intValue();
750 int highlight_end = ((Integer) highlight_end_positions.get(i)).intValue();
751
752 // Print anything before the highlight range
753 if (last_wrote < highlight_start) {
754 String preceding_text = new String(content_characters, last_wrote, (highlight_start - last_wrote));
755 // System.err.print(preceding_text);
756 content_element.appendChild(this.doc.createTextNode(preceding_text));
757 }
758
759 // Print the highlight text, annotated
760 if (highlight_end > last_wrote) {
761 String highlight_text = new String(content_characters, highlight_start, (highlight_end - highlight_start));
762 // System.err.print("|" + highlight_text + "|");
763 Element annotation_element = GSXML.createTextElement(this.doc, "annotation", highlight_text);
764 annotation_element.setAttribute("type", "query_term");
765 content_element.appendChild(annotation_element);
766 last_wrote = highlight_end;
767 }
768 }
769
770 // Finish off any unwritten text
771 if (last_wrote < content_characters.length) {
772 String remaining_text = new String(content_characters, last_wrote, (content_characters.length - last_wrote));
773 // System.err.print(remaining_text);
774 content_element.appendChild(this.doc.createTextNode(remaining_text));
775 }
776
777 return content_element;
778 }
779
780
781 static private class WordMatch
782 {
783 public String word;
784 public int start_position;
785 public int end_position;
786 public boolean preceding_word_matched;
787
788 public WordMatch(String word, int start_position, int end_position, boolean preceding_word_matched)
789 {
790 this.word = word;
791 this.start_position = start_position;
792 this.end_position = end_position;
793 this.preceding_word_matched = preceding_word_matched;
794 }
795 }
796
797
798 static private class PartialPhraseMatch
799 {
800 public int start_position;
801 public int query_phrase_number;
802 public int num_words_matched;
803
804 public PartialPhraseMatch(int start_position, int query_phrase_number)
805 {
806 this.start_position = start_position;
807 this.query_phrase_number = query_phrase_number;
808 this.num_words_matched = 1;
809 }
810 }
811}
Note: See TracBrowser for help on using the repository browser.