source: main/trunk/greenstone3/src/java/org/greenstone/gsdl3/action/DocumentAction.java@ 24889

Last change on this file since 24889 was 24889, checked in by sjm84, 12 years ago

Adjusted how metadata names (to return) are aquired

  • Property svn:keywords set to Author Date Id Revision
File size: 39.5 KB
Line 
1/*
2 * DocumentAction.java
3 * Copyright (C) 2002 New Zealand Digital Library, http://www.nzdl.org
4 *
5 * This program is free software; you can redistribute it and/or modify
6 * it under the terms of the GNU General Public License as published by
7 * the Free Software Foundation; either version 2 of the License, or
8 * (at your option) any later version.
9 *
10 * This program is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 * GNU General Public License for more details.
14 *
15 * You should have received a copy of the GNU General Public License
16 * along with this program; if not, write to the Free Software
17 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
18 */
19package org.greenstone.gsdl3.action;
20
21// Greenstone classes
22import org.greenstone.gsdl3.core.ModuleInterface;
23import org.greenstone.gsdl3.util.*;
24
25// XML classes
26import org.w3c.dom.Document;
27import org.w3c.dom.Element;
28import org.w3c.dom.Node;
29import org.w3c.dom.Text;
30import org.w3c.dom.NodeList;
31
32// General Java classes
33import java.util.ArrayList;
34import java.util.HashMap;
35import java.util.HashSet;
36import java.io.File;
37
38import org.apache.log4j.*;
39
40/** Action class for retrieving Documents via the message router */
41public class DocumentAction extends Action
42{
43
44 static Logger logger = Logger.getLogger(org.greenstone.gsdl3.action.DocumentAction.class.getName());
45
46 // this is used to specify that the sibling nodes of a selected one should be obtained
47 public static final String SIBLING_ARG = "sib";
48 public static final String GOTO_PAGE_ARG = "gp";
49 public static final String ENRICH_DOC_ARG = "end";
50
51 /**
52 * if this is set to true, when a document is displayed, any annotation type
53 * services (enrich) will be offered to the user as well
54 */
55 protected boolean provide_annotations = false;
56
57 protected boolean highlight_query_terms = false;
58
59 public boolean configure()
60 {
61 super.configure();
62 String highlight = (String) config_params.get("highlightQueryTerms");
63 if (highlight != null && highlight.equals("true"))
64 {
65 highlight_query_terms = true;
66 }
67 String annotate = (String) config_params.get("displayAnnotationService");
68 if (annotate != null && annotate.equals("true"))
69 {
70 provide_annotations = true;
71 }
72 return true;
73 }
74
75 public Node process(Node message_node)
76 {
77 // for now, no subaction eventually we may want to have subactions such as text assoc or something ?
78
79 Element message = this.converter.nodeToElement(message_node);
80
81 // the response
82 Element result = this.doc.createElement(GSXML.MESSAGE_ELEM);
83 Element page_response = this.doc.createElement(GSXML.RESPONSE_ELEM);
84 result.appendChild(page_response);
85
86 // get the request - assume only one
87 Element request = (Element) GSXML.getChildByTagName(message, GSXML.REQUEST_ELEM);
88 Element cgi_paramList = (Element) GSXML.getChildByTagName(request, GSXML.PARAM_ELEM + GSXML.LIST_MODIFIER);
89 HashMap params = GSXML.extractParams(cgi_paramList, false);
90
91 // just in case there are some that need to get passed to the services
92 HashMap service_params = (HashMap) params.get("s0");
93
94 String has_rl = null;
95 String has_href = null;
96 has_href = (String) params.get("href");//for an external link : get the href URL if it is existing in the params list
97 has_rl = (String) params.get("rl");//for an external link : get the rl value if it is existing in the params list
98 String collection = (String) params.get(GSParams.COLLECTION);
99 String lang = request.getAttribute(GSXML.LANG_ATT);
100 String uid = request.getAttribute(GSXML.USER_ID_ATT);
101 String document_name = (String) params.get(GSParams.DOCUMENT);
102 if ((document_name == null || document_name.equals("")) && (has_href == null || has_href.equals("")))
103 {
104 logger.error("no document specified!");
105 return result;
106 }
107 String document_type = (String) params.get(GSParams.DOCUMENT_TYPE);
108 if (document_type == null)
109 {
110 document_type = "simple";
111 }
112 //whether to retrieve siblings or not
113 boolean get_siblings = false;
114 String sibs = (String) params.get(SIBLING_ARG);
115 if (sibs != null && sibs.equals("1"))
116 {
117 get_siblings = true;
118 }
119
120 String sibling_num = (String) params.get(GOTO_PAGE_ARG);
121 if (sibling_num != null && !sibling_num.equals(""))
122 {
123 // we have to modify the doc name
124 document_name = document_name + "." + sibling_num + ".ss";
125 }
126
127 boolean expand_document = false;
128 String ed_arg = (String) params.get(GSParams.EXPAND_DOCUMENT);
129 if (ed_arg != null && ed_arg.equals("1"))
130 {
131 expand_document = true;
132 }
133
134 boolean expand_contents = false;
135 if (expand_document)
136 { // we always expand the contents with the text
137 expand_contents = true;
138 }
139 else
140 {
141 String ec_arg = (String) params.get(GSParams.EXPAND_CONTENTS);
142 if (ec_arg != null && ec_arg.equals("1"))
143 {
144 expand_contents = true;
145 }
146 }
147
148 //append site metadata
149 addSiteMetadata(page_response, lang, uid);
150
151 // get the additional data needed for the page
152 getBackgroundData(page_response, collection, lang, uid);
153 Element format_elem = (Element) GSXML.getChildByTagName(page_response, GSXML.FORMAT_ELEM);
154
155 // the_document is where all the doc info - structure and metadata etc
156 // is added into, to be returned in the page
157 Element the_document = this.doc.createElement(GSXML.DOCUMENT_ELEM);
158 page_response.appendChild(the_document);
159
160 // set the doctype from the cgi arg as an attribute
161 the_document.setAttribute(GSXML.DOC_TYPE_ATT, document_type);
162
163 // create a basic doc list containing the current node
164 Element basic_doc_list = this.doc.createElement(GSXML.DOC_NODE_ELEM + GSXML.LIST_MODIFIER);
165 Element current_doc = this.doc.createElement(GSXML.DOC_NODE_ELEM);
166 basic_doc_list.appendChild(current_doc);
167 if (document_name.length() != 0)
168 {
169 current_doc.setAttribute(GSXML.NODE_ID_ATT, document_name);
170 }
171 else if (has_href.length() != 0)
172 {
173 current_doc.setAttribute(GSXML.NODE_ID_ATT, has_href);
174 current_doc.setAttribute("externalURL", has_rl);
175 }
176
177 // Create a parameter list to specify the required structure information
178 Element ds_param_list = this.doc.createElement(GSXML.PARAM_ELEM + GSXML.LIST_MODIFIER);
179
180 if (service_params != null)
181 {
182 GSXML.addParametersToList(this.doc, ds_param_list, service_params);
183 }
184
185 Element ds_param = null;
186 boolean get_structure = false;
187 boolean get_structure_info = false;
188 if (document_type.equals(GSXML.DOC_TYPE_PAGED))
189 {
190 get_structure_info = true;
191
192 if (expand_contents)
193 {
194 ds_param = this.doc.createElement(GSXML.PARAM_ELEM);
195 ds_param_list.appendChild(ds_param);
196 ds_param.setAttribute(GSXML.NAME_ATT, "structure");
197 ds_param.setAttribute(GSXML.VALUE_ATT, "entire");
198 }
199
200 // get teh info needed for paged naviagtion
201 ds_param = this.doc.createElement(GSXML.PARAM_ELEM);
202 ds_param_list.appendChild(ds_param);
203 ds_param.setAttribute(GSXML.NAME_ATT, "info");
204 ds_param.setAttribute(GSXML.VALUE_ATT, "numSiblings");
205 ds_param = this.doc.createElement(GSXML.PARAM_ELEM);
206 ds_param_list.appendChild(ds_param);
207 ds_param.setAttribute(GSXML.NAME_ATT, "info");
208 ds_param.setAttribute(GSXML.VALUE_ATT, "numChildren");
209 ds_param = this.doc.createElement(GSXML.PARAM_ELEM);
210 ds_param_list.appendChild(ds_param);
211 ds_param.setAttribute(GSXML.NAME_ATT, "info");
212 ds_param.setAttribute(GSXML.VALUE_ATT, "siblingPosition");
213
214 if (get_siblings)
215 {
216 ds_param = this.doc.createElement(GSXML.PARAM_ELEM);
217 ds_param_list.appendChild(ds_param);
218 ds_param.setAttribute(GSXML.NAME_ATT, "structure");
219 ds_param.setAttribute(GSXML.VALUE_ATT, "siblings");
220 }
221
222 }
223 else if (document_type.equals(GSXML.DOC_TYPE_HIERARCHY))
224 {
225 get_structure = true;
226 if (expand_contents)
227 {
228 ds_param = this.doc.createElement(GSXML.PARAM_ELEM);
229 ds_param_list.appendChild(ds_param);
230 ds_param.setAttribute(GSXML.NAME_ATT, "structure");
231 ds_param.setAttribute(GSXML.VALUE_ATT, "entire");
232 }
233 else
234 {
235 // get the info needed for table of contents
236 ds_param = this.doc.createElement(GSXML.PARAM_ELEM);
237 ds_param_list.appendChild(ds_param);
238 ds_param.setAttribute(GSXML.NAME_ATT, "structure");
239 ds_param.setAttribute(GSXML.VALUE_ATT, "ancestors");
240 ds_param = this.doc.createElement(GSXML.PARAM_ELEM);
241 ds_param_list.appendChild(ds_param);
242 ds_param.setAttribute(GSXML.NAME_ATT, "structure");
243 ds_param.setAttribute(GSXML.VALUE_ATT, "children");
244 if (get_siblings)
245 {
246 ds_param = this.doc.createElement(GSXML.PARAM_ELEM);
247 ds_param_list.appendChild(ds_param);
248 ds_param.setAttribute(GSXML.NAME_ATT, "structure");
249 ds_param.setAttribute(GSXML.VALUE_ATT, "siblings");
250 }
251 }
252 }
253 else
254 {
255 // we dont need any structure
256 }
257
258 boolean has_dummy = false;
259 if (get_structure || get_structure_info)
260 {
261
262 // Build a request to obtain the document structure
263 Element ds_message = this.doc.createElement(GSXML.MESSAGE_ELEM);
264 String to = GSPath.appendLink(collection, "DocumentStructureRetrieve");// Hard-wired?
265 Element ds_request = GSXML.createBasicRequest(this.doc, GSXML.REQUEST_TYPE_PROCESS, to, lang, uid);
266 ds_message.appendChild(ds_request);
267 ds_request.appendChild(ds_param_list);
268
269 // create a doc_node_list and put in the doc_node that we are interested in
270 ds_request.appendChild(basic_doc_list);
271
272 // Process the document structure retrieve message
273 Element ds_response_message = (Element) this.mr.process(ds_message);
274 if (processErrorElements(ds_response_message, page_response))
275 {
276 return result;
277 }
278
279 // get the info and print out
280 String path = GSPath.appendLink(GSXML.RESPONSE_ELEM, GSXML.DOC_NODE_ELEM + GSXML.LIST_MODIFIER);
281 path = GSPath.appendLink(path, GSXML.DOC_NODE_ELEM);
282 path = GSPath.appendLink(path, "nodeStructureInfo");
283 Element ds_response_struct_info = (Element) GSXML.getNodeByPath(ds_response_message, path);
284 // get the doc_node bit
285 if (ds_response_struct_info != null)
286 {
287 the_document.appendChild(this.doc.importNode(ds_response_struct_info, true));
288 }
289 path = GSPath.appendLink(GSXML.RESPONSE_ELEM, GSXML.DOC_NODE_ELEM + GSXML.LIST_MODIFIER);
290 path = GSPath.appendLink(path, GSXML.DOC_NODE_ELEM);
291 path = GSPath.appendLink(path, GSXML.NODE_STRUCTURE_ELEM);
292 Element ds_response_structure = (Element) GSXML.getNodeByPath(ds_response_message, path);
293
294 if (ds_response_structure != null)
295 {
296 // add the contents of the structure bit into the_document
297 NodeList structs = ds_response_structure.getChildNodes();
298 for (int i = 0; i < structs.getLength(); i++)
299 {
300 the_document.appendChild(this.doc.importNode(structs.item(i), true));
301 }
302 }
303 else
304 {
305 // no structure nodes, so put in a dummy doc node
306 Element doc_node = this.doc.createElement(GSXML.DOC_NODE_ELEM);
307 if (document_name.length() != 0)
308 {
309 doc_node.setAttribute(GSXML.NODE_ID_ATT, document_name);
310 }
311 else if (has_href.length() != 0)
312 {
313 doc_node.setAttribute(GSXML.NODE_ID_ATT, has_href);
314 doc_node.setAttribute("externalURL", has_rl);
315 }
316 the_document.appendChild(doc_node);
317 has_dummy = true;
318 }
319 }
320 else
321 { // a simple type - we dont have a dummy node for simple
322 // should think about this more
323 // no structure request, so just put in a dummy doc node
324 Element doc_node = this.doc.createElement(GSXML.DOC_NODE_ELEM);
325 if (document_name.length() != 0)
326 {
327 doc_node.setAttribute(GSXML.NODE_ID_ATT, document_name);
328 }
329 else if (has_href.length() != 0)
330 {
331 doc_node.setAttribute(GSXML.NODE_ID_ATT, has_href);
332 doc_node.setAttribute("externalURL", has_rl);
333 }
334 the_document.appendChild(doc_node);
335 has_dummy = true;
336 }
337
338 // Build a request to obtain some document metadata
339 Element dm_message = this.doc.createElement(GSXML.MESSAGE_ELEM);
340 String to = GSPath.appendLink(collection, "DocumentMetadataRetrieve"); // Hard-wired?
341 Element dm_request = GSXML.createBasicRequest(this.doc, GSXML.REQUEST_TYPE_PROCESS, to, lang, uid);
342 dm_message.appendChild(dm_request);
343 // Create a parameter list to specify the required metadata information
344
345 HashSet meta_names = new HashSet();
346 meta_names.add("Title"); // the default
347 if (format_elem != null)
348 {
349 getRequiredMetadataNames(format_elem, meta_names);
350 }
351
352 Element dm_param_list = createMetadataParamList(meta_names);
353 if (service_params != null)
354 {
355 GSXML.addParametersToList(this.doc, dm_param_list, service_params);
356 }
357
358 dm_request.appendChild(dm_param_list);
359
360 // create the doc node list for the metadata request
361 Element dm_doc_list = this.doc.createElement(GSXML.DOC_NODE_ELEM + GSXML.LIST_MODIFIER);
362 dm_request.appendChild(dm_doc_list);
363
364 // Add each node from the structure response into the metadata request
365 NodeList doc_nodes = the_document.getElementsByTagName(GSXML.DOC_NODE_ELEM);
366 for (int i = 0; i < doc_nodes.getLength(); i++)
367 {
368 Element doc_node = (Element) doc_nodes.item(i);
369 String doc_node_id = doc_node.getAttribute(GSXML.NODE_ID_ATT);
370
371 // Add the documentNode to the list
372 Element dm_doc_node = this.doc.createElement(GSXML.DOC_NODE_ELEM);
373 dm_doc_list.appendChild(dm_doc_node);
374 dm_doc_node.setAttribute(GSXML.NODE_ID_ATT, doc_node_id);
375 dm_doc_node.setAttribute(GSXML.NODE_TYPE_ATT, doc_node.getAttribute(GSXML.NODE_TYPE_ATT));
376 }
377
378 // we also want a metadata request to the top level document to get
379 // assocfilepath - this could be cached too
380 Element doc_meta_request = GSXML.createBasicRequest(this.doc, GSXML.REQUEST_TYPE_PROCESS, to, lang, uid);
381 dm_message.appendChild(doc_meta_request);
382 Element doc_meta_param_list = this.doc.createElement(GSXML.PARAM_ELEM + GSXML.LIST_MODIFIER);
383 if (service_params != null)
384 {
385 GSXML.addParametersToList(this.doc, doc_meta_param_list, service_params);
386 }
387
388 doc_meta_request.appendChild(doc_meta_param_list);
389 Element doc_param = this.doc.createElement(GSXML.PARAM_ELEM);
390 doc_meta_param_list.appendChild(doc_param);
391 doc_param.setAttribute(GSXML.NAME_ATT, "metadata");
392 doc_param.setAttribute(GSXML.VALUE_ATT, "assocfilepath");
393
394 // create the doc node list for the metadata request
395 Element doc_list = this.doc.createElement(GSXML.DOC_NODE_ELEM + GSXML.LIST_MODIFIER);
396 doc_meta_request.appendChild(doc_list);
397
398 Element doc_node = this.doc.createElement(GSXML.DOC_NODE_ELEM);
399 // the node we want is the root document node
400 if (document_name.length() != 0)
401 {
402 doc_node.setAttribute(GSXML.NODE_ID_ATT, document_name + ".rt");
403 }
404 else if (has_href.length() != 0)
405 {
406 doc_node.setAttribute(GSXML.NODE_ID_ATT, has_href + ".rt");
407 doc_node.setAttribute("externalURL", has_rl);
408 }
409 doc_list.appendChild(doc_node);
410
411 Element dm_response_message = (Element) this.mr.process(dm_message);
412 if (processErrorElements(dm_response_message, page_response))
413 {
414 return result;
415 }
416
417 String path = GSPath.appendLink(GSXML.RESPONSE_ELEM, GSXML.DOC_NODE_ELEM + GSXML.LIST_MODIFIER);
418 Element dm_response_doc_list = (Element) GSXML.getNodeByPath(dm_response_message, path);
419
420 // Merge the metadata with the structure information
421 NodeList dm_response_docs = dm_response_doc_list.getChildNodes();
422 for (int i = 0; i < doc_nodes.getLength(); i++)
423 {
424 GSXML.mergeMetadataLists(doc_nodes.item(i), dm_response_docs.item(i));
425 }
426 // get the top level doc metadata out
427 Element doc_meta_response = (Element) dm_response_message.getElementsByTagName(GSXML.RESPONSE_ELEM).item(1);
428 Element top_doc_node = (Element) GSXML.getNodeByPath(doc_meta_response, "documentNodeList/documentNode");
429 GSXML.mergeMetadataLists(the_document, top_doc_node);
430
431 // Build a request to obtain some document content
432 Element dc_message = this.doc.createElement(GSXML.MESSAGE_ELEM);
433 to = GSPath.appendLink(collection, "DocumentContentRetrieve"); // Hard-wired?
434 Element dc_request = GSXML.createBasicRequest(this.doc, GSXML.REQUEST_TYPE_PROCESS, to, lang, uid);
435 dc_message.appendChild(dc_request);
436
437 // Create a parameter list to specify the request parameters - empty for now
438 Element dc_param_list = this.doc.createElement(GSXML.PARAM_ELEM + GSXML.LIST_MODIFIER);
439 if (service_params != null)
440 {
441 GSXML.addParametersToList(this.doc, dc_param_list, service_params);
442 }
443
444 dc_request.appendChild(dc_param_list);
445
446 // get the content
447 // the doc list for the content request is the same as the one for the structure request unless we want the whole document, in which case its the same as for the metadata request.
448 if (expand_document)
449 {
450 dc_request.appendChild(dm_doc_list);
451 }
452 else
453 {
454 dc_request.appendChild(basic_doc_list);
455 }
456 logger.debug("request = " + converter.getString(dc_message));
457 Element dc_response_message = (Element) this.mr.process(dc_message);
458 if (processErrorElements(dc_response_message, page_response))
459 {
460 return result;
461 }
462
463 Element dc_response_doc_list = (Element) GSXML.getNodeByPath(dc_response_message, path);
464
465 if (expand_document)
466 {
467 // Merge the content with the structure information
468 NodeList dc_response_docs = dc_response_doc_list.getChildNodes();
469 for (int i = 0; i < doc_nodes.getLength(); i++)
470 {
471 Node content = GSXML.getChildByTagName((Element) dc_response_docs.item(i), "nodeContent");
472 if (content != null)
473 {
474 if (highlight_query_terms)
475 {
476 content = highlightQueryTerms(request, (Element) content);
477 }
478 doc_nodes.item(i).appendChild(this.doc.importNode(content, true));
479 }
480 //GSXML.mergeMetadataLists(doc_nodes.item(i), dm_response_docs.item(i));
481 }
482 }
483 else
484 {
485 //path = GSPath.appendLink(path, GSXML.DOC_NODE_ELEM);
486 Element dc_response_doc = (Element) GSXML.getChildByTagName(dc_response_doc_list, GSXML.DOC_NODE_ELEM);
487 Element dc_response_doc_content = (Element) GSXML.getChildByTagName(dc_response_doc, GSXML.NODE_CONTENT_ELEM);
488 Element dc_response_doc_external = (Element) GSXML.getChildByTagName(dc_response_doc, "external");
489
490 if (dc_response_doc_content == null)
491 {
492 // no content to add
493 if (dc_response_doc_external != null)
494 {
495 String modified_doc_id = dc_response_doc.getAttribute(GSXML.NODE_ID_ATT);
496
497 the_document.setAttribute("selectedNode", modified_doc_id);
498 the_document.setAttribute("external", dc_response_doc_external.getAttribute("external_link"));
499 }
500 return result;
501 }
502 if (highlight_query_terms)
503 {
504 dc_response_doc.removeChild(dc_response_doc_content);
505
506 dc_response_doc_content = highlightQueryTerms(request, dc_response_doc_content);
507 dc_response_doc.appendChild(dc_response_doc.getOwnerDocument().importNode(dc_response_doc_content, true));
508 }
509
510 if (provide_annotations)
511 {
512 String service_selected = (String) params.get(ENRICH_DOC_ARG);
513 if (service_selected != null && service_selected.equals("1"))
514 {
515 // now we can modifiy the response doc if needed
516 String enrich_service = (String) params.get(GSParams.SERVICE);
517 // send a message to the service
518 Element enrich_message = this.doc.createElement(GSXML.MESSAGE_ELEM);
519 Element enrich_request = GSXML.createBasicRequest(this.doc, GSXML.REQUEST_TYPE_PROCESS, enrich_service, lang, uid);
520 enrich_message.appendChild(enrich_request);
521 // check for parameters
522 HashMap e_service_params = (HashMap) params.get("s1");
523 if (e_service_params != null)
524 {
525 Element enrich_pl = this.doc.createElement(GSXML.PARAM_ELEM + GSXML.LIST_MODIFIER);
526 GSXML.addParametersToList(this.doc, enrich_pl, e_service_params);
527 enrich_request.appendChild(enrich_pl);
528 }
529 Element e_doc_list = this.doc.createElement(GSXML.DOC_NODE_ELEM + GSXML.LIST_MODIFIER);
530 enrich_request.appendChild(e_doc_list);
531 e_doc_list.appendChild(this.doc.importNode(dc_response_doc, true));
532
533 Node enrich_response = this.mr.process(enrich_message);
534
535 String[] links = { GSXML.RESPONSE_ELEM, GSXML.DOC_NODE_ELEM + GSXML.LIST_MODIFIER, GSXML.DOC_NODE_ELEM, GSXML.NODE_CONTENT_ELEM };
536 path = GSPath.createPath(links);
537 dc_response_doc_content = (Element) GSXML.getNodeByPath(enrich_response, path);
538
539 }
540 } // if provide_annotations
541
542 // use the returned id rather than the sent one cos there may have
543 // been modifiers such as .pr that are removed.
544 String modified_doc_id = dc_response_doc.getAttribute(GSXML.NODE_ID_ATT);
545 the_document.setAttribute("selectedNode", modified_doc_id);
546 if (has_dummy)
547 {
548 // change the id if necessary and add the content
549 Element dummy_node = (Element) doc_nodes.item(0);
550
551 dummy_node.setAttribute(GSXML.NODE_ID_ATT, modified_doc_id);
552 dummy_node.appendChild(this.doc.importNode(dc_response_doc_content, true));
553 // hack for simple type
554 if (document_type.equals("simple"))
555 {
556 // we dont want the internal docNode, just want the content and metadata in the document
557 // rethink this!!
558 the_document.removeChild(dummy_node);
559
560 NodeList dummy_children = dummy_node.getChildNodes();
561 //for (int i=0; i<dummy_children.getLength(); i++) {
562 for (int i = dummy_children.getLength() - 1; i >= 0; i--)
563 {
564 // special case as we don't want more than one metadata list
565 if (dummy_children.item(i).getNodeName().equals(GSXML.METADATA_ELEM + GSXML.LIST_MODIFIER))
566 {
567 GSXML.mergeMetadataFromList(the_document, dummy_children.item(i));
568 }
569 else
570 {
571 the_document.appendChild(dummy_children.item(i));
572 }
573 }
574 }
575 }
576 else
577 {
578 // Merge the document content with the metadata and structure information
579 for (int i = 0; i < doc_nodes.getLength(); i++)
580 {
581 Node dn = doc_nodes.item(i);
582 String dn_id = ((Element) dn).getAttribute(GSXML.NODE_ID_ATT);
583 if (dn_id.equals(modified_doc_id))
584 {
585 dn.appendChild(this.doc.importNode(dc_response_doc_content, true));
586 break;
587 }
588 }
589 }
590 }
591 logger.debug("(DocumentAction) Page:\n" + this.converter.getPrettyString(result));
592 return result;
593 }
594
595 /**
596 * tell the param class what its arguments are if an action has its own
597 * arguments, this should add them to the params object - particularly
598 * important for args that should not be saved
599 */
600 public boolean getActionParameters(GSParams params)
601 {
602 params.addParameter(GOTO_PAGE_ARG, false);
603 params.addParameter(ENRICH_DOC_ARG, false);
604 return true;
605 }
606
607 /**
608 * this method gets the collection description, the format info, the list of
609 * enrich services, etc - stuff that is needed for the page, but is the same
610 * whatever the query is - should be cached
611 */
612 protected boolean getBackgroundData(Element page_response, String collection, String lang, String uid)
613 {
614
615 // create a message to process - contains requests for the collection
616 // description, the format element, the enrich services on offer
617 // these could all be cached
618 Element info_message = this.doc.createElement(GSXML.MESSAGE_ELEM);
619 String path = GSPath.appendLink(collection, "DocumentContentRetrieve");
620 // the format request - ignore for now, where does this request go to??
621 Element format_request = GSXML.createBasicRequest(this.doc, GSXML.REQUEST_TYPE_FORMAT, path, lang, uid);
622 info_message.appendChild(format_request);
623
624 // the enrich_services request - only do this if provide_annotations is true
625
626 if (provide_annotations)
627 {
628 Element enrich_services_request = GSXML.createBasicRequest(this.doc, GSXML.REQUEST_TYPE_DESCRIBE, "", lang, uid);
629 enrich_services_request.setAttribute(GSXML.INFO_ATT, "serviceList");
630 info_message.appendChild(enrich_services_request);
631 }
632
633 Element info_response = (Element) this.mr.process(info_message);
634
635 // the collection is the first response
636 NodeList responses = info_response.getElementsByTagName(GSXML.RESPONSE_ELEM);
637 Element format_resp = (Element) responses.item(0);
638
639 Element format_elem = (Element) GSXML.getChildByTagName(format_resp, GSXML.FORMAT_ELEM);
640 if (format_elem != null)
641 {
642 logger.debug("doc action found a format statement");
643 // set teh format type
644 format_elem.setAttribute(GSXML.TYPE_ATT, "display");
645 page_response.appendChild(this.doc.importNode(format_elem, true));
646 }
647
648 if (provide_annotations)
649 {
650 Element services_resp = (Element) responses.item(1);
651
652 // a new message for the mr
653 Element enrich_message = this.doc.createElement(GSXML.MESSAGE_ELEM);
654
655 NodeList e_services = services_resp.getElementsByTagName(GSXML.SERVICE_ELEM);
656 boolean service_found = false;
657 for (int j = 0; j < e_services.getLength(); j++)
658 {
659 if (((Element) e_services.item(j)).getAttribute(GSXML.TYPE_ATT).equals("enrich"))
660 {
661 Element s = GSXML.createBasicRequest(this.doc, GSXML.REQUEST_TYPE_DESCRIBE, ((Element) e_services.item(j)).getAttribute(GSXML.NAME_ATT), lang, uid);
662 enrich_message.appendChild(s);
663 service_found = true;
664 }
665 }
666 if (service_found)
667 {
668 Element enrich_response = (Element) this.mr.process(enrich_message);
669
670 NodeList e_responses = enrich_response.getElementsByTagName(GSXML.RESPONSE_ELEM);
671 Element service_list = this.doc.createElement(GSXML.SERVICE_ELEM + GSXML.LIST_MODIFIER);
672 for (int i = 0; i < e_responses.getLength(); i++)
673 {
674 Element e_resp = (Element) e_responses.item(i);
675 Element e_service = (Element) this.doc.importNode(GSXML.getChildByTagName(e_resp, GSXML.SERVICE_ELEM), true);
676 e_service.setAttribute(GSXML.NAME_ATT, e_resp.getAttribute(GSXML.FROM_ATT));
677 service_list.appendChild(e_service);
678 }
679 page_response.appendChild(service_list);
680 }
681 } // if provide_annotations
682 return true;
683
684 }
685
686 /**
687 * this involves a bit of a hack to get the equivalent query terms - has to
688 * requery the query service - uses the last selected service name. (if it
689 * ends in query). should this action do the query or should it send a
690 * message to the query action? but that will involve lots of extra stuff.
691 * also doesn't handle phrases properly - just highlights all the terms
692 * found in the text.
693 */
694 protected Element highlightQueryTerms(Element request, Element dc_response_doc_content)
695 {
696
697 // do the query again to get term info
698 Element cgi_param_list = (Element) GSXML.getChildByTagName(request, GSXML.PARAM_ELEM + GSXML.LIST_MODIFIER);
699 HashMap params = GSXML.extractParams(cgi_param_list, false);
700
701 HashMap previous_params = (HashMap) params.get("p");
702 if (previous_params == null)
703 {
704 return dc_response_doc_content;
705 }
706 String service_name = (String) previous_params.get(GSParams.SERVICE);
707 if (service_name == null || !service_name.endsWith("Query"))
708 { // hack for now - we only do highlighting if we were in a query last - ie not if we were in a browse thingy
709 logger.debug("invalid service, not doing highlighting");
710 return dc_response_doc_content;
711 }
712 String collection = (String) params.get(GSParams.COLLECTION);
713 String lang = request.getAttribute(GSXML.LANG_ATT);
714 String uid = request.getAttribute(GSXML.USER_ID_ATT);
715 String to = GSPath.appendLink(collection, service_name);
716
717 Element mr_query_message = this.doc.createElement(GSXML.MESSAGE_ELEM);
718 Element mr_query_request = GSXML.createBasicRequest(this.doc, GSXML.REQUEST_TYPE_PROCESS, to, lang, uid);
719 mr_query_message.appendChild(mr_query_request);
720
721 // paramList
722 HashMap service_params = (HashMap) params.get("s1");
723
724 Element query_param_list = this.doc.createElement(GSXML.PARAM_ELEM + GSXML.LIST_MODIFIER);
725 GSXML.addParametersToList(this.doc, query_param_list, service_params);
726 mr_query_request.appendChild(query_param_list);
727
728 // do the query
729 Element mr_query_response = (Element) this.mr.process(mr_query_message);
730
731 String path = GSPath.appendLink(GSXML.RESPONSE_ELEM, GSXML.TERM_ELEM + GSXML.LIST_MODIFIER);
732 Element query_term_list_element = (Element) GSXML.getNodeByPath(mr_query_response, path);
733 if (query_term_list_element == null)
734 {
735 // no term info
736 logger.error("No query term information.\n");
737 return dc_response_doc_content;
738 }
739
740 String content = GSXML.getNodeText(dc_response_doc_content);
741
742 String metadata_path = GSPath.appendLink(GSXML.RESPONSE_ELEM, GSXML.METADATA_ELEM + GSXML.LIST_MODIFIER);
743 Element metadata_list = (Element) GSXML.getNodeByPath(mr_query_response, metadata_path);
744
745 HashSet query_term_variants = new HashSet();
746 NodeList equivalent_terms_nodelist = query_term_list_element.getElementsByTagName("equivTermList");
747 if (equivalent_terms_nodelist == null || equivalent_terms_nodelist.getLength() == 0)
748 {
749 NodeList terms_nodelist = query_term_list_element.getElementsByTagName("term");
750 if (terms_nodelist != null && terms_nodelist.getLength() > 0)
751 {
752 for (int i = 0; i < terms_nodelist.getLength(); i++)
753 {
754 String termValue = ((Element) terms_nodelist.item(i)).getAttribute("name");
755 String termValueU = null;
756 String termValueL = null;
757
758 if (termValue.length() > 1)
759 {
760 termValueU = termValue.substring(0, 1).toUpperCase() + termValue.substring(1);
761 termValueL = termValue.substring(0, 1).toLowerCase() + termValue.substring(1);
762 }
763 else
764 {
765 termValueU = termValue.substring(0, 1).toUpperCase();
766 termValueL = termValue.substring(0, 1).toLowerCase();
767 }
768
769 query_term_variants.add(termValueU);
770 query_term_variants.add(termValueL);
771 }
772 }
773 }
774 else
775 {
776 for (int i = 0; i < equivalent_terms_nodelist.getLength(); i++)
777 {
778 Element equivalent_terms_element = (Element) equivalent_terms_nodelist.item(i);
779 String[] equivalent_terms = GSXML.getAttributeValuesFromList(equivalent_terms_element, GSXML.NAME_ATT);
780 for (int j = 0; j < equivalent_terms.length; j++)
781 {
782 query_term_variants.add(equivalent_terms[j]);
783 }
784 }
785 }
786
787 ArrayList phrase_query_term_variants_hierarchy = new ArrayList();
788
789 Element query_element = GSXML.getNamedElement(metadata_list, GSXML.METADATA_ELEM, GSXML.NAME_ATT, "query");
790 String performed_query = GSXML.getNodeText(query_element) + " ";
791
792 ArrayList phrase_query_p_term_variants_list = new ArrayList();
793 int term_start = 0;
794 boolean in_term = false;
795 boolean in_phrase = false;
796 for (int i = 0; i < performed_query.length(); i++)
797 {
798 char character = performed_query.charAt(i);
799 boolean is_character_letter_or_digit = Character.isLetterOrDigit(character);
800
801 // Has a query term just started?
802 if (in_term == false && is_character_letter_or_digit == true)
803 {
804 in_term = true;
805 term_start = i;
806 }
807
808 // Or has a term just finished?
809 else if (in_term == true && is_character_letter_or_digit == false)
810 {
811 in_term = false;
812 String term = performed_query.substring(term_start, i);
813
814 Element term_element = GSXML.getNamedElement(query_term_list_element, GSXML.TERM_ELEM, GSXML.NAME_ATT, term);
815 if (term_element != null)
816 {
817
818 HashSet phrase_query_p_term_x_variants = new HashSet();
819
820 NodeList term_equivalent_terms_nodelist = term_element.getElementsByTagName("equivTermList");
821 if (term_equivalent_terms_nodelist == null || term_equivalent_terms_nodelist.getLength() == 0)
822 {
823 String termValueU = null;
824 String termValueL = null;
825
826 if (term.length() > 1)
827 {
828 termValueU = term.substring(0, 1).toUpperCase() + term.substring(1);
829 termValueL = term.substring(0, 1).toLowerCase() + term.substring(1);
830 }
831 else
832 {
833 termValueU = term.substring(0, 1).toUpperCase();
834 termValueL = term.substring(0, 1).toLowerCase();
835 }
836
837 phrase_query_p_term_x_variants.add(termValueU);
838 phrase_query_p_term_x_variants.add(termValueL);
839 }
840 else
841 {
842 for (int j = 0; j < term_equivalent_terms_nodelist.getLength(); j++)
843 {
844 Element term_equivalent_terms_element = (Element) term_equivalent_terms_nodelist.item(j);
845 String[] term_equivalent_terms = GSXML.getAttributeValuesFromList(term_equivalent_terms_element, GSXML.NAME_ATT);
846 for (int k = 0; k < term_equivalent_terms.length; k++)
847 {
848 phrase_query_p_term_x_variants.add(term_equivalent_terms[k]);
849 }
850 }
851 }
852 phrase_query_p_term_variants_list.add(phrase_query_p_term_x_variants);
853
854 if (in_phrase == false)
855 {
856 phrase_query_term_variants_hierarchy.add(phrase_query_p_term_variants_list);
857 phrase_query_p_term_variants_list = new ArrayList();
858 }
859 }
860 }
861 // Watch for phrases (surrounded by quotes)
862 if (character == '\"')
863 {
864 // Has a phrase just started?
865 if (in_phrase == false)
866 {
867 in_phrase = true;
868 }
869 // Or has a phrase just finished?
870 else if (in_phrase == true)
871 {
872 in_phrase = false;
873 phrase_query_term_variants_hierarchy.add(phrase_query_p_term_variants_list);
874 }
875
876 phrase_query_p_term_variants_list = new ArrayList();
877 }
878 }
879
880 return highlightQueryTermsInternal(content, query_term_variants, phrase_query_term_variants_hierarchy);
881 }
882
883 /**
884 * Highlights query terms in a piece of text.
885 */
886 private Element highlightQueryTermsInternal(String content, HashSet query_term_variants, ArrayList phrase_query_term_variants_hierarchy)
887 {
888 // Convert the content string to an array of characters for speed
889 char[] content_characters = new char[content.length()];
890 content.getChars(0, content.length(), content_characters, 0);
891
892 // Now skim through the content, identifying word matches
893 ArrayList word_matches = new ArrayList();
894 int word_start = 0;
895 boolean in_word = false;
896 boolean preceding_word_matched = false;
897 boolean inTag = false;
898 for (int i = 0; i < content_characters.length; i++)
899 {
900 //We don't want to find words inside HTML tags
901 if(content_characters[i] == '<')
902 {
903 inTag = true;
904 continue;
905 }
906 else if (inTag && content_characters[i] == '>')
907 {
908 inTag = false;
909 }
910 else if (inTag)
911 {
912 continue;
913 }
914
915 boolean is_character_letter_or_digit = Character.isLetterOrDigit(content_characters[i]);
916
917 // Has a word just started?
918 if (in_word == false && is_character_letter_or_digit == true)
919 {
920 in_word = true;
921 word_start = i;
922 }
923
924 // Or has a word just finished?
925 else if (in_word == true && is_character_letter_or_digit == false)
926 {
927 in_word = false;
928
929 // Check if the word matches any of the query term equivalents
930 String word = new String(content_characters, word_start, (i - word_start));
931 if (query_term_variants.contains(word))
932 {
933 // We have found a matching word, so remember its location
934 word_matches.add(new WordMatch(word, word_start, i, preceding_word_matched));
935 preceding_word_matched = true;
936 }
937 else
938 {
939 preceding_word_matched = false;
940 }
941 }
942 }
943
944 // Don't forget the last word...
945 if (in_word == true)
946 {
947 // Check if the word matches any of the query term equivalents
948 String word = new String(content_characters, word_start, (content_characters.length - word_start));
949 if (query_term_variants.contains(word))
950 {
951 // We have found a matching word, so remember its location
952 word_matches.add(new WordMatch(word, word_start, content_characters.length, preceding_word_matched));
953 }
954 }
955
956 ArrayList highlight_start_positions = new ArrayList();
957 ArrayList highlight_end_positions = new ArrayList();
958
959 // Deal with phrases now
960 ArrayList partial_phrase_matches = new ArrayList();
961 for (int i = 0; i < word_matches.size(); i++)
962 {
963 WordMatch word_match = (WordMatch) word_matches.get(i);
964
965 // See if any partial phrase matches are extended by this word
966 if (word_match.preceding_word_matched)
967 {
968 for (int j = partial_phrase_matches.size() - 1; j >= 0; j--)
969 {
970 PartialPhraseMatch partial_phrase_match = (PartialPhraseMatch) partial_phrase_matches.remove(j);
971 ArrayList phrase_query_p_term_variants_list = (ArrayList) phrase_query_term_variants_hierarchy.get(partial_phrase_match.query_phrase_number);
972 HashSet phrase_query_p_term_x_variants = (HashSet) phrase_query_p_term_variants_list.get(partial_phrase_match.num_words_matched);
973 if (phrase_query_p_term_x_variants.contains(word_match.word))
974 {
975 partial_phrase_match.num_words_matched++;
976
977 // Has a complete phrase match occurred?
978 if (partial_phrase_match.num_words_matched == phrase_query_p_term_variants_list.size())
979 {
980 // Check for overlaps by looking at the previous highlight range
981 if (!highlight_end_positions.isEmpty())
982 {
983 int last_highlight_index = highlight_end_positions.size() - 1;
984 int last_highlight_end = ((Integer) highlight_end_positions.get(last_highlight_index)).intValue();
985 if (last_highlight_end > partial_phrase_match.start_position)
986 {
987 // There is an overlap, so remove the previous phrase match
988 int last_highlight_start = ((Integer) highlight_start_positions.remove(last_highlight_index)).intValue();
989 highlight_end_positions.remove(last_highlight_index);
990 partial_phrase_match.start_position = last_highlight_start;
991 }
992 }
993
994 highlight_start_positions.add(new Integer(partial_phrase_match.start_position));
995 highlight_end_positions.add(new Integer(word_match.end_position));
996 }
997 // No, but add the partial match back into the list for next time
998 else
999 {
1000 partial_phrase_matches.add(partial_phrase_match);
1001 }
1002 }
1003 }
1004 }
1005 else
1006 {
1007 partial_phrase_matches.clear();
1008 }
1009
1010 // See if this word is at the start of any of the phrases
1011 for (int p = 0; p < phrase_query_term_variants_hierarchy.size(); p++)
1012 {
1013 ArrayList phrase_query_p_term_variants_list = (ArrayList) phrase_query_term_variants_hierarchy.get(p);
1014 HashSet phrase_query_p_term_1_variants = (HashSet) phrase_query_p_term_variants_list.get(0);
1015 if (phrase_query_p_term_1_variants.contains(word_match.word))
1016 {
1017 // If this phrase is just one word long, we have a complete match
1018 if (phrase_query_p_term_variants_list.size() == 1)
1019 {
1020 highlight_start_positions.add(new Integer(word_match.start_position));
1021 highlight_end_positions.add(new Integer(word_match.end_position));
1022 }
1023 // Otherwise we have the start of a potential phrase match
1024 else
1025 {
1026 partial_phrase_matches.add(new PartialPhraseMatch(word_match.start_position, p));
1027 }
1028 }
1029 }
1030 }
1031
1032 // Now add the annotation tags into the document at the correct points
1033 Element content_element = this.doc.createElement(GSXML.NODE_CONTENT_ELEM);
1034
1035 int last_wrote = 0;
1036 for (int i = 0; i < highlight_start_positions.size(); i++)
1037 {
1038 int highlight_start = ((Integer) highlight_start_positions.get(i)).intValue();
1039 int highlight_end = ((Integer) highlight_end_positions.get(i)).intValue();
1040
1041 // Print anything before the highlight range
1042 if (last_wrote < highlight_start)
1043 {
1044 String preceding_text = new String(content_characters, last_wrote, (highlight_start - last_wrote));
1045 content_element.appendChild(this.doc.createTextNode(preceding_text));
1046 }
1047
1048 // Print the highlight text, annotated
1049 if (highlight_end > last_wrote)
1050 {
1051 String highlight_text = new String(content_characters, highlight_start, (highlight_end - highlight_start));
1052 Element annotation_element = GSXML.createTextElement(this.doc, "annotation", highlight_text);
1053 annotation_element.setAttribute("type", "query_term");
1054 content_element.appendChild(annotation_element);
1055 last_wrote = highlight_end;
1056 }
1057 }
1058
1059 // Finish off any unwritten text
1060 if (last_wrote < content_characters.length)
1061 {
1062 String remaining_text = new String(content_characters, last_wrote, (content_characters.length - last_wrote));
1063 content_element.appendChild(this.doc.createTextNode(remaining_text));
1064 }
1065
1066 return content_element;
1067 }
1068
1069 static private class WordMatch
1070 {
1071 public String word;
1072 public int start_position;
1073 public int end_position;
1074 public boolean preceding_word_matched;
1075
1076 public WordMatch(String word, int start_position, int end_position, boolean preceding_word_matched)
1077 {
1078 this.word = word;
1079 this.start_position = start_position;
1080 this.end_position = end_position;
1081 this.preceding_word_matched = preceding_word_matched;
1082 }
1083 }
1084
1085 static private class PartialPhraseMatch
1086 {
1087 public int start_position;
1088 public int query_phrase_number;
1089 public int num_words_matched;
1090
1091 public PartialPhraseMatch(int start_position, int query_phrase_number)
1092 {
1093 this.start_position = start_position;
1094 this.query_phrase_number = query_phrase_number;
1095 this.num_words_matched = 1;
1096 }
1097 }
1098}
Note: See TracBrowser for help on using the repository browser.