source: main/trunk/greenstone3/src/java/org/greenstone/gsdl3/action/DocumentAction.java@ 25305

Last change on this file since 25305 was 25305, checked in by kjdon, 12 years ago

tidying up handling of external links and hrefs that are relative greenstone links

  • Property svn:keywords set to Author Date Id Revision
File size: 40.0 KB
Line 
1/*
2 * DocumentAction.java
3 * Copyright (C) 2002 New Zealand Digital Library, http://www.nzdl.org
4 *
5 * This program is free software; you can redistribute it and/or modify
6 * it under the terms of the GNU General Public License as published by
7 * the Free Software Foundation; either version 2 of the License, or
8 * (at your option) any later version.
9 *
10 * This program is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 * GNU General Public License for more details.
14 *
15 * You should have received a copy of the GNU General Public License
16 * along with this program; if not, write to the Free Software
17 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
18 */
19package org.greenstone.gsdl3.action;
20
21// Greenstone classes
22import org.greenstone.gsdl3.core.ModuleInterface;
23import org.greenstone.gsdl3.util.*;
24
25// XML classes
26import org.w3c.dom.Document;
27import org.w3c.dom.Element;
28import org.w3c.dom.Node;
29import org.w3c.dom.Text;
30import org.w3c.dom.NodeList;
31
32// General Java classes
33import java.util.ArrayList;
34import java.util.HashMap;
35import java.util.HashSet;
36import java.io.File;
37
38import org.apache.log4j.*;
39
40/** Action class for retrieving Documents via the message router */
41public class DocumentAction extends Action
42{
43
44 static Logger logger = Logger.getLogger(org.greenstone.gsdl3.action.DocumentAction.class.getName());
45
46 // this is used to specify that the sibling nodes of a selected one should be obtained
47 public static final String SIBLING_ARG = "sib";
48 public static final String GOTO_PAGE_ARG = "gp";
49 public static final String ENRICH_DOC_ARG = "end";
50 public static final String EXPAND_DOCUMENT_ARG = "ed";
51 public static final String EXPAND_CONTENTS_ARG = "ec";
52 public static final String REALISTIC_BOOK_ARG = "book";
53
54 /**
55 * if this is set to true, when a document is displayed, any annotation type
56 * services (enrich) will be offered to the user as well
57 */
58 protected boolean provide_annotations = false;
59
60 protected boolean highlight_query_terms = false;
61
62 public boolean configure()
63 {
64 super.configure();
65 String highlight = (String) config_params.get("highlightQueryTerms");
66 if (highlight != null && highlight.equals("true"))
67 {
68 highlight_query_terms = true;
69 }
70 String annotate = (String) config_params.get("displayAnnotationService");
71 if (annotate != null && annotate.equals("true"))
72 {
73 provide_annotations = true;
74 }
75 return true;
76 }
77
78 public Node process(Node message_node)
79 {
80 // for now, no subaction eventually we may want to have subactions such as text assoc or something ?
81
82 Element message = this.converter.nodeToElement(message_node);
83
84 // the response
85 Element result = this.doc.createElement(GSXML.MESSAGE_ELEM);
86 Element page_response = this.doc.createElement(GSXML.RESPONSE_ELEM);
87 result.appendChild(page_response);
88
89 // get the request - assume only one
90 Element request = (Element) GSXML.getChildByTagName(message, GSXML.REQUEST_ELEM);
91 Element cgi_paramList = (Element) GSXML.getChildByTagName(request, GSXML.PARAM_ELEM + GSXML.LIST_MODIFIER);
92 HashMap params = GSXML.extractParams(cgi_paramList, false);
93
94 // just in case there are some that need to get passed to the services
95 HashMap service_params = (HashMap) params.get("s0");
96
97 String collection = (String) params.get(GSParams.COLLECTION);
98 String document_id = (String) params.get(GSParams.DOCUMENT);
99 if (document_id != null && document_id.equals("")) {
100 document_id = null;
101 }
102 String href = (String) params.get(GSParams.HREF);//for an external link : get the href URL if it is existing in the params list
103 if (href != null && href.equals("")) {
104 href = null;
105 }
106 String rl = (String) params.get(GSParams.RELATIVE_LINK);//for an external link : get the rl value if it is existing in the params list
107 if (document_id == null && href == null)
108 {
109 logger.error("no document specified!");
110 return result;
111 }
112 if (rl != null && rl.equals("0")) {
113 // this is a true external link, we should have been directed to a different page or action
114 logger.error("rl value was 0, shouldn't get here");
115 return result;
116 }
117 String document_type = (String) params.get(GSParams.DOCUMENT_TYPE);
118 if (document_type == null || document_type.equals(""))
119 {
120 document_type = "simple";
121 }
122 //whether to retrieve siblings or not
123 boolean get_siblings = false;
124 String sibs = (String) params.get(SIBLING_ARG);
125 if (sibs != null && sibs.equals("1"))
126 {
127 get_siblings = true;
128 }
129
130 String doc_id_modifier = "";
131 String sibling_num = (String) params.get(GOTO_PAGE_ARG);
132 if (sibling_num != null && !sibling_num.equals(""))
133 {
134 // we have to modify the doc name
135 doc_id_modifier = "." + sibling_num + ".ss";
136 }
137
138 boolean expand_document = false;
139 String ed_arg = (String) params.get(EXPAND_DOCUMENT_ARG);
140 if (ed_arg != null && ed_arg.equals("1"))
141 {
142 expand_document = true;
143 }
144
145 boolean expand_contents = false;
146 if (expand_document)
147 { // we always expand the contents with the text
148 expand_contents = true;
149 }
150 else
151 {
152 String ec_arg = (String) params.get(EXPAND_CONTENTS_ARG);
153 if (ec_arg != null && ec_arg.equals("1"))
154 {
155 expand_contents = true;
156 }
157 }
158
159 UserContext userContext = new UserContext(request);
160
161 //append site metadata
162 addSiteMetadata(page_response, userContext);
163 addInterfaceOptions(page_response);
164
165 // get the additional data needed for the page
166 getBackgroundData(page_response, collection, userContext);
167 Element format_elem = (Element) GSXML.getChildByTagName(page_response, GSXML.FORMAT_ELEM);
168
169 // the_document is where all the doc info - structure and metadata etc
170 // is added into, to be returned in the page
171 Element the_document = this.doc.createElement(GSXML.DOCUMENT_ELEM);
172 page_response.appendChild(the_document);
173
174 // set the doctype from the cgi arg as an attribute
175 the_document.setAttribute(GSXML.DOC_TYPE_ATT, document_type);
176
177 // create a basic doc list containing the current node
178 Element basic_doc_list = this.doc.createElement(GSXML.DOC_NODE_ELEM + GSXML.LIST_MODIFIER);
179 Element current_doc = this.doc.createElement(GSXML.DOC_NODE_ELEM);
180 basic_doc_list.appendChild(current_doc);
181 if (document_id != null)
182 {
183 current_doc.setAttribute(GSXML.NODE_ID_ATT, document_id+doc_id_modifier);
184 }
185 else
186 {
187 current_doc.setAttribute(GSXML.HREF_ID_ATT, href);
188 // do we need this??
189 current_doc.setAttribute(GSXML.ID_MOD_ATT, doc_id_modifier);
190 }
191
192 // Create a parameter list to specify the required structure information
193 Element ds_param_list = this.doc.createElement(GSXML.PARAM_ELEM + GSXML.LIST_MODIFIER);
194
195 if (service_params != null)
196 {
197 GSXML.addParametersToList(this.doc, ds_param_list, service_params);
198 }
199
200 Element ds_param = null;
201 boolean get_structure = false;
202 boolean get_structure_info = false;
203 if (document_type.equals(GSXML.DOC_TYPE_PAGED))
204 {
205 get_structure_info = true;
206
207 if (expand_contents)
208 {
209 ds_param = this.doc.createElement(GSXML.PARAM_ELEM);
210 ds_param_list.appendChild(ds_param);
211 ds_param.setAttribute(GSXML.NAME_ATT, "structure");
212 ds_param.setAttribute(GSXML.VALUE_ATT, "entire");
213 }
214
215 // get the info needed for paged naviagtion
216 ds_param = this.doc.createElement(GSXML.PARAM_ELEM);
217 ds_param_list.appendChild(ds_param);
218 ds_param.setAttribute(GSXML.NAME_ATT, "info");
219 ds_param.setAttribute(GSXML.VALUE_ATT, "numSiblings");
220 ds_param = this.doc.createElement(GSXML.PARAM_ELEM);
221 ds_param_list.appendChild(ds_param);
222 ds_param.setAttribute(GSXML.NAME_ATT, "info");
223 ds_param.setAttribute(GSXML.VALUE_ATT, "numChildren");
224 ds_param = this.doc.createElement(GSXML.PARAM_ELEM);
225 ds_param_list.appendChild(ds_param);
226 ds_param.setAttribute(GSXML.NAME_ATT, "info");
227 ds_param.setAttribute(GSXML.VALUE_ATT, "siblingPosition");
228
229 if (get_siblings)
230 {
231 ds_param = this.doc.createElement(GSXML.PARAM_ELEM);
232 ds_param_list.appendChild(ds_param);
233 ds_param.setAttribute(GSXML.NAME_ATT, "structure");
234 ds_param.setAttribute(GSXML.VALUE_ATT, "siblings");
235 }
236
237 }
238 else if (document_type.equals(GSXML.DOC_TYPE_HIERARCHY))
239 {
240 get_structure = true;
241 if (expand_contents)
242 {
243 ds_param = this.doc.createElement(GSXML.PARAM_ELEM);
244 ds_param_list.appendChild(ds_param);
245 ds_param.setAttribute(GSXML.NAME_ATT, "structure");
246 ds_param.setAttribute(GSXML.VALUE_ATT, "entire");
247 }
248 else
249 {
250 // get the info needed for table of contents
251 ds_param = this.doc.createElement(GSXML.PARAM_ELEM);
252 ds_param_list.appendChild(ds_param);
253 ds_param.setAttribute(GSXML.NAME_ATT, "structure");
254 ds_param.setAttribute(GSXML.VALUE_ATT, "ancestors");
255 ds_param = this.doc.createElement(GSXML.PARAM_ELEM);
256 ds_param_list.appendChild(ds_param);
257 ds_param.setAttribute(GSXML.NAME_ATT, "structure");
258 ds_param.setAttribute(GSXML.VALUE_ATT, "children");
259 if (get_siblings)
260 {
261 ds_param = this.doc.createElement(GSXML.PARAM_ELEM);
262 ds_param_list.appendChild(ds_param);
263 ds_param.setAttribute(GSXML.NAME_ATT, "structure");
264 ds_param.setAttribute(GSXML.VALUE_ATT, "siblings");
265 }
266 }
267 }
268 else
269 {
270 // we dont need any structure
271 }
272
273 boolean has_dummy = false;
274 if (get_structure || get_structure_info)
275 {
276
277 // Build a request to obtain the document structure
278 Element ds_message = this.doc.createElement(GSXML.MESSAGE_ELEM);
279 String to = GSPath.appendLink(collection, "DocumentStructureRetrieve");// Hard-wired?
280 Element ds_request = GSXML.createBasicRequest(this.doc, GSXML.REQUEST_TYPE_PROCESS, to, userContext);
281 ds_message.appendChild(ds_request);
282 ds_request.appendChild(ds_param_list);
283
284 // create a doc_node_list and put in the doc_node that we are interested in
285 ds_request.appendChild(basic_doc_list);
286
287 // Process the document structure retrieve message
288 Element ds_response_message = (Element) this.mr.process(ds_message);
289 if (processErrorElements(ds_response_message, page_response))
290 {
291 return result;
292 }
293
294 // get the info and print out
295 String path = GSPath.appendLink(GSXML.RESPONSE_ELEM, GSXML.DOC_NODE_ELEM + GSXML.LIST_MODIFIER);
296 path = GSPath.appendLink(path, GSXML.DOC_NODE_ELEM);
297 path = GSPath.appendLink(path, "nodeStructureInfo");
298 Element ds_response_struct_info = (Element) GSXML.getNodeByPath(ds_response_message, path);
299 // get the doc_node bit
300 if (ds_response_struct_info != null)
301 {
302 the_document.appendChild(this.doc.importNode(ds_response_struct_info, true));
303 }
304 path = GSPath.appendLink(GSXML.RESPONSE_ELEM, GSXML.DOC_NODE_ELEM + GSXML.LIST_MODIFIER);
305 path = GSPath.appendLink(path, GSXML.DOC_NODE_ELEM);
306 path = GSPath.appendLink(path, GSXML.NODE_STRUCTURE_ELEM);
307 Element ds_response_structure = (Element) GSXML.getNodeByPath(ds_response_message, path);
308
309 if (ds_response_structure != null)
310 {
311 // add the contents of the structure bit into the_document
312 NodeList structs = ds_response_structure.getChildNodes();
313 for (int i = 0; i < structs.getLength(); i++)
314 {
315 the_document.appendChild(this.doc.importNode(structs.item(i), true));
316 }
317 }
318 else
319 {
320 // no structure nodes, so put in a dummy doc node
321 Element doc_node = this.doc.createElement(GSXML.DOC_NODE_ELEM);
322 if (document_id != null)
323 {
324 doc_node.setAttribute(GSXML.NODE_ID_ATT, document_id);
325 }
326 else
327 {
328 doc_node.setAttribute(GSXML.HREF_ID_ATT, href);
329
330 }
331 the_document.appendChild(doc_node);
332 has_dummy = true;
333 }
334 }
335 else
336 { // a simple type - we dont have a dummy node for simple
337 // should think about this more
338 // no structure request, so just put in a dummy doc node
339 Element doc_node = this.doc.createElement(GSXML.DOC_NODE_ELEM);
340 if (document_id != null)
341 {
342 doc_node.setAttribute(GSXML.NODE_ID_ATT, document_id);
343 }
344 else
345 {
346 doc_node.setAttribute(GSXML.HREF_ID_ATT, href);
347 }
348 the_document.appendChild(doc_node);
349 has_dummy = true;
350 }
351
352 // Build a request to obtain some document metadata
353 Element dm_message = this.doc.createElement(GSXML.MESSAGE_ELEM);
354 String to = GSPath.appendLink(collection, "DocumentMetadataRetrieve"); // Hard-wired?
355 Element dm_request = GSXML.createBasicRequest(this.doc, GSXML.REQUEST_TYPE_PROCESS, to, userContext);
356 dm_message.appendChild(dm_request);
357 // Create a parameter list to specify the required metadata information
358
359 HashSet meta_names = new HashSet();
360 meta_names.add("Title"); // the default
361 if (format_elem != null)
362 {
363 getRequiredMetadataNames(format_elem, meta_names);
364 }
365
366 Element dm_param_list = createMetadataParamList(meta_names);
367 if (service_params != null)
368 {
369 GSXML.addParametersToList(this.doc, dm_param_list, service_params);
370 }
371
372 dm_request.appendChild(dm_param_list);
373
374 // create the doc node list for the metadata request
375 Element dm_doc_list = this.doc.createElement(GSXML.DOC_NODE_ELEM + GSXML.LIST_MODIFIER);
376 dm_request.appendChild(dm_doc_list);
377
378 // Add each node from the structure response into the metadata request
379 NodeList doc_nodes = the_document.getElementsByTagName(GSXML.DOC_NODE_ELEM);
380 for (int i = 0; i < doc_nodes.getLength(); i++)
381 {
382 Element doc_node = (Element) doc_nodes.item(i);
383 String doc_node_id = doc_node.getAttribute(GSXML.NODE_ID_ATT);
384
385 // Add the documentNode to the list
386 Element dm_doc_node = this.doc.createElement(GSXML.DOC_NODE_ELEM);
387 dm_doc_list.appendChild(dm_doc_node);
388 dm_doc_node.setAttribute(GSXML.NODE_ID_ATT, doc_node_id);
389 dm_doc_node.setAttribute(GSXML.NODE_TYPE_ATT, doc_node.getAttribute(GSXML.NODE_TYPE_ATT));
390 }
391
392 // we also want a metadata request to the top level document to get
393 // assocfilepath - this could be cached too
394 Element doc_meta_request = GSXML.createBasicRequest(this.doc, GSXML.REQUEST_TYPE_PROCESS, to, userContext);
395 dm_message.appendChild(doc_meta_request);
396 Element doc_meta_param_list = this.doc.createElement(GSXML.PARAM_ELEM + GSXML.LIST_MODIFIER);
397 if (service_params != null)
398 {
399 GSXML.addParametersToList(this.doc, doc_meta_param_list, service_params);
400 }
401
402 doc_meta_request.appendChild(doc_meta_param_list);
403 Element doc_param = this.doc.createElement(GSXML.PARAM_ELEM);
404 doc_meta_param_list.appendChild(doc_param);
405 doc_param.setAttribute(GSXML.NAME_ATT, "metadata");
406 doc_param.setAttribute(GSXML.VALUE_ATT, "assocfilepath");
407
408 // create the doc node list for the metadata request
409 Element doc_list = this.doc.createElement(GSXML.DOC_NODE_ELEM + GSXML.LIST_MODIFIER);
410 doc_meta_request.appendChild(doc_list);
411
412 Element doc_node = this.doc.createElement(GSXML.DOC_NODE_ELEM);
413 // the node we want is the root document node
414 if (document_id != null)
415 {
416 doc_node.setAttribute(GSXML.NODE_ID_ATT, document_id + ".rt");
417 }
418 else
419 {
420 doc_node.setAttribute(GSXML.HREF_ID_ATT, href);// + ".rt");
421 // can we assume that href is always a top level doc??
422 //doc_node.setAttribute(GSXML.ID_MOD_ATT, ".rt");
423 //doc_node.setAttribute("externalURL", has_rl);
424 }
425 doc_list.appendChild(doc_node);
426
427 Element dm_response_message = (Element) this.mr.process(dm_message);
428 if (processErrorElements(dm_response_message, page_response))
429 {
430 return result;
431 }
432
433 String path = GSPath.appendLink(GSXML.RESPONSE_ELEM, GSXML.DOC_NODE_ELEM + GSXML.LIST_MODIFIER);
434 Element dm_response_doc_list = (Element) GSXML.getNodeByPath(dm_response_message, path);
435
436 // Merge the metadata with the structure information
437 NodeList dm_response_docs = dm_response_doc_list.getChildNodes();
438 for (int i = 0; i < doc_nodes.getLength(); i++)
439 {
440 GSXML.mergeMetadataLists(doc_nodes.item(i), dm_response_docs.item(i));
441 }
442 // get the top level doc metadata out
443 Element doc_meta_response = (Element) dm_response_message.getElementsByTagName(GSXML.RESPONSE_ELEM).item(1);
444 Element top_doc_node = (Element) GSXML.getNodeByPath(doc_meta_response, "documentNodeList/documentNode");
445 GSXML.mergeMetadataLists(the_document, top_doc_node);
446
447 // Build a request to obtain some document content
448 Element dc_message = this.doc.createElement(GSXML.MESSAGE_ELEM);
449 to = GSPath.appendLink(collection, "DocumentContentRetrieve"); // Hard-wired?
450 Element dc_request = GSXML.createBasicRequest(this.doc, GSXML.REQUEST_TYPE_PROCESS, to, userContext);
451 dc_message.appendChild(dc_request);
452
453 // Create a parameter list to specify the request parameters - empty for now
454 Element dc_param_list = this.doc.createElement(GSXML.PARAM_ELEM + GSXML.LIST_MODIFIER);
455 if (service_params != null)
456 {
457 GSXML.addParametersToList(this.doc, dc_param_list, service_params);
458 }
459
460 dc_request.appendChild(dc_param_list);
461
462 // get the content
463 // the doc list for the content request is the same as the one for the structure request unless we want the whole document, in which case its the same as for the metadata request.
464 if (expand_document)
465 {
466 dc_request.appendChild(dm_doc_list);
467 }
468 else
469 {
470 dc_request.appendChild(basic_doc_list);
471 }
472 logger.debug("request = " + converter.getString(dc_message));
473 Element dc_response_message = (Element) this.mr.process(dc_message);
474 if (processErrorElements(dc_response_message, page_response))
475 {
476 return result;
477 }
478
479 Element dc_response_doc_list = (Element) GSXML.getNodeByPath(dc_response_message, path);
480
481 if (expand_document)
482 {
483 // Merge the content with the structure information
484 NodeList dc_response_docs = dc_response_doc_list.getChildNodes();
485 for (int i = 0; i < doc_nodes.getLength(); i++)
486 {
487 Node content = GSXML.getChildByTagName((Element) dc_response_docs.item(i), "nodeContent");
488 if (content != null)
489 {
490 if (highlight_query_terms)
491 {
492 content = highlightQueryTerms(request, (Element) content);
493 }
494 doc_nodes.item(i).appendChild(this.doc.importNode(content, true));
495 }
496 //GSXML.mergeMetadataLists(doc_nodes.item(i), dm_response_docs.item(i));
497 }
498 }
499 else
500 {
501 //path = GSPath.appendLink(path, GSXML.DOC_NODE_ELEM);
502 Element dc_response_doc = (Element) GSXML.getChildByTagName(dc_response_doc_list, GSXML.DOC_NODE_ELEM);
503 Element dc_response_doc_content = (Element) GSXML.getChildByTagName(dc_response_doc, GSXML.NODE_CONTENT_ELEM);
504 //Element dc_response_doc_external = (Element) GSXML.getChildByTagName(dc_response_doc, "external");
505
506 if (dc_response_doc_content == null)
507 {
508 // no content to add
509 if (dc_response_doc.getAttribute("external").equals("true")) {
510
511 //if (dc_response_doc_external != null)
512 //{
513 String href_id = dc_response_doc.getAttribute(GSXML.HREF_ID_ATT);
514
515 the_document.setAttribute("selectedNode", href_id);
516 the_document.setAttribute("external", href_id);
517 }
518 return result;
519 }
520 if (highlight_query_terms)
521 {
522 dc_response_doc.removeChild(dc_response_doc_content);
523
524 dc_response_doc_content = highlightQueryTerms(request, dc_response_doc_content);
525 dc_response_doc.appendChild(dc_response_doc.getOwnerDocument().importNode(dc_response_doc_content, true));
526 }
527
528 if (provide_annotations)
529 {
530 String service_selected = (String) params.get(ENRICH_DOC_ARG);
531 if (service_selected != null && service_selected.equals("1"))
532 {
533 // now we can modifiy the response doc if needed
534 String enrich_service = (String) params.get(GSParams.SERVICE);
535 // send a message to the service
536 Element enrich_message = this.doc.createElement(GSXML.MESSAGE_ELEM);
537 Element enrich_request = GSXML.createBasicRequest(this.doc, GSXML.REQUEST_TYPE_PROCESS, enrich_service, userContext);
538 enrich_message.appendChild(enrich_request);
539 // check for parameters
540 HashMap e_service_params = (HashMap) params.get("s1");
541 if (e_service_params != null)
542 {
543 Element enrich_pl = this.doc.createElement(GSXML.PARAM_ELEM + GSXML.LIST_MODIFIER);
544 GSXML.addParametersToList(this.doc, enrich_pl, e_service_params);
545 enrich_request.appendChild(enrich_pl);
546 }
547 Element e_doc_list = this.doc.createElement(GSXML.DOC_NODE_ELEM + GSXML.LIST_MODIFIER);
548 enrich_request.appendChild(e_doc_list);
549 e_doc_list.appendChild(this.doc.importNode(dc_response_doc, true));
550
551 Node enrich_response = this.mr.process(enrich_message);
552
553 String[] links = { GSXML.RESPONSE_ELEM, GSXML.DOC_NODE_ELEM + GSXML.LIST_MODIFIER, GSXML.DOC_NODE_ELEM, GSXML.NODE_CONTENT_ELEM };
554 path = GSPath.createPath(links);
555 dc_response_doc_content = (Element) GSXML.getNodeByPath(enrich_response, path);
556
557 }
558 } // if provide_annotations
559
560 // use the returned id rather than the sent one cos there may have
561 // been modifiers such as .pr that are removed.
562 String modified_doc_id = dc_response_doc.getAttribute(GSXML.NODE_ID_ATT);
563 the_document.setAttribute("selectedNode", modified_doc_id);
564 if (has_dummy)
565 {
566 // change the id if necessary and add the content
567 Element dummy_node = (Element) doc_nodes.item(0);
568
569 dummy_node.setAttribute(GSXML.NODE_ID_ATT, modified_doc_id);
570 dummy_node.appendChild(this.doc.importNode(dc_response_doc_content, true));
571 // hack for simple type
572 if (document_type.equals("simple"))
573 {
574 // we dont want the internal docNode, just want the content and metadata in the document
575 // rethink this!!
576 the_document.removeChild(dummy_node);
577
578 NodeList dummy_children = dummy_node.getChildNodes();
579 //for (int i=0; i<dummy_children.getLength(); i++) {
580 for (int i = dummy_children.getLength() - 1; i >= 0; i--)
581 {
582 // special case as we don't want more than one metadata list
583 if (dummy_children.item(i).getNodeName().equals(GSXML.METADATA_ELEM + GSXML.LIST_MODIFIER))
584 {
585 GSXML.mergeMetadataFromList(the_document, dummy_children.item(i));
586 }
587 else
588 {
589 the_document.appendChild(dummy_children.item(i));
590 }
591 }
592 }
593 }
594 else
595 {
596 // Merge the document content with the metadata and structure information
597 for (int i = 0; i < doc_nodes.getLength(); i++)
598 {
599 Node dn = doc_nodes.item(i);
600 String dn_id = ((Element) dn).getAttribute(GSXML.NODE_ID_ATT);
601 if (dn_id.equals(modified_doc_id))
602 {
603 dn.appendChild(this.doc.importNode(dc_response_doc_content, true));
604 break;
605 }
606 }
607 }
608 }
609 logger.debug("(DocumentAction) Page:\n" + this.converter.getPrettyString(result));
610 return result;
611 }
612
613 /**
614 * tell the param class what its arguments are if an action has its own
615 * arguments, this should add them to the params object - particularly
616 * important for args that should not be saved
617 */
618 public boolean addActionParameters(GSParams params)
619 {
620 params.addParameter(GOTO_PAGE_ARG, false);
621 params.addParameter(ENRICH_DOC_ARG, false);
622 params.addParameter(EXPAND_DOCUMENT_ARG, false);
623 params.addParameter(EXPAND_CONTENTS_ARG, false);
624 params.addParameter(REALISTIC_BOOK_ARG, false);
625
626 return true;
627 }
628
629 /**
630 * this method gets the collection description, the format info, the list of
631 * enrich services, etc - stuff that is needed for the page, but is the same
632 * whatever the query is - should be cached
633 */
634 protected boolean getBackgroundData(Element page_response, String collection, UserContext userContext)
635 {
636
637 // create a message to process - contains requests for the collection
638 // description, the format element, the enrich services on offer
639 // these could all be cached
640 Element info_message = this.doc.createElement(GSXML.MESSAGE_ELEM);
641 String path = GSPath.appendLink(collection, "DocumentContentRetrieve");
642 // the format request - ignore for now, where does this request go to??
643 Element format_request = GSXML.createBasicRequest(this.doc, GSXML.REQUEST_TYPE_FORMAT, path, userContext);
644 info_message.appendChild(format_request);
645
646 // the enrich_services request - only do this if provide_annotations is true
647
648 if (provide_annotations)
649 {
650 Element enrich_services_request = GSXML.createBasicRequest(this.doc, GSXML.REQUEST_TYPE_DESCRIBE, "", userContext);
651 enrich_services_request.setAttribute(GSXML.INFO_ATT, "serviceList");
652 info_message.appendChild(enrich_services_request);
653 }
654
655 Element info_response = (Element) this.mr.process(info_message);
656
657 // the collection is the first response
658 NodeList responses = info_response.getElementsByTagName(GSXML.RESPONSE_ELEM);
659 Element format_resp = (Element) responses.item(0);
660
661 Element format_elem = (Element) GSXML.getChildByTagName(format_resp, GSXML.FORMAT_ELEM);
662 if (format_elem != null)
663 {
664 logger.debug("doc action found a format statement");
665 // set teh format type
666 format_elem.setAttribute(GSXML.TYPE_ATT, "display");
667 page_response.appendChild(this.doc.importNode(format_elem, true));
668 }
669
670 if (provide_annotations)
671 {
672 Element services_resp = (Element) responses.item(1);
673
674 // a new message for the mr
675 Element enrich_message = this.doc.createElement(GSXML.MESSAGE_ELEM);
676
677 NodeList e_services = services_resp.getElementsByTagName(GSXML.SERVICE_ELEM);
678 boolean service_found = false;
679 for (int j = 0; j < e_services.getLength(); j++)
680 {
681 if (((Element) e_services.item(j)).getAttribute(GSXML.TYPE_ATT).equals("enrich"))
682 {
683 Element s = GSXML.createBasicRequest(this.doc, GSXML.REQUEST_TYPE_DESCRIBE, ((Element) e_services.item(j)).getAttribute(GSXML.NAME_ATT), userContext);
684 enrich_message.appendChild(s);
685 service_found = true;
686 }
687 }
688 if (service_found)
689 {
690 Element enrich_response = (Element) this.mr.process(enrich_message);
691
692 NodeList e_responses = enrich_response.getElementsByTagName(GSXML.RESPONSE_ELEM);
693 Element service_list = this.doc.createElement(GSXML.SERVICE_ELEM + GSXML.LIST_MODIFIER);
694 for (int i = 0; i < e_responses.getLength(); i++)
695 {
696 Element e_resp = (Element) e_responses.item(i);
697 Element e_service = (Element) this.doc.importNode(GSXML.getChildByTagName(e_resp, GSXML.SERVICE_ELEM), true);
698 e_service.setAttribute(GSXML.NAME_ATT, e_resp.getAttribute(GSXML.FROM_ATT));
699 service_list.appendChild(e_service);
700 }
701 page_response.appendChild(service_list);
702 }
703 } // if provide_annotations
704 return true;
705
706 }
707
708 /**
709 * this involves a bit of a hack to get the equivalent query terms - has to
710 * requery the query service - uses the last selected service name. (if it
711 * ends in query). should this action do the query or should it send a
712 * message to the query action? but that will involve lots of extra stuff.
713 * also doesn't handle phrases properly - just highlights all the terms
714 * found in the text.
715 */
716 protected Element highlightQueryTerms(Element request, Element dc_response_doc_content)
717 {
718
719 // do the query again to get term info
720 Element cgi_param_list = (Element) GSXML.getChildByTagName(request, GSXML.PARAM_ELEM + GSXML.LIST_MODIFIER);
721 HashMap params = GSXML.extractParams(cgi_param_list, false);
722
723 HashMap previous_params = (HashMap) params.get("p");
724 if (previous_params == null)
725 {
726 return dc_response_doc_content;
727 }
728 String service_name = (String) previous_params.get(GSParams.SERVICE);
729 if (service_name == null || !service_name.endsWith("Query"))
730 { // hack for now - we only do highlighting if we were in a query last - ie not if we were in a browse thingy
731 logger.debug("invalid service, not doing highlighting");
732 return dc_response_doc_content;
733 }
734 String collection = (String) params.get(GSParams.COLLECTION);
735 UserContext userContext = new UserContext(request);
736 String to = GSPath.appendLink(collection, service_name);
737
738 Element mr_query_message = this.doc.createElement(GSXML.MESSAGE_ELEM);
739 Element mr_query_request = GSXML.createBasicRequest(this.doc, GSXML.REQUEST_TYPE_PROCESS, to, userContext);
740 mr_query_message.appendChild(mr_query_request);
741
742 // paramList
743 HashMap service_params = (HashMap) params.get("s1");
744
745 Element query_param_list = this.doc.createElement(GSXML.PARAM_ELEM + GSXML.LIST_MODIFIER);
746 GSXML.addParametersToList(this.doc, query_param_list, service_params);
747 mr_query_request.appendChild(query_param_list);
748
749 // do the query
750 Element mr_query_response = (Element) this.mr.process(mr_query_message);
751
752 String path = GSPath.appendLink(GSXML.RESPONSE_ELEM, GSXML.TERM_ELEM + GSXML.LIST_MODIFIER);
753 Element query_term_list_element = (Element) GSXML.getNodeByPath(mr_query_response, path);
754 if (query_term_list_element == null)
755 {
756 // no term info
757 logger.error("No query term information.\n");
758 return dc_response_doc_content;
759 }
760
761 String content = GSXML.getNodeText(dc_response_doc_content);
762
763 String metadata_path = GSPath.appendLink(GSXML.RESPONSE_ELEM, GSXML.METADATA_ELEM + GSXML.LIST_MODIFIER);
764 Element metadata_list = (Element) GSXML.getNodeByPath(mr_query_response, metadata_path);
765
766 HashSet query_term_variants = new HashSet();
767 NodeList equivalent_terms_nodelist = query_term_list_element.getElementsByTagName("equivTermList");
768 if (equivalent_terms_nodelist == null || equivalent_terms_nodelist.getLength() == 0)
769 {
770 NodeList terms_nodelist = query_term_list_element.getElementsByTagName("term");
771 if (terms_nodelist != null && terms_nodelist.getLength() > 0)
772 {
773 for (int i = 0; i < terms_nodelist.getLength(); i++)
774 {
775 String termValue = ((Element) terms_nodelist.item(i)).getAttribute("name");
776 String termValueU = null;
777 String termValueL = null;
778
779 if (termValue.length() > 1)
780 {
781 termValueU = termValue.substring(0, 1).toUpperCase() + termValue.substring(1);
782 termValueL = termValue.substring(0, 1).toLowerCase() + termValue.substring(1);
783 }
784 else
785 {
786 termValueU = termValue.substring(0, 1).toUpperCase();
787 termValueL = termValue.substring(0, 1).toLowerCase();
788 }
789
790 query_term_variants.add(termValueU);
791 query_term_variants.add(termValueL);
792 }
793 }
794 }
795 else
796 {
797 for (int i = 0; i < equivalent_terms_nodelist.getLength(); i++)
798 {
799 Element equivalent_terms_element = (Element) equivalent_terms_nodelist.item(i);
800 String[] equivalent_terms = GSXML.getAttributeValuesFromList(equivalent_terms_element, GSXML.NAME_ATT);
801 for (int j = 0; j < equivalent_terms.length; j++)
802 {
803 query_term_variants.add(equivalent_terms[j]);
804 }
805 }
806 }
807
808 ArrayList phrase_query_term_variants_hierarchy = new ArrayList();
809
810 Element query_element = GSXML.getNamedElement(metadata_list, GSXML.METADATA_ELEM, GSXML.NAME_ATT, "query");
811 String performed_query = GSXML.getNodeText(query_element) + " ";
812
813 ArrayList phrase_query_p_term_variants_list = new ArrayList();
814 int term_start = 0;
815 boolean in_term = false;
816 boolean in_phrase = false;
817 for (int i = 0; i < performed_query.length(); i++)
818 {
819 char character = performed_query.charAt(i);
820 boolean is_character_letter_or_digit = Character.isLetterOrDigit(character);
821
822 // Has a query term just started?
823 if (in_term == false && is_character_letter_or_digit == true)
824 {
825 in_term = true;
826 term_start = i;
827 }
828
829 // Or has a term just finished?
830 else if (in_term == true && is_character_letter_or_digit == false)
831 {
832 in_term = false;
833 String term = performed_query.substring(term_start, i);
834
835 Element term_element = GSXML.getNamedElement(query_term_list_element, GSXML.TERM_ELEM, GSXML.NAME_ATT, term);
836 if (term_element != null)
837 {
838
839 HashSet phrase_query_p_term_x_variants = new HashSet();
840
841 NodeList term_equivalent_terms_nodelist = term_element.getElementsByTagName("equivTermList");
842 if (term_equivalent_terms_nodelist == null || term_equivalent_terms_nodelist.getLength() == 0)
843 {
844 String termValueU = null;
845 String termValueL = null;
846
847 if (term.length() > 1)
848 {
849 termValueU = term.substring(0, 1).toUpperCase() + term.substring(1);
850 termValueL = term.substring(0, 1).toLowerCase() + term.substring(1);
851 }
852 else
853 {
854 termValueU = term.substring(0, 1).toUpperCase();
855 termValueL = term.substring(0, 1).toLowerCase();
856 }
857
858 phrase_query_p_term_x_variants.add(termValueU);
859 phrase_query_p_term_x_variants.add(termValueL);
860 }
861 else
862 {
863 for (int j = 0; j < term_equivalent_terms_nodelist.getLength(); j++)
864 {
865 Element term_equivalent_terms_element = (Element) term_equivalent_terms_nodelist.item(j);
866 String[] term_equivalent_terms = GSXML.getAttributeValuesFromList(term_equivalent_terms_element, GSXML.NAME_ATT);
867 for (int k = 0; k < term_equivalent_terms.length; k++)
868 {
869 phrase_query_p_term_x_variants.add(term_equivalent_terms[k]);
870 }
871 }
872 }
873 phrase_query_p_term_variants_list.add(phrase_query_p_term_x_variants);
874
875 if (in_phrase == false)
876 {
877 phrase_query_term_variants_hierarchy.add(phrase_query_p_term_variants_list);
878 phrase_query_p_term_variants_list = new ArrayList();
879 }
880 }
881 }
882 // Watch for phrases (surrounded by quotes)
883 if (character == '\"')
884 {
885 // Has a phrase just started?
886 if (in_phrase == false)
887 {
888 in_phrase = true;
889 }
890 // Or has a phrase just finished?
891 else if (in_phrase == true)
892 {
893 in_phrase = false;
894 phrase_query_term_variants_hierarchy.add(phrase_query_p_term_variants_list);
895 }
896
897 phrase_query_p_term_variants_list = new ArrayList();
898 }
899 }
900
901 return highlightQueryTermsInternal(content, query_term_variants, phrase_query_term_variants_hierarchy);
902 }
903
904 /**
905 * Highlights query terms in a piece of text.
906 */
907 private Element highlightQueryTermsInternal(String content, HashSet query_term_variants, ArrayList phrase_query_term_variants_hierarchy)
908 {
909 // Convert the content string to an array of characters for speed
910 char[] content_characters = new char[content.length()];
911 content.getChars(0, content.length(), content_characters, 0);
912
913 // Now skim through the content, identifying word matches
914 ArrayList word_matches = new ArrayList();
915 int word_start = 0;
916 boolean in_word = false;
917 boolean preceding_word_matched = false;
918 boolean inTag = false;
919 for (int i = 0; i < content_characters.length; i++)
920 {
921 //We don't want to find words inside HTML tags
922 if (content_characters[i] == '<')
923 {
924 inTag = true;
925 continue;
926 }
927 else if (inTag && content_characters[i] == '>')
928 {
929 inTag = false;
930 }
931 else if (inTag)
932 {
933 continue;
934 }
935
936 boolean is_character_letter_or_digit = Character.isLetterOrDigit(content_characters[i]);
937
938 // Has a word just started?
939 if (in_word == false && is_character_letter_or_digit == true)
940 {
941 in_word = true;
942 word_start = i;
943 }
944
945 // Or has a word just finished?
946 else if (in_word == true && is_character_letter_or_digit == false)
947 {
948 in_word = false;
949
950 // Check if the word matches any of the query term equivalents
951 String word = new String(content_characters, word_start, (i - word_start));
952 if (query_term_variants.contains(word))
953 {
954 // We have found a matching word, so remember its location
955 word_matches.add(new WordMatch(word, word_start, i, preceding_word_matched));
956 preceding_word_matched = true;
957 }
958 else
959 {
960 preceding_word_matched = false;
961 }
962 }
963 }
964
965 // Don't forget the last word...
966 if (in_word == true)
967 {
968 // Check if the word matches any of the query term equivalents
969 String word = new String(content_characters, word_start, (content_characters.length - word_start));
970 if (query_term_variants.contains(word))
971 {
972 // We have found a matching word, so remember its location
973 word_matches.add(new WordMatch(word, word_start, content_characters.length, preceding_word_matched));
974 }
975 }
976
977 ArrayList highlight_start_positions = new ArrayList();
978 ArrayList highlight_end_positions = new ArrayList();
979
980 // Deal with phrases now
981 ArrayList partial_phrase_matches = new ArrayList();
982 for (int i = 0; i < word_matches.size(); i++)
983 {
984 WordMatch word_match = (WordMatch) word_matches.get(i);
985
986 // See if any partial phrase matches are extended by this word
987 if (word_match.preceding_word_matched)
988 {
989 for (int j = partial_phrase_matches.size() - 1; j >= 0; j--)
990 {
991 PartialPhraseMatch partial_phrase_match = (PartialPhraseMatch) partial_phrase_matches.remove(j);
992 ArrayList phrase_query_p_term_variants_list = (ArrayList) phrase_query_term_variants_hierarchy.get(partial_phrase_match.query_phrase_number);
993 HashSet phrase_query_p_term_x_variants = (HashSet) phrase_query_p_term_variants_list.get(partial_phrase_match.num_words_matched);
994 if (phrase_query_p_term_x_variants.contains(word_match.word))
995 {
996 partial_phrase_match.num_words_matched++;
997
998 // Has a complete phrase match occurred?
999 if (partial_phrase_match.num_words_matched == phrase_query_p_term_variants_list.size())
1000 {
1001 // Check for overlaps by looking at the previous highlight range
1002 if (!highlight_end_positions.isEmpty())
1003 {
1004 int last_highlight_index = highlight_end_positions.size() - 1;
1005 int last_highlight_end = ((Integer) highlight_end_positions.get(last_highlight_index)).intValue();
1006 if (last_highlight_end > partial_phrase_match.start_position)
1007 {
1008 // There is an overlap, so remove the previous phrase match
1009 int last_highlight_start = ((Integer) highlight_start_positions.remove(last_highlight_index)).intValue();
1010 highlight_end_positions.remove(last_highlight_index);
1011 partial_phrase_match.start_position = last_highlight_start;
1012 }
1013 }
1014
1015 highlight_start_positions.add(new Integer(partial_phrase_match.start_position));
1016 highlight_end_positions.add(new Integer(word_match.end_position));
1017 }
1018 // No, but add the partial match back into the list for next time
1019 else
1020 {
1021 partial_phrase_matches.add(partial_phrase_match);
1022 }
1023 }
1024 }
1025 }
1026 else
1027 {
1028 partial_phrase_matches.clear();
1029 }
1030
1031 // See if this word is at the start of any of the phrases
1032 for (int p = 0; p < phrase_query_term_variants_hierarchy.size(); p++)
1033 {
1034 ArrayList phrase_query_p_term_variants_list = (ArrayList) phrase_query_term_variants_hierarchy.get(p);
1035 HashSet phrase_query_p_term_1_variants = (HashSet) phrase_query_p_term_variants_list.get(0);
1036 if (phrase_query_p_term_1_variants.contains(word_match.word))
1037 {
1038 // If this phrase is just one word long, we have a complete match
1039 if (phrase_query_p_term_variants_list.size() == 1)
1040 {
1041 highlight_start_positions.add(new Integer(word_match.start_position));
1042 highlight_end_positions.add(new Integer(word_match.end_position));
1043 }
1044 // Otherwise we have the start of a potential phrase match
1045 else
1046 {
1047 partial_phrase_matches.add(new PartialPhraseMatch(word_match.start_position, p));
1048 }
1049 }
1050 }
1051 }
1052
1053 // Now add the annotation tags into the document at the correct points
1054 Element content_element = this.doc.createElement(GSXML.NODE_CONTENT_ELEM);
1055
1056 int last_wrote = 0;
1057 for (int i = 0; i < highlight_start_positions.size(); i++)
1058 {
1059 int highlight_start = ((Integer) highlight_start_positions.get(i)).intValue();
1060 int highlight_end = ((Integer) highlight_end_positions.get(i)).intValue();
1061
1062 // Print anything before the highlight range
1063 if (last_wrote < highlight_start)
1064 {
1065 String preceding_text = new String(content_characters, last_wrote, (highlight_start - last_wrote));
1066 content_element.appendChild(this.doc.createTextNode(preceding_text));
1067 }
1068
1069 // Print the highlight text, annotated
1070 if (highlight_end > last_wrote)
1071 {
1072 String highlight_text = new String(content_characters, highlight_start, (highlight_end - highlight_start));
1073 Element annotation_element = GSXML.createTextElement(this.doc, "annotation", highlight_text);
1074 annotation_element.setAttribute("type", "query_term");
1075 content_element.appendChild(annotation_element);
1076 last_wrote = highlight_end;
1077 }
1078 }
1079
1080 // Finish off any unwritten text
1081 if (last_wrote < content_characters.length)
1082 {
1083 String remaining_text = new String(content_characters, last_wrote, (content_characters.length - last_wrote));
1084 content_element.appendChild(this.doc.createTextNode(remaining_text));
1085 }
1086
1087 return content_element;
1088 }
1089
1090 static private class WordMatch
1091 {
1092 public String word;
1093 public int start_position;
1094 public int end_position;
1095 public boolean preceding_word_matched;
1096
1097 public WordMatch(String word, int start_position, int end_position, boolean preceding_word_matched)
1098 {
1099 this.word = word;
1100 this.start_position = start_position;
1101 this.end_position = end_position;
1102 this.preceding_word_matched = preceding_word_matched;
1103 }
1104 }
1105
1106 static private class PartialPhraseMatch
1107 {
1108 public int start_position;
1109 public int query_phrase_number;
1110 public int num_words_matched;
1111
1112 public PartialPhraseMatch(int start_position, int query_phrase_number)
1113 {
1114 this.start_position = start_position;
1115 this.query_phrase_number = query_phrase_number;
1116 this.num_words_matched = 1;
1117 }
1118 }
1119}
Note: See TracBrowser for help on using the repository browser.