source: main/trunk/greenstone3/src/java/org/greenstone/gsdl3/action/DocumentAction.java@ 25490

Last change on this file since 25490 was 25355, checked in by sjm84, 12 years ago

Some fixes to the file formatting

  • Property svn:keywords set to Author Date Id Revision
File size: 40.0 KB
Line 
1/*
2 * DocumentAction.java
3 * Copyright (C) 2002 New Zealand Digital Library, http://www.nzdl.org
4 *
5 * This program is free software; you can redistribute it and/or modify
6 * it under the terms of the GNU General Public License as published by
7 * the Free Software Foundation; either version 2 of the License, or
8 * (at your option) any later version.
9 *
10 * This program is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 * GNU General Public License for more details.
14 *
15 * You should have received a copy of the GNU General Public License
16 * along with this program; if not, write to the Free Software
17 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
18 */
19package org.greenstone.gsdl3.action;
20
21// Greenstone classes
22import org.greenstone.gsdl3.core.ModuleInterface;
23import org.greenstone.gsdl3.util.*;
24
25// XML classes
26import org.w3c.dom.Document;
27import org.w3c.dom.Element;
28import org.w3c.dom.Node;
29import org.w3c.dom.Text;
30import org.w3c.dom.NodeList;
31
32// General Java classes
33import java.util.ArrayList;
34import java.util.HashMap;
35import java.util.HashSet;
36import java.io.File;
37
38import org.apache.log4j.*;
39
40/** Action class for retrieving Documents via the message router */
41public class DocumentAction extends Action
42{
43
44 static Logger logger = Logger.getLogger(org.greenstone.gsdl3.action.DocumentAction.class.getName());
45
46 // this is used to specify that the sibling nodes of a selected one should be obtained
47 public static final String SIBLING_ARG = "sib";
48 public static final String GOTO_PAGE_ARG = "gp";
49 public static final String ENRICH_DOC_ARG = "end";
50 public static final String EXPAND_DOCUMENT_ARG = "ed";
51 public static final String EXPAND_CONTENTS_ARG = "ec";
52 public static final String REALISTIC_BOOK_ARG = "book";
53
54 /**
55 * if this is set to true, when a document is displayed, any annotation type
56 * services (enrich) will be offered to the user as well
57 */
58 protected boolean provide_annotations = false;
59
60 protected boolean highlight_query_terms = false;
61
62 public boolean configure()
63 {
64 super.configure();
65 String highlight = (String) config_params.get("highlightQueryTerms");
66 if (highlight != null && highlight.equals("true"))
67 {
68 highlight_query_terms = true;
69 }
70 String annotate = (String) config_params.get("displayAnnotationService");
71 if (annotate != null && annotate.equals("true"))
72 {
73 provide_annotations = true;
74 }
75 return true;
76 }
77
78 public Node process(Node message_node)
79 {
80 // for now, no subaction eventually we may want to have subactions such as text assoc or something ?
81
82 Element message = this.converter.nodeToElement(message_node);
83
84 // the response
85 Element result = this.doc.createElement(GSXML.MESSAGE_ELEM);
86 Element page_response = this.doc.createElement(GSXML.RESPONSE_ELEM);
87 result.appendChild(page_response);
88
89 // get the request - assume only one
90 Element request = (Element) GSXML.getChildByTagName(message, GSXML.REQUEST_ELEM);
91 Element cgi_paramList = (Element) GSXML.getChildByTagName(request, GSXML.PARAM_ELEM + GSXML.LIST_MODIFIER);
92 HashMap params = GSXML.extractParams(cgi_paramList, false);
93
94 // just in case there are some that need to get passed to the services
95 HashMap service_params = (HashMap) params.get("s0");
96
97 String collection = (String) params.get(GSParams.COLLECTION);
98 String document_id = (String) params.get(GSParams.DOCUMENT);
99 if (document_id != null && document_id.equals(""))
100 {
101 document_id = null;
102 }
103 String href = (String) params.get(GSParams.HREF);//for an external link : get the href URL if it is existing in the params list
104 if (href != null && href.equals(""))
105 {
106 href = null;
107 }
108 String rl = (String) params.get(GSParams.RELATIVE_LINK);//for an external link : get the rl value if it is existing in the params list
109 if (document_id == null && href == null)
110 {
111 logger.error("no document specified!");
112 return result;
113 }
114 if (rl != null && rl.equals("0"))
115 {
116 // this is a true external link, we should have been directed to a different page or action
117 logger.error("rl value was 0, shouldn't get here");
118 return result;
119 }
120 String document_type = (String) params.get(GSParams.DOCUMENT_TYPE);
121 if (document_type == null || document_type.equals(""))
122 {
123 document_type = "simple";
124 }
125 //whether to retrieve siblings or not
126 boolean get_siblings = false;
127 String sibs = (String) params.get(SIBLING_ARG);
128 if (sibs != null && sibs.equals("1"))
129 {
130 get_siblings = true;
131 }
132
133 String doc_id_modifier = "";
134 String sibling_num = (String) params.get(GOTO_PAGE_ARG);
135 if (sibling_num != null && !sibling_num.equals(""))
136 {
137 // we have to modify the doc name
138 doc_id_modifier = "." + sibling_num + ".ss";
139 }
140
141 boolean expand_document = false;
142 String ed_arg = (String) params.get(EXPAND_DOCUMENT_ARG);
143 if (ed_arg != null && ed_arg.equals("1"))
144 {
145 expand_document = true;
146 }
147
148 boolean expand_contents = false;
149 if (expand_document)
150 { // we always expand the contents with the text
151 expand_contents = true;
152 }
153 else
154 {
155 String ec_arg = (String) params.get(EXPAND_CONTENTS_ARG);
156 if (ec_arg != null && ec_arg.equals("1"))
157 {
158 expand_contents = true;
159 }
160 }
161
162 UserContext userContext = new UserContext(request);
163
164 //append site metadata
165 addSiteMetadata(page_response, userContext);
166 addInterfaceOptions(page_response);
167
168 // get the additional data needed for the page
169 getBackgroundData(page_response, collection, userContext);
170 Element format_elem = (Element) GSXML.getChildByTagName(page_response, GSXML.FORMAT_ELEM);
171
172 // the_document is where all the doc info - structure and metadata etc
173 // is added into, to be returned in the page
174 Element the_document = this.doc.createElement(GSXML.DOCUMENT_ELEM);
175 page_response.appendChild(the_document);
176
177 // set the doctype from the cgi arg as an attribute
178 the_document.setAttribute(GSXML.DOC_TYPE_ATT, document_type);
179
180 // create a basic doc list containing the current node
181 Element basic_doc_list = this.doc.createElement(GSXML.DOC_NODE_ELEM + GSXML.LIST_MODIFIER);
182 Element current_doc = this.doc.createElement(GSXML.DOC_NODE_ELEM);
183 basic_doc_list.appendChild(current_doc);
184 if (document_id != null)
185 {
186 current_doc.setAttribute(GSXML.NODE_ID_ATT, document_id + doc_id_modifier);
187 }
188 else
189 {
190 current_doc.setAttribute(GSXML.HREF_ID_ATT, href);
191 // do we need this??
192 current_doc.setAttribute(GSXML.ID_MOD_ATT, doc_id_modifier);
193 }
194
195 // Create a parameter list to specify the required structure information
196 Element ds_param_list = this.doc.createElement(GSXML.PARAM_ELEM + GSXML.LIST_MODIFIER);
197
198 if (service_params != null)
199 {
200 GSXML.addParametersToList(this.doc, ds_param_list, service_params);
201 }
202
203 Element ds_param = null;
204 boolean get_structure = false;
205 boolean get_structure_info = false;
206 if (document_type.equals(GSXML.DOC_TYPE_PAGED))
207 {
208 get_structure_info = true;
209
210 if (expand_contents)
211 {
212 ds_param = this.doc.createElement(GSXML.PARAM_ELEM);
213 ds_param_list.appendChild(ds_param);
214 ds_param.setAttribute(GSXML.NAME_ATT, "structure");
215 ds_param.setAttribute(GSXML.VALUE_ATT, "entire");
216 }
217
218 // get the info needed for paged naviagtion
219 ds_param = this.doc.createElement(GSXML.PARAM_ELEM);
220 ds_param_list.appendChild(ds_param);
221 ds_param.setAttribute(GSXML.NAME_ATT, "info");
222 ds_param.setAttribute(GSXML.VALUE_ATT, "numSiblings");
223 ds_param = this.doc.createElement(GSXML.PARAM_ELEM);
224 ds_param_list.appendChild(ds_param);
225 ds_param.setAttribute(GSXML.NAME_ATT, "info");
226 ds_param.setAttribute(GSXML.VALUE_ATT, "numChildren");
227 ds_param = this.doc.createElement(GSXML.PARAM_ELEM);
228 ds_param_list.appendChild(ds_param);
229 ds_param.setAttribute(GSXML.NAME_ATT, "info");
230 ds_param.setAttribute(GSXML.VALUE_ATT, "siblingPosition");
231
232 if (get_siblings)
233 {
234 ds_param = this.doc.createElement(GSXML.PARAM_ELEM);
235 ds_param_list.appendChild(ds_param);
236 ds_param.setAttribute(GSXML.NAME_ATT, "structure");
237 ds_param.setAttribute(GSXML.VALUE_ATT, "siblings");
238 }
239
240 }
241 else if (document_type.equals(GSXML.DOC_TYPE_HIERARCHY))
242 {
243 get_structure = true;
244 if (expand_contents)
245 {
246 ds_param = this.doc.createElement(GSXML.PARAM_ELEM);
247 ds_param_list.appendChild(ds_param);
248 ds_param.setAttribute(GSXML.NAME_ATT, "structure");
249 ds_param.setAttribute(GSXML.VALUE_ATT, "entire");
250 }
251 else
252 {
253 // get the info needed for table of contents
254 ds_param = this.doc.createElement(GSXML.PARAM_ELEM);
255 ds_param_list.appendChild(ds_param);
256 ds_param.setAttribute(GSXML.NAME_ATT, "structure");
257 ds_param.setAttribute(GSXML.VALUE_ATT, "ancestors");
258 ds_param = this.doc.createElement(GSXML.PARAM_ELEM);
259 ds_param_list.appendChild(ds_param);
260 ds_param.setAttribute(GSXML.NAME_ATT, "structure");
261 ds_param.setAttribute(GSXML.VALUE_ATT, "children");
262 if (get_siblings)
263 {
264 ds_param = this.doc.createElement(GSXML.PARAM_ELEM);
265 ds_param_list.appendChild(ds_param);
266 ds_param.setAttribute(GSXML.NAME_ATT, "structure");
267 ds_param.setAttribute(GSXML.VALUE_ATT, "siblings");
268 }
269 }
270 }
271 else
272 {
273 // we dont need any structure
274 }
275
276 boolean has_dummy = false;
277 if (get_structure || get_structure_info)
278 {
279
280 // Build a request to obtain the document structure
281 Element ds_message = this.doc.createElement(GSXML.MESSAGE_ELEM);
282 String to = GSPath.appendLink(collection, "DocumentStructureRetrieve");// Hard-wired?
283 Element ds_request = GSXML.createBasicRequest(this.doc, GSXML.REQUEST_TYPE_PROCESS, to, userContext);
284 ds_message.appendChild(ds_request);
285 ds_request.appendChild(ds_param_list);
286
287 // create a doc_node_list and put in the doc_node that we are interested in
288 ds_request.appendChild(basic_doc_list);
289
290 // Process the document structure retrieve message
291 Element ds_response_message = (Element) this.mr.process(ds_message);
292 if (processErrorElements(ds_response_message, page_response))
293 {
294 return result;
295 }
296
297 // get the info and print out
298 String path = GSPath.appendLink(GSXML.RESPONSE_ELEM, GSXML.DOC_NODE_ELEM + GSXML.LIST_MODIFIER);
299 path = GSPath.appendLink(path, GSXML.DOC_NODE_ELEM);
300 path = GSPath.appendLink(path, "nodeStructureInfo");
301 Element ds_response_struct_info = (Element) GSXML.getNodeByPath(ds_response_message, path);
302 // get the doc_node bit
303 if (ds_response_struct_info != null)
304 {
305 the_document.appendChild(this.doc.importNode(ds_response_struct_info, true));
306 }
307 path = GSPath.appendLink(GSXML.RESPONSE_ELEM, GSXML.DOC_NODE_ELEM + GSXML.LIST_MODIFIER);
308 path = GSPath.appendLink(path, GSXML.DOC_NODE_ELEM);
309 path = GSPath.appendLink(path, GSXML.NODE_STRUCTURE_ELEM);
310 Element ds_response_structure = (Element) GSXML.getNodeByPath(ds_response_message, path);
311
312 if (ds_response_structure != null)
313 {
314 // add the contents of the structure bit into the_document
315 NodeList structs = ds_response_structure.getChildNodes();
316 for (int i = 0; i < structs.getLength(); i++)
317 {
318 the_document.appendChild(this.doc.importNode(structs.item(i), true));
319 }
320 }
321 else
322 {
323 // no structure nodes, so put in a dummy doc node
324 Element doc_node = this.doc.createElement(GSXML.DOC_NODE_ELEM);
325 if (document_id != null)
326 {
327 doc_node.setAttribute(GSXML.NODE_ID_ATT, document_id);
328 }
329 else
330 {
331 doc_node.setAttribute(GSXML.HREF_ID_ATT, href);
332
333 }
334 the_document.appendChild(doc_node);
335 has_dummy = true;
336 }
337 }
338 else
339 { // a simple type - we dont have a dummy node for simple
340 // should think about this more
341 // no structure request, so just put in a dummy doc node
342 Element doc_node = this.doc.createElement(GSXML.DOC_NODE_ELEM);
343 if (document_id != null)
344 {
345 doc_node.setAttribute(GSXML.NODE_ID_ATT, document_id);
346 }
347 else
348 {
349 doc_node.setAttribute(GSXML.HREF_ID_ATT, href);
350 }
351 the_document.appendChild(doc_node);
352 has_dummy = true;
353 }
354
355 // Build a request to obtain some document metadata
356 Element dm_message = this.doc.createElement(GSXML.MESSAGE_ELEM);
357 String to = GSPath.appendLink(collection, "DocumentMetadataRetrieve"); // Hard-wired?
358 Element dm_request = GSXML.createBasicRequest(this.doc, GSXML.REQUEST_TYPE_PROCESS, to, userContext);
359 dm_message.appendChild(dm_request);
360 // Create a parameter list to specify the required metadata information
361
362 HashSet meta_names = new HashSet();
363 meta_names.add("Title"); // the default
364 if (format_elem != null)
365 {
366 getRequiredMetadataNames(format_elem, meta_names);
367 }
368
369 Element dm_param_list = createMetadataParamList(meta_names);
370 if (service_params != null)
371 {
372 GSXML.addParametersToList(this.doc, dm_param_list, service_params);
373 }
374
375 dm_request.appendChild(dm_param_list);
376
377 // create the doc node list for the metadata request
378 Element dm_doc_list = this.doc.createElement(GSXML.DOC_NODE_ELEM + GSXML.LIST_MODIFIER);
379 dm_request.appendChild(dm_doc_list);
380
381 // Add each node from the structure response into the metadata request
382 NodeList doc_nodes = the_document.getElementsByTagName(GSXML.DOC_NODE_ELEM);
383 for (int i = 0; i < doc_nodes.getLength(); i++)
384 {
385 Element doc_node = (Element) doc_nodes.item(i);
386 String doc_node_id = doc_node.getAttribute(GSXML.NODE_ID_ATT);
387
388 // Add the documentNode to the list
389 Element dm_doc_node = this.doc.createElement(GSXML.DOC_NODE_ELEM);
390 dm_doc_list.appendChild(dm_doc_node);
391 dm_doc_node.setAttribute(GSXML.NODE_ID_ATT, doc_node_id);
392 dm_doc_node.setAttribute(GSXML.NODE_TYPE_ATT, doc_node.getAttribute(GSXML.NODE_TYPE_ATT));
393 }
394
395 // we also want a metadata request to the top level document to get
396 // assocfilepath - this could be cached too
397 Element doc_meta_request = GSXML.createBasicRequest(this.doc, GSXML.REQUEST_TYPE_PROCESS, to, userContext);
398 dm_message.appendChild(doc_meta_request);
399 Element doc_meta_param_list = this.doc.createElement(GSXML.PARAM_ELEM + GSXML.LIST_MODIFIER);
400 if (service_params != null)
401 {
402 GSXML.addParametersToList(this.doc, doc_meta_param_list, service_params);
403 }
404
405 doc_meta_request.appendChild(doc_meta_param_list);
406 Element doc_param = this.doc.createElement(GSXML.PARAM_ELEM);
407 doc_meta_param_list.appendChild(doc_param);
408 doc_param.setAttribute(GSXML.NAME_ATT, "metadata");
409 doc_param.setAttribute(GSXML.VALUE_ATT, "assocfilepath");
410
411 // create the doc node list for the metadata request
412 Element doc_list = this.doc.createElement(GSXML.DOC_NODE_ELEM + GSXML.LIST_MODIFIER);
413 doc_meta_request.appendChild(doc_list);
414
415 Element doc_node = this.doc.createElement(GSXML.DOC_NODE_ELEM);
416 // the node we want is the root document node
417 if (document_id != null)
418 {
419 doc_node.setAttribute(GSXML.NODE_ID_ATT, document_id + ".rt");
420 }
421 else
422 {
423 doc_node.setAttribute(GSXML.HREF_ID_ATT, href);// + ".rt");
424 // can we assume that href is always a top level doc??
425 //doc_node.setAttribute(GSXML.ID_MOD_ATT, ".rt");
426 //doc_node.setAttribute("externalURL", has_rl);
427 }
428 doc_list.appendChild(doc_node);
429
430 Element dm_response_message = (Element) this.mr.process(dm_message);
431 if (processErrorElements(dm_response_message, page_response))
432 {
433 return result;
434 }
435
436 String path = GSPath.appendLink(GSXML.RESPONSE_ELEM, GSXML.DOC_NODE_ELEM + GSXML.LIST_MODIFIER);
437 Element dm_response_doc_list = (Element) GSXML.getNodeByPath(dm_response_message, path);
438
439 // Merge the metadata with the structure information
440 NodeList dm_response_docs = dm_response_doc_list.getChildNodes();
441 for (int i = 0; i < doc_nodes.getLength(); i++)
442 {
443 GSXML.mergeMetadataLists(doc_nodes.item(i), dm_response_docs.item(i));
444 }
445 // get the top level doc metadata out
446 Element doc_meta_response = (Element) dm_response_message.getElementsByTagName(GSXML.RESPONSE_ELEM).item(1);
447 Element top_doc_node = (Element) GSXML.getNodeByPath(doc_meta_response, "documentNodeList/documentNode");
448 GSXML.mergeMetadataLists(the_document, top_doc_node);
449
450 // Build a request to obtain some document content
451 Element dc_message = this.doc.createElement(GSXML.MESSAGE_ELEM);
452 to = GSPath.appendLink(collection, "DocumentContentRetrieve"); // Hard-wired?
453 Element dc_request = GSXML.createBasicRequest(this.doc, GSXML.REQUEST_TYPE_PROCESS, to, userContext);
454 dc_message.appendChild(dc_request);
455
456 // Create a parameter list to specify the request parameters - empty for now
457 Element dc_param_list = this.doc.createElement(GSXML.PARAM_ELEM + GSXML.LIST_MODIFIER);
458 if (service_params != null)
459 {
460 GSXML.addParametersToList(this.doc, dc_param_list, service_params);
461 }
462
463 dc_request.appendChild(dc_param_list);
464
465 // get the content
466 // the doc list for the content request is the same as the one for the structure request unless we want the whole document, in which case its the same as for the metadata request.
467 if (expand_document)
468 {
469 dc_request.appendChild(dm_doc_list);
470 }
471 else
472 {
473 dc_request.appendChild(basic_doc_list);
474 }
475 logger.debug("request = " + converter.getString(dc_message));
476 Element dc_response_message = (Element) this.mr.process(dc_message);
477 if (processErrorElements(dc_response_message, page_response))
478 {
479 return result;
480 }
481
482 Element dc_response_doc_list = (Element) GSXML.getNodeByPath(dc_response_message, path);
483
484 if (expand_document)
485 {
486 // Merge the content with the structure information
487 NodeList dc_response_docs = dc_response_doc_list.getChildNodes();
488 for (int i = 0; i < doc_nodes.getLength(); i++)
489 {
490 Node content = GSXML.getChildByTagName((Element) dc_response_docs.item(i), "nodeContent");
491 if (content != null)
492 {
493 if (highlight_query_terms)
494 {
495 content = highlightQueryTerms(request, (Element) content);
496 }
497 doc_nodes.item(i).appendChild(this.doc.importNode(content, true));
498 }
499 //GSXML.mergeMetadataLists(doc_nodes.item(i), dm_response_docs.item(i));
500 }
501 }
502 else
503 {
504 //path = GSPath.appendLink(path, GSXML.DOC_NODE_ELEM);
505 Element dc_response_doc = (Element) GSXML.getChildByTagName(dc_response_doc_list, GSXML.DOC_NODE_ELEM);
506 Element dc_response_doc_content = (Element) GSXML.getChildByTagName(dc_response_doc, GSXML.NODE_CONTENT_ELEM);
507 //Element dc_response_doc_external = (Element) GSXML.getChildByTagName(dc_response_doc, "external");
508
509 if (dc_response_doc_content == null)
510 {
511 // no content to add
512 if (dc_response_doc.getAttribute("external").equals("true"))
513 {
514
515 //if (dc_response_doc_external != null)
516 //{
517 String href_id = dc_response_doc.getAttribute(GSXML.HREF_ID_ATT);
518
519 the_document.setAttribute("selectedNode", href_id);
520 the_document.setAttribute("external", href_id);
521 }
522 return result;
523 }
524 if (highlight_query_terms)
525 {
526 dc_response_doc.removeChild(dc_response_doc_content);
527
528 dc_response_doc_content = highlightQueryTerms(request, dc_response_doc_content);
529 dc_response_doc.appendChild(dc_response_doc.getOwnerDocument().importNode(dc_response_doc_content, true));
530 }
531
532 if (provide_annotations)
533 {
534 String service_selected = (String) params.get(ENRICH_DOC_ARG);
535 if (service_selected != null && service_selected.equals("1"))
536 {
537 // now we can modifiy the response doc if needed
538 String enrich_service = (String) params.get(GSParams.SERVICE);
539 // send a message to the service
540 Element enrich_message = this.doc.createElement(GSXML.MESSAGE_ELEM);
541 Element enrich_request = GSXML.createBasicRequest(this.doc, GSXML.REQUEST_TYPE_PROCESS, enrich_service, userContext);
542 enrich_message.appendChild(enrich_request);
543 // check for parameters
544 HashMap e_service_params = (HashMap) params.get("s1");
545 if (e_service_params != null)
546 {
547 Element enrich_pl = this.doc.createElement(GSXML.PARAM_ELEM + GSXML.LIST_MODIFIER);
548 GSXML.addParametersToList(this.doc, enrich_pl, e_service_params);
549 enrich_request.appendChild(enrich_pl);
550 }
551 Element e_doc_list = this.doc.createElement(GSXML.DOC_NODE_ELEM + GSXML.LIST_MODIFIER);
552 enrich_request.appendChild(e_doc_list);
553 e_doc_list.appendChild(this.doc.importNode(dc_response_doc, true));
554
555 Node enrich_response = this.mr.process(enrich_message);
556
557 String[] links = { GSXML.RESPONSE_ELEM, GSXML.DOC_NODE_ELEM + GSXML.LIST_MODIFIER, GSXML.DOC_NODE_ELEM, GSXML.NODE_CONTENT_ELEM };
558 path = GSPath.createPath(links);
559 dc_response_doc_content = (Element) GSXML.getNodeByPath(enrich_response, path);
560
561 }
562 } // if provide_annotations
563
564 // use the returned id rather than the sent one cos there may have
565 // been modifiers such as .pr that are removed.
566 String modified_doc_id = dc_response_doc.getAttribute(GSXML.NODE_ID_ATT);
567 the_document.setAttribute("selectedNode", modified_doc_id);
568 if (has_dummy)
569 {
570 // change the id if necessary and add the content
571 Element dummy_node = (Element) doc_nodes.item(0);
572
573 dummy_node.setAttribute(GSXML.NODE_ID_ATT, modified_doc_id);
574 dummy_node.appendChild(this.doc.importNode(dc_response_doc_content, true));
575 // hack for simple type
576 if (document_type.equals("simple"))
577 {
578 // we dont want the internal docNode, just want the content and metadata in the document
579 // rethink this!!
580 the_document.removeChild(dummy_node);
581
582 NodeList dummy_children = dummy_node.getChildNodes();
583 //for (int i=0; i<dummy_children.getLength(); i++) {
584 for (int i = dummy_children.getLength() - 1; i >= 0; i--)
585 {
586 // special case as we don't want more than one metadata list
587 if (dummy_children.item(i).getNodeName().equals(GSXML.METADATA_ELEM + GSXML.LIST_MODIFIER))
588 {
589 GSXML.mergeMetadataFromList(the_document, dummy_children.item(i));
590 }
591 else
592 {
593 the_document.appendChild(dummy_children.item(i));
594 }
595 }
596 }
597 }
598 else
599 {
600 // Merge the document content with the metadata and structure information
601 for (int i = 0; i < doc_nodes.getLength(); i++)
602 {
603 Node dn = doc_nodes.item(i);
604 String dn_id = ((Element) dn).getAttribute(GSXML.NODE_ID_ATT);
605 if (dn_id.equals(modified_doc_id))
606 {
607 dn.appendChild(this.doc.importNode(dc_response_doc_content, true));
608 break;
609 }
610 }
611 }
612 }
613 logger.debug("(DocumentAction) Page:\n" + this.converter.getPrettyString(result));
614 return result;
615 }
616
617 /**
618 * tell the param class what its arguments are if an action has its own
619 * arguments, this should add them to the params object - particularly
620 * important for args that should not be saved
621 */
622 public boolean addActionParameters(GSParams params)
623 {
624 params.addParameter(GOTO_PAGE_ARG, false);
625 params.addParameter(ENRICH_DOC_ARG, false);
626 params.addParameter(EXPAND_DOCUMENT_ARG, false);
627 params.addParameter(EXPAND_CONTENTS_ARG, false);
628 params.addParameter(REALISTIC_BOOK_ARG, false);
629
630 return true;
631 }
632
633 /**
634 * this method gets the collection description, the format info, the list of
635 * enrich services, etc - stuff that is needed for the page, but is the same
636 * whatever the query is - should be cached
637 */
638 protected boolean getBackgroundData(Element page_response, String collection, UserContext userContext)
639 {
640
641 // create a message to process - contains requests for the collection
642 // description, the format element, the enrich services on offer
643 // these could all be cached
644 Element info_message = this.doc.createElement(GSXML.MESSAGE_ELEM);
645 String path = GSPath.appendLink(collection, "DocumentContentRetrieve");
646 // the format request - ignore for now, where does this request go to??
647 Element format_request = GSXML.createBasicRequest(this.doc, GSXML.REQUEST_TYPE_FORMAT, path, userContext);
648 info_message.appendChild(format_request);
649
650 // the enrich_services request - only do this if provide_annotations is true
651
652 if (provide_annotations)
653 {
654 Element enrich_services_request = GSXML.createBasicRequest(this.doc, GSXML.REQUEST_TYPE_DESCRIBE, "", userContext);
655 enrich_services_request.setAttribute(GSXML.INFO_ATT, "serviceList");
656 info_message.appendChild(enrich_services_request);
657 }
658
659 Element info_response = (Element) this.mr.process(info_message);
660
661 // the collection is the first response
662 NodeList responses = info_response.getElementsByTagName(GSXML.RESPONSE_ELEM);
663 Element format_resp = (Element) responses.item(0);
664
665 Element format_elem = (Element) GSXML.getChildByTagName(format_resp, GSXML.FORMAT_ELEM);
666 if (format_elem != null)
667 {
668 logger.debug("doc action found a format statement");
669 // set teh format type
670 format_elem.setAttribute(GSXML.TYPE_ATT, "display");
671 page_response.appendChild(this.doc.importNode(format_elem, true));
672 }
673
674 if (provide_annotations)
675 {
676 Element services_resp = (Element) responses.item(1);
677
678 // a new message for the mr
679 Element enrich_message = this.doc.createElement(GSXML.MESSAGE_ELEM);
680
681 NodeList e_services = services_resp.getElementsByTagName(GSXML.SERVICE_ELEM);
682 boolean service_found = false;
683 for (int j = 0; j < e_services.getLength(); j++)
684 {
685 if (((Element) e_services.item(j)).getAttribute(GSXML.TYPE_ATT).equals("enrich"))
686 {
687 Element s = GSXML.createBasicRequest(this.doc, GSXML.REQUEST_TYPE_DESCRIBE, ((Element) e_services.item(j)).getAttribute(GSXML.NAME_ATT), userContext);
688 enrich_message.appendChild(s);
689 service_found = true;
690 }
691 }
692 if (service_found)
693 {
694 Element enrich_response = (Element) this.mr.process(enrich_message);
695
696 NodeList e_responses = enrich_response.getElementsByTagName(GSXML.RESPONSE_ELEM);
697 Element service_list = this.doc.createElement(GSXML.SERVICE_ELEM + GSXML.LIST_MODIFIER);
698 for (int i = 0; i < e_responses.getLength(); i++)
699 {
700 Element e_resp = (Element) e_responses.item(i);
701 Element e_service = (Element) this.doc.importNode(GSXML.getChildByTagName(e_resp, GSXML.SERVICE_ELEM), true);
702 e_service.setAttribute(GSXML.NAME_ATT, e_resp.getAttribute(GSXML.FROM_ATT));
703 service_list.appendChild(e_service);
704 }
705 page_response.appendChild(service_list);
706 }
707 } // if provide_annotations
708 return true;
709
710 }
711
712 /**
713 * this involves a bit of a hack to get the equivalent query terms - has to
714 * requery the query service - uses the last selected service name. (if it
715 * ends in query). should this action do the query or should it send a
716 * message to the query action? but that will involve lots of extra stuff.
717 * also doesn't handle phrases properly - just highlights all the terms
718 * found in the text.
719 */
720 protected Element highlightQueryTerms(Element request, Element dc_response_doc_content)
721 {
722 // do the query again to get term info
723 Element cgi_param_list = (Element) GSXML.getChildByTagName(request, GSXML.PARAM_ELEM + GSXML.LIST_MODIFIER);
724 HashMap params = GSXML.extractParams(cgi_param_list, false);
725
726 HashMap previous_params = (HashMap) params.get("p");
727 if (previous_params == null)
728 {
729 return dc_response_doc_content;
730 }
731 String service_name = (String) previous_params.get(GSParams.SERVICE);
732 if (service_name == null || !service_name.endsWith("Query"))
733 { // hack for now - we only do highlighting if we were in a query last - ie not if we were in a browse thingy
734 logger.debug("invalid service, not doing highlighting");
735 return dc_response_doc_content;
736 }
737 String collection = (String) params.get(GSParams.COLLECTION);
738 UserContext userContext = new UserContext(request);
739 String to = GSPath.appendLink(collection, service_name);
740
741 Element mr_query_message = this.doc.createElement(GSXML.MESSAGE_ELEM);
742 Element mr_query_request = GSXML.createBasicRequest(this.doc, GSXML.REQUEST_TYPE_PROCESS, to, userContext);
743 mr_query_message.appendChild(mr_query_request);
744
745 // paramList
746 HashMap service_params = (HashMap) params.get("s1");
747
748 Element query_param_list = this.doc.createElement(GSXML.PARAM_ELEM + GSXML.LIST_MODIFIER);
749 GSXML.addParametersToList(this.doc, query_param_list, service_params);
750 mr_query_request.appendChild(query_param_list);
751
752 // do the query
753 Element mr_query_response = (Element) this.mr.process(mr_query_message);
754
755 String path = GSPath.appendLink(GSXML.RESPONSE_ELEM, GSXML.TERM_ELEM + GSXML.LIST_MODIFIER);
756 Element query_term_list_element = (Element) GSXML.getNodeByPath(mr_query_response, path);
757 if (query_term_list_element == null)
758 {
759 // no term info
760 logger.error("No query term information.\n");
761 return dc_response_doc_content;
762 }
763
764 String content = GSXML.getNodeText(dc_response_doc_content);
765
766 String metadata_path = GSPath.appendLink(GSXML.RESPONSE_ELEM, GSXML.METADATA_ELEM + GSXML.LIST_MODIFIER);
767 Element metadata_list = (Element) GSXML.getNodeByPath(mr_query_response, metadata_path);
768
769 HashSet query_term_variants = new HashSet();
770 NodeList equivalent_terms_nodelist = query_term_list_element.getElementsByTagName("equivTermList");
771 if (equivalent_terms_nodelist == null || equivalent_terms_nodelist.getLength() == 0)
772 {
773 NodeList terms_nodelist = query_term_list_element.getElementsByTagName("term");
774 if (terms_nodelist != null && terms_nodelist.getLength() > 0)
775 {
776 for (int i = 0; i < terms_nodelist.getLength(); i++)
777 {
778 String termValue = ((Element) terms_nodelist.item(i)).getAttribute("name");
779 String termValueU = null;
780 String termValueL = null;
781
782 if (termValue.length() > 1)
783 {
784 termValueU = termValue.substring(0, 1).toUpperCase() + termValue.substring(1);
785 termValueL = termValue.substring(0, 1).toLowerCase() + termValue.substring(1);
786 }
787 else
788 {
789 termValueU = termValue.substring(0, 1).toUpperCase();
790 termValueL = termValue.substring(0, 1).toLowerCase();
791 }
792
793 query_term_variants.add(termValueU);
794 query_term_variants.add(termValueL);
795 }
796 }
797 }
798 else
799 {
800 for (int i = 0; i < equivalent_terms_nodelist.getLength(); i++)
801 {
802 Element equivalent_terms_element = (Element) equivalent_terms_nodelist.item(i);
803 String[] equivalent_terms = GSXML.getAttributeValuesFromList(equivalent_terms_element, GSXML.NAME_ATT);
804 for (int j = 0; j < equivalent_terms.length; j++)
805 {
806 query_term_variants.add(equivalent_terms[j]);
807 }
808 }
809 }
810
811 ArrayList phrase_query_term_variants_hierarchy = new ArrayList();
812
813 Element query_element = GSXML.getNamedElement(metadata_list, GSXML.METADATA_ELEM, GSXML.NAME_ATT, "query");
814 String performed_query = GSXML.getNodeText(query_element) + " ";
815
816 ArrayList phrase_query_p_term_variants_list = new ArrayList();
817 int term_start = 0;
818 boolean in_term = false;
819 boolean in_phrase = false;
820 for (int i = 0; i < performed_query.length(); i++)
821 {
822 char character = performed_query.charAt(i);
823 boolean is_character_letter_or_digit = Character.isLetterOrDigit(character);
824
825 // Has a query term just started?
826 if (in_term == false && is_character_letter_or_digit == true)
827 {
828 in_term = true;
829 term_start = i;
830 }
831
832 // Or has a term just finished?
833 else if (in_term == true && is_character_letter_or_digit == false)
834 {
835 in_term = false;
836 String term = performed_query.substring(term_start, i);
837
838 Element term_element = GSXML.getNamedElement(query_term_list_element, GSXML.TERM_ELEM, GSXML.NAME_ATT, term);
839 if (term_element != null)
840 {
841
842 HashSet phrase_query_p_term_x_variants = new HashSet();
843
844 NodeList term_equivalent_terms_nodelist = term_element.getElementsByTagName("equivTermList");
845 if (term_equivalent_terms_nodelist == null || term_equivalent_terms_nodelist.getLength() == 0)
846 {
847 String termValueU = null;
848 String termValueL = null;
849
850 if (term.length() > 1)
851 {
852 termValueU = term.substring(0, 1).toUpperCase() + term.substring(1);
853 termValueL = term.substring(0, 1).toLowerCase() + term.substring(1);
854 }
855 else
856 {
857 termValueU = term.substring(0, 1).toUpperCase();
858 termValueL = term.substring(0, 1).toLowerCase();
859 }
860
861 phrase_query_p_term_x_variants.add(termValueU);
862 phrase_query_p_term_x_variants.add(termValueL);
863 }
864 else
865 {
866 for (int j = 0; j < term_equivalent_terms_nodelist.getLength(); j++)
867 {
868 Element term_equivalent_terms_element = (Element) term_equivalent_terms_nodelist.item(j);
869 String[] term_equivalent_terms = GSXML.getAttributeValuesFromList(term_equivalent_terms_element, GSXML.NAME_ATT);
870 for (int k = 0; k < term_equivalent_terms.length; k++)
871 {
872 phrase_query_p_term_x_variants.add(term_equivalent_terms[k]);
873 }
874 }
875 }
876 phrase_query_p_term_variants_list.add(phrase_query_p_term_x_variants);
877
878 if (in_phrase == false)
879 {
880 phrase_query_term_variants_hierarchy.add(phrase_query_p_term_variants_list);
881 phrase_query_p_term_variants_list = new ArrayList();
882 }
883 }
884 }
885 // Watch for phrases (surrounded by quotes)
886 if (character == '\"')
887 {
888 // Has a phrase just started?
889 if (in_phrase == false)
890 {
891 in_phrase = true;
892 }
893 // Or has a phrase just finished?
894 else if (in_phrase == true)
895 {
896 in_phrase = false;
897 phrase_query_term_variants_hierarchy.add(phrase_query_p_term_variants_list);
898 }
899
900 phrase_query_p_term_variants_list = new ArrayList();
901 }
902 }
903
904 return highlightQueryTermsInternal(content, query_term_variants, phrase_query_term_variants_hierarchy);
905 }
906
907 /**
908 * Highlights query terms in a piece of text.
909 */
910 private Element highlightQueryTermsInternal(String content, HashSet query_term_variants, ArrayList phrase_query_term_variants_hierarchy)
911 {
912 // Convert the content string to an array of characters for speed
913 char[] content_characters = new char[content.length()];
914 content.getChars(0, content.length(), content_characters, 0);
915
916 // Now skim through the content, identifying word matches
917 ArrayList word_matches = new ArrayList();
918 int word_start = 0;
919 boolean in_word = false;
920 boolean preceding_word_matched = false;
921 boolean inTag = false;
922 for (int i = 0; i < content_characters.length; i++)
923 {
924 //We don't want to find words inside HTML tags
925 if (content_characters[i] == '<')
926 {
927 inTag = true;
928 continue;
929 }
930 else if (inTag && content_characters[i] == '>')
931 {
932 inTag = false;
933 }
934 else if (inTag)
935 {
936 continue;
937 }
938
939 boolean is_character_letter_or_digit = Character.isLetterOrDigit(content_characters[i]);
940
941 // Has a word just started?
942 if (in_word == false && is_character_letter_or_digit == true)
943 {
944 in_word = true;
945 word_start = i;
946 }
947
948 // Or has a word just finished?
949 else if (in_word == true && is_character_letter_or_digit == false)
950 {
951 in_word = false;
952
953 // Check if the word matches any of the query term equivalents
954 String word = new String(content_characters, word_start, (i - word_start));
955 if (query_term_variants.contains(word))
956 {
957 // We have found a matching word, so remember its location
958 word_matches.add(new WordMatch(word, word_start, i, preceding_word_matched));
959 preceding_word_matched = true;
960 }
961 else
962 {
963 preceding_word_matched = false;
964 }
965 }
966 }
967
968 // Don't forget the last word...
969 if (in_word == true)
970 {
971 // Check if the word matches any of the query term equivalents
972 String word = new String(content_characters, word_start, (content_characters.length - word_start));
973 if (query_term_variants.contains(word))
974 {
975 // We have found a matching word, so remember its location
976 word_matches.add(new WordMatch(word, word_start, content_characters.length, preceding_word_matched));
977 }
978 }
979
980 ArrayList highlight_start_positions = new ArrayList();
981 ArrayList highlight_end_positions = new ArrayList();
982
983 // Deal with phrases now
984 ArrayList partial_phrase_matches = new ArrayList();
985 for (int i = 0; i < word_matches.size(); i++)
986 {
987 WordMatch word_match = (WordMatch) word_matches.get(i);
988
989 // See if any partial phrase matches are extended by this word
990 if (word_match.preceding_word_matched)
991 {
992 for (int j = partial_phrase_matches.size() - 1; j >= 0; j--)
993 {
994 PartialPhraseMatch partial_phrase_match = (PartialPhraseMatch) partial_phrase_matches.remove(j);
995 ArrayList phrase_query_p_term_variants_list = (ArrayList) phrase_query_term_variants_hierarchy.get(partial_phrase_match.query_phrase_number);
996 HashSet phrase_query_p_term_x_variants = (HashSet) phrase_query_p_term_variants_list.get(partial_phrase_match.num_words_matched);
997 if (phrase_query_p_term_x_variants.contains(word_match.word))
998 {
999 partial_phrase_match.num_words_matched++;
1000
1001 // Has a complete phrase match occurred?
1002 if (partial_phrase_match.num_words_matched == phrase_query_p_term_variants_list.size())
1003 {
1004 // Check for overlaps by looking at the previous highlight range
1005 if (!highlight_end_positions.isEmpty())
1006 {
1007 int last_highlight_index = highlight_end_positions.size() - 1;
1008 int last_highlight_end = ((Integer) highlight_end_positions.get(last_highlight_index)).intValue();
1009 if (last_highlight_end > partial_phrase_match.start_position)
1010 {
1011 // There is an overlap, so remove the previous phrase match
1012 int last_highlight_start = ((Integer) highlight_start_positions.remove(last_highlight_index)).intValue();
1013 highlight_end_positions.remove(last_highlight_index);
1014 partial_phrase_match.start_position = last_highlight_start;
1015 }
1016 }
1017
1018 highlight_start_positions.add(new Integer(partial_phrase_match.start_position));
1019 highlight_end_positions.add(new Integer(word_match.end_position));
1020 }
1021 // No, but add the partial match back into the list for next time
1022 else
1023 {
1024 partial_phrase_matches.add(partial_phrase_match);
1025 }
1026 }
1027 }
1028 }
1029 else
1030 {
1031 partial_phrase_matches.clear();
1032 }
1033
1034 // See if this word is at the start of any of the phrases
1035 for (int p = 0; p < phrase_query_term_variants_hierarchy.size(); p++)
1036 {
1037 ArrayList phrase_query_p_term_variants_list = (ArrayList) phrase_query_term_variants_hierarchy.get(p);
1038 HashSet phrase_query_p_term_1_variants = (HashSet) phrase_query_p_term_variants_list.get(0);
1039 if (phrase_query_p_term_1_variants.contains(word_match.word))
1040 {
1041 // If this phrase is just one word long, we have a complete match
1042 if (phrase_query_p_term_variants_list.size() == 1)
1043 {
1044 highlight_start_positions.add(new Integer(word_match.start_position));
1045 highlight_end_positions.add(new Integer(word_match.end_position));
1046 }
1047 // Otherwise we have the start of a potential phrase match
1048 else
1049 {
1050 partial_phrase_matches.add(new PartialPhraseMatch(word_match.start_position, p));
1051 }
1052 }
1053 }
1054 }
1055
1056 // Now add the annotation tags into the document at the correct points
1057 Element content_element = this.doc.createElement(GSXML.NODE_CONTENT_ELEM);
1058
1059 int last_wrote = 0;
1060 for (int i = 0; i < highlight_start_positions.size(); i++)
1061 {
1062 int highlight_start = ((Integer) highlight_start_positions.get(i)).intValue();
1063 int highlight_end = ((Integer) highlight_end_positions.get(i)).intValue();
1064
1065 // Print anything before the highlight range
1066 if (last_wrote < highlight_start)
1067 {
1068 String preceding_text = new String(content_characters, last_wrote, (highlight_start - last_wrote));
1069 content_element.appendChild(this.doc.createTextNode(preceding_text));
1070 }
1071
1072 // Print the highlight text, annotated
1073 if (highlight_end > last_wrote)
1074 {
1075 String highlight_text = new String(content_characters, highlight_start, (highlight_end - highlight_start));
1076 Element annotation_element = GSXML.createTextElement(this.doc, "annotation", highlight_text);
1077 annotation_element.setAttribute("type", "query_term");
1078 content_element.appendChild(annotation_element);
1079 last_wrote = highlight_end;
1080 }
1081 }
1082
1083 // Finish off any unwritten text
1084 if (last_wrote < content_characters.length)
1085 {
1086 String remaining_text = new String(content_characters, last_wrote, (content_characters.length - last_wrote));
1087 content_element.appendChild(this.doc.createTextNode(remaining_text));
1088 }
1089
1090 return content_element;
1091 }
1092
1093 static private class WordMatch
1094 {
1095 public String word;
1096 public int start_position;
1097 public int end_position;
1098 public boolean preceding_word_matched;
1099
1100 public WordMatch(String word, int start_position, int end_position, boolean preceding_word_matched)
1101 {
1102 this.word = word;
1103 this.start_position = start_position;
1104 this.end_position = end_position;
1105 this.preceding_word_matched = preceding_word_matched;
1106 }
1107 }
1108
1109 static private class PartialPhraseMatch
1110 {
1111 public int start_position;
1112 public int query_phrase_number;
1113 public int num_words_matched;
1114
1115 public PartialPhraseMatch(int start_position, int query_phrase_number)
1116 {
1117 this.start_position = start_position;
1118 this.query_phrase_number = query_phrase_number;
1119 this.num_words_matched = 1;
1120 }
1121 }
1122}
Note: See TracBrowser for help on using the repository browser.