source: main/trunk/greenstone3/src/java/org/greenstone/gsdl3/action/DocumentAction.java@ 25816

Last change on this file since 25816 was 25816, checked in by kjdon, 12 years ago

if no document type is specified in cgi params, then get it from the collection, don't just assume hierarchy

  • Property svn:keywords set to Author Date Id Revision
File size: 42.5 KB
Line 
1/*
2 * DocumentAction.java
3 * Copyright (C) 2002 New Zealand Digital Library, http://www.nzdl.org
4 *
5 * This program is free software; you can redistribute it and/or modify
6 * it under the terms of the GNU General Public License as published by
7 * the Free Software Foundation; either version 2 of the License, or
8 * (at your option) any later version.
9 *
10 * This program is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 * GNU General Public License for more details.
14 *
15 * You should have received a copy of the GNU General Public License
16 * along with this program; if not, write to the Free Software
17 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
18 */
19package org.greenstone.gsdl3.action;
20
21// Greenstone classes
22import org.greenstone.gsdl3.core.ModuleInterface;
23import org.greenstone.gsdl3.util.*;
24
25// XML classes
26import org.w3c.dom.Document;
27import org.w3c.dom.Element;
28import org.w3c.dom.Node;
29import org.w3c.dom.Text;
30import org.w3c.dom.NodeList;
31
32// General Java classes
33import java.util.ArrayList;
34import java.util.HashMap;
35import java.util.HashSet;
36import java.io.File;
37import java.io.Serializable;
38
39import org.apache.log4j.*;
40
41/** Action class for retrieving Documents via the message router */
42public class DocumentAction extends Action
43{
44
45 static Logger logger = Logger.getLogger(org.greenstone.gsdl3.action.DocumentAction.class.getName());
46
47 // this is used to specify that the sibling nodes of a selected one should be obtained
48 public static final String SIBLING_ARG = "sib";
49 public static final String GOTO_PAGE_ARG = "gp";
50 public static final String ENRICH_DOC_ARG = "end";
51 public static final String EXPAND_DOCUMENT_ARG = "ed";
52 public static final String EXPAND_CONTENTS_ARG = "ec";
53 public static final String REALISTIC_BOOK_ARG = "book";
54
55 /**
56 * if this is set to true, when a document is displayed, any annotation type
57 * services (enrich) will be offered to the user as well
58 */
59 protected boolean provide_annotations = false;
60
61 protected boolean highlight_query_terms = false;
62
63 public boolean configure()
64 {
65 super.configure();
66 String highlight = (String) config_params.get("highlightQueryTerms");
67 if (highlight != null && highlight.equals("true"))
68 {
69 highlight_query_terms = true;
70 }
71 String annotate = (String) config_params.get("displayAnnotationService");
72 if (annotate != null && annotate.equals("true"))
73 {
74 provide_annotations = true;
75 }
76 return true; }
77
78 public Node process(Node message_node)
79 {
80 // for now, no subaction eventually we may want to have subactions such as text assoc or something ?
81
82 Element message = this.converter.nodeToElement(message_node);
83
84 // the response
85 Element result = this.doc.createElement(GSXML.MESSAGE_ELEM);
86 Element page_response = this.doc.createElement(GSXML.RESPONSE_ELEM);
87 result.appendChild(page_response);
88
89 // get the request - assume only one
90 Element request = (Element) GSXML.getChildByTagName(message, GSXML.REQUEST_ELEM);
91 Element cgi_paramList = (Element) GSXML.getChildByTagName(request, GSXML.PARAM_ELEM + GSXML.LIST_MODIFIER);
92 HashMap<String, Serializable> params = GSXML.extractParams(cgi_paramList, false);
93
94 // just in case there are some that need to get passed to the services
95 HashMap service_params = (HashMap) params.get("s0");
96
97 String collection = (String) params.get(GSParams.COLLECTION);
98 String document_id = (String) params.get(GSParams.DOCUMENT);
99 if (document_id != null && document_id.equals(""))
100 {
101 document_id = null;
102 }
103 String href = (String) params.get(GSParams.HREF);//for an external link : get the href URL if it is existing in the params list
104 if (href != null && href.equals(""))
105 {
106 href = null;
107 }
108 String rl = (String) params.get(GSParams.RELATIVE_LINK);//for an external link : get the rl value if it is existing in the params list
109 if (document_id == null && href == null)
110 {
111 logger.error("no document specified!");
112 return result;
113 }
114 if (rl != null && rl.equals("0"))
115 {
116 // this is a true external link, we should have been directed to a different page or action
117 logger.error("rl value was 0, shouldn't get here");
118 return result;
119 }
120 String document_type = (String) params.get(GSParams.DOCUMENT_TYPE);
121 if (document_type != null && document_type.equals(""))
122 {
123 //document_type = "hierarchy";
124 document_type = null; // we'll get it later if not already specified
125 }
126 //whether to retrieve siblings or not
127 boolean get_siblings = false;
128 String sibs = (String) params.get(SIBLING_ARG);
129 if (sibs != null && sibs.equals("1"))
130 {
131 get_siblings = true;
132 }
133
134 String doc_id_modifier = "";
135 String sibling_num = (String) params.get(GOTO_PAGE_ARG);
136 if (sibling_num != null && !sibling_num.equals(""))
137 {
138 // we have to modify the doc name
139 doc_id_modifier = "." + sibling_num + ".ss";
140 }
141
142 boolean expand_document = false;
143 String ed_arg = (String) params.get(EXPAND_DOCUMENT_ARG);
144 if (ed_arg != null && ed_arg.equals("1"))
145 {
146 expand_document = true;
147 }
148
149 boolean expand_contents = false;
150 if (expand_document)
151 { // we always expand the contents with the text
152 expand_contents = true;
153 }
154 else
155 {
156 String ec_arg = (String) params.get(EXPAND_CONTENTS_ARG);
157 if (ec_arg != null && ec_arg.equals("1"))
158 {
159 expand_contents = true;
160 }
161 }
162
163 UserContext userContext = new UserContext(request);
164
165 //append site metadata
166 addSiteMetadata(page_response, userContext);
167 addInterfaceOptions(page_response);
168
169 // get the additional data needed for the page
170 getBackgroundData(page_response, collection, userContext);
171 Element format_elem = (Element) GSXML.getChildByTagName(page_response, GSXML.FORMAT_ELEM);
172
173 // the_document is where all the doc info - structure and metadata etc
174 // is added into, to be returned in the page
175 Element the_document = this.doc.createElement(GSXML.DOCUMENT_ELEM);
176 page_response.appendChild(the_document);
177
178 // create a basic doc list containing the current node
179 Element basic_doc_list = this.doc.createElement(GSXML.DOC_NODE_ELEM + GSXML.LIST_MODIFIER);
180 Element current_doc = this.doc.createElement(GSXML.DOC_NODE_ELEM);
181 basic_doc_list.appendChild(current_doc);
182 if (document_id != null)
183 {
184 current_doc.setAttribute(GSXML.NODE_ID_ATT, document_id + doc_id_modifier);
185 }
186 else
187 {
188 current_doc.setAttribute(GSXML.HREF_ID_ATT, href);
189 // do we need this??
190 current_doc.setAttribute(GSXML.ID_MOD_ATT, doc_id_modifier);
191 }
192
193 if (document_type == null) {
194 logger.error("getting document type");
195 document_type = getDocumentType(basic_doc_list, collection, userContext, page_response);
196 logger.error("new doc type = "+document_type);
197 }
198 if (document_type != null) {
199 // set the doctype from the cgi arg or from the server as an attribute
200 the_document.setAttribute(GSXML.DOC_TYPE_ATT, document_type);
201 }
202 else {
203 logger.error("doctype is null!!!***********");
204 }
205
206 // Create a parameter list to specify the required structure information
207 Element ds_param_list = this.doc.createElement(GSXML.PARAM_ELEM + GSXML.LIST_MODIFIER);
208
209 if (service_params != null)
210 {
211 GSXML.addParametersToList(this.doc, ds_param_list, service_params);
212 }
213
214 Element ds_param = null;
215 boolean get_structure = false;
216 boolean get_structure_info = false;
217 if (document_type.equals(GSXML.DOC_TYPE_PAGED))
218 {
219 get_structure_info = true;
220
221 if (expand_contents)
222 {
223 ds_param = this.doc.createElement(GSXML.PARAM_ELEM);
224 ds_param_list.appendChild(ds_param);
225 ds_param.setAttribute(GSXML.NAME_ATT, "structure");
226 ds_param.setAttribute(GSXML.VALUE_ATT, "entire");
227 }
228
229 // get the info needed for paged naviagtion
230 ds_param = this.doc.createElement(GSXML.PARAM_ELEM);
231 ds_param_list.appendChild(ds_param);
232 ds_param.setAttribute(GSXML.NAME_ATT, "info");
233 ds_param.setAttribute(GSXML.VALUE_ATT, "numSiblings");
234 ds_param = this.doc.createElement(GSXML.PARAM_ELEM);
235 ds_param_list.appendChild(ds_param);
236 ds_param.setAttribute(GSXML.NAME_ATT, "info");
237 ds_param.setAttribute(GSXML.VALUE_ATT, "numChildren");
238 ds_param = this.doc.createElement(GSXML.PARAM_ELEM);
239 ds_param_list.appendChild(ds_param);
240 ds_param.setAttribute(GSXML.NAME_ATT, "info");
241 ds_param.setAttribute(GSXML.VALUE_ATT, "siblingPosition");
242
243 if (get_siblings)
244 {
245 ds_param = this.doc.createElement(GSXML.PARAM_ELEM);
246 ds_param_list.appendChild(ds_param);
247 ds_param.setAttribute(GSXML.NAME_ATT, "structure");
248 ds_param.setAttribute(GSXML.VALUE_ATT, "siblings");
249 }
250
251 }
252 else if (document_type.equals(GSXML.DOC_TYPE_HIERARCHY))
253 {
254 get_structure = true;
255 if (expand_contents)
256 {
257 ds_param = this.doc.createElement(GSXML.PARAM_ELEM);
258 ds_param_list.appendChild(ds_param);
259 ds_param.setAttribute(GSXML.NAME_ATT, "structure");
260 ds_param.setAttribute(GSXML.VALUE_ATT, "entire");
261 }
262 else
263 {
264 // get the info needed for table of contents
265 ds_param = this.doc.createElement(GSXML.PARAM_ELEM);
266 ds_param_list.appendChild(ds_param);
267 ds_param.setAttribute(GSXML.NAME_ATT, "structure");
268 ds_param.setAttribute(GSXML.VALUE_ATT, "ancestors");
269 ds_param = this.doc.createElement(GSXML.PARAM_ELEM);
270 ds_param_list.appendChild(ds_param);
271 ds_param.setAttribute(GSXML.NAME_ATT, "structure");
272 ds_param.setAttribute(GSXML.VALUE_ATT, "children");
273 if (get_siblings)
274 {
275 ds_param = this.doc.createElement(GSXML.PARAM_ELEM);
276 ds_param_list.appendChild(ds_param);
277 ds_param.setAttribute(GSXML.NAME_ATT, "structure");
278 ds_param.setAttribute(GSXML.VALUE_ATT, "siblings");
279 }
280 }
281 }
282 else
283 {
284 // we dont need any structure
285 }
286
287 boolean has_dummy = false;
288 if (get_structure || get_structure_info)
289 {
290
291 // Build a request to obtain the document structure
292 Element ds_message = this.doc.createElement(GSXML.MESSAGE_ELEM);
293 String to = GSPath.appendLink(collection, "DocumentStructureRetrieve");// Hard-wired?
294 Element ds_request = GSXML.createBasicRequest(this.doc, GSXML.REQUEST_TYPE_PROCESS, to, userContext);
295 ds_message.appendChild(ds_request);
296 ds_request.appendChild(ds_param_list);
297
298 // add the node list we created earlier
299 ds_request.appendChild(basic_doc_list);
300
301 // Process the document structure retrieve message
302 Element ds_response_message = (Element) this.mr.process(ds_message);
303 if (processErrorElements(ds_response_message, page_response))
304 {
305 return result;
306 }
307
308 // get the info and print out
309 String path = GSPath.appendLink(GSXML.RESPONSE_ELEM, GSXML.DOC_NODE_ELEM + GSXML.LIST_MODIFIER);
310 path = GSPath.appendLink(path, GSXML.DOC_NODE_ELEM);
311 path = GSPath.appendLink(path, "nodeStructureInfo");
312 Element ds_response_struct_info = (Element) GSXML.getNodeByPath(ds_response_message, path);
313 // get the doc_node bit
314 if (ds_response_struct_info != null)
315 {
316 the_document.appendChild(this.doc.importNode(ds_response_struct_info, true));
317 }
318 path = GSPath.appendLink(GSXML.RESPONSE_ELEM, GSXML.DOC_NODE_ELEM + GSXML.LIST_MODIFIER);
319 path = GSPath.appendLink(path, GSXML.DOC_NODE_ELEM);
320 path = GSPath.appendLink(path, GSXML.NODE_STRUCTURE_ELEM);
321 Element ds_response_structure = (Element) GSXML.getNodeByPath(ds_response_message, path);
322
323 if (ds_response_structure != null)
324 {
325 // add the contents of the structure bit into the_document
326 NodeList structs = ds_response_structure.getChildNodes();
327 for (int i = 0; i < structs.getLength(); i++)
328 {
329 the_document.appendChild(this.doc.importNode(structs.item(i), true));
330 }
331 }
332 else
333 {
334 // no structure nodes, so put in a dummy doc node
335 Element doc_node = this.doc.createElement(GSXML.DOC_NODE_ELEM);
336 if (document_id != null)
337 {
338 doc_node.setAttribute(GSXML.NODE_ID_ATT, document_id);
339 }
340 else
341 {
342 doc_node.setAttribute(GSXML.HREF_ID_ATT, href);
343
344 }
345 the_document.appendChild(doc_node);
346 has_dummy = true;
347 }
348 }
349 else
350 { // a simple type - we dont have a dummy node for simple
351 // should think about this more
352 // no structure request, so just put in a dummy doc node
353 Element doc_node = this.doc.createElement(GSXML.DOC_NODE_ELEM);
354 if (document_id != null)
355 {
356 doc_node.setAttribute(GSXML.NODE_ID_ATT, document_id);
357 }
358 else
359 {
360 doc_node.setAttribute(GSXML.HREF_ID_ATT, href);
361 }
362 the_document.appendChild(doc_node);
363 has_dummy = true;
364 }
365
366 // Build a request to obtain some document metadata
367 Element dm_message = this.doc.createElement(GSXML.MESSAGE_ELEM);
368 String to = GSPath.appendLink(collection, "DocumentMetadataRetrieve"); // Hard-wired?
369 Element dm_request = GSXML.createBasicRequest(this.doc, GSXML.REQUEST_TYPE_PROCESS, to, userContext);
370 dm_message.appendChild(dm_request);
371 // Create a parameter list to specify the required metadata information
372
373 HashSet<String> meta_names = new HashSet<String>();
374 meta_names.add("Title"); // the default
375 if (format_elem != null)
376 {
377 getRequiredMetadataNames(format_elem, meta_names);
378 }
379
380 Element dm_param_list = createMetadataParamList(meta_names);
381 if (service_params != null)
382 {
383 GSXML.addParametersToList(this.doc, dm_param_list, service_params);
384 }
385
386 dm_request.appendChild(dm_param_list);
387
388 // create the doc node list for the metadata request
389 Element dm_doc_list = this.doc.createElement(GSXML.DOC_NODE_ELEM + GSXML.LIST_MODIFIER);
390 dm_request.appendChild(dm_doc_list);
391
392 // Add each node from the structure response into the metadata request
393 NodeList doc_nodes = the_document.getElementsByTagName(GSXML.DOC_NODE_ELEM);
394 for (int i = 0; i < doc_nodes.getLength(); i++)
395 {
396 Element doc_node = (Element) doc_nodes.item(i);
397 String doc_node_id = doc_node.getAttribute(GSXML.NODE_ID_ATT);
398
399 // Add the documentNode to the list
400 Element dm_doc_node = this.doc.createElement(GSXML.DOC_NODE_ELEM);
401 dm_doc_list.appendChild(dm_doc_node);
402 dm_doc_node.setAttribute(GSXML.NODE_ID_ATT, doc_node_id);
403 dm_doc_node.setAttribute(GSXML.NODE_TYPE_ATT, doc_node.getAttribute(GSXML.NODE_TYPE_ATT));
404 }
405
406 // we also want a metadata request to the top level document to get
407 // assocfilepath - this could be cached too
408 Element doc_meta_request = GSXML.createBasicRequest(this.doc, GSXML.REQUEST_TYPE_PROCESS, to, userContext);
409 dm_message.appendChild(doc_meta_request);
410 Element doc_meta_param_list = this.doc.createElement(GSXML.PARAM_ELEM + GSXML.LIST_MODIFIER);
411 if (service_params != null)
412 {
413 GSXML.addParametersToList(this.doc, doc_meta_param_list, service_params);
414 }
415
416 doc_meta_request.appendChild(doc_meta_param_list);
417 Element doc_param = this.doc.createElement(GSXML.PARAM_ELEM);
418 doc_meta_param_list.appendChild(doc_param);
419 doc_param.setAttribute(GSXML.NAME_ATT, "metadata");
420 doc_param.setAttribute(GSXML.VALUE_ATT, "assocfilepath");
421
422 // create the doc node list for the metadata request
423 Element doc_list = this.doc.createElement(GSXML.DOC_NODE_ELEM + GSXML.LIST_MODIFIER);
424 doc_meta_request.appendChild(doc_list);
425
426 Element doc_node = this.doc.createElement(GSXML.DOC_NODE_ELEM);
427 // the node we want is the root document node
428 if (document_id != null)
429 {
430 doc_node.setAttribute(GSXML.NODE_ID_ATT, document_id + ".rt");
431 }
432 else
433 {
434 doc_node.setAttribute(GSXML.HREF_ID_ATT, href);// + ".rt");
435 // can we assume that href is always a top level doc??
436 //doc_node.setAttribute(GSXML.ID_MOD_ATT, ".rt");
437 //doc_node.setAttribute("externalURL", has_rl);
438 }
439 doc_list.appendChild(doc_node);
440
441 Element dm_response_message = (Element) this.mr.process(dm_message);
442 if (processErrorElements(dm_response_message, page_response))
443 {
444 return result;
445 }
446
447 String path = GSPath.appendLink(GSXML.RESPONSE_ELEM, GSXML.DOC_NODE_ELEM + GSXML.LIST_MODIFIER);
448 Element dm_response_doc_list = (Element) GSXML.getNodeByPath(dm_response_message, path);
449
450 // Merge the metadata with the structure information
451 NodeList dm_response_docs = dm_response_doc_list.getChildNodes();
452 for (int i = 0; i < doc_nodes.getLength(); i++)
453 {
454 GSXML.mergeMetadataLists(doc_nodes.item(i), dm_response_docs.item(i));
455 }
456 // get the top level doc metadata out
457 Element doc_meta_response = (Element) dm_response_message.getElementsByTagName(GSXML.RESPONSE_ELEM).item(1);
458 Element top_doc_node = (Element) GSXML.getNodeByPath(doc_meta_response, "documentNodeList/documentNode");
459 GSXML.mergeMetadataLists(the_document, top_doc_node);
460
461 // Build a request to obtain some document content
462 Element dc_message = this.doc.createElement(GSXML.MESSAGE_ELEM);
463 to = GSPath.appendLink(collection, "DocumentContentRetrieve"); // Hard-wired?
464 Element dc_request = GSXML.createBasicRequest(this.doc, GSXML.REQUEST_TYPE_PROCESS, to, userContext);
465 dc_message.appendChild(dc_request);
466
467 // Create a parameter list to specify the request parameters - empty for now
468 Element dc_param_list = this.doc.createElement(GSXML.PARAM_ELEM + GSXML.LIST_MODIFIER);
469 if (service_params != null)
470 {
471 GSXML.addParametersToList(this.doc, dc_param_list, service_params);
472 }
473
474 dc_request.appendChild(dc_param_list);
475
476 // get the content
477 // the doc list for the content request is the same as the one for the structure request unless we want the whole document, in which case its the same as for the metadata request.
478 if (expand_document)
479 {
480 dc_request.appendChild(dm_doc_list);
481 }
482 else
483 {
484 dc_request.appendChild(basic_doc_list);
485 }
486 logger.debug("request = " + XMLConverter.getString(dc_message));
487 Element dc_response_message = (Element) this.mr.process(dc_message);
488 if (processErrorElements(dc_response_message, page_response))
489 {
490 return result;
491 }
492
493 Element dc_response_doc_list = (Element) GSXML.getNodeByPath(dc_response_message, path);
494
495 if (expand_document)
496 {
497 // Merge the content with the structure information
498 NodeList dc_response_docs = dc_response_doc_list.getChildNodes();
499 for (int i = 0; i < doc_nodes.getLength(); i++)
500 {
501 Node content = GSXML.getChildByTagName((Element) dc_response_docs.item(i), "nodeContent");
502 if (content != null)
503 {
504 if (highlight_query_terms)
505 {
506 content = highlightQueryTerms(request, (Element) content);
507 }
508 doc_nodes.item(i).appendChild(this.doc.importNode(content, true));
509 }
510 //GSXML.mergeMetadataLists(doc_nodes.item(i), dm_response_docs.item(i));
511 }
512 }
513 else
514 {
515 //path = GSPath.appendLink(path, GSXML.DOC_NODE_ELEM);
516 Element dc_response_doc = (Element) GSXML.getChildByTagName(dc_response_doc_list, GSXML.DOC_NODE_ELEM);
517 Element dc_response_doc_content = (Element) GSXML.getChildByTagName(dc_response_doc, GSXML.NODE_CONTENT_ELEM);
518 //Element dc_response_doc_external = (Element) GSXML.getChildByTagName(dc_response_doc, "external");
519
520 if (dc_response_doc_content == null)
521 {
522 // no content to add
523 if (dc_response_doc.getAttribute("external").equals("true"))
524 {
525
526 //if (dc_response_doc_external != null)
527 //{
528 String href_id = dc_response_doc.getAttribute(GSXML.HREF_ID_ATT);
529
530 the_document.setAttribute("selectedNode", href_id);
531 the_document.setAttribute("external", href_id);
532 }
533 return result;
534 }
535 if (highlight_query_terms)
536 {
537 dc_response_doc.removeChild(dc_response_doc_content);
538
539 dc_response_doc_content = highlightQueryTerms(request, dc_response_doc_content);
540 dc_response_doc.appendChild(dc_response_doc.getOwnerDocument().importNode(dc_response_doc_content, true));
541 }
542
543 if (provide_annotations)
544 {
545 String service_selected = (String) params.get(ENRICH_DOC_ARG);
546 if (service_selected != null && service_selected.equals("1"))
547 {
548 // now we can modifiy the response doc if needed
549 String enrich_service = (String) params.get(GSParams.SERVICE);
550 // send a message to the service
551 Element enrich_message = this.doc.createElement(GSXML.MESSAGE_ELEM);
552 Element enrich_request = GSXML.createBasicRequest(this.doc, GSXML.REQUEST_TYPE_PROCESS, enrich_service, userContext);
553 enrich_message.appendChild(enrich_request);
554 // check for parameters
555 HashMap e_service_params = (HashMap) params.get("s1");
556 if (e_service_params != null)
557 {
558 Element enrich_pl = this.doc.createElement(GSXML.PARAM_ELEM + GSXML.LIST_MODIFIER);
559 GSXML.addParametersToList(this.doc, enrich_pl, e_service_params);
560 enrich_request.appendChild(enrich_pl);
561 }
562 Element e_doc_list = this.doc.createElement(GSXML.DOC_NODE_ELEM + GSXML.LIST_MODIFIER);
563 enrich_request.appendChild(e_doc_list);
564 e_doc_list.appendChild(this.doc.importNode(dc_response_doc, true));
565
566 Node enrich_response = this.mr.process(enrich_message);
567
568 String[] links = { GSXML.RESPONSE_ELEM, GSXML.DOC_NODE_ELEM + GSXML.LIST_MODIFIER, GSXML.DOC_NODE_ELEM, GSXML.NODE_CONTENT_ELEM };
569 path = GSPath.createPath(links);
570 dc_response_doc_content = (Element) GSXML.getNodeByPath(enrich_response, path);
571
572 }
573 } // if provide_annotations
574
575 // use the returned id rather than the sent one cos there may have
576 // been modifiers such as .pr that are removed.
577 String modified_doc_id = dc_response_doc.getAttribute(GSXML.NODE_ID_ATT);
578 the_document.setAttribute("selectedNode", modified_doc_id);
579 if (has_dummy)
580 {
581 // change the id if necessary and add the content
582 Element dummy_node = (Element) doc_nodes.item(0);
583
584 dummy_node.setAttribute(GSXML.NODE_ID_ATT, modified_doc_id);
585 dummy_node.appendChild(this.doc.importNode(dc_response_doc_content, true));
586 // hack for simple type
587 if (document_type.equals("simple"))
588 {
589 // we dont want the internal docNode, just want the content and metadata in the document
590 // rethink this!!
591 the_document.removeChild(dummy_node);
592
593 NodeList dummy_children = dummy_node.getChildNodes();
594 //for (int i=0; i<dummy_children.getLength(); i++) {
595 for (int i = dummy_children.getLength() - 1; i >= 0; i--)
596 {
597 // special case as we don't want more than one metadata list
598 if (dummy_children.item(i).getNodeName().equals(GSXML.METADATA_ELEM + GSXML.LIST_MODIFIER))
599 {
600 GSXML.mergeMetadataFromList(the_document, dummy_children.item(i));
601 }
602 else
603 {
604 the_document.appendChild(dummy_children.item(i));
605 }
606 }
607 }
608 }
609 else
610 {
611 // Merge the document content with the metadata and structure information
612 for (int i = 0; i < doc_nodes.getLength(); i++)
613 {
614 Node dn = doc_nodes.item(i);
615 String dn_id = ((Element) dn).getAttribute(GSXML.NODE_ID_ATT);
616 if (dn_id.equals(modified_doc_id))
617 {
618 dn.appendChild(this.doc.importNode(dc_response_doc_content, true));
619 break;
620 }
621 }
622 }
623 }
624 logger.debug("(DocumentAction) Page:\n" + this.converter.getPrettyString(result));
625 return result;
626 }
627
628 /**
629 * tell the param class what its arguments are if an action has its own
630 * arguments, this should add them to the params object - particularly
631 * important for args that should not be saved
632 */
633 public boolean addActionParameters(GSParams params)
634 {
635 params.addParameter(GOTO_PAGE_ARG, false);
636 params.addParameter(ENRICH_DOC_ARG, false);
637 params.addParameter(EXPAND_DOCUMENT_ARG, false);
638 params.addParameter(EXPAND_CONTENTS_ARG, false);
639 params.addParameter(REALISTIC_BOOK_ARG, false);
640
641 return true;
642 }
643
644 /**
645 * this method gets the collection description, the format info, the list of
646 * enrich services, etc - stuff that is needed for the page, but is the same
647 * whatever the query is - should be cached
648 */
649 protected boolean getBackgroundData(Element page_response, String collection, UserContext userContext)
650 {
651
652 // create a message to process - contains requests for the collection
653 // description, the format element, the enrich services on offer
654 // these could all be cached
655 Element info_message = this.doc.createElement(GSXML.MESSAGE_ELEM);
656 String path = GSPath.appendLink(collection, "DocumentContentRetrieve");
657 // the format request - ignore for now, where does this request go to??
658 Element format_request = GSXML.createBasicRequest(this.doc, GSXML.REQUEST_TYPE_FORMAT, path, userContext);
659 info_message.appendChild(format_request);
660
661 // the enrich_services request - only do this if provide_annotations is true
662
663 if (provide_annotations)
664 {
665 Element enrich_services_request = GSXML.createBasicRequest(this.doc, GSXML.REQUEST_TYPE_DESCRIBE, "", userContext);
666 enrich_services_request.setAttribute(GSXML.INFO_ATT, "serviceList");
667 info_message.appendChild(enrich_services_request);
668 }
669
670 Element info_response = (Element) this.mr.process(info_message);
671
672 // the collection is the first response
673 NodeList responses = info_response.getElementsByTagName(GSXML.RESPONSE_ELEM);
674 Element format_resp = (Element) responses.item(0);
675
676 Element format_elem = (Element) GSXML.getChildByTagName(format_resp, GSXML.FORMAT_ELEM);
677 if (format_elem != null)
678 {
679 logger.debug("doc action found a format statement");
680 // set teh format type
681 format_elem.setAttribute(GSXML.TYPE_ATT, "display");
682 page_response.appendChild(this.doc.importNode(format_elem, true));
683 }
684
685 if (provide_annotations)
686 {
687 Element services_resp = (Element) responses.item(1);
688
689 // a new message for the mr
690 Element enrich_message = this.doc.createElement(GSXML.MESSAGE_ELEM);
691 NodeList e_services = services_resp.getElementsByTagName(GSXML.SERVICE_ELEM);
692 boolean service_found = false;
693 for (int j = 0; j < e_services.getLength(); j++)
694 {
695 if (((Element) e_services.item(j)).getAttribute(GSXML.TYPE_ATT).equals("enrich"))
696 {
697 Element s = GSXML.createBasicRequest(this.doc, GSXML.REQUEST_TYPE_DESCRIBE, ((Element) e_services.item(j)).getAttribute(GSXML.NAME_ATT), userContext);
698 enrich_message.appendChild(s);
699 service_found = true;
700 }
701 }
702 if (service_found)
703 {
704 Element enrich_response = (Element) this.mr.process(enrich_message);
705
706 NodeList e_responses = enrich_response.getElementsByTagName(GSXML.RESPONSE_ELEM);
707 Element service_list = this.doc.createElement(GSXML.SERVICE_ELEM + GSXML.LIST_MODIFIER);
708 for (int i = 0; i < e_responses.getLength(); i++)
709 {
710 Element e_resp = (Element) e_responses.item(i);
711 Element e_service = (Element) this.doc.importNode(GSXML.getChildByTagName(e_resp, GSXML.SERVICE_ELEM), true);
712 e_service.setAttribute(GSXML.NAME_ATT, e_resp.getAttribute(GSXML.FROM_ATT));
713 service_list.appendChild(e_service);
714 }
715 page_response.appendChild(service_list);
716 }
717 } // if provide_annotations
718 return true;
719
720 }
721
722 protected String getDocumentType(Element basic_doc_list, String collection, UserContext userContext, Element page_response)
723 {
724
725 Element ds_message = this.doc.createElement(GSXML.MESSAGE_ELEM);
726 String to = GSPath.appendLink(collection, "DocumentStructureRetrieve");// Hard-wired?
727 Element ds_request = GSXML.createBasicRequest(this.doc, GSXML.REQUEST_TYPE_PROCESS, to, userContext);
728 ds_message.appendChild(ds_request);
729
730 // Create a parameter list to specify the required structure information
731 Element ds_param_list = this.doc.createElement(GSXML.PARAM_ELEM + GSXML.LIST_MODIFIER);
732 Element ds_param = this.doc.createElement(GSXML.PARAM_ELEM);
733 ds_param_list.appendChild(ds_param);
734 ds_param.setAttribute(GSXML.NAME_ATT, "info");
735 ds_param.setAttribute(GSXML.VALUE_ATT, "documentType");
736
737 ds_request.appendChild(ds_param_list);
738
739 // add the node list we created earlier
740 ds_request.appendChild(basic_doc_list);
741
742 logger.error("doctype request = "+this.converter.getPrettyString(ds_request));
743 // Process the document structure retrieve message
744 Element ds_response_message = (Element) this.mr.process(ds_message);
745 logger.error("doctype response = "+this.converter.getPrettyString(ds_response_message));
746 if (processErrorElements(ds_response_message, page_response))
747 {
748 return null;
749 }
750
751 String[] links = { GSXML.RESPONSE_ELEM, GSXML.DOC_NODE_ELEM + GSXML.LIST_MODIFIER, GSXML.DOC_NODE_ELEM, "nodeStructureInfo"};
752 String path = GSPath.createPath(links);
753 Element info_elem = (Element) GSXML.getNodeByPath(ds_response_message, path);
754 Element doctype_elem = GSXML.getNamedElement(info_elem, "info", "name", "documentType");
755 if (doctype_elem != null) {
756 String doc_type = doctype_elem.getAttribute("value");
757 return doc_type;
758 }
759 return null;
760 }
761
762 /**
763 * this involves a bit of a hack to get the equivalent query terms - has to
764 * requery the query service - uses the last selected service name. (if it
765 * ends in query). should this action do the query or should it send a
766 * message to the query action? but that will involve lots of extra stuff.
767 * also doesn't handle phrases properly - just highlights all the terms
768 * found in the text.
769 */
770 protected Element highlightQueryTerms(Element request, Element dc_response_doc_content)
771 {
772 // do the query again to get term info
773 Element cgi_param_list = (Element) GSXML.getChildByTagName(request, GSXML.PARAM_ELEM + GSXML.LIST_MODIFIER);
774 HashMap<String, Serializable> params = GSXML.extractParams(cgi_param_list, false);
775
776 HashMap previous_params = (HashMap) params.get("p");
777 if (previous_params == null)
778 {
779 return dc_response_doc_content;
780 }
781 String service_name = (String) previous_params.get(GSParams.SERVICE);
782 if (service_name == null || !service_name.endsWith("Query"))
783 { // hack for now - we only do highlighting if we were in a query last - ie not if we were in a browse thingy
784 logger.debug("invalid service, not doing highlighting");
785 return dc_response_doc_content;
786 }
787 String collection = (String) params.get(GSParams.COLLECTION);
788 UserContext userContext = new UserContext(request);
789 String to = GSPath.appendLink(collection, service_name);
790
791 Element mr_query_message = this.doc.createElement(GSXML.MESSAGE_ELEM);
792 Element mr_query_request = GSXML.createBasicRequest(this.doc, GSXML.REQUEST_TYPE_PROCESS, to, userContext);
793 mr_query_message.appendChild(mr_query_request);
794
795 // paramList
796 HashMap service_params = (HashMap) params.get("s1");
797
798 Element query_param_list = this.doc.createElement(GSXML.PARAM_ELEM + GSXML.LIST_MODIFIER);
799 GSXML.addParametersToList(this.doc, query_param_list, service_params);
800 mr_query_request.appendChild(query_param_list);
801
802 // do the query
803 Element mr_query_response = (Element) this.mr.process(mr_query_message);
804
805 String path = GSPath.appendLink(GSXML.RESPONSE_ELEM, GSXML.TERM_ELEM + GSXML.LIST_MODIFIER);
806 Element query_term_list_element = (Element) GSXML.getNodeByPath(mr_query_response, path);
807 if (query_term_list_element == null)
808 {
809 // no term info
810 logger.error("No query term information.\n");
811 return dc_response_doc_content;
812 }
813
814 String content = GSXML.getNodeText(dc_response_doc_content);
815
816 String metadata_path = GSPath.appendLink(GSXML.RESPONSE_ELEM, GSXML.METADATA_ELEM + GSXML.LIST_MODIFIER);
817 Element metadata_list = (Element) GSXML.getNodeByPath(mr_query_response, metadata_path);
818
819 HashSet<String> query_term_variants = new HashSet<String>();
820 NodeList equivalent_terms_nodelist = query_term_list_element.getElementsByTagName("equivTermList");
821 if (equivalent_terms_nodelist == null || equivalent_terms_nodelist.getLength() == 0)
822 {
823 NodeList terms_nodelist = query_term_list_element.getElementsByTagName("term");
824 if (terms_nodelist != null && terms_nodelist.getLength() > 0)
825 {
826 for (int i = 0; i < terms_nodelist.getLength(); i++)
827 {
828 String termValue = ((Element) terms_nodelist.item(i)).getAttribute("name");
829 String termValueU = null;
830 String termValueL = null;
831
832 if (termValue.length() > 1)
833 {
834 termValueU = termValue.substring(0, 1).toUpperCase() + termValue.substring(1);
835 termValueL = termValue.substring(0, 1).toLowerCase() + termValue.substring(1);
836 }
837 else
838 {
839 termValueU = termValue.substring(0, 1).toUpperCase();
840 termValueL = termValue.substring(0, 1).toLowerCase();
841 }
842
843 query_term_variants.add(termValueU);
844 query_term_variants.add(termValueL);
845 }
846 }
847 }
848 else
849 {
850 for (int i = 0; i < equivalent_terms_nodelist.getLength(); i++)
851 {
852 Element equivalent_terms_element = (Element) equivalent_terms_nodelist.item(i);
853 String[] equivalent_terms = GSXML.getAttributeValuesFromList(equivalent_terms_element, GSXML.NAME_ATT);
854 for (int j = 0; j < equivalent_terms.length; j++)
855 {
856 query_term_variants.add(equivalent_terms[j]);
857 }
858 }
859 }
860
861 ArrayList<ArrayList<HashSet<String>>> phrase_query_term_variants_hierarchy = new ArrayList<ArrayList<HashSet<String>>>();
862
863 Element query_element = GSXML.getNamedElement(metadata_list, GSXML.METADATA_ELEM, GSXML.NAME_ATT, "query");
864 String performed_query = GSXML.getNodeText(query_element) + " ";
865
866 ArrayList<HashSet<String>> phrase_query_p_term_variants_list = new ArrayList<HashSet<String>>();
867 int term_start = 0;
868 boolean in_term = false;
869 boolean in_phrase = false;
870 for (int i = 0; i < performed_query.length(); i++)
871 {
872 char character = performed_query.charAt(i);
873 boolean is_character_letter_or_digit = Character.isLetterOrDigit(character);
874
875 // Has a query term just started?
876 if (in_term == false && is_character_letter_or_digit == true)
877 {
878 in_term = true;
879 term_start = i;
880 }
881
882 // Or has a term just finished?
883 else if (in_term == true && is_character_letter_or_digit == false)
884 {
885 in_term = false;
886 String term = performed_query.substring(term_start, i);
887
888 Element term_element = GSXML.getNamedElement(query_term_list_element, GSXML.TERM_ELEM, GSXML.NAME_ATT, term);
889 if (term_element != null)
890 {
891
892 HashSet<String> phrase_query_p_term_x_variants = new HashSet<String>();
893
894 NodeList term_equivalent_terms_nodelist = term_element.getElementsByTagName("equivTermList");
895 if (term_equivalent_terms_nodelist == null || term_equivalent_terms_nodelist.getLength() == 0)
896 {
897 String termValueU = null;
898 String termValueL = null;
899
900 if (term.length() > 1)
901 {
902 termValueU = term.substring(0, 1).toUpperCase() + term.substring(1);
903 termValueL = term.substring(0, 1).toLowerCase() + term.substring(1);
904 }
905 else
906 {
907 termValueU = term.substring(0, 1).toUpperCase();
908 termValueL = term.substring(0, 1).toLowerCase();
909 }
910
911 phrase_query_p_term_x_variants.add(termValueU);
912 phrase_query_p_term_x_variants.add(termValueL);
913 }
914 else
915 {
916 for (int j = 0; j < term_equivalent_terms_nodelist.getLength(); j++)
917 {
918 Element term_equivalent_terms_element = (Element) term_equivalent_terms_nodelist.item(j);
919 String[] term_equivalent_terms = GSXML.getAttributeValuesFromList(term_equivalent_terms_element, GSXML.NAME_ATT);
920 for (int k = 0; k < term_equivalent_terms.length; k++)
921 {
922 phrase_query_p_term_x_variants.add(term_equivalent_terms[k]);
923 }
924 }
925 }
926 phrase_query_p_term_variants_list.add(phrase_query_p_term_x_variants);
927
928 if (in_phrase == false)
929 {
930 phrase_query_term_variants_hierarchy.add(phrase_query_p_term_variants_list);
931 phrase_query_p_term_variants_list = new ArrayList<HashSet<String>>();
932 }
933 }
934 }
935 // Watch for phrases (surrounded by quotes)
936 if (character == '\"')
937 {
938 // Has a phrase just started?
939 if (in_phrase == false)
940 {
941 in_phrase = true;
942 }
943 // Or has a phrase just finished?
944 else if (in_phrase == true)
945 {
946 in_phrase = false;
947 phrase_query_term_variants_hierarchy.add(phrase_query_p_term_variants_list);
948 }
949
950 phrase_query_p_term_variants_list = new ArrayList<HashSet<String>>();
951 }
952 }
953
954 return highlightQueryTermsInternal(content, query_term_variants, phrase_query_term_variants_hierarchy);
955 }
956
957 /**
958 * Highlights query terms in a piece of text.
959 */
960 private Element highlightQueryTermsInternal(String content, HashSet<String> query_term_variants, ArrayList<ArrayList<HashSet<String>>> phrase_query_term_variants_hierarchy)
961 {
962 // Convert the content string to an array of characters for speed
963 char[] content_characters = new char[content.length()];
964 content.getChars(0, content.length(), content_characters, 0);
965
966 // Now skim through the content, identifying word matches
967 ArrayList<WordMatch> word_matches = new ArrayList<WordMatch>();
968 int word_start = 0;
969 boolean in_word = false;
970 boolean preceding_word_matched = false;
971 boolean inTag = false;
972 for (int i = 0; i < content_characters.length; i++)
973 {
974 //We don't want to find words inside HTML tags
975 if (content_characters[i] == '<')
976 {
977 inTag = true;
978 continue;
979 }
980 else if (inTag && content_characters[i] == '>')
981 {
982 inTag = false;
983 }
984 else if (inTag)
985 {
986 continue;
987 }
988
989 boolean is_character_letter_or_digit = Character.isLetterOrDigit(content_characters[i]);
990
991 // Has a word just started?
992 if (in_word == false && is_character_letter_or_digit == true)
993 {
994 in_word = true;
995 word_start = i;
996 }
997
998 // Or has a word just finished?
999 else if (in_word == true && is_character_letter_or_digit == false)
1000 {
1001 in_word = false;
1002
1003 // Check if the word matches any of the query term equivalents
1004 String word = new String(content_characters, word_start, (i - word_start));
1005 if (query_term_variants.contains(word))
1006 {
1007 // We have found a matching word, so remember its location
1008 word_matches.add(new WordMatch(word, word_start, i, preceding_word_matched));
1009 preceding_word_matched = true;
1010 }
1011 else
1012 {
1013 preceding_word_matched = false;
1014 }
1015 }
1016 }
1017
1018 // Don't forget the last word...
1019 if (in_word == true)
1020 {
1021 // Check if the word matches any of the query term equivalents
1022 String word = new String(content_characters, word_start, (content_characters.length - word_start));
1023 if (query_term_variants.contains(word))
1024 {
1025 // We have found a matching word, so remember its location
1026 word_matches.add(new WordMatch(word, word_start, content_characters.length, preceding_word_matched));
1027 }
1028 }
1029
1030 ArrayList<Integer> highlight_start_positions = new ArrayList<Integer>();
1031 ArrayList<Integer> highlight_end_positions = new ArrayList<Integer>();
1032
1033 // Deal with phrases now
1034 ArrayList<PartialPhraseMatch> partial_phrase_matches = new ArrayList<PartialPhraseMatch>();
1035 for (int i = 0; i < word_matches.size(); i++)
1036 {
1037 WordMatch word_match = word_matches.get(i);
1038
1039 // See if any partial phrase matches are extended by this word
1040 if (word_match.preceding_word_matched)
1041 {
1042 for (int j = partial_phrase_matches.size() - 1; j >= 0; j--)
1043 {
1044 PartialPhraseMatch partial_phrase_match = partial_phrase_matches.remove(j);
1045 ArrayList phrase_query_p_term_variants_list = phrase_query_term_variants_hierarchy.get(partial_phrase_match.query_phrase_number);
1046 HashSet phrase_query_p_term_x_variants = (HashSet) phrase_query_p_term_variants_list.get(partial_phrase_match.num_words_matched);
1047 if (phrase_query_p_term_x_variants.contains(word_match.word))
1048 {
1049 partial_phrase_match.num_words_matched++;
1050
1051 // Has a complete phrase match occurred?
1052 if (partial_phrase_match.num_words_matched == phrase_query_p_term_variants_list.size())
1053 {
1054 // Check for overlaps by looking at the previous highlight range
1055 if (!highlight_end_positions.isEmpty())
1056 {
1057 int last_highlight_index = highlight_end_positions.size() - 1;
1058 int last_highlight_end = highlight_end_positions.get(last_highlight_index).intValue();
1059 if (last_highlight_end > partial_phrase_match.start_position)
1060 {
1061 // There is an overlap, so remove the previous phrase match
1062 int last_highlight_start = highlight_start_positions.remove(last_highlight_index).intValue();
1063 highlight_end_positions.remove(last_highlight_index);
1064 partial_phrase_match.start_position = last_highlight_start;
1065 }
1066 }
1067
1068 highlight_start_positions.add(new Integer(partial_phrase_match.start_position));
1069 highlight_end_positions.add(new Integer(word_match.end_position));
1070 }
1071 // No, but add the partial match back into the list for next time
1072 else
1073 {
1074 partial_phrase_matches.add(partial_phrase_match);
1075 }
1076 }
1077 }
1078 }
1079 else
1080 {
1081 partial_phrase_matches.clear();
1082 }
1083
1084 // See if this word is at the start of any of the phrases
1085 for (int p = 0; p < phrase_query_term_variants_hierarchy.size(); p++)
1086 {
1087 ArrayList phrase_query_p_term_variants_list = phrase_query_term_variants_hierarchy.get(p);
1088 HashSet phrase_query_p_term_1_variants = (HashSet) phrase_query_p_term_variants_list.get(0);
1089 if (phrase_query_p_term_1_variants.contains(word_match.word))
1090 {
1091 // If this phrase is just one word long, we have a complete match
1092 if (phrase_query_p_term_variants_list.size() == 1)
1093 {
1094 highlight_start_positions.add(new Integer(word_match.start_position));
1095 highlight_end_positions.add(new Integer(word_match.end_position));
1096 }
1097 // Otherwise we have the start of a potential phrase match
1098 else
1099 {
1100 partial_phrase_matches.add(new PartialPhraseMatch(word_match.start_position, p));
1101 }
1102 }
1103 }
1104 }
1105
1106 // Now add the annotation tags into the document at the correct points
1107 Element content_element = this.doc.createElement(GSXML.NODE_CONTENT_ELEM);
1108
1109 int last_wrote = 0;
1110 for (int i = 0; i < highlight_start_positions.size(); i++)
1111 {
1112 int highlight_start = highlight_start_positions.get(i).intValue();
1113 int highlight_end = highlight_end_positions.get(i).intValue();
1114
1115 // Print anything before the highlight range
1116 if (last_wrote < highlight_start)
1117 {
1118 String preceding_text = new String(content_characters, last_wrote, (highlight_start - last_wrote));
1119 content_element.appendChild(this.doc.createTextNode(preceding_text));
1120 }
1121
1122 // Print the highlight text, annotated
1123 if (highlight_end > last_wrote)
1124 {
1125 String highlight_text = new String(content_characters, highlight_start, (highlight_end - highlight_start));
1126 Element annotation_element = GSXML.createTextElement(this.doc, "annotation", highlight_text);
1127 annotation_element.setAttribute("type", "query_term");
1128 content_element.appendChild(annotation_element);
1129 last_wrote = highlight_end;
1130 }
1131 }
1132
1133 // Finish off any unwritten text
1134 if (last_wrote < content_characters.length)
1135 {
1136 String remaining_text = new String(content_characters, last_wrote, (content_characters.length - last_wrote));
1137 content_element.appendChild(this.doc.createTextNode(remaining_text));
1138 }
1139
1140 return content_element;
1141 }
1142
1143 static private class WordMatch
1144 {
1145 public String word;
1146 public int start_position;
1147 public int end_position;
1148 public boolean preceding_word_matched;
1149
1150 public WordMatch(String word, int start_position, int end_position, boolean preceding_word_matched)
1151 {
1152 this.word = word;
1153 this.start_position = start_position;
1154 this.end_position = end_position;
1155 this.preceding_word_matched = preceding_word_matched;
1156 }
1157 }
1158
1159 static private class PartialPhraseMatch
1160 {
1161 public int start_position;
1162 public int query_phrase_number;
1163 public int num_words_matched;
1164
1165 public PartialPhraseMatch(int start_position, int query_phrase_number)
1166 {
1167 this.start_position = start_position;
1168 this.query_phrase_number = query_phrase_number;
1169 this.num_words_matched = 1;
1170 }
1171 }
1172}
Note: See TracBrowser for help on using the repository browser.