source: main/trunk/greenstone3/src/java/org/greenstone/gsdl3/action/DocumentAction.java@ 26026

Last change on this file since 26026 was 26026, checked in by sjm84, 12 years ago

Analyse xsl files in advance to find out what metadata we need

  • Property svn:keywords set to Author Date Id Revision
File size: 42.8 KB
Line 
1/*
2 * DocumentAction.java
3 * Copyright (C) 2002 New Zealand Digital Library, http://www.nzdl.org
4 *
5 * This program is free software; you can redistribute it and/or modify
6 * it under the terms of the GNU General Public License as published by
7 * the Free Software Foundation; either version 2 of the License, or
8 * (at your option) any later version.
9 *
10 * This program is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 * GNU General Public License for more details.
14 *
15 * You should have received a copy of the GNU General Public License
16 * along with this program; if not, write to the Free Software
17 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
18 */
19package org.greenstone.gsdl3.action;
20
21// Greenstone classes
22import org.greenstone.gsdl3.core.ModuleInterface;
23import org.greenstone.gsdl3.util.*;
24
25// XML classes
26import org.w3c.dom.Document;
27import org.w3c.dom.Element;
28import org.w3c.dom.Node;
29import org.w3c.dom.Text;
30import org.w3c.dom.NodeList;
31
32// General Java classes
33import java.util.ArrayList;
34import java.util.HashMap;
35import java.util.HashSet;
36import java.io.File;
37import java.io.Serializable;
38
39import org.apache.log4j.*;
40
41/** Action class for retrieving Documents via the message router */
42public class DocumentAction extends Action
43{
44
45 static Logger logger = Logger.getLogger(org.greenstone.gsdl3.action.DocumentAction.class.getName());
46
47 // this is used to specify that the sibling nodes of a selected one should be obtained
48 public static final String SIBLING_ARG = "sib";
49 public static final String GOTO_PAGE_ARG = "gp";
50 public static final String ENRICH_DOC_ARG = "end";
51 public static final String EXPAND_DOCUMENT_ARG = "ed";
52 public static final String EXPAND_CONTENTS_ARG = "ec";
53 public static final String REALISTIC_BOOK_ARG = "book";
54
55 /**
56 * if this is set to true, when a document is displayed, any annotation type
57 * services (enrich) will be offered to the user as well
58 */
59 protected boolean provide_annotations = false;
60
61 protected boolean highlight_query_terms = false;
62
63 public boolean configure()
64 {
65 super.configure();
66 String highlight = (String) config_params.get("highlightQueryTerms");
67 if (highlight != null && highlight.equals("true"))
68 {
69 highlight_query_terms = true;
70 }
71 String annotate = (String) config_params.get("displayAnnotationService");
72 if (annotate != null && annotate.equals("true"))
73 {
74 provide_annotations = true;
75 }
76 return true;
77 }
78
79 public Node process(Node message_node)
80 {
81 // for now, no subaction eventually we may want to have subactions such as text assoc or something ?
82
83 Element message = this.converter.nodeToElement(message_node);
84
85 // the response
86 Element result = this.doc.createElement(GSXML.MESSAGE_ELEM);
87 Element page_response = this.doc.createElement(GSXML.RESPONSE_ELEM);
88 result.appendChild(page_response);
89
90 // get the request - assume only one
91 Element request = (Element) GSXML.getChildByTagName(message, GSXML.REQUEST_ELEM);
92 Element cgi_paramList = (Element) GSXML.getChildByTagName(request, GSXML.PARAM_ELEM + GSXML.LIST_MODIFIER);
93 HashMap<String, Serializable> params = GSXML.extractParams(cgi_paramList, false);
94
95 // just in case there are some that need to get passed to the services
96 HashMap service_params = (HashMap) params.get("s0");
97
98 String collection = (String) params.get(GSParams.COLLECTION);
99 String document_id = (String) params.get(GSParams.DOCUMENT);
100 if (document_id != null && document_id.equals(""))
101 {
102 document_id = null;
103 }
104 String href = (String) params.get(GSParams.HREF);//for an external link : get the href URL if it is existing in the params list
105 if (href != null && href.equals(""))
106 {
107 href = null;
108 }
109 String rl = (String) params.get(GSParams.RELATIVE_LINK);//for an external link : get the rl value if it is existing in the params list
110 if (document_id == null && href == null)
111 {
112 logger.error("no document specified!");
113 return result;
114 }
115 if (rl != null && rl.equals("0"))
116 {
117 // this is a true external link, we should have been directed to a different page or action
118 logger.error("rl value was 0, shouldn't get here");
119 return result;
120 }
121 String document_type = (String) params.get(GSParams.DOCUMENT_TYPE);
122 if (document_type != null && document_type.equals(""))
123 {
124 //document_type = "hierarchy";
125 document_type = null; // we'll get it later if not already specified
126 }
127 //whether to retrieve siblings or not
128 boolean get_siblings = false;
129 String sibs = (String) params.get(SIBLING_ARG);
130 if (sibs != null && sibs.equals("1"))
131 {
132 get_siblings = true;
133 }
134
135 String doc_id_modifier = "";
136 String sibling_num = (String) params.get(GOTO_PAGE_ARG);
137 if (sibling_num != null && !sibling_num.equals(""))
138 {
139 // we have to modify the doc name
140 doc_id_modifier = "." + sibling_num + ".ss";
141 }
142
143 boolean expand_document = false;
144 String ed_arg = (String) params.get(EXPAND_DOCUMENT_ARG);
145 if (ed_arg != null && ed_arg.equals("1"))
146 {
147 expand_document = true;
148 }
149
150 boolean expand_contents = false;
151 if (expand_document)
152 { // we always expand the contents with the text
153 expand_contents = true;
154 }
155 else
156 {
157 String ec_arg = (String) params.get(EXPAND_CONTENTS_ARG);
158 if (ec_arg != null && ec_arg.equals("1"))
159 {
160 expand_contents = true;
161 }
162 }
163
164 UserContext userContext = new UserContext(request);
165
166 //append site metadata
167 addSiteMetadata(page_response, userContext);
168 addInterfaceOptions(page_response);
169
170 // get the additional data needed for the page
171 getBackgroundData(page_response, collection, userContext);
172 Element format_elem = (Element) GSXML.getChildByTagName(page_response, GSXML.FORMAT_ELEM);
173
174 // the_document is where all the doc info - structure and metadata etc
175 // is added into, to be returned in the page
176 Element the_document = this.doc.createElement(GSXML.DOCUMENT_ELEM);
177 page_response.appendChild(the_document);
178
179 // create a basic doc list containing the current node
180 Element basic_doc_list = this.doc.createElement(GSXML.DOC_NODE_ELEM + GSXML.LIST_MODIFIER);
181 Element current_doc = this.doc.createElement(GSXML.DOC_NODE_ELEM);
182 basic_doc_list.appendChild(current_doc);
183 if (document_id != null)
184 {
185 current_doc.setAttribute(GSXML.NODE_ID_ATT, document_id + doc_id_modifier);
186 }
187 else
188 {
189 current_doc.setAttribute(GSXML.HREF_ID_ATT, href);
190 // do we need this??
191 current_doc.setAttribute(GSXML.ID_MOD_ATT, doc_id_modifier);
192 }
193
194 if (document_type == null)
195 {
196 logger.error("getting document type");
197 document_type = getDocumentType(basic_doc_list, collection, userContext, page_response);
198 logger.error("new doc type = " + document_type);
199 }
200 if (document_type != null)
201 {
202 // set the doctype from the cgi arg or from the server as an attribute
203 the_document.setAttribute(GSXML.DOC_TYPE_ATT, document_type);
204 }
205 else
206 {
207 logger.error("doctype is null!!!***********");
208 }
209
210 // Create a parameter list to specify the required structure information
211 Element ds_param_list = this.doc.createElement(GSXML.PARAM_ELEM + GSXML.LIST_MODIFIER);
212
213 if (service_params != null)
214 {
215 GSXML.addParametersToList(this.doc, ds_param_list, service_params);
216 }
217
218 Element ds_param = null;
219 boolean get_structure = false;
220 boolean get_structure_info = false;
221 if (document_type.equals(GSXML.DOC_TYPE_PAGED))
222 {
223 get_structure_info = true;
224
225 if (expand_contents)
226 {
227 ds_param = this.doc.createElement(GSXML.PARAM_ELEM);
228 ds_param_list.appendChild(ds_param);
229 ds_param.setAttribute(GSXML.NAME_ATT, "structure");
230 ds_param.setAttribute(GSXML.VALUE_ATT, "entire");
231 }
232
233 // get the info needed for paged naviagtion
234 ds_param = this.doc.createElement(GSXML.PARAM_ELEM);
235 ds_param_list.appendChild(ds_param);
236 ds_param.setAttribute(GSXML.NAME_ATT, "info");
237 ds_param.setAttribute(GSXML.VALUE_ATT, "numSiblings");
238 ds_param = this.doc.createElement(GSXML.PARAM_ELEM);
239 ds_param_list.appendChild(ds_param);
240 ds_param.setAttribute(GSXML.NAME_ATT, "info");
241 ds_param.setAttribute(GSXML.VALUE_ATT, "numChildren");
242 ds_param = this.doc.createElement(GSXML.PARAM_ELEM);
243 ds_param_list.appendChild(ds_param);
244 ds_param.setAttribute(GSXML.NAME_ATT, "info");
245 ds_param.setAttribute(GSXML.VALUE_ATT, "siblingPosition");
246
247 if (get_siblings)
248 {
249 ds_param = this.doc.createElement(GSXML.PARAM_ELEM);
250 ds_param_list.appendChild(ds_param);
251 ds_param.setAttribute(GSXML.NAME_ATT, "structure");
252 ds_param.setAttribute(GSXML.VALUE_ATT, "siblings");
253 }
254
255 }
256 else if (document_type.equals(GSXML.DOC_TYPE_HIERARCHY) ||document_type.equals(GSXML.DOC_TYPE_PAGED_HIERARCHY) )
257 {
258 get_structure = true;
259 if (expand_contents)
260 {
261 ds_param = this.doc.createElement(GSXML.PARAM_ELEM);
262 ds_param_list.appendChild(ds_param);
263 ds_param.setAttribute(GSXML.NAME_ATT, "structure");
264 ds_param.setAttribute(GSXML.VALUE_ATT, "entire");
265 }
266 else
267 {
268 // get the info needed for table of contents
269 ds_param = this.doc.createElement(GSXML.PARAM_ELEM);
270 ds_param_list.appendChild(ds_param);
271 ds_param.setAttribute(GSXML.NAME_ATT, "structure");
272 ds_param.setAttribute(GSXML.VALUE_ATT, "ancestors");
273 ds_param = this.doc.createElement(GSXML.PARAM_ELEM);
274 ds_param_list.appendChild(ds_param);
275 ds_param.setAttribute(GSXML.NAME_ATT, "structure");
276 ds_param.setAttribute(GSXML.VALUE_ATT, "children");
277 if (get_siblings)
278 {
279 ds_param = this.doc.createElement(GSXML.PARAM_ELEM);
280 ds_param_list.appendChild(ds_param);
281 ds_param.setAttribute(GSXML.NAME_ATT, "structure");
282 ds_param.setAttribute(GSXML.VALUE_ATT, "siblings");
283 }
284 }
285 }
286 else
287 {
288 // we dont need any structure
289 }
290
291 boolean has_dummy = false;
292 if (get_structure || get_structure_info)
293 {
294
295 // Build a request to obtain the document structure
296 Element ds_message = this.doc.createElement(GSXML.MESSAGE_ELEM);
297 String to = GSPath.appendLink(collection, "DocumentStructureRetrieve");// Hard-wired?
298 Element ds_request = GSXML.createBasicRequest(this.doc, GSXML.REQUEST_TYPE_PROCESS, to, userContext);
299 ds_message.appendChild(ds_request);
300 ds_request.appendChild(ds_param_list);
301
302 // add the node list we created earlier
303 ds_request.appendChild(basic_doc_list);
304
305 // Process the document structure retrieve message
306 Element ds_response_message = (Element) this.mr.process(ds_message);
307 if (processErrorElements(ds_response_message, page_response))
308 {
309 return result;
310 }
311
312 // get the info and print out
313 String path = GSPath.appendLink(GSXML.RESPONSE_ELEM, GSXML.DOC_NODE_ELEM + GSXML.LIST_MODIFIER);
314 path = GSPath.appendLink(path, GSXML.DOC_NODE_ELEM);
315 path = GSPath.appendLink(path, "nodeStructureInfo");
316 Element ds_response_struct_info = (Element) GSXML.getNodeByPath(ds_response_message, path);
317 // get the doc_node bit
318 if (ds_response_struct_info != null)
319 {
320 the_document.appendChild(this.doc.importNode(ds_response_struct_info, true));
321 }
322 path = GSPath.appendLink(GSXML.RESPONSE_ELEM, GSXML.DOC_NODE_ELEM + GSXML.LIST_MODIFIER);
323 path = GSPath.appendLink(path, GSXML.DOC_NODE_ELEM);
324 path = GSPath.appendLink(path, GSXML.NODE_STRUCTURE_ELEM);
325 Element ds_response_structure = (Element) GSXML.getNodeByPath(ds_response_message, path);
326
327 if (ds_response_structure != null)
328 {
329 // add the contents of the structure bit into the_document
330 NodeList structs = ds_response_structure.getChildNodes();
331 for (int i = 0; i < structs.getLength(); i++)
332 {
333 the_document.appendChild(this.doc.importNode(structs.item(i), true));
334 }
335 }
336 else
337 {
338 // no structure nodes, so put in a dummy doc node
339 Element doc_node = this.doc.createElement(GSXML.DOC_NODE_ELEM);
340 if (document_id != null)
341 {
342 doc_node.setAttribute(GSXML.NODE_ID_ATT, document_id);
343 }
344 else
345 {
346 doc_node.setAttribute(GSXML.HREF_ID_ATT, href);
347
348 }
349 the_document.appendChild(doc_node);
350 has_dummy = true;
351 }
352 }
353 else
354 { // a simple type - we dont have a dummy node for simple
355 // should think about this more
356 // no structure request, so just put in a dummy doc node
357 Element doc_node = this.doc.createElement(GSXML.DOC_NODE_ELEM);
358 if (document_id != null)
359 {
360 doc_node.setAttribute(GSXML.NODE_ID_ATT, document_id);
361 }
362 else
363 {
364 doc_node.setAttribute(GSXML.HREF_ID_ATT, href);
365 }
366 the_document.appendChild(doc_node);
367 has_dummy = true;
368 }
369
370 // Build a request to obtain some document metadata
371 Element dm_message = this.doc.createElement(GSXML.MESSAGE_ELEM);
372 String to = GSPath.appendLink(collection, "DocumentMetadataRetrieve"); // Hard-wired?
373 Element dm_request = GSXML.createBasicRequest(this.doc, GSXML.REQUEST_TYPE_PROCESS, to, userContext);
374 dm_message.appendChild(dm_request);
375 // Create a parameter list to specify the required metadata information
376
377 HashSet<String> meta_names = new HashSet<String>();
378 meta_names.add("Title"); // the default
379 if (format_elem != null)
380 {
381 getRequiredMetadataNames(format_elem, meta_names);
382 }
383
384 Element extraMetaListElem = (Element) GSXML.getChildByTagName(request, GSXML.EXTRA_METADATA + GSXML.LIST_MODIFIER);
385 if(extraMetaListElem != null)
386 {
387 NodeList extraMetaList = extraMetaListElem.getElementsByTagName(GSXML.EXTRA_METADATA);
388 for(int i = 0; i < extraMetaList.getLength(); i++)
389 {
390 meta_names.add(((Element)extraMetaList.item(i)).getAttribute(GSXML.NAME_ATT));
391 }
392 }
393
394 Element dm_param_list = createMetadataParamList(meta_names);
395 if (service_params != null)
396 {
397 GSXML.addParametersToList(this.doc, dm_param_list, service_params);
398 }
399
400 dm_request.appendChild(dm_param_list);
401
402 // create the doc node list for the metadata request
403 Element dm_doc_list = this.doc.createElement(GSXML.DOC_NODE_ELEM + GSXML.LIST_MODIFIER);
404 dm_request.appendChild(dm_doc_list);
405
406 // Add each node from the structure response into the metadata request
407 NodeList doc_nodes = the_document.getElementsByTagName(GSXML.DOC_NODE_ELEM);
408 for (int i = 0; i < doc_nodes.getLength(); i++)
409 {
410 Element doc_node = (Element) doc_nodes.item(i);
411 String doc_node_id = doc_node.getAttribute(GSXML.NODE_ID_ATT);
412
413 // Add the documentNode to the list
414 Element dm_doc_node = this.doc.createElement(GSXML.DOC_NODE_ELEM);
415 dm_doc_list.appendChild(dm_doc_node);
416 dm_doc_node.setAttribute(GSXML.NODE_ID_ATT, doc_node_id);
417 dm_doc_node.setAttribute(GSXML.NODE_TYPE_ATT, doc_node.getAttribute(GSXML.NODE_TYPE_ATT));
418 }
419
420 // we also want a metadata request to the top level document to get
421 // assocfilepath - this could be cached too
422 Element doc_meta_request = GSXML.createBasicRequest(this.doc, GSXML.REQUEST_TYPE_PROCESS, to, userContext);
423 dm_message.appendChild(doc_meta_request);
424 Element doc_meta_param_list = this.doc.createElement(GSXML.PARAM_ELEM + GSXML.LIST_MODIFIER);
425 if (service_params != null)
426 {
427 GSXML.addParametersToList(this.doc, doc_meta_param_list, service_params);
428 }
429
430 doc_meta_request.appendChild(doc_meta_param_list);
431 Element doc_param = this.doc.createElement(GSXML.PARAM_ELEM);
432 doc_meta_param_list.appendChild(doc_param);
433 doc_param.setAttribute(GSXML.NAME_ATT, "metadata");
434 doc_param.setAttribute(GSXML.VALUE_ATT, "assocfilepath");
435
436 // create the doc node list for the metadata request
437 Element doc_list = this.doc.createElement(GSXML.DOC_NODE_ELEM + GSXML.LIST_MODIFIER);
438 doc_meta_request.appendChild(doc_list);
439
440 Element doc_node = this.doc.createElement(GSXML.DOC_NODE_ELEM);
441 // the node we want is the root document node
442 if (document_id != null)
443 {
444 doc_node.setAttribute(GSXML.NODE_ID_ATT, document_id + ".rt");
445 }
446 else
447 {
448 doc_node.setAttribute(GSXML.HREF_ID_ATT, href);// + ".rt");
449 // can we assume that href is always a top level doc??
450 //doc_node.setAttribute(GSXML.ID_MOD_ATT, ".rt");
451 //doc_node.setAttribute("externalURL", has_rl);
452 }
453 doc_list.appendChild(doc_node);
454
455 Element dm_response_message = (Element) this.mr.process(dm_message);
456 if (processErrorElements(dm_response_message, page_response))
457 {
458 return result;
459 }
460
461 String path = GSPath.appendLink(GSXML.RESPONSE_ELEM, GSXML.DOC_NODE_ELEM + GSXML.LIST_MODIFIER);
462 Element dm_response_doc_list = (Element) GSXML.getNodeByPath(dm_response_message, path);
463
464 // Merge the metadata with the structure information
465 NodeList dm_response_docs = dm_response_doc_list.getChildNodes();
466 for (int i = 0; i < doc_nodes.getLength(); i++)
467 {
468 GSXML.mergeMetadataLists(doc_nodes.item(i), dm_response_docs.item(i));
469 }
470 // get the top level doc metadata out
471 Element doc_meta_response = (Element) dm_response_message.getElementsByTagName(GSXML.RESPONSE_ELEM).item(1);
472 Element top_doc_node = (Element) GSXML.getNodeByPath(doc_meta_response, "documentNodeList/documentNode");
473 GSXML.mergeMetadataLists(the_document, top_doc_node);
474
475 // Build a request to obtain some document content
476 Element dc_message = this.doc.createElement(GSXML.MESSAGE_ELEM);
477 to = GSPath.appendLink(collection, "DocumentContentRetrieve"); // Hard-wired?
478 Element dc_request = GSXML.createBasicRequest(this.doc, GSXML.REQUEST_TYPE_PROCESS, to, userContext);
479 dc_message.appendChild(dc_request);
480
481 // Create a parameter list to specify the request parameters - empty for now
482 Element dc_param_list = this.doc.createElement(GSXML.PARAM_ELEM + GSXML.LIST_MODIFIER);
483 if (service_params != null)
484 {
485 GSXML.addParametersToList(this.doc, dc_param_list, service_params);
486 }
487
488 dc_request.appendChild(dc_param_list);
489
490 // get the content
491 // the doc list for the content request is the same as the one for the structure request unless we want the whole document, in which case its the same as for the metadata request.
492 if (expand_document)
493 {
494 dc_request.appendChild(dm_doc_list);
495 }
496 else
497 {
498 dc_request.appendChild(basic_doc_list);
499 }
500 logger.debug("request = " + XMLConverter.getString(dc_message));
501 Element dc_response_message = (Element) this.mr.process(dc_message);
502 if (processErrorElements(dc_response_message, page_response))
503 {
504 return result;
505 }
506
507 Element dc_response_doc_list = (Element) GSXML.getNodeByPath(dc_response_message, path);
508
509 if (expand_document)
510 {
511 // Merge the content with the structure information
512 NodeList dc_response_docs = dc_response_doc_list.getChildNodes();
513 for (int i = 0; i < doc_nodes.getLength(); i++)
514 {
515 Node content = GSXML.getChildByTagName((Element) dc_response_docs.item(i), "nodeContent");
516 if (content != null)
517 {
518 if (highlight_query_terms)
519 {
520 content = highlightQueryTerms(request, (Element) content);
521 }
522 doc_nodes.item(i).appendChild(this.doc.importNode(content, true));
523 }
524 //GSXML.mergeMetadataLists(doc_nodes.item(i), dm_response_docs.item(i));
525 }
526 }
527 else
528 {
529 //path = GSPath.appendLink(path, GSXML.DOC_NODE_ELEM);
530 Element dc_response_doc = (Element) GSXML.getChildByTagName(dc_response_doc_list, GSXML.DOC_NODE_ELEM);
531 Element dc_response_doc_content = (Element) GSXML.getChildByTagName(dc_response_doc, GSXML.NODE_CONTENT_ELEM);
532 //Element dc_response_doc_external = (Element) GSXML.getChildByTagName(dc_response_doc, "external");
533
534 if (dc_response_doc_content == null)
535 {
536 // no content to add
537 if (dc_response_doc.getAttribute("external").equals("true"))
538 {
539
540 //if (dc_response_doc_external != null)
541 //{
542 String href_id = dc_response_doc.getAttribute(GSXML.HREF_ID_ATT);
543
544 the_document.setAttribute("selectedNode", href_id);
545 the_document.setAttribute("external", href_id);
546 }
547 return result;
548 }
549 if (highlight_query_terms)
550 {
551 dc_response_doc.removeChild(dc_response_doc_content);
552
553 dc_response_doc_content = highlightQueryTerms(request, dc_response_doc_content);
554 dc_response_doc.appendChild(dc_response_doc.getOwnerDocument().importNode(dc_response_doc_content, true));
555 }
556
557 if (provide_annotations)
558 {
559 String service_selected = (String) params.get(ENRICH_DOC_ARG);
560 if (service_selected != null && service_selected.equals("1"))
561 {
562 // now we can modifiy the response doc if needed
563 String enrich_service = (String) params.get(GSParams.SERVICE);
564 // send a message to the service
565 Element enrich_message = this.doc.createElement(GSXML.MESSAGE_ELEM);
566 Element enrich_request = GSXML.createBasicRequest(this.doc, GSXML.REQUEST_TYPE_PROCESS, enrich_service, userContext);
567 enrich_message.appendChild(enrich_request);
568 // check for parameters
569 HashMap e_service_params = (HashMap) params.get("s1");
570 if (e_service_params != null)
571 {
572 Element enrich_pl = this.doc.createElement(GSXML.PARAM_ELEM + GSXML.LIST_MODIFIER);
573 GSXML.addParametersToList(this.doc, enrich_pl, e_service_params);
574 enrich_request.appendChild(enrich_pl);
575 }
576 Element e_doc_list = this.doc.createElement(GSXML.DOC_NODE_ELEM + GSXML.LIST_MODIFIER);
577 enrich_request.appendChild(e_doc_list);
578 e_doc_list.appendChild(this.doc.importNode(dc_response_doc, true));
579
580 Node enrich_response = this.mr.process(enrich_message);
581
582 String[] links = { GSXML.RESPONSE_ELEM, GSXML.DOC_NODE_ELEM + GSXML.LIST_MODIFIER, GSXML.DOC_NODE_ELEM, GSXML.NODE_CONTENT_ELEM };
583 path = GSPath.createPath(links);
584 dc_response_doc_content = (Element) GSXML.getNodeByPath(enrich_response, path);
585
586 }
587 } // if provide_annotations
588
589 // use the returned id rather than the sent one cos there may have
590 // been modifiers such as .pr that are removed.
591 String modified_doc_id = dc_response_doc.getAttribute(GSXML.NODE_ID_ATT);
592 the_document.setAttribute("selectedNode", modified_doc_id);
593 if (has_dummy)
594 {
595 // change the id if necessary and add the content
596 Element dummy_node = (Element) doc_nodes.item(0);
597
598 dummy_node.setAttribute(GSXML.NODE_ID_ATT, modified_doc_id);
599 dummy_node.appendChild(this.doc.importNode(dc_response_doc_content, true));
600 // hack for simple type
601 if (document_type.equals("simple"))
602 {
603 // we dont want the internal docNode, just want the content and metadata in the document
604 // rethink this!!
605 the_document.removeChild(dummy_node);
606
607 NodeList dummy_children = dummy_node.getChildNodes();
608 //for (int i=0; i<dummy_children.getLength(); i++) {
609 for (int i = dummy_children.getLength() - 1; i >= 0; i--)
610 {
611 // special case as we don't want more than one metadata list
612 if (dummy_children.item(i).getNodeName().equals(GSXML.METADATA_ELEM + GSXML.LIST_MODIFIER))
613 {
614 GSXML.mergeMetadataFromList(the_document, dummy_children.item(i));
615 }
616 else
617 {
618 the_document.appendChild(dummy_children.item(i));
619 }
620 }
621 }
622 }
623 else
624 {
625 // Merge the document content with the metadata and structure information
626 for (int i = 0; i < doc_nodes.getLength(); i++)
627 {
628 Node dn = doc_nodes.item(i);
629 String dn_id = ((Element) dn).getAttribute(GSXML.NODE_ID_ATT);
630 if (dn_id.equals(modified_doc_id))
631 {
632 dn.appendChild(this.doc.importNode(dc_response_doc_content, true));
633 break;
634 }
635 }
636 }
637 }
638 logger.debug("(DocumentAction) Page:\n" + this.converter.getPrettyString(result));
639 return result;
640 }
641
642 /**
643 * tell the param class what its arguments are if an action has its own
644 * arguments, this should add them to the params object - particularly
645 * important for args that should not be saved
646 */
647 public boolean addActionParameters(GSParams params)
648 {
649 params.addParameter(GOTO_PAGE_ARG, false);
650 params.addParameter(ENRICH_DOC_ARG, false);
651 params.addParameter(EXPAND_DOCUMENT_ARG, false);
652 params.addParameter(EXPAND_CONTENTS_ARG, false);
653 params.addParameter(REALISTIC_BOOK_ARG, false);
654
655 return true;
656 }
657
658 /**
659 * this method gets the collection description, the format info, the list of
660 * enrich services, etc - stuff that is needed for the page, but is the same
661 * whatever the query is - should be cached
662 */
663 protected boolean getBackgroundData(Element page_response, String collection, UserContext userContext)
664 {
665
666 // create a message to process - contains requests for the collection
667 // description, the format element, the enrich services on offer
668 // these could all be cached
669 Element info_message = this.doc.createElement(GSXML.MESSAGE_ELEM);
670 String path = GSPath.appendLink(collection, "DocumentContentRetrieve");
671 // the format request - ignore for now, where does this request go to??
672 Element format_request = GSXML.createBasicRequest(this.doc, GSXML.REQUEST_TYPE_FORMAT, path, userContext);
673 info_message.appendChild(format_request);
674
675 // the enrich_services request - only do this if provide_annotations is true
676
677 if (provide_annotations)
678 {
679 Element enrich_services_request = GSXML.createBasicRequest(this.doc, GSXML.REQUEST_TYPE_DESCRIBE, "", userContext);
680 enrich_services_request.setAttribute(GSXML.INFO_ATT, "serviceList");
681 info_message.appendChild(enrich_services_request);
682 }
683
684 Element info_response = (Element) this.mr.process(info_message);
685
686 // the collection is the first response
687 NodeList responses = info_response.getElementsByTagName(GSXML.RESPONSE_ELEM);
688 Element format_resp = (Element) responses.item(0);
689
690 Element format_elem = (Element) GSXML.getChildByTagName(format_resp, GSXML.FORMAT_ELEM);
691 if (format_elem != null)
692 {
693 Element global_format_elem = (Element) GSXML.getChildByTagName(format_resp, GSXML.GLOBAL_FORMAT_ELEM);
694 if(global_format_elem != null)
695 {
696 GSXSLT.mergeFormatElements(format_elem, global_format_elem, false);
697 }
698
699 // set the format type
700 format_elem.setAttribute(GSXML.TYPE_ATT, "display");
701 page_response.appendChild(this.doc.importNode(format_elem, true));
702 }
703
704 if (provide_annotations)
705 {
706 Element services_resp = (Element) responses.item(1);
707
708 // a new message for the mr
709 Element enrich_message = this.doc.createElement(GSXML.MESSAGE_ELEM);
710 NodeList e_services = services_resp.getElementsByTagName(GSXML.SERVICE_ELEM);
711 boolean service_found = false;
712 for (int j = 0; j < e_services.getLength(); j++)
713 {
714 if (((Element) e_services.item(j)).getAttribute(GSXML.TYPE_ATT).equals("enrich"))
715 {
716 Element s = GSXML.createBasicRequest(this.doc, GSXML.REQUEST_TYPE_DESCRIBE, ((Element) e_services.item(j)).getAttribute(GSXML.NAME_ATT), userContext);
717 enrich_message.appendChild(s);
718 service_found = true;
719 }
720 }
721 if (service_found)
722 {
723 Element enrich_response = (Element) this.mr.process(enrich_message);
724
725 NodeList e_responses = enrich_response.getElementsByTagName(GSXML.RESPONSE_ELEM);
726 Element service_list = this.doc.createElement(GSXML.SERVICE_ELEM + GSXML.LIST_MODIFIER);
727 for (int i = 0; i < e_responses.getLength(); i++)
728 {
729 Element e_resp = (Element) e_responses.item(i);
730 Element e_service = (Element) this.doc.importNode(GSXML.getChildByTagName(e_resp, GSXML.SERVICE_ELEM), true);
731 e_service.setAttribute(GSXML.NAME_ATT, e_resp.getAttribute(GSXML.FROM_ATT));
732 service_list.appendChild(e_service);
733 }
734 page_response.appendChild(service_list);
735 }
736 } // if provide_annotations
737 return true;
738
739 }
740
741 protected String getDocumentType(Element basic_doc_list, String collection, UserContext userContext, Element page_response)
742 {
743 Element ds_message = this.doc.createElement(GSXML.MESSAGE_ELEM);
744 String to = GSPath.appendLink(collection, "DocumentStructureRetrieve");// Hard-wired?
745 Element ds_request = GSXML.createBasicRequest(this.doc, GSXML.REQUEST_TYPE_PROCESS, to, userContext);
746 ds_message.appendChild(ds_request);
747
748 // Create a parameter list to specify the required structure information
749 Element ds_param_list = this.doc.createElement(GSXML.PARAM_ELEM + GSXML.LIST_MODIFIER);
750 Element ds_param = this.doc.createElement(GSXML.PARAM_ELEM);
751 ds_param_list.appendChild(ds_param);
752 ds_param.setAttribute(GSXML.NAME_ATT, "info");
753 ds_param.setAttribute(GSXML.VALUE_ATT, "documentType");
754
755 ds_request.appendChild(ds_param_list);
756
757 // add the node list we created earlier
758 ds_request.appendChild(basic_doc_list);
759
760 // Process the document structure retrieve message
761 Element ds_response_message = (Element) this.mr.process(ds_message);
762 if (processErrorElements(ds_response_message, page_response))
763 {
764 return null;
765 }
766
767 String[] links = { GSXML.RESPONSE_ELEM, GSXML.DOC_NODE_ELEM + GSXML.LIST_MODIFIER, GSXML.DOC_NODE_ELEM, "nodeStructureInfo" };
768 String path = GSPath.createPath(links);
769 Element info_elem = (Element) GSXML.getNodeByPath(ds_response_message, path);
770 Element doctype_elem = GSXML.getNamedElement(info_elem, "info", "name", "documentType");
771 if (doctype_elem != null)
772 {
773 String doc_type = doctype_elem.getAttribute("value");
774 return doc_type;
775 }
776 return null;
777 }
778
779 /**
780 * this involves a bit of a hack to get the equivalent query terms - has to
781 * requery the query service - uses the last selected service name. (if it
782 * ends in query). should this action do the query or should it send a
783 * message to the query action? but that will involve lots of extra stuff.
784 * also doesn't handle phrases properly - just highlights all the terms
785 * found in the text.
786 */
787 protected Element highlightQueryTerms(Element request, Element dc_response_doc_content)
788 {
789 // do the query again to get term info
790 Element cgi_param_list = (Element) GSXML.getChildByTagName(request, GSXML.PARAM_ELEM + GSXML.LIST_MODIFIER);
791 HashMap<String, Serializable> params = GSXML.extractParams(cgi_param_list, false);
792
793 HashMap previous_params = (HashMap) params.get("p");
794 if (previous_params == null)
795 {
796 return dc_response_doc_content;
797 }
798 String service_name = (String) previous_params.get(GSParams.SERVICE);
799 if (service_name == null || !service_name.endsWith("Query"))
800 { // hack for now - we only do highlighting if we were in a query last - ie not if we were in a browse thingy
801 logger.debug("invalid service, not doing highlighting");
802 return dc_response_doc_content;
803 }
804 String collection = (String) params.get(GSParams.COLLECTION);
805 UserContext userContext = new UserContext(request);
806 String to = GSPath.appendLink(collection, service_name);
807
808 Element mr_query_message = this.doc.createElement(GSXML.MESSAGE_ELEM);
809 Element mr_query_request = GSXML.createBasicRequest(this.doc, GSXML.REQUEST_TYPE_PROCESS, to, userContext);
810 mr_query_message.appendChild(mr_query_request);
811
812 // paramList
813 HashMap service_params = (HashMap) params.get("s1");
814
815 Element query_param_list = this.doc.createElement(GSXML.PARAM_ELEM + GSXML.LIST_MODIFIER);
816 GSXML.addParametersToList(this.doc, query_param_list, service_params);
817 mr_query_request.appendChild(query_param_list);
818
819 // do the query
820 Element mr_query_response = (Element) this.mr.process(mr_query_message);
821
822 String path = GSPath.appendLink(GSXML.RESPONSE_ELEM, GSXML.TERM_ELEM + GSXML.LIST_MODIFIER);
823 Element query_term_list_element = (Element) GSXML.getNodeByPath(mr_query_response, path);
824 if (query_term_list_element == null)
825 {
826 // no term info
827 logger.error("No query term information.\n");
828 return dc_response_doc_content;
829 }
830
831 String content = GSXML.getNodeText(dc_response_doc_content);
832
833 String metadata_path = GSPath.appendLink(GSXML.RESPONSE_ELEM, GSXML.METADATA_ELEM + GSXML.LIST_MODIFIER);
834 Element metadata_list = (Element) GSXML.getNodeByPath(mr_query_response, metadata_path);
835
836 HashSet<String> query_term_variants = new HashSet<String>();
837 NodeList equivalent_terms_nodelist = query_term_list_element.getElementsByTagName("equivTermList");
838 if (equivalent_terms_nodelist == null || equivalent_terms_nodelist.getLength() == 0)
839 {
840 NodeList terms_nodelist = query_term_list_element.getElementsByTagName("term");
841 if (terms_nodelist != null && terms_nodelist.getLength() > 0)
842 {
843 for (int i = 0; i < terms_nodelist.getLength(); i++)
844 {
845 String termValue = ((Element) terms_nodelist.item(i)).getAttribute("name");
846 String termValueU = null;
847 String termValueL = null;
848
849 if (termValue.length() > 1)
850 {
851 termValueU = termValue.substring(0, 1).toUpperCase() + termValue.substring(1);
852 termValueL = termValue.substring(0, 1).toLowerCase() + termValue.substring(1);
853 }
854 else
855 {
856 termValueU = termValue.substring(0, 1).toUpperCase();
857 termValueL = termValue.substring(0, 1).toLowerCase();
858 }
859
860 query_term_variants.add(termValueU);
861 query_term_variants.add(termValueL);
862 }
863 }
864 }
865 else
866 {
867 for (int i = 0; i < equivalent_terms_nodelist.getLength(); i++)
868 {
869 Element equivalent_terms_element = (Element) equivalent_terms_nodelist.item(i);
870 String[] equivalent_terms = GSXML.getAttributeValuesFromList(equivalent_terms_element, GSXML.NAME_ATT);
871 for (int j = 0; j < equivalent_terms.length; j++)
872 {
873 query_term_variants.add(equivalent_terms[j]);
874 }
875 }
876 }
877
878 ArrayList<ArrayList<HashSet<String>>> phrase_query_term_variants_hierarchy = new ArrayList<ArrayList<HashSet<String>>>();
879
880 Element query_element = GSXML.getNamedElement(metadata_list, GSXML.METADATA_ELEM, GSXML.NAME_ATT, "query");
881 String performed_query = GSXML.getNodeText(query_element) + " ";
882
883 ArrayList<HashSet<String>> phrase_query_p_term_variants_list = new ArrayList<HashSet<String>>();
884 int term_start = 0;
885 boolean in_term = false;
886 boolean in_phrase = false;
887 for (int i = 0; i < performed_query.length(); i++)
888 {
889 char character = performed_query.charAt(i);
890 boolean is_character_letter_or_digit = Character.isLetterOrDigit(character);
891
892 // Has a query term just started?
893 if (in_term == false && is_character_letter_or_digit == true)
894 {
895 in_term = true;
896 term_start = i;
897 }
898
899 // Or has a term just finished?
900 else if (in_term == true && is_character_letter_or_digit == false)
901 {
902 in_term = false;
903 String term = performed_query.substring(term_start, i);
904
905 Element term_element = GSXML.getNamedElement(query_term_list_element, GSXML.TERM_ELEM, GSXML.NAME_ATT, term);
906 if (term_element != null)
907 {
908
909 HashSet<String> phrase_query_p_term_x_variants = new HashSet<String>();
910
911 NodeList term_equivalent_terms_nodelist = term_element.getElementsByTagName("equivTermList");
912 if (term_equivalent_terms_nodelist == null || term_equivalent_terms_nodelist.getLength() == 0)
913 {
914 String termValueU = null;
915 String termValueL = null;
916
917 if (term.length() > 1)
918 {
919 termValueU = term.substring(0, 1).toUpperCase() + term.substring(1);
920 termValueL = term.substring(0, 1).toLowerCase() + term.substring(1);
921 }
922 else
923 {
924 termValueU = term.substring(0, 1).toUpperCase();
925 termValueL = term.substring(0, 1).toLowerCase();
926 }
927
928 phrase_query_p_term_x_variants.add(termValueU);
929 phrase_query_p_term_x_variants.add(termValueL);
930 }
931 else
932 {
933 for (int j = 0; j < term_equivalent_terms_nodelist.getLength(); j++)
934 {
935 Element term_equivalent_terms_element = (Element) term_equivalent_terms_nodelist.item(j);
936 String[] term_equivalent_terms = GSXML.getAttributeValuesFromList(term_equivalent_terms_element, GSXML.NAME_ATT);
937 for (int k = 0; k < term_equivalent_terms.length; k++)
938 {
939 phrase_query_p_term_x_variants.add(term_equivalent_terms[k]);
940 }
941 }
942 }
943 phrase_query_p_term_variants_list.add(phrase_query_p_term_x_variants);
944
945 if (in_phrase == false)
946 {
947 phrase_query_term_variants_hierarchy.add(phrase_query_p_term_variants_list);
948 phrase_query_p_term_variants_list = new ArrayList<HashSet<String>>();
949 }
950 }
951 }
952 // Watch for phrases (surrounded by quotes)
953 if (character == '\"')
954 {
955 // Has a phrase just started?
956 if (in_phrase == false)
957 {
958 in_phrase = true;
959 }
960 // Or has a phrase just finished?
961 else if (in_phrase == true)
962 {
963 in_phrase = false;
964 phrase_query_term_variants_hierarchy.add(phrase_query_p_term_variants_list);
965 }
966
967 phrase_query_p_term_variants_list = new ArrayList<HashSet<String>>();
968 }
969 }
970
971 return highlightQueryTermsInternal(content, query_term_variants, phrase_query_term_variants_hierarchy);
972 }
973
974 /**
975 * Highlights query terms in a piece of text.
976 */
977 private Element highlightQueryTermsInternal(String content, HashSet<String> query_term_variants, ArrayList<ArrayList<HashSet<String>>> phrase_query_term_variants_hierarchy)
978 {
979 // Convert the content string to an array of characters for speed
980 char[] content_characters = new char[content.length()];
981 content.getChars(0, content.length(), content_characters, 0);
982
983 // Now skim through the content, identifying word matches
984 ArrayList<WordMatch> word_matches = new ArrayList<WordMatch>();
985 int word_start = 0;
986 boolean in_word = false;
987 boolean preceding_word_matched = false;
988 boolean inTag = false;
989 for (int i = 0; i < content_characters.length; i++)
990 {
991 //We don't want to find words inside HTML tags
992 if (content_characters[i] == '<')
993 {
994 inTag = true;
995 continue;
996 }
997 else if (inTag && content_characters[i] == '>')
998 {
999 inTag = false;
1000 }
1001 else if (inTag)
1002 {
1003 continue;
1004 }
1005
1006 boolean is_character_letter_or_digit = Character.isLetterOrDigit(content_characters[i]);
1007
1008 // Has a word just started?
1009 if (in_word == false && is_character_letter_or_digit == true)
1010 {
1011 in_word = true;
1012 word_start = i;
1013 }
1014
1015 // Or has a word just finished?
1016 else if (in_word == true && is_character_letter_or_digit == false)
1017 {
1018 in_word = false;
1019
1020 // Check if the word matches any of the query term equivalents
1021 String word = new String(content_characters, word_start, (i - word_start));
1022 if (query_term_variants.contains(word))
1023 {
1024 // We have found a matching word, so remember its location
1025 word_matches.add(new WordMatch(word, word_start, i, preceding_word_matched));
1026 preceding_word_matched = true;
1027 }
1028 else
1029 {
1030 preceding_word_matched = false;
1031 }
1032 }
1033 }
1034
1035 // Don't forget the last word...
1036 if (in_word == true)
1037 {
1038 // Check if the word matches any of the query term equivalents
1039 String word = new String(content_characters, word_start, (content_characters.length - word_start));
1040 if (query_term_variants.contains(word))
1041 {
1042 // We have found a matching word, so remember its location
1043 word_matches.add(new WordMatch(word, word_start, content_characters.length, preceding_word_matched));
1044 }
1045 }
1046
1047 ArrayList<Integer> highlight_start_positions = new ArrayList<Integer>();
1048 ArrayList<Integer> highlight_end_positions = new ArrayList<Integer>();
1049
1050 // Deal with phrases now
1051 ArrayList<PartialPhraseMatch> partial_phrase_matches = new ArrayList<PartialPhraseMatch>();
1052 for (int i = 0; i < word_matches.size(); i++)
1053 {
1054 WordMatch word_match = word_matches.get(i);
1055
1056 // See if any partial phrase matches are extended by this word
1057 if (word_match.preceding_word_matched)
1058 {
1059 for (int j = partial_phrase_matches.size() - 1; j >= 0; j--)
1060 {
1061 PartialPhraseMatch partial_phrase_match = partial_phrase_matches.remove(j);
1062 ArrayList phrase_query_p_term_variants_list = phrase_query_term_variants_hierarchy.get(partial_phrase_match.query_phrase_number);
1063 HashSet phrase_query_p_term_x_variants = (HashSet) phrase_query_p_term_variants_list.get(partial_phrase_match.num_words_matched);
1064 if (phrase_query_p_term_x_variants.contains(word_match.word))
1065 {
1066 partial_phrase_match.num_words_matched++;
1067
1068 // Has a complete phrase match occurred?
1069 if (partial_phrase_match.num_words_matched == phrase_query_p_term_variants_list.size())
1070 {
1071 // Check for overlaps by looking at the previous highlight range
1072 if (!highlight_end_positions.isEmpty())
1073 {
1074 int last_highlight_index = highlight_end_positions.size() - 1;
1075 int last_highlight_end = highlight_end_positions.get(last_highlight_index).intValue();
1076 if (last_highlight_end > partial_phrase_match.start_position)
1077 {
1078 // There is an overlap, so remove the previous phrase match
1079 int last_highlight_start = highlight_start_positions.remove(last_highlight_index).intValue();
1080 highlight_end_positions.remove(last_highlight_index);
1081 partial_phrase_match.start_position = last_highlight_start;
1082 }
1083 }
1084
1085 highlight_start_positions.add(new Integer(partial_phrase_match.start_position));
1086 highlight_end_positions.add(new Integer(word_match.end_position));
1087 }
1088 // No, but add the partial match back into the list for next time
1089 else
1090 {
1091 partial_phrase_matches.add(partial_phrase_match);
1092 }
1093 }
1094 }
1095 }
1096 else
1097 {
1098 partial_phrase_matches.clear();
1099 }
1100
1101 // See if this word is at the start of any of the phrases
1102 for (int p = 0; p < phrase_query_term_variants_hierarchy.size(); p++)
1103 {
1104 ArrayList phrase_query_p_term_variants_list = phrase_query_term_variants_hierarchy.get(p);
1105 HashSet phrase_query_p_term_1_variants = (HashSet) phrase_query_p_term_variants_list.get(0);
1106 if (phrase_query_p_term_1_variants.contains(word_match.word))
1107 {
1108 // If this phrase is just one word long, we have a complete match
1109 if (phrase_query_p_term_variants_list.size() == 1)
1110 {
1111 highlight_start_positions.add(new Integer(word_match.start_position));
1112 highlight_end_positions.add(new Integer(word_match.end_position));
1113 }
1114 // Otherwise we have the start of a potential phrase match
1115 else
1116 {
1117 partial_phrase_matches.add(new PartialPhraseMatch(word_match.start_position, p));
1118 }
1119 }
1120 }
1121 }
1122
1123 // Now add the annotation tags into the document at the correct points
1124 Element content_element = this.doc.createElement(GSXML.NODE_CONTENT_ELEM);
1125
1126 int last_wrote = 0;
1127 for (int i = 0; i < highlight_start_positions.size(); i++)
1128 {
1129 int highlight_start = highlight_start_positions.get(i).intValue();
1130 int highlight_end = highlight_end_positions.get(i).intValue();
1131
1132 // Print anything before the highlight range
1133 if (last_wrote < highlight_start)
1134 {
1135 String preceding_text = new String(content_characters, last_wrote, (highlight_start - last_wrote));
1136 content_element.appendChild(this.doc.createTextNode(preceding_text));
1137 }
1138
1139 // Print the highlight text, annotated
1140 if (highlight_end > last_wrote)
1141 {
1142 String highlight_text = new String(content_characters, highlight_start, (highlight_end - highlight_start));
1143 Element annotation_element = GSXML.createTextElement(this.doc, "annotation", highlight_text);
1144 annotation_element.setAttribute("type", "query_term");
1145 content_element.appendChild(annotation_element);
1146 last_wrote = highlight_end;
1147 }
1148 }
1149
1150 // Finish off any unwritten text
1151 if (last_wrote < content_characters.length)
1152 {
1153 String remaining_text = new String(content_characters, last_wrote, (content_characters.length - last_wrote));
1154 content_element.appendChild(this.doc.createTextNode(remaining_text));
1155 }
1156
1157 return content_element;
1158 }
1159
1160 static private class WordMatch
1161 {
1162 public String word;
1163 public int start_position;
1164 public int end_position;
1165 public boolean preceding_word_matched;
1166
1167 public WordMatch(String word, int start_position, int end_position, boolean preceding_word_matched)
1168 {
1169 this.word = word;
1170 this.start_position = start_position;
1171 this.end_position = end_position;
1172 this.preceding_word_matched = preceding_word_matched;
1173 }
1174 }
1175
1176 static private class PartialPhraseMatch
1177 {
1178 public int start_position;
1179 public int query_phrase_number;
1180 public int num_words_matched;
1181
1182 public PartialPhraseMatch(int start_position, int query_phrase_number)
1183 {
1184 this.start_position = start_position;
1185 this.query_phrase_number = query_phrase_number;
1186 this.num_words_matched = 1;
1187 }
1188 }
1189}
Note: See TracBrowser for help on using the repository browser.