1 | /*
|
---|
2 | * DocumentAction.java
|
---|
3 | * Copyright (C) 2002 New Zealand Digital Library, http://www.nzdl.org
|
---|
4 | *
|
---|
5 | * This program is free software; you can redistribute it and/or modify
|
---|
6 | * it under the terms of the GNU General Public License as published by
|
---|
7 | * the Free Software Foundation; either version 2 of the License, or
|
---|
8 | * (at your option) any later version.
|
---|
9 | *
|
---|
10 | * This program is distributed in the hope that it will be useful,
|
---|
11 | * but WITHOUT ANY WARRANTY; without even the implied warranty of
|
---|
12 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
---|
13 | * GNU General Public License for more details.
|
---|
14 | *
|
---|
15 | * You should have received a copy of the GNU General Public License
|
---|
16 | * along with this program; if not, write to the Free Software
|
---|
17 | * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
|
---|
18 | */
|
---|
19 | package org.greenstone.gsdl3.action;
|
---|
20 |
|
---|
21 | // Greenstone classes
|
---|
22 | import org.greenstone.gsdl3.core.ModuleInterface;
|
---|
23 | import org.greenstone.gsdl3.service.AbstractDocumentRetrieve;
|
---|
24 | import org.greenstone.gsdl3.service.DocXMLUtil;
|
---|
25 | import org.greenstone.gsdl3.util.*;
|
---|
26 | import org.greenstone.util.GlobalProperties;
|
---|
27 |
|
---|
28 | // XML classes
|
---|
29 | import org.w3c.dom.Document;
|
---|
30 | import org.w3c.dom.Element;
|
---|
31 | import org.w3c.dom.Node;
|
---|
32 | import org.w3c.dom.Text;
|
---|
33 | import org.w3c.dom.NodeList;
|
---|
34 |
|
---|
35 | // General Java classes
|
---|
36 | import java.util.ArrayList;
|
---|
37 | import java.util.HashMap;
|
---|
38 | import java.util.HashSet;
|
---|
39 | import java.util.Iterator;
|
---|
40 | import java.io.File;
|
---|
41 | import java.io.Serializable;
|
---|
42 |
|
---|
43 | import org.apache.log4j.*;
|
---|
44 |
|
---|
45 | /** Action class for retrieving Documents via the message router */
|
---|
46 | public class DocumentAction extends Action
|
---|
47 | {
|
---|
48 |
|
---|
49 | static Logger logger = Logger.getLogger(org.greenstone.gsdl3.action.DocumentAction.class.getName());
|
---|
50 |
|
---|
51 | // this is used to specify that the sibling nodes of a selected one should be obtained
|
---|
52 | public static final String SIBLING_ARG = "sib";
|
---|
53 | public static final String GOTO_PAGE_ARG = "gp";
|
---|
54 | public static final String ENRICH_DOC_ARG = "end";
|
---|
55 | public static final String EXPAND_DOCUMENT_ARG = "ed";
|
---|
56 | public static final String EXPAND_CONTENTS_ARG = "ec";
|
---|
57 | public static final String REALISTIC_BOOK_ARG = "book";
|
---|
58 | public static final String NO_TEXT_ARG = "noText";
|
---|
59 | public static final String DOC_EDIT_ARG = "docEdit";
|
---|
60 | public static final String DOC_VERSION_ARG = "dv";
|
---|
61 |
|
---|
62 | /**
|
---|
63 | * if this is set to true, when a document is displayed, any annotation type
|
---|
64 | * services (enrich) will be offered to the user as well
|
---|
65 | */
|
---|
66 | protected boolean provide_annotations = false;
|
---|
67 |
|
---|
68 | protected boolean highlight_query_terms = false;
|
---|
69 |
|
---|
70 | public boolean configure()
|
---|
71 | {
|
---|
72 | super.configure();
|
---|
73 | String highlight = (String) config_params.get("highlightQueryTerms");
|
---|
74 | if (highlight != null && highlight.equals("true"))
|
---|
75 | {
|
---|
76 | highlight_query_terms = true;
|
---|
77 | }
|
---|
78 | String annotate = (String) config_params.get("displayAnnotationService");
|
---|
79 | if (annotate != null && annotate.equals("true"))
|
---|
80 | {
|
---|
81 | provide_annotations = true;
|
---|
82 | }
|
---|
83 | return true;
|
---|
84 | }
|
---|
85 |
|
---|
86 | public Node process(Node message_node)
|
---|
87 | {
|
---|
88 | // for now, no subaction eventually we may want to have subactions such as text assoc or something ?
|
---|
89 |
|
---|
90 | Element message = GSXML.nodeToElement(message_node);
|
---|
91 | Document doc = XMLConverter.newDOM();
|
---|
92 |
|
---|
93 | // the response
|
---|
94 | Element result = doc.createElement(GSXML.MESSAGE_ELEM);
|
---|
95 | Element page_response = doc.createElement(GSXML.RESPONSE_ELEM);
|
---|
96 | result.appendChild(page_response);
|
---|
97 |
|
---|
98 | // get the request - assume only one
|
---|
99 | Element request = (Element) GSXML.getChildByTagName(message, GSXML.REQUEST_ELEM);
|
---|
100 | Element cgi_paramList = (Element) GSXML.getChildByTagName(request, GSXML.PARAM_ELEM + GSXML.LIST_MODIFIER);
|
---|
101 | HashMap<String, Serializable> params = GSXML.extractParams(cgi_paramList, false);
|
---|
102 |
|
---|
103 | // just in case there are some that need to get passed to the services
|
---|
104 | // why do we use s0 here and s1 in other places???
|
---|
105 | HashMap service_params = (HashMap) params.get("s0");
|
---|
106 |
|
---|
107 | String collection = (String) params.get(GSParams.COLLECTION);
|
---|
108 | String document_id = (String) params.get(GSParams.DOCUMENT);
|
---|
109 | if (document_id != null && document_id.equals(""))
|
---|
110 | {
|
---|
111 | document_id = null;
|
---|
112 | }
|
---|
113 | String href = (String) params.get(GSParams.HREF);//for an external link : get the href URL if it is existing in the params list
|
---|
114 | if (href != null && href.equals(""))
|
---|
115 | {
|
---|
116 | href = null;
|
---|
117 | }
|
---|
118 | String rl = (String) params.get(GSParams.RELATIVE_LINK);//for an external link : get the rl value if it is existing in the params list
|
---|
119 | if (document_id == null && href == null)
|
---|
120 | {
|
---|
121 | logger.error("no document specified!");
|
---|
122 | return result;
|
---|
123 | }
|
---|
124 | if (rl != null && rl.equals("0"))
|
---|
125 | {
|
---|
126 | // this is a true external link, we should have been directed to a different page or action
|
---|
127 | logger.error("rl value was 0, shouldn't get here");
|
---|
128 | return result;
|
---|
129 | }
|
---|
130 |
|
---|
131 | String doc_id_modifier = "";
|
---|
132 | String sibling_num = (String) params.get(GOTO_PAGE_ARG);
|
---|
133 | if (sibling_num != null && !sibling_num.equals(""))
|
---|
134 | {
|
---|
135 | // we have to modify the doc name
|
---|
136 | doc_id_modifier = "." + sibling_num + ".ss";
|
---|
137 | }
|
---|
138 |
|
---|
139 |
|
---|
140 | UserContext userContext = new UserContext(request);
|
---|
141 |
|
---|
142 | //append site metadata
|
---|
143 | addSiteMetadata(page_response, userContext);
|
---|
144 | addInterfaceOptions(page_response);
|
---|
145 |
|
---|
146 | // get the additional data needed for the page
|
---|
147 | getBackgroundData(page_response, collection, userContext);
|
---|
148 |
|
---|
149 | // create a basic doc list containing the current node
|
---|
150 | // we will use this to query whether the id is valid, and to get document type
|
---|
151 | Element basic_doc_list = doc.createElement(GSXML.DOC_NODE_ELEM + GSXML.LIST_MODIFIER);
|
---|
152 | Element current_doc = doc.createElement(GSXML.DOC_NODE_ELEM);
|
---|
153 | basic_doc_list.appendChild(current_doc);
|
---|
154 | if (document_id != null)
|
---|
155 | {
|
---|
156 | current_doc.setAttribute(GSXML.NODE_ID_ATT, document_id + doc_id_modifier);
|
---|
157 | }
|
---|
158 | else
|
---|
159 | {
|
---|
160 | current_doc.setAttribute(GSXML.HREF_ID_ATT, href);
|
---|
161 | // do we need this??
|
---|
162 | current_doc.setAttribute(GSXML.ID_MOD_ATT, doc_id_modifier);
|
---|
163 | }
|
---|
164 |
|
---|
165 | // lets do a quick check here for valid doc id.
|
---|
166 | if (document_id != null) {
|
---|
167 | boolean is_valid = checkValidOID(basic_doc_list, collection, userContext, page_response );
|
---|
168 | if (!is_valid) {
|
---|
169 | GSXML.addError(page_response, "Invalid doc id ("+document_id+")", GSXML.ERROR_TYPE_INVALID_ID);
|
---|
170 | return result;
|
---|
171 | }
|
---|
172 | }
|
---|
173 | Element format_elem = (Element) GSXML.getChildByTagName(page_response, GSXML.FORMAT_ELEM);
|
---|
174 |
|
---|
175 | if (format_elem != null) {
|
---|
176 | // lets look for param defaults set in config file
|
---|
177 | NodeList param_defaults = format_elem.getElementsByTagName(GSXML.PARAM_DEFAULT_ELEM);
|
---|
178 | for (int i=0; i<param_defaults.getLength(); i++) {
|
---|
179 | Element p = (Element)param_defaults.item(i);
|
---|
180 | String name = p.getAttribute(GSXML.NAME_ATT);
|
---|
181 | if (params.get(name) ==null) {
|
---|
182 | // wasn't set from interface
|
---|
183 | String value = p.getAttribute(GSXML.VALUE_ATT);
|
---|
184 | params.put(name, value );
|
---|
185 | // also add into request param xml so that xslt knows it too
|
---|
186 | GSXML.addParameterToList(cgi_paramList, name, value);
|
---|
187 | }
|
---|
188 | }
|
---|
189 | }
|
---|
190 |
|
---|
191 | String document_type = (String) params.get(GSParams.DOCUMENT_TYPE);
|
---|
192 | if (document_type != null && document_type.equals(""))
|
---|
193 | {
|
---|
194 | //document_type = "hierarchy";
|
---|
195 | document_type = null; // we'll get it later if not already specified
|
---|
196 | }
|
---|
197 | // what if it is null here?? Anu to check...
|
---|
198 |
|
---|
199 |
|
---|
200 | boolean editing_document = false;
|
---|
201 | String doc_edit = (String) params.get(DOC_EDIT_ARG);
|
---|
202 | if (doc_edit != null && doc_edit.equals("1")) {
|
---|
203 | editing_document = true;
|
---|
204 | }
|
---|
205 |
|
---|
206 | // are we editing mode? just get the archive document, convert to our internal doc format, and return it
|
---|
207 | if (editing_document) {
|
---|
208 | String opt_document_version = (String) params.get(DOC_VERSION_ARG);
|
---|
209 | return getFormattedArchiveDoc(doc, collection, document_id, opt_document_version, document_type, result, page_response, userContext);
|
---|
210 | }
|
---|
211 |
|
---|
212 | //whether to retrieve siblings or not
|
---|
213 | boolean get_siblings = false;
|
---|
214 | String sibs = (String) params.get(SIBLING_ARG);
|
---|
215 | if (sibs != null && sibs.equals("1"))
|
---|
216 | {
|
---|
217 | get_siblings = true;
|
---|
218 | }
|
---|
219 |
|
---|
220 | boolean expand_document = false;
|
---|
221 | String ed_arg = (String) params.get(EXPAND_DOCUMENT_ARG);
|
---|
222 | if (ed_arg != null && ed_arg.equals("1"))
|
---|
223 | {
|
---|
224 | expand_document = true;
|
---|
225 | }
|
---|
226 |
|
---|
227 | boolean expand_contents = false;
|
---|
228 | if (expand_document)
|
---|
229 | { // we always expand the contents with the text
|
---|
230 | expand_contents = true;
|
---|
231 | }
|
---|
232 | else
|
---|
233 | {
|
---|
234 | String ec_arg = (String) params.get(EXPAND_CONTENTS_ARG);
|
---|
235 | if (ec_arg != null && ec_arg.equals("1"))
|
---|
236 | {
|
---|
237 | expand_contents = true;
|
---|
238 | }
|
---|
239 | }
|
---|
240 |
|
---|
241 | // do we want text content? Not if no_text=1.
|
---|
242 | // expand_document overrides this. - should it??
|
---|
243 | boolean get_text = true;
|
---|
244 | String nt_arg = (String) params.get(NO_TEXT_ARG);
|
---|
245 |
|
---|
246 | if (!expand_document && nt_arg!=null && nt_arg.equals("1")) {
|
---|
247 | logger.debug("SETTING GET TEXT TO FALSE");
|
---|
248 | get_text = false;
|
---|
249 | } else {
|
---|
250 | logger.debug("GET TEXT REMAINS TRUE");
|
---|
251 | }
|
---|
252 |
|
---|
253 | // the_document is where all the doc info - structure and metadata etc
|
---|
254 | // is added into, to be returned in the page
|
---|
255 | Element the_document = doc.createElement(GSXML.DOCUMENT_ELEM);
|
---|
256 | page_response.appendChild(the_document);
|
---|
257 |
|
---|
258 | // used to create basic_doc_list here
|
---|
259 | if (document_type == null)
|
---|
260 | {
|
---|
261 | document_type = getDocumentType(basic_doc_list, collection, userContext, page_response);
|
---|
262 | }
|
---|
263 | if (document_type == null)
|
---|
264 | {
|
---|
265 | logger.debug("##### doctype is null, setting to simple");
|
---|
266 | document_type = GSXML.DOC_TYPE_SIMPLE;
|
---|
267 | }
|
---|
268 |
|
---|
269 | the_document.setAttribute(GSXML.DOC_TYPE_ATT, document_type);
|
---|
270 |
|
---|
271 | // start getting doc structure
|
---|
272 |
|
---|
273 | // Create a parameter list to specify the required structure information
|
---|
274 | Element ds_param_list = doc.createElement(GSXML.PARAM_ELEM + GSXML.LIST_MODIFIER);
|
---|
275 |
|
---|
276 | if (service_params != null)
|
---|
277 | {
|
---|
278 | GSXML.addParametersToList(ds_param_list, service_params);
|
---|
279 | }
|
---|
280 |
|
---|
281 | Element ds_param = null;
|
---|
282 | boolean get_structure = false;
|
---|
283 | boolean get_structure_info = false;
|
---|
284 | if (document_type.equals(GSXML.DOC_TYPE_PAGED))
|
---|
285 | {
|
---|
286 | get_structure_info = true;
|
---|
287 |
|
---|
288 | if (expand_contents)
|
---|
289 | {
|
---|
290 | ds_param = doc.createElement(GSXML.PARAM_ELEM);
|
---|
291 | ds_param_list.appendChild(ds_param);
|
---|
292 | ds_param.setAttribute(GSXML.NAME_ATT, "structure");
|
---|
293 | ds_param.setAttribute(GSXML.VALUE_ATT, "entire");
|
---|
294 | }
|
---|
295 |
|
---|
296 | // get the info needed for paged naviagtion
|
---|
297 | ds_param = doc.createElement(GSXML.PARAM_ELEM);
|
---|
298 | ds_param_list.appendChild(ds_param);
|
---|
299 | ds_param.setAttribute(GSXML.NAME_ATT, "info");
|
---|
300 | ds_param.setAttribute(GSXML.VALUE_ATT, "numSiblings");
|
---|
301 | ds_param = doc.createElement(GSXML.PARAM_ELEM);
|
---|
302 | ds_param_list.appendChild(ds_param);
|
---|
303 | ds_param.setAttribute(GSXML.NAME_ATT, "info");
|
---|
304 | ds_param.setAttribute(GSXML.VALUE_ATT, "numChildren");
|
---|
305 | ds_param = doc.createElement(GSXML.PARAM_ELEM);
|
---|
306 | ds_param_list.appendChild(ds_param);
|
---|
307 | ds_param.setAttribute(GSXML.NAME_ATT, "info");
|
---|
308 | ds_param.setAttribute(GSXML.VALUE_ATT, "siblingPosition");
|
---|
309 |
|
---|
310 | if (get_siblings)
|
---|
311 | {
|
---|
312 | ds_param = doc.createElement(GSXML.PARAM_ELEM);
|
---|
313 | ds_param_list.appendChild(ds_param);
|
---|
314 | ds_param.setAttribute(GSXML.NAME_ATT, "structure");
|
---|
315 | ds_param.setAttribute(GSXML.VALUE_ATT, "siblings");
|
---|
316 | }
|
---|
317 |
|
---|
318 | }
|
---|
319 | else if (document_type.equals(GSXML.DOC_TYPE_HIERARCHY) || document_type.equals(GSXML.DOC_TYPE_PAGED_HIERARCHY))
|
---|
320 | {
|
---|
321 | get_structure = true;
|
---|
322 | if (expand_contents)
|
---|
323 | {
|
---|
324 | ds_param = doc.createElement(GSXML.PARAM_ELEM);
|
---|
325 | ds_param_list.appendChild(ds_param);
|
---|
326 | ds_param.setAttribute(GSXML.NAME_ATT, "structure");
|
---|
327 | ds_param.setAttribute(GSXML.VALUE_ATT, "entire");
|
---|
328 | }
|
---|
329 | else
|
---|
330 | {
|
---|
331 | // get the info needed for table of contents
|
---|
332 | ds_param = doc.createElement(GSXML.PARAM_ELEM);
|
---|
333 | ds_param_list.appendChild(ds_param);
|
---|
334 | ds_param.setAttribute(GSXML.NAME_ATT, "structure");
|
---|
335 | ds_param.setAttribute(GSXML.VALUE_ATT, "ancestors");
|
---|
336 | ds_param = doc.createElement(GSXML.PARAM_ELEM);
|
---|
337 | ds_param_list.appendChild(ds_param);
|
---|
338 | ds_param.setAttribute(GSXML.NAME_ATT, "structure");
|
---|
339 | ds_param.setAttribute(GSXML.VALUE_ATT, "children");
|
---|
340 | if (get_siblings)
|
---|
341 | {
|
---|
342 | ds_param = doc.createElement(GSXML.PARAM_ELEM);
|
---|
343 | ds_param_list.appendChild(ds_param);
|
---|
344 | ds_param.setAttribute(GSXML.NAME_ATT, "structure");
|
---|
345 | ds_param.setAttribute(GSXML.VALUE_ATT, "siblings");
|
---|
346 | }
|
---|
347 | }
|
---|
348 | }
|
---|
349 | else
|
---|
350 | {
|
---|
351 | // we dont need any structure
|
---|
352 | }
|
---|
353 |
|
---|
354 | boolean has_dummy = false;
|
---|
355 | if (get_structure || get_structure_info)
|
---|
356 | {
|
---|
357 |
|
---|
358 | // Build a request to obtain the document structure
|
---|
359 | Element ds_message = doc.createElement(GSXML.MESSAGE_ELEM);
|
---|
360 | String to = GSPath.appendLink(collection, AbstractDocumentRetrieve.DOCUMENT_STRUCTURE_RETRIEVE_SERVICE);
|
---|
361 | Element ds_request = GSXML.createBasicRequest(doc, GSXML.REQUEST_TYPE_PROCESS, to, userContext);
|
---|
362 | ds_message.appendChild(ds_request);
|
---|
363 | ds_request.appendChild(ds_param_list);
|
---|
364 |
|
---|
365 | // add the node list we created earlier
|
---|
366 | ds_request.appendChild(basic_doc_list);
|
---|
367 |
|
---|
368 | // Process the document structure retrieve message
|
---|
369 | Element ds_response_message = (Element) this.mr.process(ds_message);
|
---|
370 | if (processErrorElements(ds_response_message, page_response))
|
---|
371 | {
|
---|
372 | return result;
|
---|
373 | }
|
---|
374 |
|
---|
375 | // get the info and print out
|
---|
376 | String path = GSPath.appendLink(GSXML.RESPONSE_ELEM, GSXML.DOC_NODE_ELEM + GSXML.LIST_MODIFIER);
|
---|
377 | path = GSPath.appendLink(path, GSXML.DOC_NODE_ELEM);
|
---|
378 | path = GSPath.appendLink(path, "nodeStructureInfo");
|
---|
379 | Element ds_response_struct_info = (Element) GSXML.getNodeByPath(ds_response_message, path);
|
---|
380 | // get the doc_node bit
|
---|
381 | if (ds_response_struct_info != null)
|
---|
382 | {
|
---|
383 | the_document.appendChild(doc.importNode(ds_response_struct_info, true));
|
---|
384 | }
|
---|
385 | path = GSPath.appendLink(GSXML.RESPONSE_ELEM, GSXML.DOC_NODE_ELEM + GSXML.LIST_MODIFIER);
|
---|
386 | path = GSPath.appendLink(path, GSXML.DOC_NODE_ELEM);
|
---|
387 | path = GSPath.appendLink(path, GSXML.NODE_STRUCTURE_ELEM);
|
---|
388 | Element ds_response_structure = (Element) GSXML.getNodeByPath(ds_response_message, path);
|
---|
389 |
|
---|
390 | if (ds_response_structure != null)
|
---|
391 | {
|
---|
392 | // add the contents of the structure bit into the_document
|
---|
393 | NodeList structs = ds_response_structure.getChildNodes();
|
---|
394 | for (int i = 0; i < structs.getLength(); i++)
|
---|
395 | {
|
---|
396 | the_document.appendChild(doc.importNode(structs.item(i), true));
|
---|
397 | }
|
---|
398 | }
|
---|
399 | else
|
---|
400 | {
|
---|
401 | // no structure nodes, so put in a dummy doc node
|
---|
402 | Element doc_node = doc.createElement(GSXML.DOC_NODE_ELEM);
|
---|
403 | if (document_id != null)
|
---|
404 | {
|
---|
405 | doc_node.setAttribute(GSXML.NODE_ID_ATT, document_id);
|
---|
406 | }
|
---|
407 | else
|
---|
408 | {
|
---|
409 | doc_node.setAttribute(GSXML.HREF_ID_ATT, href);
|
---|
410 |
|
---|
411 | }
|
---|
412 | the_document.appendChild(doc_node);
|
---|
413 | has_dummy = true;
|
---|
414 | }
|
---|
415 | }
|
---|
416 | else
|
---|
417 | { // a simple type - we dont have a dummy node for simple
|
---|
418 | // should think about this more
|
---|
419 | // no structure request, so just put in a dummy doc node
|
---|
420 | Element doc_node = doc.createElement(GSXML.DOC_NODE_ELEM);
|
---|
421 | if (document_id != null)
|
---|
422 | {
|
---|
423 | doc_node.setAttribute(GSXML.NODE_ID_ATT, document_id);
|
---|
424 | }
|
---|
425 | else
|
---|
426 | {
|
---|
427 | doc_node.setAttribute(GSXML.HREF_ID_ATT, href);
|
---|
428 | }
|
---|
429 | the_document.appendChild(doc_node);
|
---|
430 | has_dummy = true;
|
---|
431 | }
|
---|
432 |
|
---|
433 | // end getting doc structure
|
---|
434 |
|
---|
435 | // start getting doc metadata
|
---|
436 |
|
---|
437 | // Build a request to obtain some document metadata
|
---|
438 | Element dm_message = doc.createElement(GSXML.MESSAGE_ELEM);
|
---|
439 | String to = GSPath.appendLink(collection, AbstractDocumentRetrieve.DOCUMENT_METADATA_RETRIEVE_SERVICE);
|
---|
440 | Element dm_request = GSXML.createBasicRequest(doc, GSXML.REQUEST_TYPE_PROCESS, to, userContext);
|
---|
441 | dm_message.appendChild(dm_request);
|
---|
442 | // Create a parameter list to specify the required metadata information
|
---|
443 |
|
---|
444 | HashSet<String> meta_names = new HashSet<String>();
|
---|
445 | meta_names.add("Title"); // the default
|
---|
446 | getRequiredMetadataNames(meta_names, format_elem, request);
|
---|
447 |
|
---|
448 | Element dm_param_list = createMetadataParamList(doc,meta_names);
|
---|
449 | if (service_params != null)
|
---|
450 | {
|
---|
451 | GSXML.addParametersToList(dm_param_list, service_params);
|
---|
452 | }
|
---|
453 |
|
---|
454 | dm_request.appendChild(dm_param_list);
|
---|
455 |
|
---|
456 | // create the doc node list for the metadata request
|
---|
457 | Element dm_doc_list = doc.createElement(GSXML.DOC_NODE_ELEM + GSXML.LIST_MODIFIER);
|
---|
458 | dm_request.appendChild(dm_doc_list);
|
---|
459 |
|
---|
460 | // Add each node from the structure response into the metadata request
|
---|
461 | NodeList doc_nodes = the_document.getElementsByTagName(GSXML.DOC_NODE_ELEM);
|
---|
462 | for (int i = 0; i < doc_nodes.getLength(); i++)
|
---|
463 | {
|
---|
464 | Element doc_node = (Element) doc_nodes.item(i);
|
---|
465 | String doc_node_id = doc_node.getAttribute(GSXML.NODE_ID_ATT);
|
---|
466 |
|
---|
467 | // Add the documentNode to the list
|
---|
468 | Element dm_doc_node = doc.createElement(GSXML.DOC_NODE_ELEM);
|
---|
469 | if (needSectionContent(params)) {
|
---|
470 | if (doc_node_id.equals(document_id)) {
|
---|
471 | dm_doc_list.appendChild(dm_doc_node);
|
---|
472 | }
|
---|
473 | } else {
|
---|
474 | dm_doc_list.appendChild(dm_doc_node);
|
---|
475 | }
|
---|
476 | //dm_doc_list.appendChild(dm_doc_node);
|
---|
477 | dm_doc_node.setAttribute(GSXML.NODE_ID_ATT, doc_node_id);
|
---|
478 | dm_doc_node.setAttribute(GSXML.NODE_TYPE_ATT, doc_node.getAttribute(GSXML.NODE_TYPE_ATT));
|
---|
479 | if (document_id == null){
|
---|
480 | dm_doc_node.setAttribute(GSXML.HREF_ID_ATT, href );
|
---|
481 | }
|
---|
482 |
|
---|
483 | }
|
---|
484 | // we also want a metadata request to the top level document to get
|
---|
485 | // assocfilepath - this could be cached too
|
---|
486 | Element doc_meta_request = GSXML.createBasicRequest(doc, GSXML.REQUEST_TYPE_PROCESS, to, userContext);
|
---|
487 | dm_message.appendChild(doc_meta_request);
|
---|
488 | Element doc_meta_param_list = doc.createElement(GSXML.PARAM_ELEM + GSXML.LIST_MODIFIER);
|
---|
489 | if (service_params != null)
|
---|
490 | {
|
---|
491 | GSXML.addParametersToList(doc_meta_param_list, service_params);
|
---|
492 | }
|
---|
493 |
|
---|
494 | doc_meta_request.appendChild(doc_meta_param_list);
|
---|
495 | Element doc_param = doc.createElement(GSXML.PARAM_ELEM);
|
---|
496 | doc_meta_param_list.appendChild(doc_param);
|
---|
497 | doc_param.setAttribute(GSXML.NAME_ATT, "metadata");
|
---|
498 | doc_param.setAttribute(GSXML.VALUE_ATT, "assocfilepath");
|
---|
499 |
|
---|
500 | // create the doc node list for the metadata request
|
---|
501 | Element doc_list = doc.createElement(GSXML.DOC_NODE_ELEM + GSXML.LIST_MODIFIER);
|
---|
502 | doc_meta_request.appendChild(doc_list);
|
---|
503 |
|
---|
504 | Element doc_node = doc.createElement(GSXML.DOC_NODE_ELEM);
|
---|
505 | // the node we want is the root document node
|
---|
506 | if (document_id != null)
|
---|
507 | {
|
---|
508 | doc_node.setAttribute(GSXML.NODE_ID_ATT, document_id + ".rt");
|
---|
509 | }
|
---|
510 | /*else
|
---|
511 | {
|
---|
512 | doc_node.setAttribute(GSXML.HREF_ID_ATT, href);// + ".rt");
|
---|
513 | // can we assume that href is always a top level doc??
|
---|
514 | //doc_node.setAttribute(GSXML.ID_MOD_ATT, ".rt");
|
---|
515 | //doc_node.setAttribute("externalURL", has_rl);
|
---|
516 | }*/
|
---|
517 | doc_list.appendChild(doc_node);
|
---|
518 |
|
---|
519 | Element dm_response_message = (Element) this.mr.process(dm_message);
|
---|
520 | if (processErrorElements(dm_response_message, page_response))
|
---|
521 | {
|
---|
522 | return result;
|
---|
523 | }
|
---|
524 |
|
---|
525 | String path = GSPath.appendLink(GSXML.RESPONSE_ELEM, GSXML.DOC_NODE_ELEM + GSXML.LIST_MODIFIER);
|
---|
526 | Element dm_response_doc_list = (Element) GSXML.getNodeByPath(dm_response_message, path);
|
---|
527 |
|
---|
528 | // Merge the metadata with the structure information
|
---|
529 | NodeList dm_response_docs = dm_response_doc_list.getChildNodes();
|
---|
530 | for (int i = 0; i < doc_nodes.getLength(); i++)
|
---|
531 | {
|
---|
532 | Node dcNode;
|
---|
533 | String node_idd = ((Element)doc_nodes.item(i)).getAttribute(GSXML.NODE_ID_ATT);
|
---|
534 | if (node_idd.isEmpty()) {
|
---|
535 | String href_id_att = ((Element)doc_nodes.item(i)).getAttribute(GSXML.HREF_ID_ATT);
|
---|
536 | dcNode = GSXML.getNamedElement(dm_response_doc_list, "documentNode", GSXML.HREF_ID_ATT, href_id_att);
|
---|
537 | } else {
|
---|
538 | dcNode = GSXML.getNamedElement(dm_response_doc_list, "documentNode", GSXML.NODE_ID_ATT, node_idd);
|
---|
539 | }
|
---|
540 | GSXML.mergeMetadataLists(doc_nodes.item(i), dcNode);
|
---|
541 | }
|
---|
542 | // get the top level doc metadata out
|
---|
543 | Element doc_meta_response = (Element) dm_response_message.getElementsByTagName(GSXML.RESPONSE_ELEM).item(1);
|
---|
544 | Element top_doc_node = (Element) GSXML.getNodeByPath(doc_meta_response, "documentNodeList/documentNode");
|
---|
545 | GSXML.mergeMetadataLists(the_document, top_doc_node);
|
---|
546 |
|
---|
547 | // if we are highlighting query terms, then we also get them highlighted in the metadata
|
---|
548 |
|
---|
549 | HashSet<String> query_term_variants = null;
|
---|
550 | ArrayList<ArrayList<HashSet<String>>> phrase_query_term_variants_hierarchy = null;
|
---|
551 | boolean do_highlight_query_terms = highlight_query_terms;
|
---|
552 | int query_terms_status = 0;
|
---|
553 | if (highlight_query_terms) {
|
---|
554 | // lets get the query term equivalents
|
---|
555 | query_term_variants = new HashSet<String>();
|
---|
556 | phrase_query_term_variants_hierarchy = new ArrayList<ArrayList<HashSet<String>>>();
|
---|
557 | if ((query_terms_status = getQueryTermVariants(request, query_term_variants, phrase_query_term_variants_hierarchy)) ==0) {
|
---|
558 | do_highlight_query_terms = false; // we couldn't get the terms
|
---|
559 | }
|
---|
560 | }
|
---|
561 |
|
---|
562 | // lets try marking up the metadata with search terms
|
---|
563 | // if the search service doesn't send back <equivTermlist> then we haven't got the term variants. We lower case everything and do case insensitive matching
|
---|
564 | boolean highlight_case_insensitive = false;
|
---|
565 | if (query_terms_status == NO_EQUIV_QUERY_TERMS) {
|
---|
566 | highlight_case_insensitive = true;
|
---|
567 | }
|
---|
568 | if (do_highlight_query_terms) {
|
---|
569 | highlightQueryTermsDOM(doc, the_document, "metadata", query_term_variants, phrase_query_term_variants_hierarchy, highlight_case_insensitive);
|
---|
570 | }
|
---|
571 |
|
---|
572 | // do we want doc text content? If not, we are done.
|
---|
573 | if (!get_text) {
|
---|
574 | // don't get text
|
---|
575 | return result;
|
---|
576 | }
|
---|
577 |
|
---|
578 | // Build a request to obtain some document content
|
---|
579 | Element dc_message = doc.createElement(GSXML.MESSAGE_ELEM);
|
---|
580 | to = GSPath.appendLink(collection, AbstractDocumentRetrieve.DOCUMENT_CONTENT_RETRIEVE_SERVICE);
|
---|
581 | Element dc_request = GSXML.createBasicRequest(doc, GSXML.REQUEST_TYPE_PROCESS, to, userContext);
|
---|
582 | dc_message.appendChild(dc_request);
|
---|
583 |
|
---|
584 | // Create a parameter list to specify the request parameters - empty for now
|
---|
585 | Element dc_param_list = doc.createElement(GSXML.PARAM_ELEM + GSXML.LIST_MODIFIER);
|
---|
586 | if (service_params != null)
|
---|
587 | {
|
---|
588 | GSXML.addParametersToList(dc_param_list, service_params);
|
---|
589 | }
|
---|
590 |
|
---|
591 | dc_request.appendChild(dc_param_list);
|
---|
592 |
|
---|
593 | // get the content
|
---|
594 | // the doc list for the content request is the same as the one for the structure request unless we want the whole document, in which case its the same as for the metadata request.
|
---|
595 | if (expand_document)
|
---|
596 | {
|
---|
597 | dc_request.appendChild(dm_doc_list);
|
---|
598 | }
|
---|
599 | else
|
---|
600 | {
|
---|
601 | dc_request.appendChild(basic_doc_list);
|
---|
602 | }
|
---|
603 | Element dc_response_message = (Element) this.mr.process(dc_message);
|
---|
604 |
|
---|
605 | if (processErrorElements(dc_response_message, page_response))
|
---|
606 | {
|
---|
607 | return result;
|
---|
608 |
|
---|
609 | }
|
---|
610 | Element dc_response_doc_list = (Element) GSXML.getNodeByPath(dc_response_message, path);
|
---|
611 |
|
---|
612 | boolean get_marked_up_doc_from_query = false;
|
---|
613 | if (do_highlight_query_terms && query_terms_status == NO_EQUIV_QUERY_TERMS) {
|
---|
614 | get_marked_up_doc_from_query = true; // we try to. solr we can, lucene we can't
|
---|
615 | }
|
---|
616 |
|
---|
617 | if (expand_document)
|
---|
618 | {
|
---|
619 | // Merge the content with the structure information
|
---|
620 | NodeList dc_response_docs = dc_response_doc_list.getChildNodes();
|
---|
621 | for (int i = 0; i < doc_nodes.getLength(); i++)
|
---|
622 | {
|
---|
623 | String node_id = ((Element)doc_nodes.item(i)).getAttribute(GSXML.NODE_ID_ATT);
|
---|
624 | Node docNode = GSXML.getNamedElement(dc_response_doc_list, "documentNode", GSXML.NODE_ID_ATT, node_id);
|
---|
625 | Node content = GSXML.getChildByTagName(docNode, GSXML.NODE_CONTENT_ELEM);
|
---|
626 | if (content != null)
|
---|
627 | {
|
---|
628 | if (do_highlight_query_terms) {
|
---|
629 | if (get_marked_up_doc_from_query) {
|
---|
630 |
|
---|
631 | Element new_content = retrieveHighlightedContent(request, node_id);
|
---|
632 |
|
---|
633 | if (new_content == null) {
|
---|
634 | // we didn't get any text back from the request. assume we won't be able to get it next time either (eg lucene)
|
---|
635 | get_marked_up_doc_from_query = false;
|
---|
636 | content = highlightQueryTermsElementText(doc, (Element)content, query_term_variants, phrase_query_term_variants_hierarchy, highlight_case_insensitive);
|
---|
637 | } else {
|
---|
638 | content= new_content;
|
---|
639 | }
|
---|
640 | } else {
|
---|
641 | content = highlightQueryTermsElementText(doc, (Element)content, query_term_variants, phrase_query_term_variants_hierarchy, highlight_case_insensitive);
|
---|
642 | }
|
---|
643 | }
|
---|
644 | doc_nodes.item(i).appendChild(doc.importNode(content, true));
|
---|
645 | }
|
---|
646 |
|
---|
647 | }
|
---|
648 | if (has_dummy && document_type.equals(GSXML.DOC_TYPE_SIMPLE)) {
|
---|
649 | Element dummy_node = (Element) doc_nodes.item(0);
|
---|
650 | the_document.removeChild(dummy_node);
|
---|
651 | the_document.setAttribute(GSXML.NODE_ID_ATT, dummy_node.getAttribute(GSXML.NODE_ID_ATT));
|
---|
652 | NodeList dummy_children = dummy_node.getChildNodes();
|
---|
653 | for (int i = dummy_children.getLength() - 1; i >= 0; i--)
|
---|
654 | {
|
---|
655 | // special case as we don't want more than one metadata list
|
---|
656 | if (dummy_children.item(i).getNodeName().equals(GSXML.METADATA_ELEM + GSXML.LIST_MODIFIER))
|
---|
657 | {
|
---|
658 | GSXML.mergeMetadataFromList(the_document, dummy_children.item(i));
|
---|
659 | }
|
---|
660 | else
|
---|
661 | {
|
---|
662 | the_document.appendChild(dummy_children.item(i));
|
---|
663 | }
|
---|
664 | }
|
---|
665 | }
|
---|
666 | }
|
---|
667 | else
|
---|
668 | {
|
---|
669 | Element dc_response_doc = (Element) GSXML.getChildByTagName(dc_response_doc_list, GSXML.DOC_NODE_ELEM);
|
---|
670 | Element dc_response_doc_content = (Element) GSXML.getChildByTagName(dc_response_doc, GSXML.NODE_CONTENT_ELEM);
|
---|
671 |
|
---|
672 | if (dc_response_doc_content == null)
|
---|
673 | {
|
---|
674 | // no content to add
|
---|
675 | if (dc_response_doc.getAttribute("external").equals("true"))
|
---|
676 | {
|
---|
677 | String href_id = dc_response_doc.getAttribute(GSXML.HREF_ID_ATT);
|
---|
678 |
|
---|
679 | the_document.setAttribute("selectedNode", href_id);
|
---|
680 | the_document.setAttribute("external", href_id);
|
---|
681 | }
|
---|
682 | return result;
|
---|
683 | }
|
---|
684 | if (do_highlight_query_terms)
|
---|
685 | {
|
---|
686 | dc_response_doc.removeChild(dc_response_doc_content);
|
---|
687 | if (get_marked_up_doc_from_query) {
|
---|
688 | Element new_content = retrieveHighlightedContent(request, null);
|
---|
689 | if (new_content == null) {
|
---|
690 | get_marked_up_doc_from_query = false;
|
---|
691 | dc_response_doc_content = highlightQueryTermsElementText(doc, (Element)dc_response_doc_content, query_term_variants, phrase_query_term_variants_hierarchy, highlight_case_insensitive);
|
---|
692 | } else {
|
---|
693 |
|
---|
694 | dc_response_doc_content = new_content;
|
---|
695 | }
|
---|
696 | } else {
|
---|
697 | dc_response_doc_content = highlightQueryTermsElementText(doc, (Element)dc_response_doc_content, query_term_variants, phrase_query_term_variants_hierarchy, highlight_case_insensitive);
|
---|
698 | }
|
---|
699 | dc_response_doc.appendChild(dc_response_doc.getOwnerDocument().importNode(dc_response_doc_content, true));
|
---|
700 | }
|
---|
701 |
|
---|
702 | if (provide_annotations)
|
---|
703 | {
|
---|
704 | String service_selected = (String) params.get(ENRICH_DOC_ARG);
|
---|
705 | if (service_selected != null && service_selected.equals("1"))
|
---|
706 | {
|
---|
707 | // now we can modifiy the response doc if needed
|
---|
708 | String enrich_service = (String) params.get(GSParams.SERVICE);
|
---|
709 | // send a message to the service
|
---|
710 | Element enrich_message = doc.createElement(GSXML.MESSAGE_ELEM);
|
---|
711 | Element enrich_request = GSXML.createBasicRequest(doc, GSXML.REQUEST_TYPE_PROCESS, enrich_service, userContext);
|
---|
712 | enrich_message.appendChild(enrich_request);
|
---|
713 | // check for parameters
|
---|
714 | HashMap e_service_params = (HashMap) params.get("s1");
|
---|
715 | if (e_service_params != null)
|
---|
716 | {
|
---|
717 | Element enrich_pl = doc.createElement(GSXML.PARAM_ELEM + GSXML.LIST_MODIFIER);
|
---|
718 | GSXML.addParametersToList(enrich_pl, e_service_params);
|
---|
719 | enrich_request.appendChild(enrich_pl);
|
---|
720 | }
|
---|
721 | Element e_doc_list = doc.createElement(GSXML.DOC_NODE_ELEM + GSXML.LIST_MODIFIER);
|
---|
722 | enrich_request.appendChild(e_doc_list);
|
---|
723 | e_doc_list.appendChild(doc.importNode(dc_response_doc, true));
|
---|
724 |
|
---|
725 | Node enrich_response = this.mr.process(enrich_message);
|
---|
726 |
|
---|
727 | String[] links = { GSXML.RESPONSE_ELEM, GSXML.DOC_NODE_ELEM + GSXML.LIST_MODIFIER, GSXML.DOC_NODE_ELEM, GSXML.NODE_CONTENT_ELEM };
|
---|
728 | path = GSPath.createPath(links);
|
---|
729 | dc_response_doc_content = (Element) GSXML.getNodeByPath(enrich_response, path);
|
---|
730 |
|
---|
731 | }
|
---|
732 | } // if provide_annotations
|
---|
733 |
|
---|
734 | // use the returned id rather than the sent one cos there may have
|
---|
735 | // been modifiers such as .pr that are removed.
|
---|
736 | String modified_doc_id = dc_response_doc.getAttribute(GSXML.NODE_ID_ATT);
|
---|
737 | the_document.setAttribute("selectedNode", modified_doc_id);
|
---|
738 | if (has_dummy)
|
---|
739 | {
|
---|
740 | // change the id if necessary and add the content
|
---|
741 | Element dummy_node = (Element) doc_nodes.item(0);
|
---|
742 |
|
---|
743 | dummy_node.setAttribute(GSXML.NODE_ID_ATT, modified_doc_id);
|
---|
744 | dummy_node.appendChild(doc.importNode(dc_response_doc_content, true));
|
---|
745 | // hack for simple type
|
---|
746 | if (document_type.equals(GSXML.DOC_TYPE_SIMPLE))
|
---|
747 | {
|
---|
748 | // we dont want the internal docNode, just want the content and metadata in the document
|
---|
749 | // rethink this!!
|
---|
750 | the_document.removeChild(dummy_node);
|
---|
751 |
|
---|
752 | NodeList dummy_children = dummy_node.getChildNodes();
|
---|
753 | for (int i = dummy_children.getLength() - 1; i >= 0; i--)
|
---|
754 | {
|
---|
755 | // special case as we don't want more than one metadata list
|
---|
756 | if (dummy_children.item(i).getNodeName().equals(GSXML.METADATA_ELEM + GSXML.LIST_MODIFIER))
|
---|
757 | {
|
---|
758 | GSXML.mergeMetadataFromList(the_document, dummy_children.item(i));
|
---|
759 | }
|
---|
760 | else
|
---|
761 | {
|
---|
762 | the_document.appendChild(dummy_children.item(i));
|
---|
763 | }
|
---|
764 | }
|
---|
765 | }
|
---|
766 |
|
---|
767 | the_document.setAttribute(GSXML.NODE_ID_ATT, modified_doc_id);
|
---|
768 | }
|
---|
769 | else
|
---|
770 | {
|
---|
771 | // Merge the document content with the metadata and structure information
|
---|
772 | for (int i = 0; i < doc_nodes.getLength(); i++)
|
---|
773 | {
|
---|
774 | Node dn = doc_nodes.item(i);
|
---|
775 | String dn_id = ((Element) dn).getAttribute(GSXML.NODE_ID_ATT);
|
---|
776 | if (dn_id.equals(modified_doc_id))
|
---|
777 | {
|
---|
778 | dn.appendChild(doc.importNode(dc_response_doc_content, true));
|
---|
779 | break;
|
---|
780 | }
|
---|
781 | }
|
---|
782 | }
|
---|
783 | }
|
---|
784 | //logger.debug("(DocumentAction) Page:\n" + GSXML.xmlNodeToString(result));
|
---|
785 | return result;
|
---|
786 | }
|
---|
787 |
|
---|
788 | protected boolean checkValidOID(Element basic_doc_list, String collection, UserContext userContext, Element page_response) {
|
---|
789 | Document doc = basic_doc_list.getOwnerDocument();
|
---|
790 |
|
---|
791 | Element v_message = doc.createElement(GSXML.MESSAGE_ELEM);
|
---|
792 | String to = GSPath.appendLink(collection, AbstractDocumentRetrieve.VALIDATE_DOCUMENT_ID_SERVICE);
|
---|
793 | Element v_request = GSXML.createBasicRequest(doc, GSXML.REQUEST_TYPE_PROCESS, to, userContext);
|
---|
794 | v_message.appendChild(v_request);
|
---|
795 |
|
---|
796 | // add the node list
|
---|
797 | v_request.appendChild(basic_doc_list);
|
---|
798 | Element v_response_message = (Element) this.mr.process(v_message);
|
---|
799 | if (processErrorElements(v_response_message, page_response))
|
---|
800 | {
|
---|
801 | return false;
|
---|
802 | }
|
---|
803 | String[] links = { GSXML.RESPONSE_ELEM, GSXML.DOC_NODE_ELEM + GSXML.LIST_MODIFIER, GSXML.DOC_NODE_ELEM };
|
---|
804 | String path = GSPath.createPath(links);
|
---|
805 | Element info_elem = (Element) GSXML.getNodeByPath(v_response_message, path);
|
---|
806 | if (info_elem == null) {
|
---|
807 | return false;
|
---|
808 | }
|
---|
809 | if (info_elem.getAttribute("valid").equals("true")) {
|
---|
810 | return true;
|
---|
811 | }
|
---|
812 | return false;
|
---|
813 |
|
---|
814 | }
|
---|
815 |
|
---|
816 | protected Element getFormattedArchiveDoc(Document doc, String collection, String document_id, String opt_document_version, String document_type,
|
---|
817 | Element result, Element page_response, UserContext userContext ) {
|
---|
818 | // call get archive doc
|
---|
819 | Element dx_message = doc.createElement(GSXML.MESSAGE_ELEM);
|
---|
820 | String to = DocXMLUtil.DOC_XML_GET_SECTION_SERVICE;
|
---|
821 | Element dx_request = GSXML.createBasicRequest(doc, GSXML.REQUEST_TYPE_PROCESS, to, userContext);
|
---|
822 | dx_message.appendChild(dx_request);
|
---|
823 | Element dx_section = doc.createElement(GSXML.DOCXML_SECTION_ELEM);
|
---|
824 | dx_section.setAttribute(GSXML.NODE_ID_ATT, document_id);
|
---|
825 | dx_section.setAttribute(GSXML.COLLECTION_ATT, collection);
|
---|
826 | dx_section.setAttribute(GSXML.DOC_VERSION_ATT, opt_document_version);
|
---|
827 | dx_request.appendChild(dx_section);
|
---|
828 |
|
---|
829 | Element dx_response_message = (Element) this.mr.process(dx_message);
|
---|
830 | if (processErrorElements(dx_response_message, page_response))
|
---|
831 | {
|
---|
832 | return result;
|
---|
833 | }
|
---|
834 |
|
---|
835 | // get the section out
|
---|
836 | String path = GSPath.appendLink(GSXML.RESPONSE_ELEM, GSXML.DOCXML_SECTION_ELEM);
|
---|
837 | Element section = (Element) GSXML.getNodeByPath(dx_response_message, path);
|
---|
838 | if (section == null) {
|
---|
839 | logger.error("no archive doc returned for "+document_id);
|
---|
840 | return result;
|
---|
841 | }
|
---|
842 | // convert the archive format into the internal format that the page response requires
|
---|
843 |
|
---|
844 | // work out doctype
|
---|
845 | // NOTE: this will be coming from collection database in index
|
---|
846 | // the archive file doesn't store this. So we have to assume
|
---|
847 | // that the doc type will not be changing with any
|
---|
848 | // modifications happening to archives.
|
---|
849 |
|
---|
850 | // if doc type is null, then we need to work it out.
|
---|
851 | // create a basic doc list containing the current node
|
---|
852 |
|
---|
853 | if (document_type == null) {
|
---|
854 | Element basic_doc_list = doc.createElement(GSXML.DOC_NODE_ELEM + GSXML.LIST_MODIFIER);
|
---|
855 | Element current_doc = doc.createElement(GSXML.DOC_NODE_ELEM);
|
---|
856 | basic_doc_list.appendChild(current_doc);
|
---|
857 | current_doc.setAttribute(GSXML.NODE_ID_ATT, document_id);
|
---|
858 | basic_doc_list.appendChild(current_doc);
|
---|
859 | document_type = getDocumentType(basic_doc_list, collection, userContext, page_response);
|
---|
860 | }
|
---|
861 |
|
---|
862 | if (document_type == null) {
|
---|
863 | logger.debug("@@@ doctype is null, setting to simple");
|
---|
864 | document_type = GSXML.DOC_TYPE_SIMPLE;
|
---|
865 | }
|
---|
866 |
|
---|
867 | Element doc_elem = doc.createElement(GSXML.DOCUMENT_ELEM);
|
---|
868 | doc_elem.setAttribute(GSXML.DOC_TYPE_ATT, document_type);
|
---|
869 | page_response.appendChild(doc_elem);
|
---|
870 |
|
---|
871 | Element transformed_section = transformArchiveToDocument(section);
|
---|
872 | if (document_type == GSXML.DOC_TYPE_SIMPLE) {
|
---|
873 | // simple doc, only returning a single document node, which is the top level section.
|
---|
874 | doc_elem.setAttribute(GSXML.NODE_ID_ATT, document_id);
|
---|
875 | GSXML.mergeElements(doc_elem, transformed_section);
|
---|
876 | return result;
|
---|
877 | }
|
---|
878 |
|
---|
879 | // multi sectioned document.
|
---|
880 | transformed_section.setAttribute(GSXML.NODE_ID_ATT, document_id);
|
---|
881 | // In docEdit mode, we obtain the text from archives, from doc.xml
|
---|
882 | // Now the transformation has replaced <Section> with <documentNode>
|
---|
883 | // Need to add nodeID, nodeType and docType attributes to each docNode
|
---|
884 | // as doc.xml doesn't store that.
|
---|
885 | insertDocNodeAttributes(transformed_section, document_type, null);
|
---|
886 | doc_elem.appendChild(doc.importNode(transformed_section, true));
|
---|
887 | logger.debug("dx result = "+XMLConverter.getPrettyString(result));
|
---|
888 |
|
---|
889 | return result;
|
---|
890 | }
|
---|
891 |
|
---|
892 |
|
---|
893 | private boolean needSectionContent(HashMap<String, Serializable> params) {
|
---|
894 | String document_id = (String) params.get(GSParams.DOCUMENT);
|
---|
895 | String ilt = (String) params.get(GSParams.INLINE_TEMPLATE);
|
---|
896 | String iltPrefix = "<xsl:template match=\"/\"><text><xsl:for-each select=\"/page/pageResponse/document//documentNode[@nodeID =";
|
---|
897 | if (ilt != null && ilt.startsWith(iltPrefix) && document_id != null) {
|
---|
898 | return true;
|
---|
899 | }
|
---|
900 |
|
---|
901 | return false;
|
---|
902 | }
|
---|
903 | /**
|
---|
904 | * this method gets the collection description, the format info, the list of
|
---|
905 | * enrich services, etc - stuff that is needed for the page, but is the same
|
---|
906 | * whatever the query is - should be cached
|
---|
907 | */
|
---|
908 | protected boolean getBackgroundData(Element page_response, String collection, UserContext userContext)
|
---|
909 | {
|
---|
910 | Document doc = page_response.getOwnerDocument();
|
---|
911 |
|
---|
912 | // create a message to process - contains requests for the collection
|
---|
913 | // description, the format element, the enrich services on offer
|
---|
914 | // these could all be cached
|
---|
915 | Element info_message = doc.createElement(GSXML.MESSAGE_ELEM);
|
---|
916 | String path = GSPath.appendLink(collection, AbstractDocumentRetrieve.DOCUMENT_CONTENT_RETRIEVE_SERVICE);
|
---|
917 | // the format request - ignore for now, where does this request go to??
|
---|
918 | Element format_request = GSXML.createBasicRequest(doc, GSXML.REQUEST_TYPE_FORMAT, path, userContext);
|
---|
919 | info_message.appendChild(format_request);
|
---|
920 |
|
---|
921 | // the enrich_services request - only do this if provide_annotations is true
|
---|
922 |
|
---|
923 | if (provide_annotations)
|
---|
924 | {
|
---|
925 | Element enrich_services_request = GSXML.createBasicRequest(doc, GSXML.REQUEST_TYPE_DESCRIBE, "", userContext);
|
---|
926 | enrich_services_request.setAttribute(GSXML.INFO_ATT, "serviceList");
|
---|
927 | info_message.appendChild(enrich_services_request);
|
---|
928 | }
|
---|
929 |
|
---|
930 | Element info_response = (Element) this.mr.process(info_message);
|
---|
931 |
|
---|
932 | // the collection is the first response
|
---|
933 | NodeList responses = info_response.getElementsByTagName(GSXML.RESPONSE_ELEM);
|
---|
934 | Element format_resp = (Element) responses.item(0);
|
---|
935 |
|
---|
936 | Element format_elem = (Element) GSXML.getChildByTagName(format_resp, GSXML.FORMAT_ELEM);
|
---|
937 | if (format_elem != null)
|
---|
938 | {
|
---|
939 | Element global_format_elem = (Element) GSXML.getChildByTagName(format_resp, GSXML.GLOBAL_FORMAT_ELEM);
|
---|
940 | if (global_format_elem != null)
|
---|
941 | {
|
---|
942 | GSXSLT.mergeFormatElements(format_elem, global_format_elem, false);
|
---|
943 | }
|
---|
944 |
|
---|
945 | // set the format type
|
---|
946 | format_elem.setAttribute(GSXML.TYPE_ATT, "display");
|
---|
947 | page_response.appendChild(doc.importNode(format_elem, true));
|
---|
948 | }
|
---|
949 |
|
---|
950 | if (provide_annotations)
|
---|
951 | {
|
---|
952 | Element services_resp = (Element) responses.item(1);
|
---|
953 |
|
---|
954 | // a new message for the mr
|
---|
955 | Element enrich_message = doc.createElement(GSXML.MESSAGE_ELEM);
|
---|
956 | NodeList e_services = services_resp.getElementsByTagName(GSXML.SERVICE_ELEM);
|
---|
957 | boolean service_found = false;
|
---|
958 | for (int j = 0; j < e_services.getLength(); j++)
|
---|
959 | {
|
---|
960 | if (((Element) e_services.item(j)).getAttribute(GSXML.TYPE_ATT).equals("enrich"))
|
---|
961 | {
|
---|
962 | Element s = GSXML.createBasicRequest(doc, GSXML.REQUEST_TYPE_DESCRIBE, ((Element) e_services.item(j)).getAttribute(GSXML.NAME_ATT), userContext);
|
---|
963 | enrich_message.appendChild(s);
|
---|
964 | service_found = true;
|
---|
965 | }
|
---|
966 | }
|
---|
967 | if (service_found)
|
---|
968 | {
|
---|
969 | Element enrich_response = (Element) this.mr.process(enrich_message);
|
---|
970 |
|
---|
971 | NodeList e_responses = enrich_response.getElementsByTagName(GSXML.RESPONSE_ELEM);
|
---|
972 | Element service_list = doc.createElement(GSXML.SERVICE_ELEM + GSXML.LIST_MODIFIER);
|
---|
973 | for (int i = 0; i < e_responses.getLength(); i++)
|
---|
974 | {
|
---|
975 | Element e_resp = (Element) e_responses.item(i);
|
---|
976 | Element e_service = (Element) doc.importNode(GSXML.getChildByTagName(e_resp, GSXML.SERVICE_ELEM), true);
|
---|
977 | e_service.setAttribute(GSXML.NAME_ATT, e_resp.getAttribute(GSXML.FROM_ATT));
|
---|
978 | service_list.appendChild(e_service);
|
---|
979 | }
|
---|
980 | page_response.appendChild(service_list);
|
---|
981 | }
|
---|
982 | } // if provide_annotations
|
---|
983 | return true;
|
---|
984 |
|
---|
985 | }
|
---|
986 |
|
---|
987 | protected String getDocumentType(Element basic_doc_list, String collection, UserContext userContext, Element page_response)
|
---|
988 | {
|
---|
989 | Document doc = basic_doc_list.getOwnerDocument();
|
---|
990 |
|
---|
991 | Element ds_message = doc.createElement(GSXML.MESSAGE_ELEM);
|
---|
992 | String to = GSPath.appendLink(collection, AbstractDocumentRetrieve.DOCUMENT_STRUCTURE_RETRIEVE_SERVICE);
|
---|
993 | Element ds_request = GSXML.createBasicRequest(doc, GSXML.REQUEST_TYPE_PROCESS, to, userContext);
|
---|
994 | ds_message.appendChild(ds_request);
|
---|
995 |
|
---|
996 | // Create a parameter list to specify the required structure information
|
---|
997 | Element ds_param_list = doc.createElement(GSXML.PARAM_ELEM + GSXML.LIST_MODIFIER);
|
---|
998 | Element ds_param = doc.createElement(GSXML.PARAM_ELEM);
|
---|
999 | ds_param_list.appendChild(ds_param);
|
---|
1000 | ds_param.setAttribute(GSXML.NAME_ATT, "info");
|
---|
1001 | ds_param.setAttribute(GSXML.VALUE_ATT, "documentType");
|
---|
1002 |
|
---|
1003 | ds_request.appendChild(ds_param_list);
|
---|
1004 |
|
---|
1005 | // add the node list we created earlier
|
---|
1006 | ds_request.appendChild(basic_doc_list);
|
---|
1007 |
|
---|
1008 | // Process the document structure retrieve message
|
---|
1009 | Element ds_response_message = (Element) this.mr.process(ds_message);
|
---|
1010 | if (processErrorElements(ds_response_message, page_response))
|
---|
1011 | {
|
---|
1012 | return null;
|
---|
1013 | }
|
---|
1014 |
|
---|
1015 | String[] links = { GSXML.RESPONSE_ELEM, GSXML.DOC_NODE_ELEM + GSXML.LIST_MODIFIER, GSXML.DOC_NODE_ELEM, "nodeStructureInfo" };
|
---|
1016 | String path = GSPath.createPath(links);
|
---|
1017 | Element info_elem = (Element) GSXML.getNodeByPath(ds_response_message, path);
|
---|
1018 | if (info_elem == null) {
|
---|
1019 | return null;
|
---|
1020 | }
|
---|
1021 | Element doctype_elem = GSXML.getNamedElement(info_elem, "info", "name", "documentType");
|
---|
1022 | if (doctype_elem != null)
|
---|
1023 | {
|
---|
1024 | String doc_type = doctype_elem.getAttribute("value");
|
---|
1025 | return doc_type;
|
---|
1026 | }
|
---|
1027 | return null;
|
---|
1028 | }
|
---|
1029 |
|
---|
1030 | // Recursive method to set the docType, nodeType and nodeID attributes of each docNode
|
---|
1031 | // The docType remains constant as in parameter document_type
|
---|
1032 | // The nodeID for the first (root) docNode is already set. For all children, the rootNode id
|
---|
1033 | // is updated to be <parent-id>.<num-child>, where the first parent-id is rootNode id.
|
---|
1034 | // The nodeType is root if rootNode, internal if there are children and leaf if no children
|
---|
1035 | protected void insertDocNodeAttributes(Element docNode, String document_type, String id) {
|
---|
1036 |
|
---|
1037 | boolean isRoot = false;
|
---|
1038 | if(id == null) { // rootNode, get the root nodeID to work with recursively
|
---|
1039 | id = docNode.getAttribute(GSXML.NODE_ID_ATT);
|
---|
1040 | isRoot = true;
|
---|
1041 | } else { // for all but the root node, need to still set the nodeID
|
---|
1042 | docNode.setAttribute(GSXML.NODE_ID_ATT, id);
|
---|
1043 | }
|
---|
1044 |
|
---|
1045 | docNode.setAttribute(GSXML.DOC_TYPE_ATT, document_type);
|
---|
1046 |
|
---|
1047 | NodeList docNodes = GSXML.getChildrenByTagName(docNode, GSXML.DOC_NODE_ELEM);
|
---|
1048 | if(docNodes.getLength() > 0) {
|
---|
1049 | docNode.setAttribute(GSXML.NODE_TYPE_ATT, GSXML.NODE_TYPE_INTERNAL);
|
---|
1050 | for(int i = 0; i < docNodes.getLength(); i++) {
|
---|
1051 | Element childDocNode = (Element)docNodes.item(i);
|
---|
1052 |
|
---|
1053 | // work out the child docNode's nodeID based on current id
|
---|
1054 | String nodeID = id + "." + (i+1);
|
---|
1055 | insertDocNodeAttributes(childDocNode, document_type, nodeID); //recursion step
|
---|
1056 | }
|
---|
1057 | } else {
|
---|
1058 | docNode.setAttribute(GSXML.NODE_TYPE_ATT, GSXML.NODE_TYPE_LEAF);
|
---|
1059 | }
|
---|
1060 |
|
---|
1061 | // rootNode's nodeType is a special case: it's "root", not "leaf" or "internal"
|
---|
1062 | if(isRoot) docNode.setAttribute(GSXML.NODE_TYPE_ATT, GSXML.NODE_TYPE_ROOT);
|
---|
1063 |
|
---|
1064 | }
|
---|
1065 |
|
---|
1066 | /** run the XSLT transform which converts from doc.xml format to our internal document format */
|
---|
1067 | protected Element transformArchiveToDocument(Element section) {
|
---|
1068 |
|
---|
1069 | String stylesheet_filename = GSFile.stylesheetFile(GlobalProperties.getGSDL3Home(), (String) this.config_params.get(GSConstants.SITE_NAME), "", (String) this.config_params.get(GSConstants.INTERFACE_NAME), (ArrayList<String>) this.config_params.get(GSConstants.BASE_INTERFACES), "archive2document.xsl");
|
---|
1070 | if (stylesheet_filename == null) {
|
---|
1071 | logger.error("Couldn't find stylesheet archive2document.xsl");
|
---|
1072 | return section;
|
---|
1073 | }
|
---|
1074 |
|
---|
1075 | Document stylesheet_doc = XMLConverter.getDOM(new File(stylesheet_filename));
|
---|
1076 | if (stylesheet_doc == null) {
|
---|
1077 | logger.error("Couldn't load in stylesheet "+stylesheet_filename);
|
---|
1078 | return section;
|
---|
1079 | }
|
---|
1080 |
|
---|
1081 | Document section_doc = XMLConverter.newDOM();
|
---|
1082 | section_doc.appendChild(section_doc.importNode(section, true));
|
---|
1083 | Node result = this.transformer.transform(stylesheet_doc, section_doc);
|
---|
1084 | logger.debug("transform result = "+XMLConverter.getPrettyString(result));
|
---|
1085 |
|
---|
1086 | Element new_element;
|
---|
1087 | if (result.getNodeType() == Node.DOCUMENT_NODE) {
|
---|
1088 | new_element = ((Document) result).getDocumentElement();
|
---|
1089 | } else {
|
---|
1090 | new_element = (Element) result;
|
---|
1091 | }
|
---|
1092 |
|
---|
1093 |
|
---|
1094 | return new_element;
|
---|
1095 |
|
---|
1096 | }
|
---|
1097 |
|
---|
1098 | protected final int NO_QUERY_TERMS = 0;
|
---|
1099 | protected final int NO_EQUIV_QUERY_TERMS = 1;
|
---|
1100 | protected final int EQUIV_QUERY_TERMS = 2;
|
---|
1101 | /**
|
---|
1102 | * this involves a bit of a hack to get the equivalent query terms - has to
|
---|
1103 | * requery the query service - uses the last selected service name. (if it
|
---|
1104 | * ends in query).
|
---|
1105 | */
|
---|
1106 | protected int getQueryTermVariants(Element request, HashSet<String> query_term_variants, ArrayList<ArrayList<HashSet<String>>> phrase_query_term_variants_hierarchy)
|
---|
1107 | {
|
---|
1108 | Document doc = XMLConverter.newDOM();
|
---|
1109 |
|
---|
1110 | // do the query again to get term info
|
---|
1111 | Element cgi_param_list = (Element) GSXML.getChildByTagName(request, GSXML.PARAM_ELEM + GSXML.LIST_MODIFIER);
|
---|
1112 | HashMap<String, Serializable> params = GSXML.extractParams(cgi_param_list, false);
|
---|
1113 |
|
---|
1114 | HashMap previous_params = (HashMap) params.get("p");
|
---|
1115 | if (previous_params == null)
|
---|
1116 | {
|
---|
1117 | return NO_QUERY_TERMS;
|
---|
1118 | }
|
---|
1119 | String service_name = (String) previous_params.get(GSParams.SERVICE);
|
---|
1120 | if (service_name == null || !service_name.endsWith("Query"))
|
---|
1121 | { // hack for now - we only do highlighting if we were in a query last - ie not if we were in a browse thingy
|
---|
1122 | logger.debug("invalid service "+service_name+", not doing highlighting");
|
---|
1123 | return NO_QUERY_TERMS;
|
---|
1124 | }
|
---|
1125 |
|
---|
1126 | String collection = (String) params.get(GSParams.COLLECTION);
|
---|
1127 | UserContext userContext = new UserContext(request);
|
---|
1128 | String to = GSPath.appendLink(collection, service_name);
|
---|
1129 |
|
---|
1130 | Element mr_query_message = doc.createElement(GSXML.MESSAGE_ELEM);
|
---|
1131 | Element mr_query_request = GSXML.createBasicRequest(doc, GSXML.REQUEST_TYPE_PROCESS, to, userContext);
|
---|
1132 | mr_query_message.appendChild(mr_query_request);
|
---|
1133 |
|
---|
1134 | // paramList
|
---|
1135 | HashMap service_params = (HashMap) params.get("s1");
|
---|
1136 |
|
---|
1137 | Element query_param_list = doc.createElement(GSXML.PARAM_ELEM + GSXML.LIST_MODIFIER);
|
---|
1138 | GSXML.addParametersToList(query_param_list, service_params);
|
---|
1139 | mr_query_request.appendChild(query_param_list);
|
---|
1140 |
|
---|
1141 | // do the query
|
---|
1142 | Element mr_query_response = (Element) this.mr.process(mr_query_message);
|
---|
1143 |
|
---|
1144 | // find the term lists
|
---|
1145 | String path = GSPath.appendLink(GSXML.RESPONSE_ELEM, GSXML.TERM_ELEM + GSXML.LIST_MODIFIER);
|
---|
1146 | Element query_term_list_element = (Element) GSXML.getNodeByPath(mr_query_response, path);
|
---|
1147 | if (query_term_list_element == null)
|
---|
1148 | {
|
---|
1149 | // no term info
|
---|
1150 | return NO_QUERY_TERMS;
|
---|
1151 | }
|
---|
1152 |
|
---|
1153 | int result_code = NO_EQUIV_QUERY_TERMS;
|
---|
1154 | NodeList equivalent_terms_nodelist = query_term_list_element.getElementsByTagName("equivTermList");
|
---|
1155 | if (equivalent_terms_nodelist == null || equivalent_terms_nodelist.getLength() == 0)
|
---|
1156 | {
|
---|
1157 | // if we have no equivalent terms, just add the current terms lower cased and we do case insensitive matching later on
|
---|
1158 | NodeList terms_nodelist = query_term_list_element.getElementsByTagName("term");
|
---|
1159 | if (terms_nodelist != null && terms_nodelist.getLength() > 0)
|
---|
1160 | {
|
---|
1161 | for (int i = 0; i < terms_nodelist.getLength(); i++)
|
---|
1162 | {
|
---|
1163 | String termValue = ((Element) terms_nodelist.item(i)).getAttribute("name");
|
---|
1164 | query_term_variants.add(termValue.toLowerCase());
|
---|
1165 | }
|
---|
1166 | }
|
---|
1167 | }
|
---|
1168 | else
|
---|
1169 | {
|
---|
1170 | result_code = EQUIV_QUERY_TERMS;
|
---|
1171 | for (int i = 0; i < equivalent_terms_nodelist.getLength(); i++)
|
---|
1172 | {
|
---|
1173 | Element equivalent_terms_element = (Element) equivalent_terms_nodelist.item(i);
|
---|
1174 | String[] equivalent_terms = GSXML.getAttributeValuesFromList(equivalent_terms_element, GSXML.NAME_ATT);
|
---|
1175 | for (int j = 0; j < equivalent_terms.length; j++)
|
---|
1176 | {
|
---|
1177 | query_term_variants.add(equivalent_terms[j]);
|
---|
1178 | }
|
---|
1179 | }
|
---|
1180 | }
|
---|
1181 |
|
---|
1182 | String metadata_path = GSPath.appendLink(GSXML.RESPONSE_ELEM, GSXML.METADATA_ELEM + GSXML.LIST_MODIFIER);
|
---|
1183 | Element metadata_list = (Element) GSXML.getNodeByPath(mr_query_response, metadata_path);
|
---|
1184 |
|
---|
1185 | Element query_element = GSXML.getNamedElement(metadata_list, GSXML.METADATA_ELEM, GSXML.NAME_ATT, "query");
|
---|
1186 | String performed_query = GSXML.getNodeText(query_element) + " ";
|
---|
1187 | logger.debug("performed query="+performed_query);
|
---|
1188 |
|
---|
1189 | boolean has_phrases = false; // if there are no phrases, we don't bother making the phrase variants structure
|
---|
1190 | if (performed_query.contains("\"")) {
|
---|
1191 | has_phrases = true;
|
---|
1192 | }
|
---|
1193 |
|
---|
1194 | ArrayList<HashSet<String>> phrase_query_p_term_variants_list = new ArrayList<HashSet<String>>();
|
---|
1195 | int term_start = 0;
|
---|
1196 | boolean in_term = false;
|
---|
1197 | boolean in_phrase = false;
|
---|
1198 | for (int i = 0; i < performed_query.length(); i++) {
|
---|
1199 |
|
---|
1200 | char character = performed_query.charAt(i);
|
---|
1201 | boolean is_character_letter_or_digit = Character.isLetterOrDigit(character);
|
---|
1202 |
|
---|
1203 | // Has a query term just started?
|
---|
1204 | if (in_term == false && is_character_letter_or_digit == true)
|
---|
1205 | {
|
---|
1206 | in_term = true;
|
---|
1207 | term_start = i;
|
---|
1208 | }
|
---|
1209 |
|
---|
1210 | // Or has a term just finished?
|
---|
1211 | else if (in_term == true && is_character_letter_or_digit == false)
|
---|
1212 | {
|
---|
1213 | in_term = false;
|
---|
1214 | String term = performed_query.substring(term_start, i);
|
---|
1215 | if (has_phrases) {
|
---|
1216 | // do the phrase bit
|
---|
1217 | HashSet<String> phrase_query_p_term_x_variants = new HashSet<String>();
|
---|
1218 | if (result_code == EQUIV_QUERY_TERMS) {
|
---|
1219 | Element term_element = GSXML.getNamedElement(query_term_list_element, GSXML.TERM_ELEM, GSXML.NAME_ATT, term);
|
---|
1220 | if (term_element != null) {
|
---|
1221 | // might be null for eg TX in [snails]:TX
|
---|
1222 |
|
---|
1223 | NodeList term_equivalent_terms_nodelist = term_element.getElementsByTagName("equivTermList");
|
---|
1224 | if (term_equivalent_terms_nodelist != null || term_equivalent_terms_nodelist.getLength() != 0) {
|
---|
1225 | for (int j = 0; j < term_equivalent_terms_nodelist.getLength(); j++)
|
---|
1226 | {
|
---|
1227 | Element term_equivalent_terms_element = (Element) term_equivalent_terms_nodelist.item(j);
|
---|
1228 | String[] term_equivalent_terms = GSXML.getAttributeValuesFromList(term_equivalent_terms_element, GSXML.NAME_ATT);
|
---|
1229 | for (int k = 0; k < term_equivalent_terms.length; k++)
|
---|
1230 | {
|
---|
1231 | phrase_query_p_term_x_variants.add(term_equivalent_terms[k]);
|
---|
1232 | }
|
---|
1233 | }
|
---|
1234 | }
|
---|
1235 | }
|
---|
1236 | } else { // result_code != EQUIV_QUERY_TERMS
|
---|
1237 | // we don;t have equivalent term list, so just add the lower cased version in, and we do case-insensitive matching later on
|
---|
1238 | if (query_term_variants.contains(term.toLowerCase()) || containsSubString(query_term_variants, term)) {
|
---|
1239 | // this handles the case where the user has searched for snails, but term list returns 'snail'
|
---|
1240 | phrase_query_p_term_x_variants.add(term.toLowerCase());
|
---|
1241 | }
|
---|
1242 | }
|
---|
1243 | if (phrase_query_p_term_x_variants.size()>0) {
|
---|
1244 | // we have found a valid term
|
---|
1245 | phrase_query_p_term_variants_list.add(phrase_query_p_term_x_variants);
|
---|
1246 |
|
---|
1247 | if (in_phrase == false)
|
---|
1248 | {
|
---|
1249 | phrase_query_term_variants_hierarchy.add(phrase_query_p_term_variants_list);
|
---|
1250 | phrase_query_p_term_variants_list = new ArrayList<HashSet<String>>();
|
---|
1251 | }
|
---|
1252 | }
|
---|
1253 | } // end if has_phrases
|
---|
1254 | else {
|
---|
1255 | // no phrases so we don't have to do the phrasey stuff. but
|
---|
1256 | // we need to check the term against the query term list - if its not in there, check whether its the root of a term.
|
---|
1257 | // we want to handle the case where user has queried "snails", the term list returned only has snail, and therefore snails doesn't get highlighted.
|
---|
1258 | // but dont want to include eg TX
|
---|
1259 | if (result_code == NO_EQUIV_QUERY_TERMS) {
|
---|
1260 | if (containsSubString(query_term_variants, term)) {
|
---|
1261 | query_term_variants.add(term.toLowerCase());
|
---|
1262 | }
|
---|
1263 | }
|
---|
1264 |
|
---|
1265 | }
|
---|
1266 | } // end of in_term...
|
---|
1267 | // Watch for phrases (surrounded by quotes)
|
---|
1268 | if (character == '\"') {
|
---|
1269 |
|
---|
1270 | // Has a phrase just started?
|
---|
1271 | if (in_phrase == false)
|
---|
1272 | {
|
---|
1273 | in_phrase = true;
|
---|
1274 | }
|
---|
1275 | // Or has a phrase just finished?
|
---|
1276 | else if (in_phrase == true)
|
---|
1277 | {
|
---|
1278 | in_phrase = false;
|
---|
1279 | phrase_query_term_variants_hierarchy.add(phrase_query_p_term_variants_list);
|
---|
1280 | }
|
---|
1281 |
|
---|
1282 | phrase_query_p_term_variants_list = new ArrayList<HashSet<String>>();
|
---|
1283 | } // if char == "
|
---|
1284 | } // for each char in performed query
|
---|
1285 |
|
---|
1286 | return result_code;
|
---|
1287 | }
|
---|
1288 |
|
---|
1289 | protected boolean containsSubString(HashSet<String> query_term_variants, String term) {
|
---|
1290 | // hack to filter out TX, TI field names
|
---|
1291 | String lc_term = term.toLowerCase();
|
---|
1292 | if (query_term_variants.contains(term)) {
|
---|
1293 | return false; // or true??
|
---|
1294 | }
|
---|
1295 | if (term.matches("[A-Z][A-Z][A-Z]?")) {
|
---|
1296 | return false;
|
---|
1297 | }
|
---|
1298 | Iterator i = query_term_variants.iterator();
|
---|
1299 | while (i.hasNext()) {
|
---|
1300 | String t = (String)i.next();
|
---|
1301 | if (term.startsWith(t)) {
|
---|
1302 | return true;
|
---|
1303 | }
|
---|
1304 | }
|
---|
1305 | return false;
|
---|
1306 | }
|
---|
1307 |
|
---|
1308 |
|
---|
1309 | /** retrieve the marked up highlighted section - only works for solr collection */
|
---|
1310 | protected Element retrieveHighlightedContent(Element request, String node_id) {
|
---|
1311 |
|
---|
1312 | Document doc = XMLConverter.newDOM();
|
---|
1313 |
|
---|
1314 | // do the query again to get term info
|
---|
1315 | Element cgi_param_list = (Element) GSXML.getChildByTagName(request, GSXML.PARAM_ELEM + GSXML.LIST_MODIFIER);
|
---|
1316 | HashMap<String, Serializable> params = GSXML.extractParams(cgi_param_list, false);
|
---|
1317 |
|
---|
1318 | HashMap previous_params = (HashMap) params.get("p");
|
---|
1319 | if (previous_params == null)
|
---|
1320 | {
|
---|
1321 | return null;
|
---|
1322 | }
|
---|
1323 | String service_name = (String) previous_params.get(GSParams.SERVICE);
|
---|
1324 | if (service_name == null || !service_name.endsWith("Query"))
|
---|
1325 | { // hack for now - we only do highlighting if we were in a query last - ie not if we were in a browse thingy
|
---|
1326 | logger.debug("HL invalid service, not doing highlighting");
|
---|
1327 | return null;
|
---|
1328 | }
|
---|
1329 |
|
---|
1330 | String collection = (String) params.get(GSParams.COLLECTION);
|
---|
1331 | UserContext userContext = new UserContext(request);
|
---|
1332 | String to = GSPath.appendLink(collection, service_name);
|
---|
1333 |
|
---|
1334 | Element mr_query_message = doc.createElement(GSXML.MESSAGE_ELEM);
|
---|
1335 | Element mr_query_request = GSXML.createBasicRequest(doc, GSXML.REQUEST_TYPE_PROCESS, to, userContext);
|
---|
1336 | mr_query_message.appendChild(mr_query_request);
|
---|
1337 |
|
---|
1338 | // paramList
|
---|
1339 | HashMap service_params = (HashMap) params.get("s1");
|
---|
1340 |
|
---|
1341 | // hack in case the user searched on eg titles, but we want highlighting in the text
|
---|
1342 | service_params.put("index", "TX");
|
---|
1343 | Element query_param_list = doc.createElement(GSXML.PARAM_ELEM + GSXML.LIST_MODIFIER);
|
---|
1344 | GSXML.addParametersToList(query_param_list, service_params);
|
---|
1345 |
|
---|
1346 | if (node_id != null) {
|
---|
1347 | GSXML.addParameterToList(query_param_list, "hldocOID", node_id);
|
---|
1348 | } else {
|
---|
1349 | GSXML.addParameterToList(query_param_list, "hldocOID", (String) params.get(GSParams.DOCUMENT));
|
---|
1350 | }
|
---|
1351 | mr_query_request.appendChild(query_param_list);
|
---|
1352 | // do the query
|
---|
1353 |
|
---|
1354 | Element mr_query_response = (Element) this.mr.process(mr_query_message);
|
---|
1355 | String pathNode = GSPath.appendLink(GSXML.RESPONSE_ELEM, GSXML.NODE_CONTENT_ELEM);
|
---|
1356 | Element highlighted_node = (Element) GSXML.getNodeByPath(mr_query_response, pathNode);
|
---|
1357 |
|
---|
1358 | if (highlighted_node == null) {
|
---|
1359 | return null;
|
---|
1360 | }
|
---|
1361 | // For SOLR, the highlighted node will be a nodeContent element, which is the hldocOID section content, with search terms marked up.
|
---|
1362 | //We send it back to the documnetContentRetrieve service so that resolveTextMacros can be applied, and it can be properly encased in documentNode etc elements
|
---|
1363 |
|
---|
1364 | // Build a request to process highlighted text
|
---|
1365 |
|
---|
1366 | Element hl_message = doc.createElement(GSXML.MESSAGE_ELEM);
|
---|
1367 | to = GSPath.appendLink(collection, AbstractDocumentRetrieve.DOCUMENT_CONTENT_RETRIEVE_SERVICE);
|
---|
1368 | Element dc_request = GSXML.createBasicRequest(doc, GSXML.REQUEST_TYPE_PROCESS, to, userContext);
|
---|
1369 | hl_message.appendChild(dc_request);
|
---|
1370 |
|
---|
1371 | // Create a parameter list to specify the request parameters - empty for now
|
---|
1372 | Element dc_param_list = doc.createElement(GSXML.PARAM_ELEM + GSXML.LIST_MODIFIER);
|
---|
1373 | dc_request.appendChild(dc_param_list);
|
---|
1374 |
|
---|
1375 | // get the content
|
---|
1376 | Element doc_list = doc.createElement(GSXML.DOC_NODE_ELEM + GSXML.LIST_MODIFIER);
|
---|
1377 | dc_request.appendChild(doc_list);
|
---|
1378 | Element current_doc = doc.createElement(GSXML.DOC_NODE_ELEM);
|
---|
1379 | doc_list.appendChild(current_doc);
|
---|
1380 | current_doc.setAttribute(GSXML.NODE_ID_ATT, (String) params.get(GSParams.DOCUMENT));
|
---|
1381 | //Append highlighted content to request for processing
|
---|
1382 | dc_request.appendChild(doc.importNode(highlighted_node, true));
|
---|
1383 | Element hl_response_message = (Element) this.mr.process(hl_message);
|
---|
1384 | //Get results
|
---|
1385 | NodeList contentList = hl_response_message.getElementsByTagName(GSXML.NODE_CONTENT_ELEM);
|
---|
1386 | Element content = (Element) contentList.item(0);
|
---|
1387 | return content;
|
---|
1388 |
|
---|
1389 |
|
---|
1390 | }
|
---|
1391 | /**
|
---|
1392 | * Highlights query terms in specified elements (whose name is in element_names) text inside top_level_elem
|
---|
1393 | */
|
---|
1394 | protected boolean highlightQueryTermsDOM(Document doc, Element top_level_elem, String element_name, HashSet<String> query_term_variants, ArrayList<ArrayList<HashSet<String>>> phrase_query_term_variants_hierarchy, boolean case_insensitive) {
|
---|
1395 |
|
---|
1396 | NodeList named_elems = top_level_elem.getElementsByTagName(element_name);
|
---|
1397 | for (int j=named_elems.getLength()-1; j>=0; j--) {
|
---|
1398 | Element this_elem = (Element)named_elems.item(j);
|
---|
1399 | Element replacement_elem = highlightQueryTermsElementText(doc, this_elem, query_term_variants, phrase_query_term_variants_hierarchy, case_insensitive);
|
---|
1400 | this_elem.getParentNode().replaceChild(replacement_elem, this_elem);
|
---|
1401 | }
|
---|
1402 | return true;
|
---|
1403 | }
|
---|
1404 | /**
|
---|
1405 | * Highlights query terms in the text content of an element.
|
---|
1406 | */
|
---|
1407 | private Element highlightQueryTermsElementText(Document doc, Element original_element, HashSet<String> query_term_variants, ArrayList<ArrayList<HashSet<String>>> phrase_query_term_variants_hierarchy, boolean case_insensitive)
|
---|
1408 | {
|
---|
1409 | String content = GSXML.getNodeText(original_element);
|
---|
1410 | // Convert the content string to an array of characters for speed
|
---|
1411 | char[] content_characters = new char[content.length()];
|
---|
1412 | content.getChars(0, content.length(), content_characters, 0);
|
---|
1413 |
|
---|
1414 | // Now skim through the content, identifying word matches
|
---|
1415 | ArrayList<WordMatch> word_matches = new ArrayList<WordMatch>();
|
---|
1416 | int word_start = 0;
|
---|
1417 | boolean in_word = false;
|
---|
1418 | boolean preceding_word_matched = false;
|
---|
1419 | boolean inTag = false;
|
---|
1420 | for (int i = 0; i < content_characters.length; i++)
|
---|
1421 | {
|
---|
1422 | //We don't want to find words inside HTML tags
|
---|
1423 | if (content_characters[i] == '<')
|
---|
1424 | {
|
---|
1425 | // are we currently in a word?
|
---|
1426 | if (in_word) {
|
---|
1427 | in_word = false;
|
---|
1428 | String word = new String(content_characters, word_start, (i - word_start));
|
---|
1429 | if (case_insensitive) {
|
---|
1430 | word = word.toLowerCase();
|
---|
1431 | }
|
---|
1432 | if (query_term_variants.contains(word)) {
|
---|
1433 | // We have found a matching word, so remember its location
|
---|
1434 | word_matches.add(new WordMatch(word, word_start, i, preceding_word_matched));
|
---|
1435 | // should preceding word matched be set to true/false here??
|
---|
1436 | preceding_word_matched = true;
|
---|
1437 | } else {
|
---|
1438 | preceding_word_matched = false;
|
---|
1439 | }
|
---|
1440 | }
|
---|
1441 | inTag = true;
|
---|
1442 | continue;
|
---|
1443 | }
|
---|
1444 | else if (inTag && content_characters[i] == '>')
|
---|
1445 | {
|
---|
1446 | inTag = false;
|
---|
1447 | continue;
|
---|
1448 | }
|
---|
1449 | else if (inTag)
|
---|
1450 | {
|
---|
1451 | continue;
|
---|
1452 | }
|
---|
1453 |
|
---|
1454 | boolean is_character_letter_or_digit = Character.isLetterOrDigit(content_characters[i]);
|
---|
1455 |
|
---|
1456 | // Has a word just started?
|
---|
1457 | if (in_word == false && is_character_letter_or_digit == true)
|
---|
1458 | {
|
---|
1459 | in_word = true;
|
---|
1460 | word_start = i;
|
---|
1461 | }
|
---|
1462 |
|
---|
1463 | // Or has a word just finished?
|
---|
1464 | else if (in_word == true && is_character_letter_or_digit == false)
|
---|
1465 | {
|
---|
1466 | in_word = false;
|
---|
1467 |
|
---|
1468 | // Check if the word matches any of the query term equivalents
|
---|
1469 | String word = new String(content_characters, word_start, (i - word_start));
|
---|
1470 | if (case_insensitive) {
|
---|
1471 | word = word.toLowerCase();
|
---|
1472 | }
|
---|
1473 | if (query_term_variants.contains(word))
|
---|
1474 | {
|
---|
1475 | // We have found a matching word, so remember its location
|
---|
1476 | word_matches.add(new WordMatch(word, word_start, i, preceding_word_matched));
|
---|
1477 | preceding_word_matched = true;
|
---|
1478 | }
|
---|
1479 | else
|
---|
1480 | {
|
---|
1481 | preceding_word_matched = false;
|
---|
1482 | }
|
---|
1483 | }
|
---|
1484 | }
|
---|
1485 |
|
---|
1486 | // Don't forget the last word...
|
---|
1487 | if (in_word == true)
|
---|
1488 | {
|
---|
1489 | // Check if the word matches any of the query term equivalents
|
---|
1490 | String word = new String(content_characters, word_start, (content_characters.length - word_start));
|
---|
1491 | if (case_insensitive) {
|
---|
1492 | word = word.toLowerCase();
|
---|
1493 | }
|
---|
1494 | if (query_term_variants.contains(word))
|
---|
1495 | {
|
---|
1496 | // We have found a matching word, so remember its location
|
---|
1497 | word_matches.add(new WordMatch(word, word_start, content_characters.length, preceding_word_matched));
|
---|
1498 | }
|
---|
1499 | }
|
---|
1500 |
|
---|
1501 | if (word_matches.size() == 0) {
|
---|
1502 | // just return a copy of the original element
|
---|
1503 | return (Element)doc.importNode(original_element, true);
|
---|
1504 |
|
---|
1505 | }
|
---|
1506 |
|
---|
1507 | ArrayList<Integer> highlight_start_positions = new ArrayList<Integer>();
|
---|
1508 | ArrayList<Integer> highlight_end_positions = new ArrayList<Integer>();
|
---|
1509 |
|
---|
1510 | if (phrase_query_term_variants_hierarchy.size() ==0) {
|
---|
1511 | for (int i = 0; i < word_matches.size(); i++) {
|
---|
1512 | highlight_start_positions.add(Integer.valueOf(word_matches.get(i).start_position));
|
---|
1513 | highlight_end_positions.add(Integer.valueOf(word_matches.get(i).end_position));
|
---|
1514 | }
|
---|
1515 | }
|
---|
1516 | else {
|
---|
1517 | // Deal with phrases now
|
---|
1518 | ArrayList<PartialPhraseMatch> partial_phrase_matches = new ArrayList<PartialPhraseMatch>();
|
---|
1519 | for (int i = 0; i < word_matches.size(); i++)
|
---|
1520 | {
|
---|
1521 | WordMatch word_match = word_matches.get(i);
|
---|
1522 |
|
---|
1523 | // See if any partial phrase matches are extended by this word
|
---|
1524 | if (word_match.preceding_word_matched)
|
---|
1525 | {
|
---|
1526 | for (int j = partial_phrase_matches.size() - 1; j >= 0; j--)
|
---|
1527 | {
|
---|
1528 | PartialPhraseMatch partial_phrase_match = partial_phrase_matches.remove(j);
|
---|
1529 | ArrayList phrase_query_p_term_variants_list = phrase_query_term_variants_hierarchy.get(partial_phrase_match.query_phrase_number);
|
---|
1530 | HashSet phrase_query_p_term_x_variants = (HashSet) phrase_query_p_term_variants_list.get(partial_phrase_match.num_words_matched);
|
---|
1531 | if (phrase_query_p_term_x_variants.contains(word_match.word))
|
---|
1532 | {
|
---|
1533 | partial_phrase_match.num_words_matched++;
|
---|
1534 |
|
---|
1535 | // Has a complete phrase match occurred?
|
---|
1536 | if (partial_phrase_match.num_words_matched == phrase_query_p_term_variants_list.size())
|
---|
1537 | {
|
---|
1538 | // Check for overlaps by looking at the previous highlight range
|
---|
1539 | if (!highlight_end_positions.isEmpty())
|
---|
1540 | {
|
---|
1541 | int last_highlight_index = highlight_end_positions.size() - 1;
|
---|
1542 | int last_highlight_end = highlight_end_positions.get(last_highlight_index).intValue();
|
---|
1543 | if (last_highlight_end > partial_phrase_match.start_position)
|
---|
1544 | {
|
---|
1545 | // There is an overlap, so remove the previous phrase match
|
---|
1546 | int last_highlight_start = highlight_start_positions.remove(last_highlight_index).intValue();
|
---|
1547 | highlight_end_positions.remove(last_highlight_index);
|
---|
1548 | partial_phrase_match.start_position = last_highlight_start;
|
---|
1549 | }
|
---|
1550 | }
|
---|
1551 |
|
---|
1552 | highlight_start_positions.add(Integer.valueOf(partial_phrase_match.start_position));
|
---|
1553 | highlight_end_positions.add(Integer.valueOf(word_match.end_position));
|
---|
1554 | }
|
---|
1555 | // No, but add the partial match back into the list for next time
|
---|
1556 | else
|
---|
1557 | {
|
---|
1558 | partial_phrase_matches.add(partial_phrase_match);
|
---|
1559 | }
|
---|
1560 | }
|
---|
1561 | }
|
---|
1562 | }
|
---|
1563 | else
|
---|
1564 | {
|
---|
1565 | partial_phrase_matches.clear();
|
---|
1566 | }
|
---|
1567 |
|
---|
1568 | // See if this word is at the start of any of the phrases
|
---|
1569 | for (int p = 0; p < phrase_query_term_variants_hierarchy.size(); p++)
|
---|
1570 | {
|
---|
1571 | ArrayList phrase_query_p_term_variants_list = phrase_query_term_variants_hierarchy.get(p);
|
---|
1572 | if (phrase_query_p_term_variants_list.size()>0) {
|
---|
1573 | HashSet phrase_query_p_term_1_variants = (HashSet) phrase_query_p_term_variants_list.get(0);
|
---|
1574 | if (phrase_query_p_term_1_variants.contains(word_match.word))
|
---|
1575 | {
|
---|
1576 | // If this phrase is just one word long, we have a complete match
|
---|
1577 | if (phrase_query_p_term_variants_list.size() == 1)
|
---|
1578 | {
|
---|
1579 | highlight_start_positions.add(Integer.valueOf(word_match.start_position));
|
---|
1580 | highlight_end_positions.add(Integer.valueOf(word_match.end_position));
|
---|
1581 | }
|
---|
1582 | // Otherwise we have the start of a potential phrase match
|
---|
1583 | else
|
---|
1584 | {
|
---|
1585 | partial_phrase_matches.add(new PartialPhraseMatch(word_match.start_position, p));
|
---|
1586 | }
|
---|
1587 | }
|
---|
1588 | }
|
---|
1589 | }
|
---|
1590 | }
|
---|
1591 | }
|
---|
1592 |
|
---|
1593 | // Now add the annotation tags into the document at the correct points
|
---|
1594 | Element content_element = (Element)doc.importNode(original_element, false); // just copy the element plus any attributes, but not any children.
|
---|
1595 | int last_wrote = 0;
|
---|
1596 | for (int i = 0; i < highlight_start_positions.size(); i++)
|
---|
1597 | {
|
---|
1598 | int highlight_start = highlight_start_positions.get(i).intValue();
|
---|
1599 | int highlight_end = highlight_end_positions.get(i).intValue();
|
---|
1600 |
|
---|
1601 | // Print anything before the highlight range
|
---|
1602 | if (last_wrote < highlight_start)
|
---|
1603 | {
|
---|
1604 | String preceding_text = new String(content_characters, last_wrote, (highlight_start - last_wrote));
|
---|
1605 | content_element.appendChild(doc.createTextNode(preceding_text));
|
---|
1606 | }
|
---|
1607 |
|
---|
1608 | // Print the highlight text, annotated
|
---|
1609 | if (highlight_end > last_wrote)
|
---|
1610 | {
|
---|
1611 | String highlight_text = new String(content_characters, highlight_start, (highlight_end - highlight_start));
|
---|
1612 | Element annotation_element = GSXML.createTextElement(doc, "annotation", highlight_text);
|
---|
1613 | annotation_element.setAttribute("type", "query_term");
|
---|
1614 | content_element.appendChild(annotation_element);
|
---|
1615 | last_wrote = highlight_end;
|
---|
1616 | }
|
---|
1617 | }
|
---|
1618 |
|
---|
1619 | // Finish off any unwritten text
|
---|
1620 | if (last_wrote < content_characters.length)
|
---|
1621 | {
|
---|
1622 | String remaining_text = new String(content_characters, last_wrote, (content_characters.length - last_wrote));
|
---|
1623 | content_element.appendChild(doc.createTextNode(remaining_text));
|
---|
1624 | }
|
---|
1625 | return content_element;
|
---|
1626 | }
|
---|
1627 |
|
---|
1628 |
|
---|
1629 | static private class WordMatch
|
---|
1630 | {
|
---|
1631 | public String word;
|
---|
1632 | public int start_position;
|
---|
1633 | public int end_position;
|
---|
1634 | public boolean preceding_word_matched;
|
---|
1635 |
|
---|
1636 | public WordMatch(String word, int start_position, int end_position, boolean preceding_word_matched)
|
---|
1637 | {
|
---|
1638 | this.word = word;
|
---|
1639 | this.start_position = start_position;
|
---|
1640 | this.end_position = end_position;
|
---|
1641 | this.preceding_word_matched = preceding_word_matched;
|
---|
1642 | }
|
---|
1643 | }
|
---|
1644 |
|
---|
1645 | static private class PartialPhraseMatch
|
---|
1646 | {
|
---|
1647 | public int start_position;
|
---|
1648 | public int query_phrase_number;
|
---|
1649 | public int num_words_matched;
|
---|
1650 |
|
---|
1651 | public PartialPhraseMatch(int start_position, int query_phrase_number)
|
---|
1652 | {
|
---|
1653 | this.start_position = start_position;
|
---|
1654 | this.query_phrase_number = query_phrase_number;
|
---|
1655 | this.num_words_matched = 1;
|
---|
1656 | }
|
---|
1657 | }
|
---|
1658 | }
|
---|