Changeset 24116
- Timestamp:
- 2011-06-07T17:07:48+12:00 (12 years ago)
- File:
-
- 1 edited
Legend:
- Unmodified
- Added
- Removed
-
main/trunk/greenstone3/src/java/org/greenstone/gsdl3/action/DocumentAction.java
r23628 r24116 1 1 /* 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 2 * DocumentAction.java 3 * Copyright (C) 2002 New Zealand Digital Library, http://www.nzdl.org 4 * 5 * This program is free software; you can redistribute it and/or modify 6 * it under the terms of the GNU General Public License as published by 7 * the Free Software Foundation; either version 2 of the License, or 8 * (at your option) any later version. 9 * 10 * This program is distributed in the hope that it will be useful, 11 * but WITHOUT ANY WARRANTY; without even the implied warranty of 12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 * GNU General Public License for more details. 14 * 15 * You should have received a copy of the GNU General Public License 16 * along with this program; if not, write to the Free Software 17 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. 18 */ 19 19 package org.greenstone.gsdl3.action; 20 20 … … 39 39 40 40 /** Action class for retrieving Documents via the message router 41 41 */ 42 42 public class DocumentAction extends Action { 43 43 44 static Logger logger = Logger.getLogger(org.greenstone.gsdl3.action.DocumentAction.class.getName()); 45 46 // this is used to specify that the sibling nodes of a selected one should be obtained 47 public static final String SIBLING_ARG = "sib"; 48 public static final String GOTO_PAGE_ARG = "gp"; 49 public static final String ENRICH_DOC_ARG = "end"; 50 51 /** if this is set to true, when a document is displayed, any annotation 52 * type services (enrich) will be offered to the user as well */ 53 protected boolean provide_annotations = false; 54 55 protected boolean highlight_query_terms = false; 56 57 public boolean configure() { 58 super.configure(); 59 String highlight = (String)config_params.get("highlightQueryTerms"); 60 if (highlight != null && highlight.equals("true")) { 61 highlight_query_terms = true; 44 static Logger logger = Logger.getLogger(org.greenstone.gsdl3.action.DocumentAction.class.getName()); 45 46 // this is used to specify that the sibling nodes of a selected one should be obtained 47 public static final String SIBLING_ARG = "sib"; 48 public static final String GOTO_PAGE_ARG = "gp"; 49 public static final String ENRICH_DOC_ARG = "end"; 50 51 /** if this is set to true, when a document is displayed, any annotation 52 * type services (enrich) will be offered to the user as well */ 53 protected boolean provide_annotations = false; 54 55 protected boolean highlight_query_terms = false; 56 57 public boolean configure() { 58 super.configure(); 59 String highlight = (String)config_params.get("highlightQueryTerms"); 60 if (highlight != null && highlight.equals("true")) { 61 highlight_query_terms = true; 62 } 63 String annotate = (String)config_params.get("displayAnnotationService"); 64 if (annotate != null && annotate.equals("true")) { 65 provide_annotations = true; 66 } 67 return true; 62 68 } 63 String annotate = (String)config_params.get("displayAnnotationService"); 64 if (annotate != null && annotate.equals("true")) { 65 provide_annotations = true; 66 } 67 return true; 68 } 69 public Node process (Node message_node) 70 { 71 // for now, no subaction eventually we may want to have subactions such as text assoc or something ? 72 73 Element message = this.converter.nodeToElement(message_node); 74 75 // the response 76 Element result = this.doc.createElement(GSXML.MESSAGE_ELEM); 77 Element page_response = this.doc.createElement(GSXML.RESPONSE_ELEM); 78 result.appendChild(page_response); 79 80 // get the request - assume only one 81 Element request = (Element)GSXML.getChildByTagName(message, GSXML.REQUEST_ELEM); 82 Element cgi_paramList = (Element)GSXML.getChildByTagName(request, GSXML.PARAM_ELEM+GSXML.LIST_MODIFIER); 83 HashMap params = GSXML.extractParams(cgi_paramList, false); 84 85 // just in case there are some that need to get passed to the services 86 HashMap service_params = (HashMap)params.get("s0"); 87 88 89 String has_rl = null; 90 String has_href = null; 91 has_href = (String) params.get("href");//for an external link : get the href URL if it is existing in the params list 92 has_rl = (String) params.get("rl");//for an external link : get the rl value if it is existing in the params list 93 String collection = (String) params.get(GSParams.COLLECTION); 94 String lang = request.getAttribute(GSXML.LANG_ATT); 95 String uid = request.getAttribute(GSXML.USER_ID_ATT); 96 String document_name = (String) params.get(GSParams.DOCUMENT); 97 if ((document_name == null || document_name.equals("")) && (has_href == null || has_href.equals(""))) { 98 logger.error("no document specified!"); 99 return result; 100 } 101 String document_type = (String) params.get(GSParams.DOCUMENT_TYPE); 102 if (document_type == null) { 103 document_type = "simple"; 104 } 105 //whether to retrieve siblings or not 106 boolean get_siblings = false; 107 String sibs = (String) params.get(SIBLING_ARG); 108 if (sibs != null && sibs.equals("1")) { 109 get_siblings = true; 69 public Node process (Node message_node) 70 { 71 // for now, no subaction eventually we may want to have subactions such as text assoc or something ? 72 73 Element message = this.converter.nodeToElement(message_node); 74 75 // the response 76 Element result = this.doc.createElement(GSXML.MESSAGE_ELEM); 77 Element page_response = this.doc.createElement(GSXML.RESPONSE_ELEM); 78 result.appendChild(page_response); 79 80 // get the request - assume only one 81 Element request = (Element)GSXML.getChildByTagName(message, GSXML.REQUEST_ELEM); 82 Element cgi_paramList = (Element)GSXML.getChildByTagName(request, GSXML.PARAM_ELEM+GSXML.LIST_MODIFIER); 83 HashMap params = GSXML.extractParams(cgi_paramList, false); 84 85 // just in case there are some that need to get passed to the services 86 HashMap service_params = (HashMap)params.get("s0"); 87 88 89 String has_rl = null; 90 String has_href = null; 91 has_href = (String) params.get("href");//for an external link : get the href URL if it is existing in the params list 92 has_rl = (String) params.get("rl");//for an external link : get the rl value if it is existing in the params list 93 String collection = (String) params.get(GSParams.COLLECTION); 94 String lang = request.getAttribute(GSXML.LANG_ATT); 95 String uid = request.getAttribute(GSXML.USER_ID_ATT); 96 String document_name = (String) params.get(GSParams.DOCUMENT); 97 if ((document_name == null || document_name.equals("")) && (has_href == null || has_href.equals(""))) { 98 logger.error("no document specified!"); 99 return result; 100 } 101 String document_type = (String) params.get(GSParams.DOCUMENT_TYPE); 102 if (document_type == null) { 103 document_type = "simple"; 104 } 105 //whether to retrieve siblings or not 106 boolean get_siblings = false; 107 String sibs = (String) params.get(SIBLING_ARG); 108 if (sibs != null && sibs.equals("1")) { 109 get_siblings = true; 110 } 111 112 String sibling_num = (String) params.get(GOTO_PAGE_ARG); 113 if (sibling_num != null && !sibling_num.equals("")) { 114 // we have to modify the doc name 115 document_name = document_name+"."+sibling_num+".ss"; 116 } 117 118 boolean expand_document = false; 119 String ed_arg = (String) params.get(GSParams.EXPAND_DOCUMENT); 120 if (ed_arg != null && ed_arg.equals("1")) { 121 expand_document = true; 122 } 123 124 125 boolean expand_contents = false; 126 if (expand_document) { // we always expand the contents with the text 127 expand_contents = true; 128 } else { 129 String ec_arg = (String) params.get(GSParams.EXPAND_CONTENTS); 130 if (ec_arg != null && ec_arg.equals("1")) { 131 expand_contents = true; 132 } 133 } 134 135 //append site metadata 136 addSiteMetadata( page_response, lang, uid); 137 138 // get the additional data needed for the page 139 getBackgroundData(page_response, collection, lang, uid); 140 Element format_elem = (Element)GSXML.getChildByTagName(page_response, GSXML.FORMAT_ELEM); 141 142 // the_document is where all the doc info - structure and metadata etc 143 // is added into, to be returned in the page 144 Element the_document = this.doc.createElement(GSXML.DOCUMENT_ELEM); 145 page_response.appendChild(the_document); 146 147 // set the doctype from the cgi arg as an attribute 148 the_document.setAttribute(GSXML.DOC_TYPE_ATT, document_type); 149 150 // create a basic doc list containing the current node 151 Element basic_doc_list = this.doc.createElement(GSXML.DOC_NODE_ELEM+GSXML.LIST_MODIFIER); 152 Element current_doc = this.doc.createElement(GSXML.DOC_NODE_ELEM); 153 basic_doc_list.appendChild(current_doc); 154 if (document_name.length()!=0){ 155 current_doc.setAttribute(GSXML.NODE_ID_ATT, document_name); 156 }else if (has_href.length()!=0){ 157 current_doc.setAttribute(GSXML.NODE_ID_ATT, has_href); 158 current_doc.setAttribute("externalURL", has_rl); 159 } 160 161 // Create a parameter list to specify the required structure information 162 Element ds_param_list = this.doc.createElement(GSXML.PARAM_ELEM+GSXML.LIST_MODIFIER); 163 164 if (service_params != null) { 165 GSXML.addParametersToList(this.doc, ds_param_list, service_params); 166 } 167 168 Element ds_param = null; 169 boolean get_structure = false; 170 boolean get_structure_info = false; 171 if (document_type.equals("paged")) { 172 get_structure_info = true; 173 // get teh info needed for paged naviagtion 174 ds_param = this.doc.createElement(GSXML.PARAM_ELEM); 175 ds_param_list.appendChild(ds_param); 176 ds_param.setAttribute(GSXML.NAME_ATT, "info"); 177 ds_param.setAttribute(GSXML.VALUE_ATT, "numSiblings"); 178 ds_param = this.doc.createElement(GSXML.PARAM_ELEM); 179 ds_param_list.appendChild(ds_param); 180 ds_param.setAttribute(GSXML.NAME_ATT, "info"); 181 ds_param.setAttribute(GSXML.VALUE_ATT, "numChildren"); 182 ds_param = this.doc.createElement(GSXML.PARAM_ELEM); 183 ds_param_list.appendChild(ds_param); 184 ds_param.setAttribute(GSXML.NAME_ATT, "info"); 185 ds_param.setAttribute(GSXML.VALUE_ATT, "siblingPosition"); 186 187 } else if (document_type.equals("hierarchy")){ 188 get_structure = true; 189 if (expand_contents) { 190 ds_param = this.doc.createElement(GSXML.PARAM_ELEM); 191 ds_param_list.appendChild(ds_param); 192 ds_param.setAttribute(GSXML.NAME_ATT, "structure"); 193 ds_param.setAttribute(GSXML.VALUE_ATT, "entire"); 194 } else { 195 // get the info needed for table of contents 196 ds_param = this.doc.createElement(GSXML.PARAM_ELEM); 197 ds_param_list.appendChild(ds_param); 198 ds_param.setAttribute(GSXML.NAME_ATT, "structure"); 199 ds_param.setAttribute(GSXML.VALUE_ATT, "ancestors"); 200 ds_param = this.doc.createElement(GSXML.PARAM_ELEM); 201 ds_param_list.appendChild(ds_param); 202 ds_param.setAttribute(GSXML.NAME_ATT, "structure"); 203 ds_param.setAttribute(GSXML.VALUE_ATT, "children"); 204 if (get_siblings) { 205 ds_param = this.doc.createElement(GSXML.PARAM_ELEM); 206 ds_param_list.appendChild(ds_param); 207 ds_param.setAttribute(GSXML.NAME_ATT, "structure"); 208 ds_param.setAttribute(GSXML.VALUE_ATT, "siblings"); 209 } 210 } 211 } else { 212 // we dont need any structure 213 } 214 215 boolean has_dummy = false; 216 if (get_structure || get_structure_info) { 217 218 // Build a request to obtain the document structure 219 Element ds_message = this.doc.createElement(GSXML.MESSAGE_ELEM); 220 String to = GSPath.appendLink(collection, "DocumentStructureRetrieve");// Hard-wired? 221 Element ds_request = GSXML.createBasicRequest(this.doc,GSXML.REQUEST_TYPE_PROCESS, to, lang, uid); 222 ds_message.appendChild(ds_request); 223 ds_request.appendChild(ds_param_list); 224 225 // create a doc_node_list and put in the doc_node that we are interested in 226 ds_request.appendChild(basic_doc_list); 227 228 // Process the document structure retrieve message 229 Element ds_response_message = (Element) this.mr.process(ds_message); 230 if (processErrorElements(ds_response_message, page_response)) { 231 return result; 232 } 233 234 // get the info and print out 235 String path = GSPath.appendLink(GSXML.RESPONSE_ELEM, GSXML.DOC_NODE_ELEM+GSXML.LIST_MODIFIER); 236 path = GSPath.appendLink(path, GSXML.DOC_NODE_ELEM); 237 path = GSPath.appendLink(path, "nodeStructureInfo"); 238 Element ds_response_struct_info = (Element) GSXML.getNodeByPath(ds_response_message, path); 239 // get the doc_node bit 240 if (ds_response_struct_info != null) { 241 the_document.appendChild(this.doc.importNode(ds_response_struct_info, true)); 242 } 243 path = GSPath.appendLink(GSXML.RESPONSE_ELEM, GSXML.DOC_NODE_ELEM+GSXML.LIST_MODIFIER); 244 path = GSPath.appendLink(path, GSXML.DOC_NODE_ELEM); 245 path = GSPath.appendLink(path, GSXML.NODE_STRUCTURE_ELEM); 246 Element ds_response_structure = (Element) GSXML.getNodeByPath(ds_response_message, path); 247 248 if (ds_response_structure != null) { 249 // add the contents of the structure bit into the_document 250 NodeList structs = ds_response_structure.getChildNodes(); 251 for (int i=0; i<structs.getLength();i++) { 252 the_document.appendChild(this.doc.importNode(structs.item(i), true)); 253 } 254 } else { 255 // no structure nodes, so put in a dummy doc node 256 Element doc_node = this.doc.createElement(GSXML.DOC_NODE_ELEM); 257 if (document_name.length()!=0){ 258 doc_node.setAttribute(GSXML.NODE_ID_ATT, document_name); 259 }else if (has_href.length()!=0){ 260 doc_node.setAttribute(GSXML.NODE_ID_ATT, has_href); 261 doc_node.setAttribute("externalURL", has_rl); 262 } 263 the_document.appendChild(doc_node); 264 has_dummy = true; 265 } 266 } else { // a simple type - we dont have a dummy node for simple 267 // should think about this more 268 // no structure request, so just put in a dummy doc node 269 Element doc_node = this.doc.createElement(GSXML.DOC_NODE_ELEM); 270 if (document_name.length()!=0){ 271 doc_node.setAttribute(GSXML.NODE_ID_ATT, document_name); 272 }else if (has_href.length()!=0){ 273 doc_node.setAttribute(GSXML.NODE_ID_ATT, has_href); 274 doc_node.setAttribute("externalURL", has_rl); 275 } 276 the_document.appendChild(doc_node); 277 has_dummy = true; 278 } 279 280 // Build a request to obtain some document metadata 281 Element dm_message = this.doc.createElement(GSXML.MESSAGE_ELEM); 282 String to = GSPath.appendLink(collection, "DocumentMetadataRetrieve"); // Hard-wired? 283 Element dm_request = GSXML.createBasicRequest(this.doc, GSXML.REQUEST_TYPE_PROCESS, to, lang, uid); 284 dm_message.appendChild(dm_request); 285 // Create a parameter list to specify the required metadata information 286 287 HashSet meta_names = new HashSet(); 288 meta_names.add("Title"); // the default 289 if (format_elem != null) { 290 extractMetadataNames(format_elem, meta_names); 291 } 292 293 Element dm_param_list = createMetadataParamList(meta_names); 294 if (service_params != null) { 295 GSXML.addParametersToList(this.doc, dm_param_list, service_params); 296 } 297 298 dm_request.appendChild(dm_param_list); 299 300 301 // create the doc node list for the metadata request 302 Element dm_doc_list = this.doc.createElement(GSXML.DOC_NODE_ELEM+GSXML.LIST_MODIFIER); 303 dm_request.appendChild(dm_doc_list); 304 305 // Add each node from the structure response into the metadata request 306 NodeList doc_nodes = the_document.getElementsByTagName(GSXML.DOC_NODE_ELEM); 307 for (int i = 0; i < doc_nodes.getLength(); i++) { 308 Element doc_node = (Element) doc_nodes.item(i); 309 String doc_node_id = doc_node.getAttribute(GSXML.NODE_ID_ATT); 310 311 // Add the documentNode to the list 312 Element dm_doc_node = this.doc.createElement(GSXML.DOC_NODE_ELEM); 313 dm_doc_list.appendChild(dm_doc_node); 314 dm_doc_node.setAttribute(GSXML.NODE_ID_ATT, doc_node_id); 315 dm_doc_node.setAttribute(GSXML.NODE_TYPE_ATT, 316 doc_node.getAttribute(GSXML.NODE_TYPE_ATT)); 317 } 318 319 // we also want a metadata request to the top level document to get 320 // assocfilepath - this could be cached too 321 Element doc_meta_request = GSXML.createBasicRequest(this.doc, GSXML.REQUEST_TYPE_PROCESS, to, lang, uid); 322 dm_message.appendChild(doc_meta_request); 323 Element doc_meta_param_list = this.doc.createElement(GSXML.PARAM_ELEM+GSXML.LIST_MODIFIER); 324 if (service_params != null) { 325 GSXML.addParametersToList(this.doc, doc_meta_param_list, service_params); 326 } 327 328 doc_meta_request.appendChild(doc_meta_param_list); 329 Element doc_param = this.doc.createElement(GSXML.PARAM_ELEM); 330 doc_meta_param_list.appendChild(doc_param); 331 doc_param.setAttribute(GSXML.NAME_ATT, "metadata"); 332 doc_param.setAttribute(GSXML.VALUE_ATT, "assocfilepath"); 333 334 // create the doc node list for the metadata request 335 Element doc_list = this.doc.createElement(GSXML.DOC_NODE_ELEM+GSXML.LIST_MODIFIER); 336 doc_meta_request.appendChild(doc_list); 337 338 Element doc_node = this.doc.createElement(GSXML.DOC_NODE_ELEM); 339 // the node we want is the root document node 340 if (document_name.length()!=0){ 341 doc_node.setAttribute(GSXML.NODE_ID_ATT, document_name+".rt"); 342 }else if (has_href.length()!=0){ 343 doc_node.setAttribute(GSXML.NODE_ID_ATT, has_href+".rt"); 344 doc_node.setAttribute("externalURL", has_rl); 345 } 346 doc_list.appendChild(doc_node); 347 Element dm_response_message = (Element) this.mr.process(dm_message); 348 if (processErrorElements(dm_response_message, page_response)) { 349 return result; 350 } 351 352 String path = GSPath.appendLink(GSXML.RESPONSE_ELEM, GSXML.DOC_NODE_ELEM+GSXML.LIST_MODIFIER); 353 Element dm_response_doc_list = (Element) GSXML.getNodeByPath(dm_response_message, path); 354 355 // Merge the metadata with the structure information 356 NodeList dm_response_docs = dm_response_doc_list.getChildNodes(); 357 for (int i = 0; i < doc_nodes.getLength(); i++) { 358 GSXML.mergeMetadataLists(doc_nodes.item(i), dm_response_docs.item(i)); 359 } 360 // get the top level doc metadata out 361 Element doc_meta_response = (Element)dm_response_message.getElementsByTagName(GSXML.RESPONSE_ELEM).item(1); 362 Element top_doc_node = (Element)GSXML.getNodeByPath(doc_meta_response, "documentNodeList/documentNode"); 363 GSXML.mergeMetadataLists(the_document, top_doc_node); 364 365 // Build a request to obtain some document content 366 Element dc_message = this.doc.createElement(GSXML.MESSAGE_ELEM); 367 to = GSPath.appendLink(collection, "DocumentContentRetrieve"); // Hard-wired? 368 Element dc_request = GSXML.createBasicRequest(this.doc, GSXML.REQUEST_TYPE_PROCESS, to, lang, uid); 369 dc_message.appendChild(dc_request); 370 371 372 // Create a parameter list to specify the request parameters - empty for now 373 Element dc_param_list = this.doc.createElement(GSXML.PARAM_ELEM+GSXML.LIST_MODIFIER); 374 if (service_params != null) { 375 GSXML.addParametersToList(this.doc, dc_param_list, service_params); 376 } 377 378 dc_request.appendChild(dc_param_list); 379 380 // get the content 381 // the doc list for the content request is the same as the one for the structure request unless we want the whole document, in which case its the same as for the metadata request. 382 if (expand_document) { 383 dc_request.appendChild(dm_doc_list); 384 } else { 385 dc_request.appendChild(basic_doc_list); 386 } 387 logger.debug("request = "+converter.getString(dc_message)); 388 Element dc_response_message = (Element) this.mr.process(dc_message); 389 if (processErrorElements(dc_response_message, page_response)) { 390 return result; 391 } 392 393 Element dc_response_doc_list = (Element) GSXML.getNodeByPath(dc_response_message, path); 394 395 if (expand_document) { 396 // Merge the content with the structure information 397 NodeList dc_response_docs = dc_response_doc_list.getChildNodes(); 398 for (int i = 0; i < doc_nodes.getLength(); i++) { 399 Node content = GSXML.getChildByTagName((Element)dc_response_docs.item(i), "nodeContent"); 400 if (content != null) { 401 if (highlight_query_terms) { 402 content = highlightQueryTerms(request, (Element)content); 403 } 404 doc_nodes.item(i).appendChild(this.doc.importNode(content, true)); 405 } 406 //GSXML.mergeMetadataLists(doc_nodes.item(i), dm_response_docs.item(i)); 407 } 408 } else { 409 //path = GSPath.appendLink(path, GSXML.DOC_NODE_ELEM); 410 Element dc_response_doc = (Element) GSXML.getChildByTagName(dc_response_doc_list, GSXML.DOC_NODE_ELEM); 411 Element dc_response_doc_content = (Element) GSXML.getChildByTagName(dc_response_doc, GSXML.NODE_CONTENT_ELEM); 412 Element dc_response_doc_external = (Element) GSXML.getChildByTagName(dc_response_doc, "external"); 413 414 if (dc_response_doc_content == null) { 415 // no content to add 416 if (dc_response_doc_external !=null){ 417 String modified_doc_id = dc_response_doc.getAttribute(GSXML.NODE_ID_ATT); 418 419 the_document.setAttribute("selectedNode", modified_doc_id); 420 the_document.setAttribute("external", dc_response_doc_external.getAttribute("external_link")); 421 } 422 return result; 423 } 424 if (highlight_query_terms) { 425 dc_response_doc.removeChild(dc_response_doc_content); 426 427 dc_response_doc_content = highlightQueryTerms(request, dc_response_doc_content); 428 dc_response_doc.appendChild(dc_response_doc.getOwnerDocument().importNode(dc_response_doc_content, true)); 429 } 430 431 432 if (provide_annotations) { 433 String service_selected = (String)params.get(ENRICH_DOC_ARG); 434 if (service_selected != null && service_selected.equals("1")) { 435 // now we can modifiy the response doc if needed 436 String enrich_service = (String)params.get(GSParams.SERVICE); 437 // send a message to the service 438 Element enrich_message = this.doc.createElement(GSXML.MESSAGE_ELEM); 439 Element enrich_request = GSXML.createBasicRequest(this.doc, GSXML.REQUEST_TYPE_PROCESS, enrich_service, lang, uid); 440 enrich_message.appendChild(enrich_request); 441 // check for parameters 442 HashMap e_service_params = (HashMap)params.get("s1"); 443 if (e_service_params != null) { 444 Element enrich_pl = this.doc.createElement(GSXML.PARAM_ELEM+GSXML.LIST_MODIFIER); 445 GSXML.addParametersToList(this.doc, enrich_pl, e_service_params); 446 enrich_request.appendChild(enrich_pl); 447 } 448 Element e_doc_list = this.doc.createElement(GSXML.DOC_NODE_ELEM+GSXML.LIST_MODIFIER); 449 enrich_request.appendChild(e_doc_list); 450 e_doc_list.appendChild(this.doc.importNode(dc_response_doc, true)); 451 452 Node enrich_response = this.mr.process(enrich_message); 453 454 String [] links = {GSXML.RESPONSE_ELEM, GSXML.DOC_NODE_ELEM+GSXML.LIST_MODIFIER, GSXML.DOC_NODE_ELEM, GSXML.NODE_CONTENT_ELEM}; 455 path = GSPath.createPath(links); 456 dc_response_doc_content = (Element)GSXML.getNodeByPath(enrich_response, path); 457 458 } 459 } // if provide_annotations 460 461 462 // use the returned id rather than the sent one cos there may have 463 // been modifiers such as .pr that are removed. 464 String modified_doc_id = dc_response_doc.getAttribute(GSXML.NODE_ID_ATT); 465 the_document.setAttribute("selectedNode", modified_doc_id); 466 if (has_dummy) { 467 // change the id if necessary and add the content 468 Element dummy_node = (Element)doc_nodes.item(0); 469 470 dummy_node.setAttribute(GSXML.NODE_ID_ATT, modified_doc_id); 471 dummy_node.appendChild(this.doc.importNode(dc_response_doc_content, true)); 472 // hack for simple type 473 if (document_type.equals("simple")) { 474 // we dont want the internal docNode, just want the content and metadata in the document 475 // rethink this!! 476 the_document.removeChild(dummy_node); 477 478 NodeList dummy_children = dummy_node.getChildNodes(); 479 //for (int i=0; i<dummy_children.getLength(); i++) { 480 for (int i=dummy_children.getLength()-1; i>=0; i--) { 481 // special case as we don't want more than one metadata list 482 if (dummy_children.item(i).getNodeName().equals(GSXML.METADATA_ELEM+GSXML.LIST_MODIFIER)) { 483 GSXML.mergeMetadataFromList(the_document, dummy_children.item(i)); 484 } else { 485 the_document.appendChild(dummy_children.item(i)); 486 } 487 } 488 } 489 } else { 490 // Merge the document content with the metadata and structure information 491 for (int i = 0; i < doc_nodes.getLength(); i++) { 492 Node dn = doc_nodes.item(i); 493 String dn_id = ((Element)dn).getAttribute(GSXML.NODE_ID_ATT); 494 if (dn_id.equals(modified_doc_id)) { 495 dn.appendChild(this.doc.importNode(dc_response_doc_content, true)); 496 break; 497 } 498 } 499 } 500 } 501 logger.debug("(DocumentAction) Page:\n" + this.converter.getPrettyString(result)); 502 return result; 110 503 } 111 504 112 String sibling_num = (String) params.get(GOTO_PAGE_ARG); 113 if (sibling_num != null && !sibling_num.equals("")) { 114 // we have to modify the doc name 115 document_name = document_name+"."+sibling_num+".ss"; 505 /** tell the param class what its arguments are 506 * if an action has its own arguments, this should add them to the params 507 * object - particularly important for args that should not be saved */ 508 public boolean getActionParameters(GSParams params) { 509 params.addParameter(GOTO_PAGE_ARG, false); 510 params.addParameter(ENRICH_DOC_ARG, false); 511 return true; 116 512 } 117 118 boolean expand_document = false; 119 String ed_arg = (String) params.get(GSParams.EXPAND_DOCUMENT); 120 if (ed_arg != null && ed_arg.equals("1")) { 121 expand_document = true; 513 514 515 /** this method gets the collection description, the format info, the 516 * list of enrich services, etc - stuff that is needed for the page, 517 * but is the same whatever the query is - should be cached */ 518 protected boolean getBackgroundData(Element page_response, 519 String collection, String lang, 520 String uid) { 521 522 // create a message to process - contains requests for the collection 523 // description, the format element, the enrich services on offer 524 // these could all be cached 525 Element info_message = this.doc.createElement(GSXML.MESSAGE_ELEM); 526 String path = GSPath.appendLink(collection, "DocumentContentRetrieve"); 527 // the format request - ignore for now, where does this request go to?? 528 Element format_request = GSXML.createBasicRequest(this.doc, GSXML.REQUEST_TYPE_FORMAT, path, lang, uid); 529 info_message.appendChild(format_request); 530 531 // the enrich_services request - only do this if provide_annotations is true 532 533 if (provide_annotations) { 534 Element enrich_services_request = GSXML.createBasicRequest(this.doc, GSXML.REQUEST_TYPE_DESCRIBE, "", lang, uid); 535 enrich_services_request.setAttribute(GSXML.INFO_ATT, "serviceList"); 536 info_message.appendChild(enrich_services_request); 537 } 538 539 Element info_response = (Element)this.mr.process(info_message); 540 541 // the collection is the first response 542 NodeList responses = info_response.getElementsByTagName(GSXML.RESPONSE_ELEM); 543 Element format_resp = (Element) responses.item(0); 544 545 Element format_elem = (Element)GSXML.getChildByTagName(format_resp, GSXML.FORMAT_ELEM); 546 if (format_elem != null) { 547 logger.debug("doc action found a format statement"); 548 // set teh format type 549 format_elem.setAttribute(GSXML.TYPE_ATT, "display"); 550 page_response.appendChild(this.doc.importNode(format_elem, true)); 551 } 552 553 if (provide_annotations) { 554 Element services_resp = (Element)responses.item(1); 555 556 // a new message for the mr 557 Element enrich_message = this.doc.createElement(GSXML.MESSAGE_ELEM); 558 559 NodeList e_services = services_resp.getElementsByTagName(GSXML.SERVICE_ELEM); 560 boolean service_found = false; 561 for (int j=0; j<e_services.getLength(); j++) { 562 if (((Element)e_services.item(j)).getAttribute(GSXML.TYPE_ATT).equals("enrich")) { 563 Element s = GSXML.createBasicRequest(this.doc, GSXML.REQUEST_TYPE_DESCRIBE, ((Element)e_services.item(j)).getAttribute(GSXML.NAME_ATT), lang, uid); 564 enrich_message.appendChild(s); 565 service_found = true; 566 } 567 } 568 if (service_found) { 569 Element enrich_response = (Element)this.mr.process(enrich_message); 570 571 NodeList e_responses = enrich_response.getElementsByTagName(GSXML.RESPONSE_ELEM); 572 Element service_list = this.doc.createElement(GSXML.SERVICE_ELEM + GSXML.LIST_MODIFIER); 573 for (int i=0; i<e_responses.getLength(); i++) { 574 Element e_resp = (Element)e_responses.item(i); 575 Element e_service = (Element)this.doc.importNode(GSXML.getChildByTagName(e_resp, GSXML.SERVICE_ELEM), true); 576 e_service.setAttribute(GSXML.NAME_ATT, e_resp.getAttribute(GSXML.FROM_ATT)); 577 service_list.appendChild(e_service); 578 } 579 page_response.appendChild(service_list); 580 } 581 } // if provide_annotations 582 return true; 583 122 584 } 123 124 125 boolean expand_contents = false; 126 if (expand_document) { // we always expand the contents with the text 127 expand_contents = true; 128 } else { 129 String ec_arg = (String) params.get(GSParams.EXPAND_CONTENTS); 130 if (ec_arg != null && ec_arg.equals("1")) { 131 expand_contents = true; 132 } 585 586 /** this involves a bit of a hack to get the equivalent query terms - has to requery the query service - uses the last selected service name. (if it ends in query). should this action do the query or should it send a message to the query action? but that will involve lots of extra stuff. also doesn't handle phrases properly - just highlights all the terms found in the text. 587 */ 588 protected Element highlightQueryTerms(Element request, Element dc_response_doc_content) { 589 590 // do the query again to get term info 591 Element cgi_param_list = (Element)GSXML.getChildByTagName(request, GSXML.PARAM_ELEM+GSXML.LIST_MODIFIER); 592 HashMap params = GSXML.extractParams(cgi_param_list, false); 593 594 HashMap previous_params = (HashMap)params.get("p"); 595 if (previous_params == null) { 596 return dc_response_doc_content; 597 } 598 String service_name = (String)previous_params.get(GSParams.SERVICE); 599 if (service_name == null || !service_name.endsWith("Query")) { // hack for now - we only do highlighting if we were in a query last - ie not if we were in a browse thingy 600 logger.debug("invalid service, not doing highlighting"); 601 return dc_response_doc_content; 602 } 603 String collection = (String)params.get(GSParams.COLLECTION); 604 String lang = request.getAttribute(GSXML.LANG_ATT); 605 String uid = request.getAttribute(GSXML.USER_ID_ATT); 606 String to = GSPath.appendLink(collection, service_name); 607 608 Element mr_query_message = this.doc.createElement(GSXML.MESSAGE_ELEM); 609 Element mr_query_request = GSXML.createBasicRequest(this.doc, GSXML.REQUEST_TYPE_PROCESS, to, lang, uid); 610 mr_query_message.appendChild(mr_query_request); 611 612 // paramList 613 HashMap service_params = (HashMap)params.get("s1"); 614 615 Element query_param_list = this.doc.createElement(GSXML.PARAM_ELEM + GSXML.LIST_MODIFIER); 616 GSXML.addParametersToList(this.doc, query_param_list, service_params); 617 mr_query_request.appendChild(query_param_list); 618 619 // do the query 620 Element mr_query_response = (Element)this.mr.process(mr_query_message); 621 622 String path = GSPath.appendLink(GSXML.RESPONSE_ELEM, GSXML.TERM_ELEM+GSXML.LIST_MODIFIER); 623 Element query_term_list_element = (Element) GSXML.getNodeByPath(mr_query_response, path); 624 if (query_term_list_element == null) { 625 // no term info 626 logger.error("No query term information.\n"); 627 return dc_response_doc_content; 628 } 629 630 String content = GSXML.getNodeText(dc_response_doc_content); 631 632 String metadata_path = GSPath.appendLink(GSXML.RESPONSE_ELEM, GSXML.METADATA_ELEM+GSXML.LIST_MODIFIER); 633 Element metadata_list = (Element) GSXML.getNodeByPath(mr_query_response, metadata_path); 634 635 HashSet query_term_variants = new HashSet(); 636 NodeList equivalent_terms_nodelist = query_term_list_element.getElementsByTagName("equivTermList"); 637 if(equivalent_terms_nodelist == null || equivalent_terms_nodelist.getLength() == 0) 638 { 639 NodeList terms_nodelist = query_term_list_element.getElementsByTagName("term"); 640 if(terms_nodelist != null && terms_nodelist.getLength() > 0) 641 { 642 for(int i = 0; i < terms_nodelist.getLength(); i++) 643 { 644 String termValue = ((Element)terms_nodelist.item(i)).getAttribute("name"); 645 String termValueU = null; 646 String termValueL = null; 647 648 if(termValue.length() > 1) 649 { 650 termValueU = termValue.substring(0, 1).toUpperCase() + termValue.substring(1); 651 termValueL = termValue.substring(0, 1).toLowerCase() + termValue.substring(1); 652 } 653 else 654 { 655 termValueU = termValue.substring(0, 1).toUpperCase(); 656 termValueL = termValue.substring(0, 1).toLowerCase(); 657 } 658 659 query_term_variants.add(termValueU); 660 query_term_variants.add(termValueL); 661 } 662 } 663 } 664 else 665 { 666 for (int i = 0; i < equivalent_terms_nodelist.getLength(); i++) { 667 Element equivalent_terms_element = (Element) equivalent_terms_nodelist.item(i); 668 String[] equivalent_terms = GSXML.getAttributeValuesFromList(equivalent_terms_element, GSXML.NAME_ATT); 669 for (int j = 0; j < equivalent_terms.length; j++) { 670 query_term_variants.add(equivalent_terms[j]); 671 } 672 } 673 } 674 675 ArrayList phrase_query_term_variants_hierarchy = new ArrayList(); 676 677 Element query_element = GSXML.getNamedElement(metadata_list, GSXML.METADATA_ELEM, GSXML.NAME_ATT, "query"); 678 String performed_query = GSXML.getNodeText(query_element) + " "; 679 680 ArrayList phrase_query_p_term_variants_list = new ArrayList(); 681 int term_start = 0; 682 boolean in_term = false; 683 boolean in_phrase = false; 684 for (int i = 0; i < performed_query.length(); i++) { 685 char character = performed_query.charAt(i); 686 boolean is_character_letter_or_digit = Character.isLetterOrDigit(character); 687 688 // Has a query term just started? 689 if (in_term == false && is_character_letter_or_digit == true) { 690 in_term = true; 691 term_start = i; 692 } 693 694 // Or has a term just finished? 695 else if (in_term == true && is_character_letter_or_digit == false) { 696 in_term = false; 697 String term = performed_query.substring(term_start, i); 698 699 Element term_element = GSXML.getNamedElement(query_term_list_element, GSXML.TERM_ELEM, GSXML.NAME_ATT, term); 700 if (term_element != null) { 701 702 HashSet phrase_query_p_term_x_variants = new HashSet(); 703 704 NodeList term_equivalent_terms_nodelist = term_element.getElementsByTagName("equivTermList"); 705 if(term_equivalent_terms_nodelist == null || term_equivalent_terms_nodelist.getLength() == 0) 706 { 707 String termValueU = null; 708 String termValueL = null; 709 710 if(term.length() > 1) 711 { 712 termValueU = term.substring(0, 1).toUpperCase() + term.substring(1); 713 termValueL = term.substring(0, 1).toLowerCase() + term.substring(1); 714 } 715 else 716 { 717 termValueU = term.substring(0, 1).toUpperCase(); 718 termValueL = term.substring(0, 1).toLowerCase(); 719 } 720 721 phrase_query_p_term_x_variants.add(termValueU); 722 phrase_query_p_term_x_variants.add(termValueL); 723 } 724 else 725 { 726 for (int j = 0; j < term_equivalent_terms_nodelist.getLength(); j++) { 727 Element term_equivalent_terms_element = (Element) term_equivalent_terms_nodelist.item(j); 728 String[] term_equivalent_terms = GSXML.getAttributeValuesFromList(term_equivalent_terms_element, GSXML.NAME_ATT); 729 for (int k = 0; k < term_equivalent_terms.length; k++) { 730 phrase_query_p_term_x_variants.add(term_equivalent_terms[k]); 731 } 732 } 733 } 734 phrase_query_p_term_variants_list.add(phrase_query_p_term_x_variants); 735 736 if (in_phrase == false) { 737 phrase_query_term_variants_hierarchy.add(phrase_query_p_term_variants_list); 738 phrase_query_p_term_variants_list = new ArrayList(); 739 } 740 } 741 } 742 // Watch for phrases (surrounded by quotes) 743 if (character == '\"') { 744 // Has a phrase just started? 745 if (in_phrase == false) { 746 in_phrase = true; 747 } 748 // Or has a phrase just finished? 749 else if (in_phrase == true) { 750 in_phrase = false; 751 phrase_query_term_variants_hierarchy.add(phrase_query_p_term_variants_list); 752 } 753 754 phrase_query_p_term_variants_list = new ArrayList(); 755 } 756 } 757 758 System.err.println(query_term_variants + " *** " + phrase_query_term_variants_hierarchy); 759 return highlightQueryTermsInternal(content, query_term_variants, phrase_query_term_variants_hierarchy); 133 760 } 134 761 135 //append site metadata 136 addSiteMetadata( page_response, lang, uid); 137 138 // get the additional data needed for the page 139 getBackgroundData(page_response, collection, lang, uid); 140 Element format_elem = (Element)GSXML.getChildByTagName(page_response, GSXML.FORMAT_ELEM); 141 142 // the_document is where all the doc info - structure and metadata etc 143 // is added into, to be returned in the page 144 Element the_document = this.doc.createElement(GSXML.DOCUMENT_ELEM); 145 page_response.appendChild(the_document); 146 147 // set the doctype from the cgi arg as an attribute 148 the_document.setAttribute(GSXML.DOC_TYPE_ATT, document_type); 149 150 // create a basic doc list containing the current node 151 Element basic_doc_list = this.doc.createElement(GSXML.DOC_NODE_ELEM+GSXML.LIST_MODIFIER); 152 Element current_doc = this.doc.createElement(GSXML.DOC_NODE_ELEM); 153 basic_doc_list.appendChild(current_doc); 154 if (document_name.length()!=0){ 155 current_doc.setAttribute(GSXML.NODE_ID_ATT, document_name); 156 }else if (has_href.length()!=0){ 157 current_doc.setAttribute(GSXML.NODE_ID_ATT, has_href); 158 current_doc.setAttribute("externalURL", has_rl); 762 763 /** 764 * Highlights query terms in a piece of text. 765 */ 766 private Element highlightQueryTermsInternal(String content, HashSet query_term_variants, ArrayList phrase_query_term_variants_hierarchy) 767 { 768 // Convert the content string to an array of characters for speed 769 char[] content_characters = new char[content.length()]; 770 content.getChars(0, content.length(), content_characters, 0); 771 772 // Now skim through the content, identifying word matches 773 ArrayList word_matches = new ArrayList(); 774 int word_start = 0; 775 boolean in_word = false; 776 boolean preceding_word_matched = false; 777 for (int i = 0; i < content_characters.length; i++) { 778 boolean is_character_letter_or_digit = Character.isLetterOrDigit(content_characters[i]); 779 780 // Has a word just started? 781 if (in_word == false && is_character_letter_or_digit == true) { 782 in_word = true; 783 word_start = i; 784 } 785 786 // Or has a word just finished? 787 else if (in_word == true && is_character_letter_or_digit == false) { 788 in_word = false; 789 790 // Check if the word matches any of the query term equivalents 791 String word = new String(content_characters, word_start, (i - word_start)); 792 if (query_term_variants.contains(word)) { 793 // We have found a matching word, so remember its location 794 word_matches.add(new WordMatch(word, word_start, i, preceding_word_matched)); 795 preceding_word_matched = true; 796 } 797 else { 798 preceding_word_matched = false; 799 } 800 } 801 } 802 803 // Don't forget the last word... 804 if (in_word == true) { 805 // Check if the word matches any of the query term equivalents 806 String word = new String(content_characters, word_start, (content_characters.length - word_start)); 807 if (query_term_variants.contains(word)) { 808 // We have found a matching word, so remember its location 809 word_matches.add(new WordMatch(word, word_start, content_characters.length, preceding_word_matched)); 810 } 811 } 812 813 ArrayList highlight_start_positions = new ArrayList(); 814 ArrayList highlight_end_positions = new ArrayList(); 815 816 // Deal with phrases now 817 ArrayList partial_phrase_matches = new ArrayList(); 818 for (int i = 0; i < word_matches.size(); i++) { 819 WordMatch word_match = (WordMatch) word_matches.get(i); 820 821 // See if any partial phrase matches are extended by this word 822 if (word_match.preceding_word_matched) { 823 for (int j = partial_phrase_matches.size() - 1; j >= 0; j--) { 824 PartialPhraseMatch partial_phrase_match = (PartialPhraseMatch) partial_phrase_matches.remove(j); 825 ArrayList phrase_query_p_term_variants_list = (ArrayList) phrase_query_term_variants_hierarchy.get(partial_phrase_match.query_phrase_number); 826 HashSet phrase_query_p_term_x_variants = (HashSet) phrase_query_p_term_variants_list.get(partial_phrase_match.num_words_matched); 827 if (phrase_query_p_term_x_variants.contains(word_match.word)) { 828 partial_phrase_match.num_words_matched++; 829 830 // Has a complete phrase match occurred? 831 if (partial_phrase_match.num_words_matched == phrase_query_p_term_variants_list.size()) { 832 // Check for overlaps by looking at the previous highlight range 833 if (!highlight_end_positions.isEmpty()) { 834 int last_highlight_index = highlight_end_positions.size() - 1; 835 int last_highlight_end = ((Integer) highlight_end_positions.get(last_highlight_index)).intValue(); 836 if (last_highlight_end > partial_phrase_match.start_position) { 837 // There is an overlap, so remove the previous phrase match 838 int last_highlight_start = ((Integer) highlight_start_positions.remove(last_highlight_index)).intValue(); 839 highlight_end_positions.remove(last_highlight_index); 840 partial_phrase_match.start_position = last_highlight_start; 841 } 842 } 843 844 highlight_start_positions.add(new Integer(partial_phrase_match.start_position)); 845 highlight_end_positions.add(new Integer(word_match.end_position)); 846 } 847 // No, but add the partial match back into the list for next time 848 else { 849 partial_phrase_matches.add(partial_phrase_match); 850 } 851 } 852 } 853 } 854 else { 855 partial_phrase_matches.clear(); 856 } 857 858 // See if this word is at the start of any of the phrases 859 for (int p = 0; p < phrase_query_term_variants_hierarchy.size(); p++) { 860 ArrayList phrase_query_p_term_variants_list = (ArrayList) phrase_query_term_variants_hierarchy.get(p); 861 HashSet phrase_query_p_term_1_variants = (HashSet) phrase_query_p_term_variants_list.get(0); 862 if (phrase_query_p_term_1_variants.contains(word_match.word)) { 863 // If this phrase is just one word long, we have a complete match 864 if (phrase_query_p_term_variants_list.size() == 1) { 865 highlight_start_positions.add(new Integer(word_match.start_position)); 866 highlight_end_positions.add(new Integer(word_match.end_position)); 867 } 868 // Otherwise we have the start of a potential phrase match 869 else { 870 partial_phrase_matches.add(new PartialPhraseMatch(word_match.start_position, p)); 871 } 872 } 873 } 874 } 875 876 // Now add the annotation tags into the document at the correct points 877 Element content_element = this.doc.createElement(GSXML.NODE_CONTENT_ELEM); 878 879 int last_wrote = 0; 880 for (int i = 0; i < highlight_start_positions.size(); i++) { 881 int highlight_start = ((Integer) highlight_start_positions.get(i)).intValue(); 882 int highlight_end = ((Integer) highlight_end_positions.get(i)).intValue(); 883 884 // Print anything before the highlight range 885 if (last_wrote < highlight_start) { 886 String preceding_text = new String(content_characters, last_wrote, (highlight_start - last_wrote)); 887 content_element.appendChild(this.doc.createTextNode(preceding_text)); 888 } 889 890 // Print the highlight text, annotated 891 if (highlight_end > last_wrote) { 892 String highlight_text = new String(content_characters, highlight_start, (highlight_end - highlight_start)); 893 Element annotation_element = GSXML.createTextElement(this.doc, "annotation", highlight_text); 894 annotation_element.setAttribute("type", "query_term"); 895 content_element.appendChild(annotation_element); 896 last_wrote = highlight_end; 897 } 898 } 899 900 // Finish off any unwritten text 901 if (last_wrote < content_characters.length) { 902 String remaining_text = new String(content_characters, last_wrote, (content_characters.length - last_wrote)); 903 content_element.appendChild(this.doc.createTextNode(remaining_text)); 904 } 905 906 return content_element; 159 907 } 160 908 161 // Create a parameter list to specify the required structure information 162 Element ds_param_list = this.doc.createElement(GSXML.PARAM_ELEM+GSXML.LIST_MODIFIER); 163 164 if (service_params != null) { 165 GSXML.addParametersToList(this.doc, ds_param_list, service_params); 909 910 static private class WordMatch 911 { 912 public String word; 913 public int start_position; 914 public int end_position; 915 public boolean preceding_word_matched; 916 917 public WordMatch(String word, int start_position, int end_position, boolean preceding_word_matched) 918 { 919 this.word = word; 920 this.start_position = start_position; 921 this.end_position = end_position; 922 this.preceding_word_matched = preceding_word_matched; 923 } 166 924 } 167 925 168 Element ds_param = null; 169 boolean get_structure = false; 170 boolean get_structure_info = false; 171 if (document_type.equals("paged")) { 172 get_structure_info = true; 173 // get teh info needed for paged naviagtion 174 ds_param = this.doc.createElement(GSXML.PARAM_ELEM); 175 ds_param_list.appendChild(ds_param); 176 ds_param.setAttribute(GSXML.NAME_ATT, "info"); 177 ds_param.setAttribute(GSXML.VALUE_ATT, "numSiblings"); 178 ds_param = this.doc.createElement(GSXML.PARAM_ELEM); 179 ds_param_list.appendChild(ds_param); 180 ds_param.setAttribute(GSXML.NAME_ATT, "info"); 181 ds_param.setAttribute(GSXML.VALUE_ATT, "numChildren"); 182 ds_param = this.doc.createElement(GSXML.PARAM_ELEM); 183 ds_param_list.appendChild(ds_param); 184 ds_param.setAttribute(GSXML.NAME_ATT, "info"); 185 ds_param.setAttribute(GSXML.VALUE_ATT, "siblingPosition"); 186 187 } else if (document_type.equals("hierarchy")){ 188 get_structure = true; 189 if (expand_contents) { 190 ds_param = this.doc.createElement(GSXML.PARAM_ELEM); 191 ds_param_list.appendChild(ds_param); 192 ds_param.setAttribute(GSXML.NAME_ATT, "structure"); 193 ds_param.setAttribute(GSXML.VALUE_ATT, "entire"); 194 } else { 195 // get the info needed for table of contents 196 ds_param = this.doc.createElement(GSXML.PARAM_ELEM); 197 ds_param_list.appendChild(ds_param); 198 ds_param.setAttribute(GSXML.NAME_ATT, "structure"); 199 ds_param.setAttribute(GSXML.VALUE_ATT, "ancestors"); 200 ds_param = this.doc.createElement(GSXML.PARAM_ELEM); 201 ds_param_list.appendChild(ds_param); 202 ds_param.setAttribute(GSXML.NAME_ATT, "structure"); 203 ds_param.setAttribute(GSXML.VALUE_ATT, "children"); 204 if (get_siblings) { 205 ds_param = this.doc.createElement(GSXML.PARAM_ELEM); 206 ds_param_list.appendChild(ds_param); 207 ds_param.setAttribute(GSXML.NAME_ATT, "structure"); 208 ds_param.setAttribute(GSXML.VALUE_ATT, "siblings"); 209 } 210 } 211 } else { 212 // we dont need any structure 926 927 static private class PartialPhraseMatch 928 { 929 public int start_position; 930 public int query_phrase_number; 931 public int num_words_matched; 932 933 public PartialPhraseMatch(int start_position, int query_phrase_number) 934 { 935 this.start_position = start_position; 936 this.query_phrase_number = query_phrase_number; 937 this.num_words_matched = 1; 938 } 213 939 } 214 215 boolean has_dummy = false;216 if (get_structure || get_structure_info) {217 218 // Build a request to obtain the document structure219 Element ds_message = this.doc.createElement(GSXML.MESSAGE_ELEM);220 String to = GSPath.appendLink(collection, "DocumentStructureRetrieve");// Hard-wired?221 Element ds_request = GSXML.createBasicRequest(this.doc,GSXML.REQUEST_TYPE_PROCESS, to, lang, uid);222 ds_message.appendChild(ds_request);223 ds_request.appendChild(ds_param_list);224 225 // create a doc_node_list and put in the doc_node that we are interested in226 ds_request.appendChild(basic_doc_list);227 228 // Process the document structure retrieve message229 Element ds_response_message = (Element) this.mr.process(ds_message);230 if (processErrorElements(ds_response_message, page_response)) {231 return result;232 }233 234 // get the info and print out235 String path = GSPath.appendLink(GSXML.RESPONSE_ELEM, GSXML.DOC_NODE_ELEM+GSXML.LIST_MODIFIER);236 path = GSPath.appendLink(path, GSXML.DOC_NODE_ELEM);237 path = GSPath.appendLink(path, "nodeStructureInfo");238 Element ds_response_struct_info = (Element) GSXML.getNodeByPath(ds_response_message, path);239 // get the doc_node bit240 if (ds_response_struct_info != null) {241 the_document.appendChild(this.doc.importNode(ds_response_struct_info, true));242 }243 path = GSPath.appendLink(GSXML.RESPONSE_ELEM, GSXML.DOC_NODE_ELEM+GSXML.LIST_MODIFIER);244 path = GSPath.appendLink(path, GSXML.DOC_NODE_ELEM);245 path = GSPath.appendLink(path, GSXML.NODE_STRUCTURE_ELEM);246 Element ds_response_structure = (Element) GSXML.getNodeByPath(ds_response_message, path);247 248 if (ds_response_structure != null) {249 // add the contents of the structure bit into the_document250 NodeList structs = ds_response_structure.getChildNodes();251 for (int i=0; i<structs.getLength();i++) {252 the_document.appendChild(this.doc.importNode(structs.item(i), true));253 }254 } else {255 // no structure nodes, so put in a dummy doc node256 Element doc_node = this.doc.createElement(GSXML.DOC_NODE_ELEM);257 if (document_name.length()!=0){258 doc_node.setAttribute(GSXML.NODE_ID_ATT, document_name);259 }else if (has_href.length()!=0){260 doc_node.setAttribute(GSXML.NODE_ID_ATT, has_href);261 doc_node.setAttribute("externalURL", has_rl);262 }263 the_document.appendChild(doc_node);264 has_dummy = true;265 }266 } else { // a simple type - we dont have a dummy node for simple267 // should think about this more268 // no structure request, so just put in a dummy doc node269 Element doc_node = this.doc.createElement(GSXML.DOC_NODE_ELEM);270 if (document_name.length()!=0){271 doc_node.setAttribute(GSXML.NODE_ID_ATT, document_name);272 }else if (has_href.length()!=0){273 doc_node.setAttribute(GSXML.NODE_ID_ATT, has_href);274 doc_node.setAttribute("externalURL", has_rl);275 }276 the_document.appendChild(doc_node);277 has_dummy = true;278 }279 280 // Build a request to obtain some document metadata281 Element dm_message = this.doc.createElement(GSXML.MESSAGE_ELEM);282 String to = GSPath.appendLink(collection, "DocumentMetadataRetrieve"); // Hard-wired?283 Element dm_request = GSXML.createBasicRequest(this.doc, GSXML.REQUEST_TYPE_PROCESS, to, lang, uid);284 dm_message.appendChild(dm_request);285 // Create a parameter list to specify the required metadata information286 287 HashSet meta_names = new HashSet();288 meta_names.add("Title"); // the default289 if (format_elem != null) {290 extractMetadataNames(format_elem, meta_names);291 }292 293 Element dm_param_list = createMetadataParamList(meta_names);294 if (service_params != null) {295 GSXML.addParametersToList(this.doc, dm_param_list, service_params);296 }297 298 dm_request.appendChild(dm_param_list);299 300 301 // create the doc node list for the metadata request302 Element dm_doc_list = this.doc.createElement(GSXML.DOC_NODE_ELEM+GSXML.LIST_MODIFIER);303 dm_request.appendChild(dm_doc_list);304 305 // Add each node from the structure response into the metadata request306 NodeList doc_nodes = the_document.getElementsByTagName(GSXML.DOC_NODE_ELEM);307 for (int i = 0; i < doc_nodes.getLength(); i++) {308 Element doc_node = (Element) doc_nodes.item(i);309 String doc_node_id = doc_node.getAttribute(GSXML.NODE_ID_ATT);310 311 // Add the documentNode to the list312 Element dm_doc_node = this.doc.createElement(GSXML.DOC_NODE_ELEM);313 dm_doc_list.appendChild(dm_doc_node);314 dm_doc_node.setAttribute(GSXML.NODE_ID_ATT, doc_node_id);315 dm_doc_node.setAttribute(GSXML.NODE_TYPE_ATT,316 doc_node.getAttribute(GSXML.NODE_TYPE_ATT));317 }318 319 // we also want a metadata request to the top level document to get320 // assocfilepath - this could be cached too321 Element doc_meta_request = GSXML.createBasicRequest(this.doc, GSXML.REQUEST_TYPE_PROCESS, to, lang, uid);322 dm_message.appendChild(doc_meta_request);323 Element doc_meta_param_list = this.doc.createElement(GSXML.PARAM_ELEM+GSXML.LIST_MODIFIER);324 if (service_params != null) {325 GSXML.addParametersToList(this.doc, doc_meta_param_list, service_params);326 }327 328 doc_meta_request.appendChild(doc_meta_param_list);329 Element doc_param = this.doc.createElement(GSXML.PARAM_ELEM);330 doc_meta_param_list.appendChild(doc_param);331 doc_param.setAttribute(GSXML.NAME_ATT, "metadata");332 doc_param.setAttribute(GSXML.VALUE_ATT, "assocfilepath");333 334 // create the doc node list for the metadata request335 Element doc_list = this.doc.createElement(GSXML.DOC_NODE_ELEM+GSXML.LIST_MODIFIER);336 doc_meta_request.appendChild(doc_list);337 338 Element doc_node = this.doc.createElement(GSXML.DOC_NODE_ELEM);339 // the node we want is the root document node340 if (document_name.length()!=0){341 doc_node.setAttribute(GSXML.NODE_ID_ATT, document_name+".rt");342 }else if (has_href.length()!=0){343 doc_node.setAttribute(GSXML.NODE_ID_ATT, has_href+".rt");344 doc_node.setAttribute("externalURL", has_rl);345 }346 doc_list.appendChild(doc_node);347 Element dm_response_message = (Element) this.mr.process(dm_message);348 if (processErrorElements(dm_response_message, page_response)) {349 return result;350 }351 352 String path = GSPath.appendLink(GSXML.RESPONSE_ELEM, GSXML.DOC_NODE_ELEM+GSXML.LIST_MODIFIER);353 Element dm_response_doc_list = (Element) GSXML.getNodeByPath(dm_response_message, path);354 355 // Merge the metadata with the structure information356 NodeList dm_response_docs = dm_response_doc_list.getChildNodes();357 for (int i = 0; i < doc_nodes.getLength(); i++) {358 GSXML.mergeMetadataLists(doc_nodes.item(i), dm_response_docs.item(i));359 }360 // get the top level doc metadata out361 Element doc_meta_response = (Element)dm_response_message.getElementsByTagName(GSXML.RESPONSE_ELEM).item(1);362 Element top_doc_node = (Element)GSXML.getNodeByPath(doc_meta_response, "documentNodeList/documentNode");363 GSXML.mergeMetadataLists(the_document, top_doc_node);364 365 // Build a request to obtain some document content366 Element dc_message = this.doc.createElement(GSXML.MESSAGE_ELEM);367 to = GSPath.appendLink(collection, "DocumentContentRetrieve"); // Hard-wired?368 Element dc_request = GSXML.createBasicRequest(this.doc, GSXML.REQUEST_TYPE_PROCESS, to, lang, uid);369 dc_message.appendChild(dc_request);370 371 372 // Create a parameter list to specify the request parameters - empty for now373 Element dc_param_list = this.doc.createElement(GSXML.PARAM_ELEM+GSXML.LIST_MODIFIER);374 if (service_params != null) {375 GSXML.addParametersToList(this.doc, dc_param_list, service_params);376 }377 378 dc_request.appendChild(dc_param_list);379 380 // get the content381 // the doc list for the content request is the same as the one for the structure request unless we want the whole document, in which case its the same as for the metadata request.382 if (expand_document) {383 dc_request.appendChild(dm_doc_list);384 } else {385 dc_request.appendChild(basic_doc_list);386 }387 logger.debug("request = "+converter.getString(dc_message));388 Element dc_response_message = (Element) this.mr.process(dc_message);389 if (processErrorElements(dc_response_message, page_response)) {390 return result;391 }392 393 Element dc_response_doc_list = (Element) GSXML.getNodeByPath(dc_response_message, path);394 395 if (expand_document) {396 // Merge the content with the structure information397 NodeList dc_response_docs = dc_response_doc_list.getChildNodes();398 for (int i = 0; i < doc_nodes.getLength(); i++) {399 Node content = GSXML.getChildByTagName((Element)dc_response_docs.item(i), "nodeContent");400 if (content != null) {401 doc_nodes.item(i).appendChild(this.doc.importNode(content, true));402 }403 //GSXML.mergeMetadataLists(doc_nodes.item(i), dm_response_docs.item(i));404 }405 } else {406 //path = GSPath.appendLink(path, GSXML.DOC_NODE_ELEM);407 Element dc_response_doc = (Element) GSXML.getChildByTagName(dc_response_doc_list, GSXML.DOC_NODE_ELEM);408 Element dc_response_doc_content = (Element) GSXML.getChildByTagName(dc_response_doc, GSXML.NODE_CONTENT_ELEM);409 Element dc_response_doc_external = (Element) GSXML.getChildByTagName(dc_response_doc, "external");410 411 if (dc_response_doc_content == null) {412 // no content to add413 if (dc_response_doc_external !=null){414 String modified_doc_id = dc_response_doc.getAttribute(GSXML.NODE_ID_ATT);415 416 the_document.setAttribute("selectedNode", modified_doc_id);417 the_document.setAttribute("external", dc_response_doc_external.getAttribute("external_link"));418 }419 return result;420 }421 if (highlight_query_terms) {422 dc_response_doc.removeChild(dc_response_doc_content);423 424 dc_response_doc_content = highlightQueryTerms(request, dc_response_doc_content);425 dc_response_doc.appendChild(dc_response_doc.getOwnerDocument().importNode(dc_response_doc_content, true));426 }427 428 429 if (provide_annotations) {430 String service_selected = (String)params.get(ENRICH_DOC_ARG);431 if (service_selected != null && service_selected.equals("1")) {432 // now we can modifiy the response doc if needed433 String enrich_service = (String)params.get(GSParams.SERVICE);434 // send a message to the service435 Element enrich_message = this.doc.createElement(GSXML.MESSAGE_ELEM);436 Element enrich_request = GSXML.createBasicRequest(this.doc, GSXML.REQUEST_TYPE_PROCESS, enrich_service, lang, uid);437 enrich_message.appendChild(enrich_request);438 // check for parameters439 HashMap e_service_params = (HashMap)params.get("s1");440 if (e_service_params != null) {441 Element enrich_pl = this.doc.createElement(GSXML.PARAM_ELEM+GSXML.LIST_MODIFIER);442 GSXML.addParametersToList(this.doc, enrich_pl, e_service_params);443 enrich_request.appendChild(enrich_pl);444 }445 Element e_doc_list = this.doc.createElement(GSXML.DOC_NODE_ELEM+GSXML.LIST_MODIFIER);446 enrich_request.appendChild(e_doc_list);447 e_doc_list.appendChild(this.doc.importNode(dc_response_doc, true));448 449 Node enrich_response = this.mr.process(enrich_message);450 451 String [] links = {GSXML.RESPONSE_ELEM, GSXML.DOC_NODE_ELEM+GSXML.LIST_MODIFIER, GSXML.DOC_NODE_ELEM, GSXML.NODE_CONTENT_ELEM};452 path = GSPath.createPath(links);453 dc_response_doc_content = (Element)GSXML.getNodeByPath(enrich_response, path);454 455 }456 } // if provide_annotations457 458 459 // use the returned id rather than the sent one cos there may have460 // been modifiers such as .pr that are removed.461 String modified_doc_id = dc_response_doc.getAttribute(GSXML.NODE_ID_ATT);462 the_document.setAttribute("selectedNode", modified_doc_id);463 if (has_dummy) {464 // change the id if necessary and add the content465 Element dummy_node = (Element)doc_nodes.item(0);466 467 dummy_node.setAttribute(GSXML.NODE_ID_ATT, modified_doc_id);468 dummy_node.appendChild(this.doc.importNode(dc_response_doc_content, true));469 // hack for simple type470 if (document_type.equals("simple")) {471 // we dont want the internal docNode, just want the content and metadata in the document472 // rethink this!!473 the_document.removeChild(dummy_node);474 475 NodeList dummy_children = dummy_node.getChildNodes();476 //for (int i=0; i<dummy_children.getLength(); i++) {477 for (int i=dummy_children.getLength()-1; i>=0; i--) {478 // special case as we don't want more than one metadata list479 if (dummy_children.item(i).getNodeName().equals(GSXML.METADATA_ELEM+GSXML.LIST_MODIFIER)) {480 GSXML.mergeMetadataFromList(the_document, dummy_children.item(i));481 } else {482 the_document.appendChild(dummy_children.item(i));483 }484 }485 }486 } else {487 // Merge the document content with the metadata and structure information488 for (int i = 0; i < doc_nodes.getLength(); i++) {489 Node dn = doc_nodes.item(i);490 String dn_id = ((Element)dn).getAttribute(GSXML.NODE_ID_ATT);491 if (dn_id.equals(modified_doc_id)) {492 dn.appendChild(this.doc.importNode(dc_response_doc_content, true));493 break;494 }495 }496 }497 }498 logger.debug("(DocumentAction) Page:\n" + this.converter.getPrettyString(result));499 return result;500 }501 502 /** tell the param class what its arguments are503 * if an action has its own arguments, this should add them to the params504 * object - particularly important for args that should not be saved */505 public boolean getActionParameters(GSParams params) {506 params.addParameter(GOTO_PAGE_ARG, false);507 params.addParameter(ENRICH_DOC_ARG, false);508 return true;509 }510 511 512 /** this method gets the collection description, the format info, the513 * list of enrich services, etc - stuff that is needed for the page,514 * but is the same whatever the query is - should be cached */515 protected boolean getBackgroundData(Element page_response,516 String collection, String lang,517 String uid) {518 519 // create a message to process - contains requests for the collection520 // description, the format element, the enrich services on offer521 // these could all be cached522 Element info_message = this.doc.createElement(GSXML.MESSAGE_ELEM);523 String path = GSPath.appendLink(collection, "DocumentContentRetrieve");524 // the format request - ignore for now, where does this request go to??525 Element format_request = GSXML.createBasicRequest(this.doc, GSXML.REQUEST_TYPE_FORMAT, path, lang, uid);526 info_message.appendChild(format_request);527 528 // the enrich_services request - only do this if provide_annotations is true529 530 if (provide_annotations) {531 Element enrich_services_request = GSXML.createBasicRequest(this.doc, GSXML.REQUEST_TYPE_DESCRIBE, "", lang, uid);532 enrich_services_request.setAttribute(GSXML.INFO_ATT, "serviceList");533 info_message.appendChild(enrich_services_request);534 }535 536 Element info_response = (Element)this.mr.process(info_message);537 538 // the collection is the first response539 NodeList responses = info_response.getElementsByTagName(GSXML.RESPONSE_ELEM);540 Element format_resp = (Element) responses.item(0);541 542 Element format_elem = (Element)GSXML.getChildByTagName(format_resp, GSXML.FORMAT_ELEM);543 if (format_elem != null) {544 logger.debug("doc action found a format statement");545 // set teh format type546 format_elem.setAttribute(GSXML.TYPE_ATT, "display");547 page_response.appendChild(this.doc.importNode(format_elem, true));548 }549 550 if (provide_annotations) {551 Element services_resp = (Element)responses.item(1);552 553 // a new message for the mr554 Element enrich_message = this.doc.createElement(GSXML.MESSAGE_ELEM);555 556 NodeList e_services = services_resp.getElementsByTagName(GSXML.SERVICE_ELEM);557 boolean service_found = false;558 for (int j=0; j<e_services.getLength(); j++) {559 if (((Element)e_services.item(j)).getAttribute(GSXML.TYPE_ATT).equals("enrich")) {560 Element s = GSXML.createBasicRequest(this.doc, GSXML.REQUEST_TYPE_DESCRIBE, ((Element)e_services.item(j)).getAttribute(GSXML.NAME_ATT), lang, uid);561 enrich_message.appendChild(s);562 service_found = true;563 }564 }565 if (service_found) {566 Element enrich_response = (Element)this.mr.process(enrich_message);567 568 NodeList e_responses = enrich_response.getElementsByTagName(GSXML.RESPONSE_ELEM);569 Element service_list = this.doc.createElement(GSXML.SERVICE_ELEM + GSXML.LIST_MODIFIER);570 for (int i=0; i<e_responses.getLength(); i++) {571 Element e_resp = (Element)e_responses.item(i);572 Element e_service = (Element)this.doc.importNode(GSXML.getChildByTagName(e_resp, GSXML.SERVICE_ELEM), true);573 e_service.setAttribute(GSXML.NAME_ATT, e_resp.getAttribute(GSXML.FROM_ATT));574 service_list.appendChild(e_service);575 }576 page_response.appendChild(service_list);577 }578 } // if provide_annotations579 return true;580 581 }582 583 /** this involves a bit of a hack to get the equivalent query terms - has to requery the query service - uses the last selected service name. (if it ends in query). should this action do the query or should it send a message to the query action? but that will involve lots of extra stuff. also doesn't handle phrases properly - just highlights all the terms found in the text.584 */585 protected Element highlightQueryTerms(Element request, Element dc_response_doc_content) {586 587 // do the query again to get term info588 Element cgi_param_list = (Element)GSXML.getChildByTagName(request, GSXML.PARAM_ELEM+GSXML.LIST_MODIFIER);589 HashMap params = GSXML.extractParams(cgi_param_list, false);590 591 HashMap previous_params = (HashMap)params.get("p");592 if (previous_params == null) {593 return dc_response_doc_content;594 }595 String service_name = (String)previous_params.get(GSParams.SERVICE);596 if (service_name == null || !service_name.endsWith("Query")) { // hack for now - we only do highlighting if we were in a query last - ie not if we were in a browse thingy597 logger.debug("invalid service, not doing highlighting");598 return dc_response_doc_content;599 }600 String collection = (String)params.get(GSParams.COLLECTION);601 String lang = request.getAttribute(GSXML.LANG_ATT);602 String uid = request.getAttribute(GSXML.USER_ID_ATT);603 String to = GSPath.appendLink(collection, service_name);604 605 Element mr_query_message = this.doc.createElement(GSXML.MESSAGE_ELEM);606 Element mr_query_request = GSXML.createBasicRequest(this.doc, GSXML.REQUEST_TYPE_PROCESS, to, lang, uid);607 mr_query_message.appendChild(mr_query_request);608 609 // paramList610 HashMap service_params = (HashMap)params.get("s1");611 612 Element query_param_list = this.doc.createElement(GSXML.PARAM_ELEM + GSXML.LIST_MODIFIER);613 GSXML.addParametersToList(this.doc, query_param_list, service_params);614 mr_query_request.appendChild(query_param_list);615 616 // do the query617 Element mr_query_response = (Element)this.mr.process(mr_query_message);618 619 String path = GSPath.appendLink(GSXML.RESPONSE_ELEM, GSXML.TERM_ELEM+GSXML.LIST_MODIFIER);620 Element query_term_list_element = (Element) GSXML.getNodeByPath(mr_query_response, path);621 if (query_term_list_element == null) {622 // no term info623 logger.error("No query term information.\n");624 return dc_response_doc_content;625 }626 627 String content = GSXML.getNodeText(dc_response_doc_content);628 629 String metadata_path = GSPath.appendLink(GSXML.RESPONSE_ELEM, GSXML.METADATA_ELEM+GSXML.LIST_MODIFIER);630 Element metadata_list = (Element) GSXML.getNodeByPath(mr_query_response, metadata_path);631 632 HashSet query_term_variants = new HashSet();633 NodeList equivalent_terms_nodelist = query_term_list_element.getElementsByTagName("equivTermList");634 for (int i = 0; i < equivalent_terms_nodelist.getLength(); i++) {635 Element equivalent_terms_element = (Element) equivalent_terms_nodelist.item(i);636 String[] equivalent_terms = GSXML.getAttributeValuesFromList(equivalent_terms_element, GSXML.NAME_ATT);637 for (int j = 0; j < equivalent_terms.length; j++) {638 query_term_variants.add(equivalent_terms[j]);639 }640 }641 642 ArrayList phrase_query_term_variants_hierarchy = new ArrayList();643 644 Element query_element = GSXML.getNamedElement(metadata_list, GSXML.METADATA_ELEM, GSXML.NAME_ATT, "query");645 String performed_query = GSXML.getNodeText(query_element) + " ";646 647 ArrayList phrase_query_p_term_variants_list = new ArrayList();648 int term_start = 0;649 boolean in_term = false;650 boolean in_phrase = false;651 for (int i = 0; i < performed_query.length(); i++) {652 char character = performed_query.charAt(i);653 boolean is_character_letter_or_digit = Character.isLetterOrDigit(character);654 655 // Has a query term just started?656 if (in_term == false && is_character_letter_or_digit == true) {657 in_term = true;658 term_start = i;659 }660 661 // Or has a term just finished?662 else if (in_term == true && is_character_letter_or_digit == false) {663 in_term = false;664 String term = performed_query.substring(term_start, i);665 666 Element term_element = GSXML.getNamedElement(query_term_list_element, GSXML.TERM_ELEM, GSXML.NAME_ATT, term);667 if (term_element != null) {668 669 HashSet phrase_query_p_term_x_variants = new HashSet();670 671 NodeList term_equivalent_terms_nodelist = term_element.getElementsByTagName("equivTermList");672 for (int j = 0; j < term_equivalent_terms_nodelist.getLength(); j++) {673 Element term_equivalent_terms_element = (Element) term_equivalent_terms_nodelist.item(j);674 String[] term_equivalent_terms = GSXML.getAttributeValuesFromList(term_equivalent_terms_element, GSXML.NAME_ATT);675 for (int k = 0; k < term_equivalent_terms.length; k++) {676 phrase_query_p_term_x_variants.add(term_equivalent_terms[k]);677 }678 }679 phrase_query_p_term_variants_list.add(phrase_query_p_term_x_variants);680 681 if (in_phrase == false) {682 phrase_query_term_variants_hierarchy.add(phrase_query_p_term_variants_list);683 phrase_query_p_term_variants_list = new ArrayList();684 }685 }686 }687 // Watch for phrases (surrounded by quotes)688 if (character == '\"') {689 // Has a phrase just started?690 if (in_phrase == false) {691 in_phrase = true;692 }693 // Or has a phrase just finished?694 else if (in_phrase == true) {695 in_phrase = false;696 phrase_query_term_variants_hierarchy.add(phrase_query_p_term_variants_list);697 }698 699 phrase_query_p_term_variants_list = new ArrayList();700 }701 }702 703 return highlightQueryTermsInternal(content, query_term_variants, phrase_query_term_variants_hierarchy);704 }705 706 707 /**708 * Highlights query terms in a piece of text.709 */710 private Element highlightQueryTermsInternal(String content, HashSet query_term_variants, ArrayList phrase_query_term_variants_hierarchy)711 {712 // Convert the content string to an array of characters for speed713 char[] content_characters = new char[content.length()];714 content.getChars(0, content.length(), content_characters, 0);715 716 // Now skim through the content, identifying word matches717 ArrayList word_matches = new ArrayList();718 int word_start = 0;719 boolean in_word = false;720 boolean preceding_word_matched = false;721 for (int i = 0; i < content_characters.length; i++) {722 boolean is_character_letter_or_digit = Character.isLetterOrDigit(content_characters[i]);723 724 // Has a word just started?725 if (in_word == false && is_character_letter_or_digit == true) {726 in_word = true;727 word_start = i;728 }729 730 // Or has a word just finished?731 else if (in_word == true && is_character_letter_or_digit == false) {732 in_word = false;733 734 // Check if the word matches any of the query term equivalents735 String word = new String(content_characters, word_start, (i - word_start));736 if (query_term_variants.contains(word)) {737 // We have found a matching word, so remember its location738 word_matches.add(new WordMatch(word, word_start, i, preceding_word_matched));739 preceding_word_matched = true;740 }741 else {742 preceding_word_matched = false;743 }744 }745 }746 747 // Don't forget the last word...748 if (in_word == true) {749 // Check if the word matches any of the query term equivalents750 String word = new String(content_characters, word_start, (content_characters.length - word_start));751 if (query_term_variants.contains(word)) {752 // We have found a matching word, so remember its location753 word_matches.add(new WordMatch(word, word_start, content_characters.length, preceding_word_matched));754 }755 }756 757 ArrayList highlight_start_positions = new ArrayList();758 ArrayList highlight_end_positions = new ArrayList();759 760 // Deal with phrases now761 ArrayList partial_phrase_matches = new ArrayList();762 for (int i = 0; i < word_matches.size(); i++) {763 WordMatch word_match = (WordMatch) word_matches.get(i);764 765 // See if any partial phrase matches are extended by this word766 if (word_match.preceding_word_matched) {767 for (int j = partial_phrase_matches.size() - 1; j >= 0; j--) {768 PartialPhraseMatch partial_phrase_match = (PartialPhraseMatch) partial_phrase_matches.remove(j);769 ArrayList phrase_query_p_term_variants_list = (ArrayList) phrase_query_term_variants_hierarchy.get(partial_phrase_match.query_phrase_number);770 HashSet phrase_query_p_term_x_variants = (HashSet) phrase_query_p_term_variants_list.get(partial_phrase_match.num_words_matched);771 if (phrase_query_p_term_x_variants.contains(word_match.word)) {772 partial_phrase_match.num_words_matched++;773 774 // Has a complete phrase match occurred?775 if (partial_phrase_match.num_words_matched == phrase_query_p_term_variants_list.size()) {776 // Check for overlaps by looking at the previous highlight range777 if (!highlight_end_positions.isEmpty()) {778 int last_highlight_index = highlight_end_positions.size() - 1;779 int last_highlight_end = ((Integer) highlight_end_positions.get(last_highlight_index)).intValue();780 if (last_highlight_end > partial_phrase_match.start_position) {781 // There is an overlap, so remove the previous phrase match782 int last_highlight_start = ((Integer) highlight_start_positions.remove(last_highlight_index)).intValue();783 highlight_end_positions.remove(last_highlight_index);784 partial_phrase_match.start_position = last_highlight_start;785 }786 }787 788 highlight_start_positions.add(new Integer(partial_phrase_match.start_position));789 highlight_end_positions.add(new Integer(word_match.end_position));790 }791 // No, but add the partial match back into the list for next time792 else {793 partial_phrase_matches.add(partial_phrase_match);794 }795 }796 }797 }798 else {799 partial_phrase_matches.clear();800 }801 802 // See if this word is at the start of any of the phrases803 for (int p = 0; p < phrase_query_term_variants_hierarchy.size(); p++) {804 ArrayList phrase_query_p_term_variants_list = (ArrayList) phrase_query_term_variants_hierarchy.get(p);805 HashSet phrase_query_p_term_1_variants = (HashSet) phrase_query_p_term_variants_list.get(0);806 if (phrase_query_p_term_1_variants.contains(word_match.word)) {807 // If this phrase is just one word long, we have a complete match808 if (phrase_query_p_term_variants_list.size() == 1) {809 highlight_start_positions.add(new Integer(word_match.start_position));810 highlight_end_positions.add(new Integer(word_match.end_position));811 }812 // Otherwise we have the start of a potential phrase match813 else {814 partial_phrase_matches.add(new PartialPhraseMatch(word_match.start_position, p));815 }816 }817 }818 }819 820 // Now add the annotation tags into the document at the correct points821 Element content_element = this.doc.createElement(GSXML.NODE_CONTENT_ELEM);822 823 int last_wrote = 0;824 for (int i = 0; i < highlight_start_positions.size(); i++) {825 int highlight_start = ((Integer) highlight_start_positions.get(i)).intValue();826 int highlight_end = ((Integer) highlight_end_positions.get(i)).intValue();827 828 // Print anything before the highlight range829 if (last_wrote < highlight_start) {830 String preceding_text = new String(content_characters, last_wrote, (highlight_start - last_wrote));831 content_element.appendChild(this.doc.createTextNode(preceding_text));832 }833 834 // Print the highlight text, annotated835 if (highlight_end > last_wrote) {836 String highlight_text = new String(content_characters, highlight_start, (highlight_end - highlight_start));837 Element annotation_element = GSXML.createTextElement(this.doc, "annotation", highlight_text);838 annotation_element.setAttribute("type", "query_term");839 content_element.appendChild(annotation_element);840 last_wrote = highlight_end;841 }842 }843 844 // Finish off any unwritten text845 if (last_wrote < content_characters.length) {846 String remaining_text = new String(content_characters, last_wrote, (content_characters.length - last_wrote));847 content_element.appendChild(this.doc.createTextNode(remaining_text));848 }849 850 return content_element;851 }852 853 854 static private class WordMatch855 {856 public String word;857 public int start_position;858 public int end_position;859 public boolean preceding_word_matched;860 861 public WordMatch(String word, int start_position, int end_position, boolean preceding_word_matched)862 {863 this.word = word;864 this.start_position = start_position;865 this.end_position = end_position;866 this.preceding_word_matched = preceding_word_matched;867 }868 }869 870 871 static private class PartialPhraseMatch872 {873 public int start_position;874 public int query_phrase_number;875 public int num_words_matched;876 877 public PartialPhraseMatch(int start_position, int query_phrase_number)878 {879 this.start_position = start_position;880 this.query_phrase_number = query_phrase_number;881 this.num_words_matched = 1;882 }883 }884 940 }
Note:
See TracChangeset
for help on using the changeset viewer.