- Timestamp:
- 2018-10-29T12:19:46+13:00 (5 years ago)
- File:
-
- 1 edited
Legend:
- Unmodified
- Added
- Removed
-
main/trunk/greenstone3/src/java/org/greenstone/gsdl3/action/DocumentAction.java
r32505 r32545 35 35 import java.util.HashMap; 36 36 import java.util.HashSet; 37 import java.util.Iterator; 37 38 import java.io.File; 38 39 import java.io.Serializable; … … 541 542 GSXML.mergeMetadataLists(the_document, top_doc_node); 542 543 544 // if we are highlighting query terms, then we also get them highlighted in the metadata 545 546 HashSet<String> query_term_variants = null; 547 ArrayList<ArrayList<HashSet<String>>> phrase_query_term_variants_hierarchy = null; 548 boolean do_highlight_query_terms = highlight_query_terms; 549 int query_terms_status = 0; 550 if (highlight_query_terms) { 551 // lets get the query term equivalents 552 query_term_variants = new HashSet<String>(); 553 phrase_query_term_variants_hierarchy = new ArrayList<ArrayList<HashSet<String>>>(); 554 if ((query_terms_status = getQueryTermVariants(request, query_term_variants, phrase_query_term_variants_hierarchy)) ==0) { 555 do_highlight_query_terms = false; // we couldn't get the terms 556 } 557 } 558 559 // lets try marking up the metadata with search terms 560 // if the search service doesn't send back <equivTermlist> then we haven't got the term variants. We lower case everything and do case insensitive matching 561 boolean highlight_case_insensitive = false; 562 if (query_terms_status == NO_EQUIV_QUERY_TERMS) { 563 highlight_case_insensitive = true; 564 } 565 if (do_highlight_query_terms) { 566 highlightQueryTermsDOM(doc, the_document, "metadata", query_term_variants, phrase_query_term_variants_hierarchy, highlight_case_insensitive); 567 } 568 543 569 // do we want doc text content? If not, we are done. 544 570 if (!get_text) { … … 546 572 return result; 547 573 } 548 549 574 550 HashSet<String> query_term_variants = null;551 ArrayList<ArrayList<HashSet<String>>> phrase_query_term_variants_hierarchy = null;552 boolean do_highlight_query_terms = highlight_query_terms;553 if (highlight_query_terms) {554 // lets get the query term equivalents555 query_term_variants = new HashSet<String>();556 phrase_query_term_variants_hierarchy = new ArrayList<ArrayList<HashSet<String>>>();557 if (!getQueryTermVariants(request, null, /*current_node_id,*/ query_term_variants, phrase_query_term_variants_hierarchy)) {558 do_highlight_query_terms = false; // we couldn't get the terms559 }560 }561 562 // lets try marking up the metadata with search terms563 if (do_highlight_query_terms) {564 highlightQueryTermsDOM(doc, the_document, "metadata", query_term_variants, phrase_query_term_variants_hierarchy);565 }566 567 575 // Build a request to obtain some document content 568 576 Element dc_message = doc.createElement(GSXML.MESSAGE_ELEM); … … 599 607 Element dc_response_doc_list = (Element) GSXML.getNodeByPath(dc_response_message, path); 600 608 609 boolean get_marked_up_doc_from_query = false; 610 if (do_highlight_query_terms && query_terms_status == NO_EQUIV_QUERY_TERMS) { 611 get_marked_up_doc_from_query = true; // we try to. solr we can, lucene we can't 612 } 613 601 614 if (expand_document) 602 615 { … … 606 619 { 607 620 String node_id = ((Element)doc_nodes.item(i)).getAttribute(GSXML.NODE_ID_ATT); 608 //Node content = GSXML.getChildByTagName((Element) dc_response_docs.item(i), GSXML.NODE_CONTENT_ELEM);609 621 Node docNode = GSXML.getNamedElement(dc_response_doc_list, "documentNode", GSXML.NODE_ID_ATT, node_id); 610 622 Node content = GSXML.getChildByTagName(docNode, GSXML.NODE_CONTENT_ELEM); 611 623 if (content != null) 612 624 { 613 if (do_highlight_query_terms) 614 { 615 content = highlightQueryTermsElementText(doc, (Element)content, query_term_variants, phrase_query_term_variants_hierarchy); 616 } 617 618 doc_nodes.item(i).appendChild(doc.importNode(content, true)); 619 } 620 //GSXML.mergeMetadataLists(doc_nodes.item(i), dm_response_docs.item(i)); 625 if (do_highlight_query_terms) { 626 if (get_marked_up_doc_from_query) { 627 628 Element new_content = retrieveHighlightedContent(request, node_id); 629 630 if (new_content == null) { 631 // we didn't get any text back from the request. assume we won't be able to get it next time either (eg lucene) 632 get_marked_up_doc_from_query = false; 633 content = highlightQueryTermsElementText(doc, (Element)content, query_term_variants, phrase_query_term_variants_hierarchy, highlight_case_insensitive); 634 } else { 635 content= new_content; 636 } 637 } else { 638 content = highlightQueryTermsElementText(doc, (Element)content, query_term_variants, phrase_query_term_variants_hierarchy, highlight_case_insensitive); 639 } 640 } 641 doc_nodes.item(i).appendChild(doc.importNode(content, true)); 642 } 643 621 644 } 622 645 if (has_dummy && document_type.equals(GSXML.DOC_TYPE_SIMPLE)) { … … 641 664 else 642 665 { 643 //path = GSPath.appendLink(path, GSXML.DOC_NODE_ELEM);644 666 Element dc_response_doc = (Element) GSXML.getChildByTagName(dc_response_doc_list, GSXML.DOC_NODE_ELEM); 645 667 Element dc_response_doc_content = (Element) GSXML.getChildByTagName(dc_response_doc, GSXML.NODE_CONTENT_ELEM); 646 //Element dc_response_doc_external = (Element) GSXML.getChildByTagName(dc_response_doc, "external");647 668 648 669 if (dc_response_doc_content == null) … … 651 672 if (dc_response_doc.getAttribute("external").equals("true")) 652 673 { 653 654 //if (dc_response_doc_external != null)655 //{656 674 String href_id = dc_response_doc.getAttribute(GSXML.HREF_ID_ATT); 657 675 … … 664 682 { 665 683 dc_response_doc.removeChild(dc_response_doc_content); 666 667 dc_response_doc_content = highlightQueryTermsElementText(doc, (Element)dc_response_doc_content, query_term_variants, phrase_query_term_variants_hierarchy); 684 if (get_marked_up_doc_from_query) { 685 Element new_content = retrieveHighlightedContent(request, null); 686 if (new_content == null) { 687 get_marked_up_doc_from_query = false; 688 dc_response_doc_content = highlightQueryTermsElementText(doc, (Element)dc_response_doc_content, query_term_variants, phrase_query_term_variants_hierarchy, highlight_case_insensitive); 689 } else { 690 691 dc_response_doc_content = new_content; 692 } 693 } else { 694 dc_response_doc_content = highlightQueryTermsElementText(doc, (Element)dc_response_doc_content, query_term_variants, phrase_query_term_variants_hierarchy, highlight_case_insensitive); 695 } 668 696 dc_response_doc.appendChild(dc_response_doc.getOwnerDocument().importNode(dc_response_doc_content, true)); 669 697 } … … 720 748 721 749 NodeList dummy_children = dummy_node.getChildNodes(); 722 //for (int i=0; i<dummy_children.getLength(); i++) {723 750 for (int i = dummy_children.getLength() - 1; i >= 0; i--) 724 751 { … … 1036 1063 } 1037 1064 1065 protected final int NO_QUERY_TERMS = 0; 1066 protected final int NO_EQUIV_QUERY_TERMS = 1; 1067 protected final int EQUIV_QUERY_TERMS = 2; 1038 1068 /** 1039 1069 * this involves a bit of a hack to get the equivalent query terms - has to 1040 1070 * requery the query service - uses the last selected service name. (if it 1041 * ends in query). should this action do the query or should it send a 1042 * message to the query action? but that will involve lots of extra stuff. 1071 * ends in query). 1043 1072 */ 1044 protected boolean getQueryTermVariants(Element request, String current_node_id, HashSet<String> query_term_variants, ArrayList<ArrayList<HashSet<String>>> phrase_query_term_variants_hierarchy)1073 protected int getQueryTermVariants(Element request, HashSet<String> query_term_variants, ArrayList<ArrayList<HashSet<String>>> phrase_query_term_variants_hierarchy) 1045 1074 { 1046 Document doc = request.getOwnerDocument();1047 1075 Document doc = XMLConverter.newDOM(); 1076 1048 1077 // do the query again to get term info 1049 1078 Element cgi_param_list = (Element) GSXML.getChildByTagName(request, GSXML.PARAM_ELEM + GSXML.LIST_MODIFIER); 1050 //logger.error("cgi param list = "+XMLConverter.getPrettyString(cgi_param_list));1051 1079 HashMap<String, Serializable> params = GSXML.extractParams(cgi_param_list, false); 1052 1080 … … 1054 1082 if (previous_params == null) 1055 1083 { 1056 //logger.error("no p parms"); 1057 return false; 1084 return NO_QUERY_TERMS; 1058 1085 } 1059 1086 String service_name = (String) previous_params.get(GSParams.SERVICE); 1060 1087 if (service_name == null || !service_name.endsWith("Query")) 1061 1088 { // hack for now - we only do highlighting if we were in a query last - ie not if we were in a browse thingy 1062 logger.debug("invalid service , not doing highlighting");1063 return false;1089 logger.debug("invalid service "+service_name+", not doing highlighting"); 1090 return NO_QUERY_TERMS; 1064 1091 } 1065 1092 … … 1077 1104 Element query_param_list = doc.createElement(GSXML.PARAM_ELEM + GSXML.LIST_MODIFIER); 1078 1105 GSXML.addParametersToList(query_param_list, service_params); 1079 // is this only used for solr??? - do we still want it for solr??1080 // if (current_node_id != null) {1081 // GSXML.addParameterToList(query_param_list, "hldocOID", current_node_id);1082 // } else {1083 // GSXML.addParameterToList(query_param_list, "hldocOID", (String) params.get(GSParams.DOCUMENT));1084 // }1085 1106 mr_query_request.appendChild(query_param_list); 1107 1086 1108 // do the query 1087 1088 1109 Element mr_query_response = (Element) this.mr.process(mr_query_message); 1089 1110 1090 1111 // find the term lists 1091 1112 String path = GSPath.appendLink(GSXML.RESPONSE_ELEM, GSXML.TERM_ELEM + GSXML.LIST_MODIFIER); … … 1094 1115 { 1095 1116 // no term info 1096 logger.error("No query term information. xx\n"); 1097 return false; 1117 return NO_QUERY_TERMS; 1098 1118 } 1099 // logger.error("query term list info "+XMLConverter.getPrettyString(query_term_list_element)); 1100 //String content = GSXML.getNodeText(dc_response_doc_content); 1101 1102 String metadata_path = GSPath.appendLink(GSXML.RESPONSE_ELEM, GSXML.METADATA_ELEM + GSXML.LIST_MODIFIER); 1103 Element metadata_list = (Element) GSXML.getNodeByPath(mr_query_response, metadata_path); 1104 1119 1120 int result_code = NO_EQUIV_QUERY_TERMS; 1105 1121 NodeList equivalent_terms_nodelist = query_term_list_element.getElementsByTagName("equivTermList"); 1106 1122 if (equivalent_terms_nodelist == null || equivalent_terms_nodelist.getLength() == 0) 1107 1123 { 1124 // if we have no equivalent terms, just add the current terms lower cased and we do case insensitive matching later on 1108 1125 NodeList terms_nodelist = query_term_list_element.getElementsByTagName("term"); 1109 1126 if (terms_nodelist != null && terms_nodelist.getLength() > 0) … … 1112 1129 { 1113 1130 String termValue = ((Element) terms_nodelist.item(i)).getAttribute("name"); 1114 String termValueU = null; 1115 String termValueL = null; 1116 1117 if (termValue.length() > 1) 1118 { 1119 termValueU = termValue.substring(0, 1).toUpperCase() + termValue.substring(1); 1120 termValueL = termValue.substring(0, 1).toLowerCase() + termValue.substring(1); 1121 } 1122 else 1123 { 1124 termValueU = termValue.substring(0, 1).toUpperCase(); 1125 termValueL = termValue.substring(0, 1).toLowerCase(); 1126 } 1127 query_term_variants.add(termValueU); 1128 query_term_variants.add(termValueL); 1131 query_term_variants.add(termValue.toLowerCase()); 1129 1132 } 1130 1133 } … … 1132 1135 else 1133 1136 { 1137 result_code = EQUIV_QUERY_TERMS; 1134 1138 for (int i = 0; i < equivalent_terms_nodelist.getLength(); i++) 1135 1139 { … … 1143 1147 } 1144 1148 1149 String metadata_path = GSPath.appendLink(GSXML.RESPONSE_ELEM, GSXML.METADATA_ELEM + GSXML.LIST_MODIFIER); 1150 Element metadata_list = (Element) GSXML.getNodeByPath(mr_query_response, metadata_path); 1145 1151 1146 1152 Element query_element = GSXML.getNamedElement(metadata_list, GSXML.METADATA_ELEM, GSXML.NAME_ATT, "query"); 1147 1153 String performed_query = GSXML.getNodeText(query_element) + " "; 1148 1154 logger.debug("performed query="+performed_query); 1155 1156 boolean has_phrases = false; // if there are no phrases, we don't bother making the phrase variants structure 1157 if (performed_query.contains("\"")) { 1158 has_phrases = true; 1159 } 1160 1149 1161 ArrayList<HashSet<String>> phrase_query_p_term_variants_list = new ArrayList<HashSet<String>>(); 1150 1162 int term_start = 0; 1151 1163 boolean in_term = false; 1152 1164 boolean in_phrase = false; 1153 for (int i = 0; i < performed_query.length(); i++) 1154 {1165 for (int i = 0; i < performed_query.length(); i++) { 1166 1155 1167 char character = performed_query.charAt(i); 1156 1168 boolean is_character_letter_or_digit = Character.isLetterOrDigit(character); … … 1168 1180 in_term = false; 1169 1181 String term = performed_query.substring(term_start, i); 1170 1171 Element term_element = GSXML.getNamedElement(query_term_list_element, GSXML.TERM_ELEM, GSXML.NAME_ATT, term); 1172 if (term_element != null) 1173 { 1182 if (has_phrases) { 1183 // do the phrase bit 1184 HashSet<String> phrase_query_p_term_x_variants = new HashSet<String>(); 1185 if (result_code == EQUIV_QUERY_TERMS) { 1186 Element term_element = GSXML.getNamedElement(query_term_list_element, GSXML.TERM_ELEM, GSXML.NAME_ATT, term); 1187 if (term_element != null) { 1188 // might be null for eg TX in [snails]:TX 1174 1189 1175 HashSet<String> phrase_query_p_term_x_variants = new HashSet<String>(); 1176 1177 NodeList term_equivalent_terms_nodelist = term_element.getElementsByTagName("equivTermList"); 1178 if (term_equivalent_terms_nodelist == null || term_equivalent_terms_nodelist.getLength() == 0) 1179 { 1180 String termValueU = null; 1181 String termValueL = null; 1182 1183 if (term.length() > 1) 1184 { 1185 termValueU = term.substring(0, 1).toUpperCase() + term.substring(1); 1186 termValueL = term.substring(0, 1).toLowerCase() + term.substring(1); 1187 } 1188 else 1189 { 1190 termValueU = term.substring(0, 1).toUpperCase(); 1191 termValueL = term.substring(0, 1).toLowerCase(); 1192 } 1193 1194 phrase_query_p_term_x_variants.add(termValueU); 1195 phrase_query_p_term_x_variants.add(termValueL); 1196 } 1197 else 1198 { 1190 NodeList term_equivalent_terms_nodelist = term_element.getElementsByTagName("equivTermList"); 1191 if (term_equivalent_terms_nodelist != null || term_equivalent_terms_nodelist.getLength() != 0) { 1199 1192 for (int j = 0; j < term_equivalent_terms_nodelist.getLength(); j++) 1200 1193 { … … 1207 1200 } 1208 1201 } 1202 } 1203 } else { // result_code != EQUIV_QUERY_TERMS 1204 // we don;t have equivalent term list, so just add the lower cased version in, and we do case-insensitive matching later on 1205 if (query_term_variants.contains(term.toLowerCase()) || containsSubString(query_term_variants, term)) { 1206 // this handles the case where the user has searched for snails, but term list returns 'snail' 1207 phrase_query_p_term_x_variants.add(term.toLowerCase()); 1208 } 1209 } 1210 if (phrase_query_p_term_x_variants.size()>0) { 1211 // we have found a valid term 1209 1212 phrase_query_p_term_variants_list.add(phrase_query_p_term_x_variants); 1210 1213 … … 1215 1218 } 1216 1219 } 1217 } 1220 } // end if has_phrases 1221 else { 1222 // no phrases so we don't have to do the phrasey stuff. but 1223 // we need to check the term against the query term list - if its not in there, check whether its the root of a term. 1224 // we want to handle the case where user has queried "snails", the term list returned only has snail, and therefore snails doesn't get highlighted. 1225 // but dont want to include eg TX 1226 if (result_code == NO_EQUIV_QUERY_TERMS) { 1227 if (containsSubString(query_term_variants, term)) { 1228 query_term_variants.add(term.toLowerCase()); 1229 } 1230 } 1231 1232 } 1233 } // end of in_term... 1218 1234 // Watch for phrases (surrounded by quotes) 1219 if (character == '\"') 1220 {1235 if (character == '\"') { 1236 1221 1237 // Has a phrase just started? 1222 1238 if (in_phrase == false) … … 1232 1248 1233 1249 phrase_query_p_term_variants_list = new ArrayList<HashSet<String>>(); 1234 } 1250 } // if char == " 1251 } // for each char in performed query 1252 1253 return result_code; 1254 } 1255 1256 protected boolean containsSubString(HashSet<String> query_term_variants, String term) { 1257 // hack to filter out TX, TI field names 1258 String lc_term = term.toLowerCase(); 1259 if (query_term_variants.contains(term)) { 1260 return false; // or true?? 1261 } 1262 if (term.matches("[A-Z][A-Z][A-Z]?")) { 1263 return false; 1264 } 1265 Iterator i = query_term_variants.iterator(); 1266 while (i.hasNext()) { 1267 String t = (String)i.next(); 1268 if (term.startsWith(t)) { 1269 return true; 1235 1270 } 1236 1237 return true;1271 } 1272 return false; 1238 1273 } 1239 1274 1240 /** redo the request to get the query terms then highlight them in the text 1241 * 1242 */ 1243 protected Element highlightQueryTermsOld(Element request, String current_node_id, Element dc_response_doc_content) 1244 { 1245 Document doc = request.getOwnerDocument(); 1246 1247 // do the query again to get term info 1248 Element cgi_param_list = (Element) GSXML.getChildByTagName(request, GSXML.PARAM_ELEM + GSXML.LIST_MODIFIER); 1249 HashMap<String, Serializable> params = GSXML.extractParams(cgi_param_list, false); 1250 1251 HashMap previous_params = (HashMap) params.get("p"); 1252 if (previous_params == null) 1253 { 1254 return dc_response_doc_content; 1255 } 1256 String service_name = (String) previous_params.get(GSParams.SERVICE); 1257 if (service_name == null || !service_name.endsWith("Query")) 1258 { // hack for now - we only do highlighting if we were in a query last - ie not if we were in a browse thingy 1259 logger.debug("invalid service, not doing highlighting"); 1260 return dc_response_doc_content; 1261 } 1262 String collection = (String) params.get(GSParams.COLLECTION); 1263 UserContext userContext = new UserContext(request); 1264 String to = GSPath.appendLink(collection, service_name); 1265 1266 Element mr_query_message = doc.createElement(GSXML.MESSAGE_ELEM); 1267 Element mr_query_request = GSXML.createBasicRequest(doc, GSXML.REQUEST_TYPE_PROCESS, to, userContext); 1268 mr_query_message.appendChild(mr_query_request); 1269 1270 // paramList 1271 HashMap service_params = (HashMap) params.get("s1"); 1272 1273 Element query_param_list = doc.createElement(GSXML.PARAM_ELEM + GSXML.LIST_MODIFIER); 1274 GSXML.addParametersToList(query_param_list, service_params); 1275 if (current_node_id != null) { 1276 GSXML.addParameterToList(query_param_list, "hldocOID", current_node_id); 1277 } else { 1278 GSXML.addParameterToList(query_param_list, "hldocOID", (String) params.get(GSParams.DOCUMENT)); 1279 } 1280 mr_query_request.appendChild(query_param_list); 1281 // do the query 1282 Element mr_query_response = (Element) this.mr.process(mr_query_message); 1283 String pathNode = GSPath.appendLink(GSXML.RESPONSE_ELEM, GSXML.NODE_CONTENT_ELEM); 1284 Element highlighted_Node = (Element) GSXML.getNodeByPath(mr_query_response, pathNode); 1285 // For SOLR, the above query may come back with a nodeContent element, which is the hldocOID section content, with search terms marked up. We send it back to the documnetContentRetrieve service so that resolveTextMacros can be applied, and it can be properly encased in documentNode etc elements 1286 if (highlighted_Node != null) 1287 { 1288 // Build a request to process highlighted text 1289 logger.error("highlighted node is not null!!!!"); 1290 logger.error(XMLConverter.getPrettyString(highlighted_Node)); 1291 Element hl_message = doc.createElement(GSXML.MESSAGE_ELEM); 1292 to = GSPath.appendLink(collection, "DocumentContentRetrieve"); 1293 Element dc_request = GSXML.createBasicRequest(doc, GSXML.REQUEST_TYPE_PROCESS, to, userContext); 1294 hl_message.appendChild(dc_request); 1295 1296 // Create a parameter list to specify the request parameters - empty for now 1297 Element dc_param_list = doc.createElement(GSXML.PARAM_ELEM + GSXML.LIST_MODIFIER); 1298 dc_request.appendChild(dc_param_list); 1299 1300 // get the content 1301 Element doc_list = doc.createElement(GSXML.DOC_NODE_ELEM + GSXML.LIST_MODIFIER); 1302 dc_request.appendChild(doc_list); 1303 Element current_doc = doc.createElement(GSXML.DOC_NODE_ELEM); 1304 doc_list.appendChild(current_doc); 1305 current_doc.setAttribute(GSXML.NODE_ID_ATT, (String) params.get(GSParams.DOCUMENT)); 1306 //Append highlighted content to request for processing 1307 dc_request.appendChild(doc.importNode(highlighted_Node, true)); 1308 Element hl_response_message = (Element) this.mr.process(hl_message); 1309 1310 //Get results 1311 NodeList contentList = hl_response_message.getElementsByTagName(GSXML.NODE_CONTENT_ELEM); 1312 Element content = (Element) contentList.item(0); 1313 return content; 1314 } 1315 String path = GSPath.appendLink(GSXML.RESPONSE_ELEM, GSXML.TERM_ELEM + GSXML.LIST_MODIFIER); 1316 Element query_term_list_element = (Element) GSXML.getNodeByPath(mr_query_response, path); 1317 if (query_term_list_element == null) 1318 { 1319 // no term info 1320 logger.error("No query term information. yy\n"); 1321 return dc_response_doc_content; 1322 } 1323 1324 String content = GSXML.getNodeText(dc_response_doc_content); 1325 1326 String metadata_path = GSPath.appendLink(GSXML.RESPONSE_ELEM, GSXML.METADATA_ELEM + GSXML.LIST_MODIFIER); 1327 Element metadata_list = (Element) GSXML.getNodeByPath(mr_query_response, metadata_path); 1328 1329 HashSet<String> query_term_variants = new HashSet<String>(); 1330 NodeList equivalent_terms_nodelist = query_term_list_element.getElementsByTagName("equivTermList"); 1331 if (equivalent_terms_nodelist == null || equivalent_terms_nodelist.getLength() == 0) 1332 { 1333 NodeList terms_nodelist = query_term_list_element.getElementsByTagName("term"); 1334 if (terms_nodelist != null && terms_nodelist.getLength() > 0) 1335 { 1336 for (int i = 0; i < terms_nodelist.getLength(); i++) 1337 { 1338 String termValue = ((Element) terms_nodelist.item(i)).getAttribute("name"); 1339 String termValueU = null; 1340 String termValueL = null; 1341 1342 if (termValue.length() > 1) 1343 { 1344 termValueU = termValue.substring(0, 1).toUpperCase() + termValue.substring(1); 1345 termValueL = termValue.substring(0, 1).toLowerCase() + termValue.substring(1); 1346 } 1347 else 1348 { 1349 termValueU = termValue.substring(0, 1).toUpperCase(); 1350 termValueL = termValue.substring(0, 1).toLowerCase(); 1351 } 1352 1353 query_term_variants.add(termValueU); 1354 query_term_variants.add(termValueL); 1355 } 1356 } 1357 } 1358 else 1359 { 1360 for (int i = 0; i < equivalent_terms_nodelist.getLength(); i++) 1361 { 1362 Element equivalent_terms_element = (Element) equivalent_terms_nodelist.item(i); 1363 String[] equivalent_terms = GSXML.getAttributeValuesFromList(equivalent_terms_element, GSXML.NAME_ATT); 1364 for (int j = 0; j < equivalent_terms.length; j++) 1365 { 1366 query_term_variants.add(equivalent_terms[j]); 1367 } 1368 } 1369 } 1370 1371 ArrayList<ArrayList<HashSet<String>>> phrase_query_term_variants_hierarchy = new ArrayList<ArrayList<HashSet<String>>>(); 1372 1373 Element query_element = GSXML.getNamedElement(metadata_list, GSXML.METADATA_ELEM, GSXML.NAME_ATT, "query"); 1374 String performed_query = GSXML.getNodeText(query_element) + " "; 1375 1376 ArrayList<HashSet<String>> phrase_query_p_term_variants_list = new ArrayList<HashSet<String>>(); 1377 int term_start = 0; 1378 boolean in_term = false; 1379 boolean in_phrase = false; 1380 for (int i = 0; i < performed_query.length(); i++) 1381 { 1382 char character = performed_query.charAt(i); 1383 boolean is_character_letter_or_digit = Character.isLetterOrDigit(character); 1384 1385 // Has a query term just started? 1386 if (in_term == false && is_character_letter_or_digit == true) 1387 { 1388 in_term = true; 1389 term_start = i; 1390 } 1391 1392 // Or has a term just finished? 1393 else if (in_term == true && is_character_letter_or_digit == false) 1394 { 1395 in_term = false; 1396 String term = performed_query.substring(term_start, i); 1397 1398 Element term_element = GSXML.getNamedElement(query_term_list_element, GSXML.TERM_ELEM, GSXML.NAME_ATT, term); 1399 if (term_element != null) 1400 { 1401 1402 HashSet<String> phrase_query_p_term_x_variants = new HashSet<String>(); 1403 1404 NodeList term_equivalent_terms_nodelist = term_element.getElementsByTagName("equivTermList"); 1405 if (term_equivalent_terms_nodelist == null || term_equivalent_terms_nodelist.getLength() == 0) 1406 { 1407 String termValueU = null; 1408 String termValueL = null; 1409 1410 if (term.length() > 1) 1411 { 1412 termValueU = term.substring(0, 1).toUpperCase() + term.substring(1); 1413 termValueL = term.substring(0, 1).toLowerCase() + term.substring(1); 1414 } 1415 else 1416 { 1417 termValueU = term.substring(0, 1).toUpperCase(); 1418 termValueL = term.substring(0, 1).toLowerCase(); 1419 } 1420 1421 phrase_query_p_term_x_variants.add(termValueU); 1422 phrase_query_p_term_x_variants.add(termValueL); 1423 } 1424 else 1425 { 1426 for (int j = 0; j < term_equivalent_terms_nodelist.getLength(); j++) 1427 { 1428 Element term_equivalent_terms_element = (Element) term_equivalent_terms_nodelist.item(j); 1429 String[] term_equivalent_terms = GSXML.getAttributeValuesFromList(term_equivalent_terms_element, GSXML.NAME_ATT); 1430 for (int k = 0; k < term_equivalent_terms.length; k++) 1431 { 1432 phrase_query_p_term_x_variants.add(term_equivalent_terms[k]); 1433 } 1434 } 1435 } 1436 phrase_query_p_term_variants_list.add(phrase_query_p_term_x_variants); 1437 1438 if (in_phrase == false) 1439 { 1440 phrase_query_term_variants_hierarchy.add(phrase_query_p_term_variants_list); 1441 phrase_query_p_term_variants_list = new ArrayList<HashSet<String>>(); 1442 } 1443 } 1444 } 1445 // Watch for phrases (surrounded by quotes) 1446 if (character == '\"') 1447 { 1448 // Has a phrase just started? 1449 if (in_phrase == false) 1450 { 1451 in_phrase = true; 1452 } 1453 // Or has a phrase just finished? 1454 else if (in_phrase == true) 1455 { 1456 in_phrase = false; 1457 phrase_query_term_variants_hierarchy.add(phrase_query_p_term_variants_list); 1458 } 1459 1460 phrase_query_p_term_variants_list = new ArrayList<HashSet<String>>(); 1461 } 1462 } 1463 1464 return highlightQueryTermsInternalOrig(doc, content, query_term_variants, phrase_query_term_variants_hierarchy); 1465 } 1466 1275 1276 /** retrieve the marked up highlighted section - only works for solr collection */ 1277 protected Element retrieveHighlightedContent(Element request, String node_id) { 1278 1279 Document doc = XMLConverter.newDOM(); 1280 1281 // do the query again to get term info 1282 Element cgi_param_list = (Element) GSXML.getChildByTagName(request, GSXML.PARAM_ELEM + GSXML.LIST_MODIFIER); 1283 HashMap<String, Serializable> params = GSXML.extractParams(cgi_param_list, false); 1284 1285 HashMap previous_params = (HashMap) params.get("p"); 1286 if (previous_params == null) 1287 { 1288 return null; 1289 } 1290 String service_name = (String) previous_params.get(GSParams.SERVICE); 1291 if (service_name == null || !service_name.endsWith("Query")) 1292 { // hack for now - we only do highlighting if we were in a query last - ie not if we were in a browse thingy 1293 logger.debug("HL invalid service, not doing highlighting"); 1294 return null; 1295 } 1296 1297 String collection = (String) params.get(GSParams.COLLECTION); 1298 UserContext userContext = new UserContext(request); 1299 String to = GSPath.appendLink(collection, service_name); 1300 1301 Element mr_query_message = doc.createElement(GSXML.MESSAGE_ELEM); 1302 Element mr_query_request = GSXML.createBasicRequest(doc, GSXML.REQUEST_TYPE_PROCESS, to, userContext); 1303 mr_query_message.appendChild(mr_query_request); 1304 1305 // paramList 1306 HashMap service_params = (HashMap) params.get("s1"); 1307 1308 // hack in case the user searched on eg titles, but we want highlighting in the text 1309 service_params.put("index", "TX"); 1310 Element query_param_list = doc.createElement(GSXML.PARAM_ELEM + GSXML.LIST_MODIFIER); 1311 GSXML.addParametersToList(query_param_list, service_params); 1312 1313 if (node_id != null) { 1314 GSXML.addParameterToList(query_param_list, "hldocOID", node_id); 1315 } else { 1316 GSXML.addParameterToList(query_param_list, "hldocOID", (String) params.get(GSParams.DOCUMENT)); 1317 } 1318 mr_query_request.appendChild(query_param_list); 1319 // do the query 1320 1321 Element mr_query_response = (Element) this.mr.process(mr_query_message); 1322 String pathNode = GSPath.appendLink(GSXML.RESPONSE_ELEM, GSXML.NODE_CONTENT_ELEM); 1323 Element highlighted_node = (Element) GSXML.getNodeByPath(mr_query_response, pathNode); 1324 1325 if (highlighted_node == null) { 1326 return null; 1327 } 1328 // For SOLR, the highlighted node will be a nodeContent element, which is the hldocOID section content, with search terms marked up. 1329 //We send it back to the documnetContentRetrieve service so that resolveTextMacros can be applied, and it can be properly encased in documentNode etc elements 1330 1331 // Build a request to process highlighted text 1332 1333 Element hl_message = doc.createElement(GSXML.MESSAGE_ELEM); 1334 to = GSPath.appendLink(collection, "DocumentContentRetrieve"); 1335 Element dc_request = GSXML.createBasicRequest(doc, GSXML.REQUEST_TYPE_PROCESS, to, userContext); 1336 hl_message.appendChild(dc_request); 1337 1338 // Create a parameter list to specify the request parameters - empty for now 1339 Element dc_param_list = doc.createElement(GSXML.PARAM_ELEM + GSXML.LIST_MODIFIER); 1340 dc_request.appendChild(dc_param_list); 1341 1342 // get the content 1343 Element doc_list = doc.createElement(GSXML.DOC_NODE_ELEM + GSXML.LIST_MODIFIER); 1344 dc_request.appendChild(doc_list); 1345 Element current_doc = doc.createElement(GSXML.DOC_NODE_ELEM); 1346 doc_list.appendChild(current_doc); 1347 current_doc.setAttribute(GSXML.NODE_ID_ATT, (String) params.get(GSParams.DOCUMENT)); 1348 //Append highlighted content to request for processing 1349 dc_request.appendChild(doc.importNode(highlighted_node, true)); 1350 Element hl_response_message = (Element) this.mr.process(hl_message); 1351 //Get results 1352 NodeList contentList = hl_response_message.getElementsByTagName(GSXML.NODE_CONTENT_ELEM); 1353 Element content = (Element) contentList.item(0); 1354 return content; 1355 1356 1357 } 1467 1358 /** 1468 1359 * Highlights query terms in specified elements (whose name is in element_names) text inside top_level_elem 1469 1360 */ 1470 protected boolean highlightQueryTermsDOM(Document doc, Element top_level_elem, String element_name, HashSet<String> query_term_variants, ArrayList<ArrayList<HashSet<String>>> phrase_query_term_variants_hierarchy) { 1471 1472 //logger.error("begin highlight DOM "+XMLConverter.getPrettyString(top_level_elem)); 1361 protected boolean highlightQueryTermsDOM(Document doc, Element top_level_elem, String element_name, HashSet<String> query_term_variants, ArrayList<ArrayList<HashSet<String>>> phrase_query_term_variants_hierarchy, boolean case_insensitive) { 1362 1473 1363 NodeList named_elems = top_level_elem.getElementsByTagName(element_name); 1474 1364 for (int j=named_elems.getLength()-1; j>=0; j--) { 1475 1365 Element this_elem = (Element)named_elems.item(j); 1476 Element replacement_elem = highlightQueryTermsElementText(doc, this_elem, query_term_variants, phrase_query_term_variants_hierarchy );1366 Element replacement_elem = highlightQueryTermsElementText(doc, this_elem, query_term_variants, phrase_query_term_variants_hierarchy, case_insensitive); 1477 1367 this_elem.getParentNode().replaceChild(replacement_elem, this_elem); 1478 1368 } 1479 1480 1481 //logger.error("end highlight DOM "+XMLConverter.getPrettyString(top_level_elem));1482 1369 return true; 1483 1370 } … … 1485 1372 * Highlights query terms in the text content of an element. 1486 1373 */ 1487 private Element highlightQueryTermsElementText(Document doc, Element original_element, /*String content,*/ HashSet<String> query_term_variants, ArrayList<ArrayList<HashSet<String>>> phrase_query_term_variants_hierarchy)1374 private Element highlightQueryTermsElementText(Document doc, Element original_element, HashSet<String> query_term_variants, ArrayList<ArrayList<HashSet<String>>> phrase_query_term_variants_hierarchy, boolean case_insensitive) 1488 1375 { 1489 //logger.error("in hl internal, query terms are "+query_term_variants.toString());1490 1376 String content = GSXML.getNodeText(original_element); 1491 //logger.error("original elem = "+XMLConverter.getPrettyString(original_element));1492 logger.error("highlighting content: "+content);1493 1377 // Convert the content string to an array of characters for speed 1494 1378 char[] content_characters = new char[content.length()]; … … 1506 1390 if (content_characters[i] == '<') 1507 1391 { 1392 // are we currently in a word? 1393 if (in_word) { 1394 in_word = false; 1395 String word = new String(content_characters, word_start, (i - word_start)); 1396 if (case_insensitive) { 1397 word = word.toLowerCase(); 1398 } 1399 if (query_term_variants.contains(word)) { 1400 // We have found a matching word, so remember its location 1401 word_matches.add(new WordMatch(word, word_start, i, preceding_word_matched)); 1402 // should preceding word matched be set to true/false here?? 1403 preceding_word_matched = true; 1404 } else { 1405 preceding_word_matched = false; 1406 } 1407 } 1508 1408 inTag = true; 1509 1409 continue; … … 1512 1412 { 1513 1413 inTag = false; 1414 continue; 1514 1415 } 1515 1416 else if (inTag) … … 1534 1435 // Check if the word matches any of the query term equivalents 1535 1436 String word = new String(content_characters, word_start, (i - word_start)); 1536 //logger.error("word: "+word); 1437 if (case_insensitive) { 1438 word = word.toLowerCase(); 1439 } 1537 1440 if (query_term_variants.contains(word)) 1538 1441 { 1539 //logger.error("matched");1540 1442 // We have found a matching word, so remember its location 1541 1443 word_matches.add(new WordMatch(word, word_start, i, preceding_word_matched)); … … 1554 1456 // Check if the word matches any of the query term equivalents 1555 1457 String word = new String(content_characters, word_start, (content_characters.length - word_start)); 1458 if (case_insensitive) { 1459 word = word.toLowerCase(); 1460 } 1556 1461 if (query_term_variants.contains(word)) 1557 1462 { … … 1561 1466 } 1562 1467 1468 if (word_matches.size() == 0) { 1469 // just return a copy of the original element 1470 return (Element)doc.importNode(original_element, true); 1471 1472 } 1473 1563 1474 ArrayList<Integer> highlight_start_positions = new ArrayList<Integer>(); 1564 1475 ArrayList<Integer> highlight_end_positions = new ArrayList<Integer>(); 1565 1476 1477 if (phrase_query_term_variants_hierarchy.size() ==0) { 1478 for (int i = 0; i < word_matches.size(); i++) { 1479 highlight_start_positions.add(new Integer(word_matches.get(i).start_position)); 1480 highlight_end_positions.add(new Integer(word_matches.get(i).end_position)); 1481 } 1482 } 1483 else { 1566 1484 // Deal with phrases now 1567 1485 ArrayList<PartialPhraseMatch> partial_phrase_matches = new ArrayList<PartialPhraseMatch>(); … … 1638 1556 } 1639 1557 } 1558 } 1640 1559 1641 1560 // Now add the annotation tags into the document at the correct points 1642 //Element content_element = doc.createElement(GSXML.NODE_CONTENT_ELEM);1643 1561 Element content_element = (Element)doc.importNode(original_element, false); // just copy the element plus any attributes, but not any children. 1644 1562 int last_wrote = 0; … … 1675 1593 } 1676 1594 1677 private Element highlightQueryTermsInternalOrig(Document doc, String content, HashSet<String> query_term_variants, ArrayList<ArrayList<HashSet<String>>> phrase_query_term_variants_hierarchy)1678 {1679 // Convert the content string to an array of characters for speed1680 char[] content_characters = new char[content.length()];1681 content.getChars(0, content.length(), content_characters, 0);1682 1683 // Now skim through the content, identifying word matches1684 ArrayList<WordMatch> word_matches = new ArrayList<WordMatch>();1685 int word_start = 0;1686 boolean in_word = false;1687 boolean preceding_word_matched = false;1688 boolean inTag = false;1689 for (int i = 0; i < content_characters.length; i++)1690 {1691 //We don't want to find words inside HTML tags1692 if (content_characters[i] == '<')1693 {1694 inTag = true;1695 continue;1696 }1697 else if (inTag && content_characters[i] == '>')1698 {1699 inTag = false;1700 }1701 else if (inTag)1702 {1703 continue;1704 }1705 1706 boolean is_character_letter_or_digit = Character.isLetterOrDigit(content_characters[i]);1707 1708 // Has a word just started?1709 if (in_word == false && is_character_letter_or_digit == true)1710 {1711 in_word = true;1712 word_start = i;1713 }1714 1715 // Or has a word just finished?1716 else if (in_word == true && is_character_letter_or_digit == false)1717 {1718 in_word = false;1719 1720 // Check if the word matches any of the query term equivalents1721 String word = new String(content_characters, word_start, (i - word_start));1722 if (query_term_variants.contains(word))1723 {1724 // We have found a matching word, so remember its location1725 word_matches.add(new WordMatch(word, word_start, i, preceding_word_matched));1726 preceding_word_matched = true;1727 }1728 else1729 {1730 preceding_word_matched = false;1731 }1732 }1733 }1734 1735 // Don't forget the last word...1736 if (in_word == true)1737 {1738 // Check if the word matches any of the query term equivalents1739 String word = new String(content_characters, word_start, (content_characters.length - word_start));1740 if (query_term_variants.contains(word))1741 {1742 // We have found a matching word, so remember its location1743 word_matches.add(new WordMatch(word, word_start, content_characters.length, preceding_word_matched));1744 }1745 }1746 1747 ArrayList<Integer> highlight_start_positions = new ArrayList<Integer>();1748 ArrayList<Integer> highlight_end_positions = new ArrayList<Integer>();1749 1750 // Deal with phrases now1751 ArrayList<PartialPhraseMatch> partial_phrase_matches = new ArrayList<PartialPhraseMatch>();1752 for (int i = 0; i < word_matches.size(); i++)1753 {1754 WordMatch word_match = word_matches.get(i);1755 1756 // See if any partial phrase matches are extended by this word1757 if (word_match.preceding_word_matched)1758 {1759 for (int j = partial_phrase_matches.size() - 1; j >= 0; j--)1760 {1761 PartialPhraseMatch partial_phrase_match = partial_phrase_matches.remove(j);1762 ArrayList phrase_query_p_term_variants_list = phrase_query_term_variants_hierarchy.get(partial_phrase_match.query_phrase_number);1763 HashSet phrase_query_p_term_x_variants = (HashSet) phrase_query_p_term_variants_list.get(partial_phrase_match.num_words_matched);1764 if (phrase_query_p_term_x_variants.contains(word_match.word))1765 {1766 partial_phrase_match.num_words_matched++;1767 1768 // Has a complete phrase match occurred?1769 if (partial_phrase_match.num_words_matched == phrase_query_p_term_variants_list.size())1770 {1771 // Check for overlaps by looking at the previous highlight range1772 if (!highlight_end_positions.isEmpty())1773 {1774 int last_highlight_index = highlight_end_positions.size() - 1;1775 int last_highlight_end = highlight_end_positions.get(last_highlight_index).intValue();1776 if (last_highlight_end > partial_phrase_match.start_position)1777 {1778 // There is an overlap, so remove the previous phrase match1779 int last_highlight_start = highlight_start_positions.remove(last_highlight_index).intValue();1780 highlight_end_positions.remove(last_highlight_index);1781 partial_phrase_match.start_position = last_highlight_start;1782 }1783 }1784 1785 highlight_start_positions.add(new Integer(partial_phrase_match.start_position));1786 highlight_end_positions.add(new Integer(word_match.end_position));1787 }1788 // No, but add the partial match back into the list for next time1789 else1790 {1791 partial_phrase_matches.add(partial_phrase_match);1792 }1793 }1794 }1795 }1796 else1797 {1798 partial_phrase_matches.clear();1799 }1800 1801 // See if this word is at the start of any of the phrases1802 for (int p = 0; p < phrase_query_term_variants_hierarchy.size(); p++)1803 {1804 ArrayList phrase_query_p_term_variants_list = phrase_query_term_variants_hierarchy.get(p);1805 if (phrase_query_p_term_variants_list.size()>0) {1806 HashSet phrase_query_p_term_1_variants = (HashSet) phrase_query_p_term_variants_list.get(0);1807 if (phrase_query_p_term_1_variants.contains(word_match.word))1808 {1809 // If this phrase is just one word long, we have a complete match1810 if (phrase_query_p_term_variants_list.size() == 1)1811 {1812 highlight_start_positions.add(new Integer(word_match.start_position));1813 highlight_end_positions.add(new Integer(word_match.end_position));1814 }1815 // Otherwise we have the start of a potential phrase match1816 else1817 {1818 partial_phrase_matches.add(new PartialPhraseMatch(word_match.start_position, p));1819 }1820 }1821 }1822 }1823 }1824 1825 // Now add the annotation tags into the document at the correct points1826 Element content_element = doc.createElement(GSXML.NODE_CONTENT_ELEM);1827 1828 int last_wrote = 0;1829 for (int i = 0; i < highlight_start_positions.size(); i++)1830 {1831 int highlight_start = highlight_start_positions.get(i).intValue();1832 int highlight_end = highlight_end_positions.get(i).intValue();1833 1834 // Print anything before the highlight range1835 if (last_wrote < highlight_start)1836 {1837 String preceding_text = new String(content_characters, last_wrote, (highlight_start - last_wrote));1838 content_element.appendChild(doc.createTextNode(preceding_text));1839 }1840 1841 // Print the highlight text, annotated1842 if (highlight_end > last_wrote)1843 {1844 String highlight_text = new String(content_characters, highlight_start, (highlight_end - highlight_start));1845 Element annotation_element = GSXML.createTextElement(doc, "annotation", highlight_text);1846 annotation_element.setAttribute("type", "query_term");1847 content_element.appendChild(annotation_element);1848 last_wrote = highlight_end;1849 }1850 }1851 1852 // Finish off any unwritten text1853 if (last_wrote < content_characters.length)1854 {1855 String remaining_text = new String(content_characters, last_wrote, (content_characters.length - last_wrote));1856 content_element.appendChild(doc.createTextNode(remaining_text));1857 }1858 return content_element;1859 }1860 1595 1861 1596 static private class WordMatch
Note:
See TracChangeset
for help on using the changeset viewer.