Changeset 32505
- Timestamp:
- 2018-10-09T16:02:10+13:00 (5 years ago)
- File:
-
- 1 edited
Legend:
- Unmodified
- Added
- Removed
-
main/trunk/greenstone3/src/java/org/greenstone/gsdl3/action/DocumentAction.java
r32448 r32505 85 85 86 86 Element message = GSXML.nodeToElement(message_node); 87 Document doc = XMLConverter.newDOM(); //message.getOwnerDocument();87 Document doc = XMLConverter.newDOM(); 88 88 89 89 // the response … … 98 98 99 99 // just in case there are some that need to get passed to the services 100 // why do we use s0 here and s1 in other places??? 100 101 HashMap service_params = (HashMap) params.get("s0"); 101 102 … … 167 168 // are we editing mode? just get the archive document, convert to our internal doc format, and return it 168 169 if (editing_document) { 169 170 // call get archive doc 171 Element dx_message = doc.createElement(GSXML.MESSAGE_ELEM); 172 String to = "DocXMLGetSection"; 173 Element dx_request = GSXML.createBasicRequest(doc, GSXML.REQUEST_TYPE_PROCESS, to, userContext); 174 dx_message.appendChild(dx_request); 175 Element dx_section = doc.createElement(GSXML.DOCXML_SECTION_ELEM); 176 dx_section.setAttribute(GSXML.NODE_ID_ATT, document_id); 177 dx_section.setAttribute(GSXML.COLLECTION_ATT, collection); 178 dx_request.appendChild(dx_section); 179 180 Element dx_response_message = (Element) this.mr.process(dx_message); 181 if (processErrorElements(dx_response_message, page_response)) 182 { 183 return result; 184 } 185 186 // get the section out 187 String path = GSPath.appendLink(GSXML.RESPONSE_ELEM, GSXML.DOCXML_SECTION_ELEM); 188 Element section = (Element) GSXML.getNodeByPath(dx_response_message, path); 189 if (section == null) { 190 logger.error("no archive doc returned for "+document_id); 191 return result; 192 } 193 // convert the archive format into the internal format that the page response requires 194 195 // work out doctype 196 // NOTE: this will be coming from collection database in index 197 // the archive file doesn't store this. So we have to assume 198 // that the doc type will not be changing with any 199 // modifications happening to archives. 200 201 // if doc type is null, then we need to work it out. 202 // create a basic doc list containing the current node 203 204 if (document_type == null) { 205 Element basic_doc_list = doc.createElement(GSXML.DOC_NODE_ELEM + GSXML.LIST_MODIFIER); 206 Element current_doc = doc.createElement(GSXML.DOC_NODE_ELEM); 207 basic_doc_list.appendChild(current_doc); 208 current_doc.setAttribute(GSXML.NODE_ID_ATT, document_id); 209 basic_doc_list.appendChild(current_doc); 210 document_type = getDocumentType(basic_doc_list, collection, userContext, page_response); 211 } 212 213 if (document_type == null) { 214 logger.debug("@@@ doctype is null, setting to simple"); 215 document_type = GSXML.DOC_TYPE_SIMPLE; 216 } 217 218 Element doc_elem = doc.createElement(GSXML.DOCUMENT_ELEM); 219 doc_elem.setAttribute(GSXML.DOC_TYPE_ATT, document_type); 220 page_response.appendChild(doc_elem); 221 222 Element transformed_section = transformArchiveToDocument(section); 223 if (document_type == GSXML.DOC_TYPE_SIMPLE) { 224 // simple doc, only returning a single document node, which is the top level section. 225 doc_elem.setAttribute(GSXML.NODE_ID_ATT, document_id); 226 GSXML.mergeElements(doc_elem, transformed_section); 227 return result; 228 } 229 230 // multi sectioned document. 231 transformed_section.setAttribute(GSXML.NODE_ID_ATT, document_id); 232 // In docEdit mode, we obtain the text from archives, from doc.xml 233 // Now the transformation has replaced <Section> with <documentNode> 234 // Need to add nodeID, nodeType and docType attributes to each docNode 235 // as doc.xml doesn't store that. 236 insertDocNodeAttributes(transformed_section, document_type, null); 237 doc_elem.appendChild(doc.importNode(transformed_section, true)); 238 logger.debug("dx result = "+XMLConverter.getPrettyString(result)); 239 240 return result; 170 return getFormattedArchiveDoc(doc, collection, document_id, document_type, result, page_response, userContext); 241 171 } 242 172 … … 321 251 322 252 the_document.setAttribute(GSXML.DOC_TYPE_ATT, document_type); 253 254 // start getting doc structure 323 255 324 256 // Create a parameter list to specify the required structure information … … 482 414 } 483 415 416 // end getting doc structure 417 418 // start getting doc metadata 419 484 420 // Build a request to obtain some document metadata 485 421 Element dm_message = doc.createElement(GSXML.MESSAGE_ELEM); … … 611 547 } 612 548 549 550 HashSet<String> query_term_variants = null; 551 ArrayList<ArrayList<HashSet<String>>> phrase_query_term_variants_hierarchy = null; 552 boolean do_highlight_query_terms = highlight_query_terms; 553 if (highlight_query_terms) { 554 // lets get the query term equivalents 555 query_term_variants = new HashSet<String>(); 556 phrase_query_term_variants_hierarchy = new ArrayList<ArrayList<HashSet<String>>>(); 557 if (!getQueryTermVariants(request, null, /*current_node_id,*/ query_term_variants, phrase_query_term_variants_hierarchy)) { 558 do_highlight_query_terms = false; // we couldn't get the terms 559 } 560 } 561 562 // lets try marking up the metadata with search terms 563 if (do_highlight_query_terms) { 564 highlightQueryTermsDOM(doc, the_document, "metadata", query_term_variants, phrase_query_term_variants_hierarchy); 565 } 566 613 567 // Build a request to obtain some document content 614 568 Element dc_message = doc.createElement(GSXML.MESSAGE_ELEM); … … 657 611 if (content != null) 658 612 { 659 if ( highlight_query_terms)613 if (do_highlight_query_terms) 660 614 { 661 662 content = highlightQueryTerms(request, node_id, (Element) content); 615 content = highlightQueryTermsElementText(doc, (Element)content, query_term_variants, phrase_query_term_variants_hierarchy); 663 616 } 664 617 … … 708 661 return result; 709 662 } 710 if ( highlight_query_terms)663 if (do_highlight_query_terms) 711 664 { 712 665 dc_response_doc.removeChild(dc_response_doc_content); 713 666 714 dc_response_doc_content = highlightQueryTerms (request, null, dc_response_doc_content);667 dc_response_doc_content = highlightQueryTermsElementText(doc, (Element)dc_response_doc_content, query_term_variants, phrase_query_term_variants_hierarchy); 715 668 dc_response_doc.appendChild(dc_response_doc.getOwnerDocument().importNode(dc_response_doc_content, true)); 716 669 } … … 803 756 } 804 757 758 protected Element getFormattedArchiveDoc(Document doc, String collection, String document_id, String document_type, Element result, Element page_response, UserContext userContext ) { 759 // call get archive doc 760 Element dx_message = doc.createElement(GSXML.MESSAGE_ELEM); 761 String to = "DocXMLGetSection"; 762 Element dx_request = GSXML.createBasicRequest(doc, GSXML.REQUEST_TYPE_PROCESS, to, userContext); 763 dx_message.appendChild(dx_request); 764 Element dx_section = doc.createElement(GSXML.DOCXML_SECTION_ELEM); 765 dx_section.setAttribute(GSXML.NODE_ID_ATT, document_id); 766 dx_section.setAttribute(GSXML.COLLECTION_ATT, collection); 767 dx_request.appendChild(dx_section); 768 769 Element dx_response_message = (Element) this.mr.process(dx_message); 770 if (processErrorElements(dx_response_message, page_response)) 771 { 772 return result; 773 } 774 775 // get the section out 776 String path = GSPath.appendLink(GSXML.RESPONSE_ELEM, GSXML.DOCXML_SECTION_ELEM); 777 Element section = (Element) GSXML.getNodeByPath(dx_response_message, path); 778 if (section == null) { 779 logger.error("no archive doc returned for "+document_id); 780 return result; 781 } 782 // convert the archive format into the internal format that the page response requires 783 784 // work out doctype 785 // NOTE: this will be coming from collection database in index 786 // the archive file doesn't store this. So we have to assume 787 // that the doc type will not be changing with any 788 // modifications happening to archives. 789 790 // if doc type is null, then we need to work it out. 791 // create a basic doc list containing the current node 792 793 if (document_type == null) { 794 Element basic_doc_list = doc.createElement(GSXML.DOC_NODE_ELEM + GSXML.LIST_MODIFIER); 795 Element current_doc = doc.createElement(GSXML.DOC_NODE_ELEM); 796 basic_doc_list.appendChild(current_doc); 797 current_doc.setAttribute(GSXML.NODE_ID_ATT, document_id); 798 basic_doc_list.appendChild(current_doc); 799 document_type = getDocumentType(basic_doc_list, collection, userContext, page_response); 800 } 801 802 if (document_type == null) { 803 logger.debug("@@@ doctype is null, setting to simple"); 804 document_type = GSXML.DOC_TYPE_SIMPLE; 805 } 806 807 Element doc_elem = doc.createElement(GSXML.DOCUMENT_ELEM); 808 doc_elem.setAttribute(GSXML.DOC_TYPE_ATT, document_type); 809 page_response.appendChild(doc_elem); 810 811 Element transformed_section = transformArchiveToDocument(section); 812 if (document_type == GSXML.DOC_TYPE_SIMPLE) { 813 // simple doc, only returning a single document node, which is the top level section. 814 doc_elem.setAttribute(GSXML.NODE_ID_ATT, document_id); 815 GSXML.mergeElements(doc_elem, transformed_section); 816 return result; 817 } 818 819 // multi sectioned document. 820 transformed_section.setAttribute(GSXML.NODE_ID_ATT, document_id); 821 // In docEdit mode, we obtain the text from archives, from doc.xml 822 // Now the transformation has replaced <Section> with <documentNode> 823 // Need to add nodeID, nodeType and docType attributes to each docNode 824 // as doc.xml doesn't store that. 825 insertDocNodeAttributes(transformed_section, document_type, null); 826 doc_elem.appendChild(doc.importNode(transformed_section, true)); 827 logger.debug("dx result = "+XMLConverter.getPrettyString(result)); 828 829 return result; 830 } 831 805 832 806 833 private boolean needSectionContent(HashMap<String, Serializable> params) { … … 1009 1036 } 1010 1037 1011 1012 /** 1013 * this involves a bit of a hack to get the equivalent query terms - has to 1014 * requery the query service - uses the last selected service name. (if it 1015 * ends in query). should this action do the query or should it send a 1016 * message to the query action? but that will involve lots of extra stuff. 1017 * also doesn't handle phrases properly - just highlights all the terms 1018 * found in the text. 1019 */ 1020 protected Element highlightQueryTerms(Element request, String current_node_id, Element dc_response_doc_content) 1038 /** 1039 * this involves a bit of a hack to get the equivalent query terms - has to 1040 * requery the query service - uses the last selected service name. (if it 1041 * ends in query). should this action do the query or should it send a 1042 * message to the query action? but that will involve lots of extra stuff. 1043 */ 1044 protected boolean getQueryTermVariants(Element request, String current_node_id, HashSet<String> query_term_variants, ArrayList<ArrayList<HashSet<String>>> phrase_query_term_variants_hierarchy) 1045 { 1046 Document doc = request.getOwnerDocument(); 1047 1048 // do the query again to get term info 1049 Element cgi_param_list = (Element) GSXML.getChildByTagName(request, GSXML.PARAM_ELEM + GSXML.LIST_MODIFIER); 1050 //logger.error("cgi param list = "+XMLConverter.getPrettyString(cgi_param_list)); 1051 HashMap<String, Serializable> params = GSXML.extractParams(cgi_param_list, false); 1052 1053 HashMap previous_params = (HashMap) params.get("p"); 1054 if (previous_params == null) 1055 { 1056 //logger.error("no p parms"); 1057 return false; 1058 } 1059 String service_name = (String) previous_params.get(GSParams.SERVICE); 1060 if (service_name == null || !service_name.endsWith("Query")) 1061 { // hack for now - we only do highlighting if we were in a query last - ie not if we were in a browse thingy 1062 logger.debug("invalid service, not doing highlighting"); 1063 return false; 1064 } 1065 1066 String collection = (String) params.get(GSParams.COLLECTION); 1067 UserContext userContext = new UserContext(request); 1068 String to = GSPath.appendLink(collection, service_name); 1069 1070 Element mr_query_message = doc.createElement(GSXML.MESSAGE_ELEM); 1071 Element mr_query_request = GSXML.createBasicRequest(doc, GSXML.REQUEST_TYPE_PROCESS, to, userContext); 1072 mr_query_message.appendChild(mr_query_request); 1073 1074 // paramList 1075 HashMap service_params = (HashMap) params.get("s1"); 1076 1077 Element query_param_list = doc.createElement(GSXML.PARAM_ELEM + GSXML.LIST_MODIFIER); 1078 GSXML.addParametersToList(query_param_list, service_params); 1079 // is this only used for solr??? - do we still want it for solr?? 1080 // if (current_node_id != null) { 1081 // GSXML.addParameterToList(query_param_list, "hldocOID", current_node_id); 1082 // } else { 1083 // GSXML.addParameterToList(query_param_list, "hldocOID", (String) params.get(GSParams.DOCUMENT)); 1084 // } 1085 mr_query_request.appendChild(query_param_list); 1086 // do the query 1087 1088 Element mr_query_response = (Element) this.mr.process(mr_query_message); 1089 1090 // find the term lists 1091 String path = GSPath.appendLink(GSXML.RESPONSE_ELEM, GSXML.TERM_ELEM + GSXML.LIST_MODIFIER); 1092 Element query_term_list_element = (Element) GSXML.getNodeByPath(mr_query_response, path); 1093 if (query_term_list_element == null) 1094 { 1095 // no term info 1096 logger.error("No query term information. xx\n"); 1097 return false; 1098 } 1099 // logger.error("query term list info "+XMLConverter.getPrettyString(query_term_list_element)); 1100 //String content = GSXML.getNodeText(dc_response_doc_content); 1101 1102 String metadata_path = GSPath.appendLink(GSXML.RESPONSE_ELEM, GSXML.METADATA_ELEM + GSXML.LIST_MODIFIER); 1103 Element metadata_list = (Element) GSXML.getNodeByPath(mr_query_response, metadata_path); 1104 1105 NodeList equivalent_terms_nodelist = query_term_list_element.getElementsByTagName("equivTermList"); 1106 if (equivalent_terms_nodelist == null || equivalent_terms_nodelist.getLength() == 0) 1107 { 1108 NodeList terms_nodelist = query_term_list_element.getElementsByTagName("term"); 1109 if (terms_nodelist != null && terms_nodelist.getLength() > 0) 1110 { 1111 for (int i = 0; i < terms_nodelist.getLength(); i++) 1112 { 1113 String termValue = ((Element) terms_nodelist.item(i)).getAttribute("name"); 1114 String termValueU = null; 1115 String termValueL = null; 1116 1117 if (termValue.length() > 1) 1118 { 1119 termValueU = termValue.substring(0, 1).toUpperCase() + termValue.substring(1); 1120 termValueL = termValue.substring(0, 1).toLowerCase() + termValue.substring(1); 1121 } 1122 else 1123 { 1124 termValueU = termValue.substring(0, 1).toUpperCase(); 1125 termValueL = termValue.substring(0, 1).toLowerCase(); 1126 } 1127 query_term_variants.add(termValueU); 1128 query_term_variants.add(termValueL); 1129 } 1130 } 1131 } 1132 else 1133 { 1134 for (int i = 0; i < equivalent_terms_nodelist.getLength(); i++) 1135 { 1136 Element equivalent_terms_element = (Element) equivalent_terms_nodelist.item(i); 1137 String[] equivalent_terms = GSXML.getAttributeValuesFromList(equivalent_terms_element, GSXML.NAME_ATT); 1138 for (int j = 0; j < equivalent_terms.length; j++) 1139 { 1140 query_term_variants.add(equivalent_terms[j]); 1141 } 1142 } 1143 } 1144 1145 1146 Element query_element = GSXML.getNamedElement(metadata_list, GSXML.METADATA_ELEM, GSXML.NAME_ATT, "query"); 1147 String performed_query = GSXML.getNodeText(query_element) + " "; 1148 1149 ArrayList<HashSet<String>> phrase_query_p_term_variants_list = new ArrayList<HashSet<String>>(); 1150 int term_start = 0; 1151 boolean in_term = false; 1152 boolean in_phrase = false; 1153 for (int i = 0; i < performed_query.length(); i++) 1154 { 1155 char character = performed_query.charAt(i); 1156 boolean is_character_letter_or_digit = Character.isLetterOrDigit(character); 1157 1158 // Has a query term just started? 1159 if (in_term == false && is_character_letter_or_digit == true) 1160 { 1161 in_term = true; 1162 term_start = i; 1163 } 1164 1165 // Or has a term just finished? 1166 else if (in_term == true && is_character_letter_or_digit == false) 1167 { 1168 in_term = false; 1169 String term = performed_query.substring(term_start, i); 1170 1171 Element term_element = GSXML.getNamedElement(query_term_list_element, GSXML.TERM_ELEM, GSXML.NAME_ATT, term); 1172 if (term_element != null) 1173 { 1174 1175 HashSet<String> phrase_query_p_term_x_variants = new HashSet<String>(); 1176 1177 NodeList term_equivalent_terms_nodelist = term_element.getElementsByTagName("equivTermList"); 1178 if (term_equivalent_terms_nodelist == null || term_equivalent_terms_nodelist.getLength() == 0) 1179 { 1180 String termValueU = null; 1181 String termValueL = null; 1182 1183 if (term.length() > 1) 1184 { 1185 termValueU = term.substring(0, 1).toUpperCase() + term.substring(1); 1186 termValueL = term.substring(0, 1).toLowerCase() + term.substring(1); 1187 } 1188 else 1189 { 1190 termValueU = term.substring(0, 1).toUpperCase(); 1191 termValueL = term.substring(0, 1).toLowerCase(); 1192 } 1193 1194 phrase_query_p_term_x_variants.add(termValueU); 1195 phrase_query_p_term_x_variants.add(termValueL); 1196 } 1197 else 1198 { 1199 for (int j = 0; j < term_equivalent_terms_nodelist.getLength(); j++) 1200 { 1201 Element term_equivalent_terms_element = (Element) term_equivalent_terms_nodelist.item(j); 1202 String[] term_equivalent_terms = GSXML.getAttributeValuesFromList(term_equivalent_terms_element, GSXML.NAME_ATT); 1203 for (int k = 0; k < term_equivalent_terms.length; k++) 1204 { 1205 phrase_query_p_term_x_variants.add(term_equivalent_terms[k]); 1206 } 1207 } 1208 } 1209 phrase_query_p_term_variants_list.add(phrase_query_p_term_x_variants); 1210 1211 if (in_phrase == false) 1212 { 1213 phrase_query_term_variants_hierarchy.add(phrase_query_p_term_variants_list); 1214 phrase_query_p_term_variants_list = new ArrayList<HashSet<String>>(); 1215 } 1216 } 1217 } 1218 // Watch for phrases (surrounded by quotes) 1219 if (character == '\"') 1220 { 1221 // Has a phrase just started? 1222 if (in_phrase == false) 1223 { 1224 in_phrase = true; 1225 } 1226 // Or has a phrase just finished? 1227 else if (in_phrase == true) 1228 { 1229 in_phrase = false; 1230 phrase_query_term_variants_hierarchy.add(phrase_query_p_term_variants_list); 1231 } 1232 1233 phrase_query_p_term_variants_list = new ArrayList<HashSet<String>>(); 1234 } 1235 } 1236 1237 return true; 1238 } 1239 1240 /** redo the request to get the query terms then highlight them in the text 1241 * 1242 */ 1243 protected Element highlightQueryTermsOld(Element request, String current_node_id, Element dc_response_doc_content) 1021 1244 { 1022 1245 Document doc = request.getOwnerDocument(); … … 1064 1287 { 1065 1288 // Build a request to process highlighted text 1066 1289 logger.error("highlighted node is not null!!!!"); 1290 logger.error(XMLConverter.getPrettyString(highlighted_Node)); 1067 1291 Element hl_message = doc.createElement(GSXML.MESSAGE_ELEM); 1068 1292 to = GSPath.appendLink(collection, "DocumentContentRetrieve"); … … 1094 1318 { 1095 1319 // no term info 1096 logger.error("No query term information. \n");1320 logger.error("No query term information. yy\n"); 1097 1321 return dc_response_doc_content; 1098 1322 } … … 1238 1462 } 1239 1463 1240 return highlightQueryTermsInternal (doc, content, query_term_variants, phrase_query_term_variants_hierarchy);1464 return highlightQueryTermsInternalOrig(doc, content, query_term_variants, phrase_query_term_variants_hierarchy); 1241 1465 } 1242 1466 1467 /** 1468 * Highlights query terms in specified elements (whose name is in element_names) text inside top_level_elem 1469 */ 1470 protected boolean highlightQueryTermsDOM(Document doc, Element top_level_elem, String element_name, HashSet<String> query_term_variants, ArrayList<ArrayList<HashSet<String>>> phrase_query_term_variants_hierarchy) { 1471 1472 //logger.error("begin highlight DOM "+XMLConverter.getPrettyString(top_level_elem)); 1473 NodeList named_elems = top_level_elem.getElementsByTagName(element_name); 1474 for (int j=named_elems.getLength()-1; j>=0; j--) { 1475 Element this_elem = (Element)named_elems.item(j); 1476 Element replacement_elem = highlightQueryTermsElementText(doc, this_elem, query_term_variants, phrase_query_term_variants_hierarchy); 1477 this_elem.getParentNode().replaceChild(replacement_elem, this_elem); 1478 } 1479 1480 1481 //logger.error("end highlight DOM "+XMLConverter.getPrettyString(top_level_elem)); 1482 return true; 1483 } 1243 1484 /** 1244 * Highlights query terms in a piece of text.1485 * Highlights query terms in the text content of an element. 1245 1486 */ 1246 private Element highlightQueryTermsInternal(Document doc, String content,HashSet<String> query_term_variants, ArrayList<ArrayList<HashSet<String>>> phrase_query_term_variants_hierarchy)1487 private Element highlightQueryTermsElementText(Document doc, Element original_element, /*String content,*/ HashSet<String> query_term_variants, ArrayList<ArrayList<HashSet<String>>> phrase_query_term_variants_hierarchy) 1247 1488 { 1489 //logger.error("in hl internal, query terms are "+query_term_variants.toString()); 1490 String content = GSXML.getNodeText(original_element); 1491 //logger.error("original elem = "+XMLConverter.getPrettyString(original_element)); 1492 logger.error("highlighting content: "+content); 1248 1493 // Convert the content string to an array of characters for speed 1249 1494 char[] content_characters = new char[content.length()]; … … 1289 1534 // Check if the word matches any of the query term equivalents 1290 1535 String word = new String(content_characters, word_start, (i - word_start)); 1536 //logger.error("word: "+word); 1291 1537 if (query_term_variants.contains(word)) 1292 1538 { 1539 //logger.error("matched"); 1293 1540 // We have found a matching word, so remember its location 1294 1541 word_matches.add(new WordMatch(word, word_start, i, preceding_word_matched)); … … 1393 1640 1394 1641 // Now add the annotation tags into the document at the correct points 1642 //Element content_element = doc.createElement(GSXML.NODE_CONTENT_ELEM); 1643 Element content_element = (Element)doc.importNode(original_element, false); // just copy the element plus any attributes, but not any children. 1644 int last_wrote = 0; 1645 for (int i = 0; i < highlight_start_positions.size(); i++) 1646 { 1647 int highlight_start = highlight_start_positions.get(i).intValue(); 1648 int highlight_end = highlight_end_positions.get(i).intValue(); 1649 1650 // Print anything before the highlight range 1651 if (last_wrote < highlight_start) 1652 { 1653 String preceding_text = new String(content_characters, last_wrote, (highlight_start - last_wrote)); 1654 content_element.appendChild(doc.createTextNode(preceding_text)); 1655 } 1656 1657 // Print the highlight text, annotated 1658 if (highlight_end > last_wrote) 1659 { 1660 String highlight_text = new String(content_characters, highlight_start, (highlight_end - highlight_start)); 1661 Element annotation_element = GSXML.createTextElement(doc, "annotation", highlight_text); 1662 annotation_element.setAttribute("type", "query_term"); 1663 content_element.appendChild(annotation_element); 1664 last_wrote = highlight_end; 1665 } 1666 } 1667 1668 // Finish off any unwritten text 1669 if (last_wrote < content_characters.length) 1670 { 1671 String remaining_text = new String(content_characters, last_wrote, (content_characters.length - last_wrote)); 1672 content_element.appendChild(doc.createTextNode(remaining_text)); 1673 } 1674 return content_element; 1675 } 1676 1677 private Element highlightQueryTermsInternalOrig(Document doc, String content, HashSet<String> query_term_variants, ArrayList<ArrayList<HashSet<String>>> phrase_query_term_variants_hierarchy) 1678 { 1679 // Convert the content string to an array of characters for speed 1680 char[] content_characters = new char[content.length()]; 1681 content.getChars(0, content.length(), content_characters, 0); 1682 1683 // Now skim through the content, identifying word matches 1684 ArrayList<WordMatch> word_matches = new ArrayList<WordMatch>(); 1685 int word_start = 0; 1686 boolean in_word = false; 1687 boolean preceding_word_matched = false; 1688 boolean inTag = false; 1689 for (int i = 0; i < content_characters.length; i++) 1690 { 1691 //We don't want to find words inside HTML tags 1692 if (content_characters[i] == '<') 1693 { 1694 inTag = true; 1695 continue; 1696 } 1697 else if (inTag && content_characters[i] == '>') 1698 { 1699 inTag = false; 1700 } 1701 else if (inTag) 1702 { 1703 continue; 1704 } 1705 1706 boolean is_character_letter_or_digit = Character.isLetterOrDigit(content_characters[i]); 1707 1708 // Has a word just started? 1709 if (in_word == false && is_character_letter_or_digit == true) 1710 { 1711 in_word = true; 1712 word_start = i; 1713 } 1714 1715 // Or has a word just finished? 1716 else if (in_word == true && is_character_letter_or_digit == false) 1717 { 1718 in_word = false; 1719 1720 // Check if the word matches any of the query term equivalents 1721 String word = new String(content_characters, word_start, (i - word_start)); 1722 if (query_term_variants.contains(word)) 1723 { 1724 // We have found a matching word, so remember its location 1725 word_matches.add(new WordMatch(word, word_start, i, preceding_word_matched)); 1726 preceding_word_matched = true; 1727 } 1728 else 1729 { 1730 preceding_word_matched = false; 1731 } 1732 } 1733 } 1734 1735 // Don't forget the last word... 1736 if (in_word == true) 1737 { 1738 // Check if the word matches any of the query term equivalents 1739 String word = new String(content_characters, word_start, (content_characters.length - word_start)); 1740 if (query_term_variants.contains(word)) 1741 { 1742 // We have found a matching word, so remember its location 1743 word_matches.add(new WordMatch(word, word_start, content_characters.length, preceding_word_matched)); 1744 } 1745 } 1746 1747 ArrayList<Integer> highlight_start_positions = new ArrayList<Integer>(); 1748 ArrayList<Integer> highlight_end_positions = new ArrayList<Integer>(); 1749 1750 // Deal with phrases now 1751 ArrayList<PartialPhraseMatch> partial_phrase_matches = new ArrayList<PartialPhraseMatch>(); 1752 for (int i = 0; i < word_matches.size(); i++) 1753 { 1754 WordMatch word_match = word_matches.get(i); 1755 1756 // See if any partial phrase matches are extended by this word 1757 if (word_match.preceding_word_matched) 1758 { 1759 for (int j = partial_phrase_matches.size() - 1; j >= 0; j--) 1760 { 1761 PartialPhraseMatch partial_phrase_match = partial_phrase_matches.remove(j); 1762 ArrayList phrase_query_p_term_variants_list = phrase_query_term_variants_hierarchy.get(partial_phrase_match.query_phrase_number); 1763 HashSet phrase_query_p_term_x_variants = (HashSet) phrase_query_p_term_variants_list.get(partial_phrase_match.num_words_matched); 1764 if (phrase_query_p_term_x_variants.contains(word_match.word)) 1765 { 1766 partial_phrase_match.num_words_matched++; 1767 1768 // Has a complete phrase match occurred? 1769 if (partial_phrase_match.num_words_matched == phrase_query_p_term_variants_list.size()) 1770 { 1771 // Check for overlaps by looking at the previous highlight range 1772 if (!highlight_end_positions.isEmpty()) 1773 { 1774 int last_highlight_index = highlight_end_positions.size() - 1; 1775 int last_highlight_end = highlight_end_positions.get(last_highlight_index).intValue(); 1776 if (last_highlight_end > partial_phrase_match.start_position) 1777 { 1778 // There is an overlap, so remove the previous phrase match 1779 int last_highlight_start = highlight_start_positions.remove(last_highlight_index).intValue(); 1780 highlight_end_positions.remove(last_highlight_index); 1781 partial_phrase_match.start_position = last_highlight_start; 1782 } 1783 } 1784 1785 highlight_start_positions.add(new Integer(partial_phrase_match.start_position)); 1786 highlight_end_positions.add(new Integer(word_match.end_position)); 1787 } 1788 // No, but add the partial match back into the list for next time 1789 else 1790 { 1791 partial_phrase_matches.add(partial_phrase_match); 1792 } 1793 } 1794 } 1795 } 1796 else 1797 { 1798 partial_phrase_matches.clear(); 1799 } 1800 1801 // See if this word is at the start of any of the phrases 1802 for (int p = 0; p < phrase_query_term_variants_hierarchy.size(); p++) 1803 { 1804 ArrayList phrase_query_p_term_variants_list = phrase_query_term_variants_hierarchy.get(p); 1805 if (phrase_query_p_term_variants_list.size()>0) { 1806 HashSet phrase_query_p_term_1_variants = (HashSet) phrase_query_p_term_variants_list.get(0); 1807 if (phrase_query_p_term_1_variants.contains(word_match.word)) 1808 { 1809 // If this phrase is just one word long, we have a complete match 1810 if (phrase_query_p_term_variants_list.size() == 1) 1811 { 1812 highlight_start_positions.add(new Integer(word_match.start_position)); 1813 highlight_end_positions.add(new Integer(word_match.end_position)); 1814 } 1815 // Otherwise we have the start of a potential phrase match 1816 else 1817 { 1818 partial_phrase_matches.add(new PartialPhraseMatch(word_match.start_position, p)); 1819 } 1820 } 1821 } 1822 } 1823 } 1824 1825 // Now add the annotation tags into the document at the correct points 1395 1826 Element content_element = doc.createElement(GSXML.NODE_CONTENT_ELEM); 1396 1827
Note:
See TracChangeset
for help on using the changeset viewer.