Changeset 32545

Show
Ignore:
Timestamp:
29.10.2018 12:19:46 (3 weeks ago)
Author:
kjdon
Message:

changes to query term highlighting. 1. added back in the functionality for getting marked up text back from indexer - useful for solr as it doesn't give you the equivalent terms list, so you don't know what terms it has matched. eg a query for snail will match snail and snails. only do this for text though, not metadata. 2. if the query doesn't give back equivalent terms (solr and lucene), then do case insensitive matching. previously it just did upper and lower case for first letter, but that didn't match eg SNAIL. 3. if the term in the performed query is not in the terms list (eg in the case of solr, query snails, term returned will be snail), but it starts with a term in the term list, then add it to the term list. this helps with a solr query of snails, but not with a query of snail - which matches snails, but snails won't get highlighted. 4. if there is no double quote in the query string, then don't worry about trying to make the phrase structure, just find all the matches of the terms and highlight them. I think thats all...

Files:
1 modified

Legend:

Unmodified
Added
Removed
  • main/trunk/greenstone3/src/java/org/greenstone/gsdl3/action/DocumentAction.java

    r32505 r32545  
    3535import java.util.HashMap; 
    3636import java.util.HashSet; 
     37import java.util.Iterator; 
    3738import java.io.File; 
    3839import java.io.Serializable; 
     
    541542        GSXML.mergeMetadataLists(the_document, top_doc_node); 
    542543 
     544        // if we are highlighting query terms, then we also get them highlighted in the metadata 
     545         
     546        HashSet<String> query_term_variants = null; 
     547        ArrayList<ArrayList<HashSet<String>>> phrase_query_term_variants_hierarchy = null; 
     548        boolean do_highlight_query_terms = highlight_query_terms; 
     549        int query_terms_status = 0; 
     550        if (highlight_query_terms) { 
     551          // lets get the query term equivalents 
     552          query_term_variants = new HashSet<String>(); 
     553          phrase_query_term_variants_hierarchy = new ArrayList<ArrayList<HashSet<String>>>(); 
     554          if ((query_terms_status = getQueryTermVariants(request, query_term_variants, phrase_query_term_variants_hierarchy)) ==0) { 
     555            do_highlight_query_terms = false; // we couldn't get the terms 
     556          } 
     557        } 
     558 
     559        // lets try marking up the metadata with search terms 
     560        // if the search service doesn't send back <equivTermlist> then we haven't got the term variants. We lower case everything and do case insensitive matching 
     561        boolean highlight_case_insensitive = false; 
     562        if (query_terms_status == NO_EQUIV_QUERY_TERMS) { 
     563          highlight_case_insensitive = true; 
     564        } 
     565        if (do_highlight_query_terms) { 
     566          highlightQueryTermsDOM(doc, the_document, "metadata", query_term_variants, phrase_query_term_variants_hierarchy, highlight_case_insensitive); 
     567        } 
     568 
    543569        // do we want doc text content? If not, we are done. 
    544570        if (!get_text) { 
     
    546572          return result; 
    547573        } 
    548  
    549574         
    550         HashSet<String> query_term_variants = null; 
    551         ArrayList<ArrayList<HashSet<String>>> phrase_query_term_variants_hierarchy = null; 
    552         boolean do_highlight_query_terms = highlight_query_terms; 
    553         if (highlight_query_terms) { 
    554           // lets get the query term equivalents 
    555           query_term_variants = new HashSet<String>(); 
    556           phrase_query_term_variants_hierarchy = new ArrayList<ArrayList<HashSet<String>>>(); 
    557           if (!getQueryTermVariants(request, null, /*current_node_id,*/ query_term_variants, phrase_query_term_variants_hierarchy)) { 
    558             do_highlight_query_terms = false; // we couldn't get the terms 
    559           } 
    560         } 
    561  
    562         // lets try marking up the metadata with search terms 
    563         if (do_highlight_query_terms) { 
    564           highlightQueryTermsDOM(doc, the_document, "metadata", query_term_variants, phrase_query_term_variants_hierarchy); 
    565         } 
    566  
    567575        // Build a request to obtain some document content 
    568576        Element dc_message = doc.createElement(GSXML.MESSAGE_ELEM); 
     
    599607        Element dc_response_doc_list = (Element) GSXML.getNodeByPath(dc_response_message, path); 
    600608 
     609        boolean get_marked_up_doc_from_query = false; 
     610        if (do_highlight_query_terms && query_terms_status == NO_EQUIV_QUERY_TERMS) { 
     611          get_marked_up_doc_from_query = true; // we try to. solr we can, lucene we can't 
     612        } 
     613 
    601614        if (expand_document) 
    602615        { 
     
    606619            { 
    607620                String node_id = ((Element)doc_nodes.item(i)).getAttribute(GSXML.NODE_ID_ATT); 
    608                 //Node content = GSXML.getChildByTagName((Element) dc_response_docs.item(i), GSXML.NODE_CONTENT_ELEM); 
    609621                Node docNode = GSXML.getNamedElement(dc_response_doc_list, "documentNode", GSXML.NODE_ID_ATT, node_id); 
    610622                Node content = GSXML.getChildByTagName(docNode, GSXML.NODE_CONTENT_ELEM); 
    611623                if (content != null) 
    612624                { 
    613                     if (do_highlight_query_terms) 
    614                     { 
    615                       content = highlightQueryTermsElementText(doc, (Element)content, query_term_variants, phrase_query_term_variants_hierarchy); 
    616                     } 
    617                      
    618                     doc_nodes.item(i).appendChild(doc.importNode(content, true)); 
    619                 } 
    620                 //GSXML.mergeMetadataLists(doc_nodes.item(i), dm_response_docs.item(i)); 
     625                  if (do_highlight_query_terms) { 
     626                    if (get_marked_up_doc_from_query) { 
     627                       
     628                      Element new_content = retrieveHighlightedContent(request, node_id); 
     629                       
     630                      if (new_content == null) { 
     631                    // we didn't get any text back from the request. assume we won't be able to get it next time either (eg lucene) 
     632                    get_marked_up_doc_from_query = false; 
     633                    content = highlightQueryTermsElementText(doc, (Element)content, query_term_variants, phrase_query_term_variants_hierarchy, highlight_case_insensitive); 
     634                      } else { 
     635                    content= new_content; 
     636                      } 
     637                    } else { 
     638                      content = highlightQueryTermsElementText(doc, (Element)content, query_term_variants, phrase_query_term_variants_hierarchy, highlight_case_insensitive); 
     639                    } 
     640                  } 
     641                  doc_nodes.item(i).appendChild(doc.importNode(content, true)); 
     642                } 
     643 
    621644            } 
    622645            if (has_dummy && document_type.equals(GSXML.DOC_TYPE_SIMPLE)) { 
     
    641664        else 
    642665        { 
    643             //path = GSPath.appendLink(path, GSXML.DOC_NODE_ELEM); 
    644666            Element dc_response_doc = (Element) GSXML.getChildByTagName(dc_response_doc_list, GSXML.DOC_NODE_ELEM); 
    645667            Element dc_response_doc_content = (Element) GSXML.getChildByTagName(dc_response_doc, GSXML.NODE_CONTENT_ELEM); 
    646             //Element dc_response_doc_external = (Element) GSXML.getChildByTagName(dc_response_doc, "external"); 
    647668 
    648669            if (dc_response_doc_content == null) 
     
    651672                if (dc_response_doc.getAttribute("external").equals("true")) 
    652673                { 
    653  
    654                     //if (dc_response_doc_external != null) 
    655                     //{ 
    656674                    String href_id = dc_response_doc.getAttribute(GSXML.HREF_ID_ATT); 
    657675 
     
    664682            { 
    665683                dc_response_doc.removeChild(dc_response_doc_content); 
    666  
    667                 dc_response_doc_content = highlightQueryTermsElementText(doc, (Element)dc_response_doc_content, query_term_variants, phrase_query_term_variants_hierarchy); 
     684                if (get_marked_up_doc_from_query) { 
     685                  Element new_content = retrieveHighlightedContent(request, null); 
     686                  if (new_content == null) { 
     687                    get_marked_up_doc_from_query = false; 
     688                    dc_response_doc_content = highlightQueryTermsElementText(doc, (Element)dc_response_doc_content, query_term_variants, phrase_query_term_variants_hierarchy, highlight_case_insensitive); 
     689                  } else { 
     690                     
     691                    dc_response_doc_content = new_content; 
     692                  } 
     693                } else { 
     694                  dc_response_doc_content = highlightQueryTermsElementText(doc, (Element)dc_response_doc_content, query_term_variants, phrase_query_term_variants_hierarchy, highlight_case_insensitive); 
     695                } 
    668696                dc_response_doc.appendChild(dc_response_doc.getOwnerDocument().importNode(dc_response_doc_content, true)); 
    669697            } 
     
    720748 
    721749                    NodeList dummy_children = dummy_node.getChildNodes(); 
    722                     //for (int i=0; i<dummy_children.getLength(); i++) { 
    723750                    for (int i = dummy_children.getLength() - 1; i >= 0; i--) 
    724751                    { 
     
    10361063  } 
    10371064 
     1065  protected final int NO_QUERY_TERMS = 0; 
     1066  protected final int NO_EQUIV_QUERY_TERMS = 1; 
     1067  protected final int EQUIV_QUERY_TERMS = 2; 
    10381068  /** 
    10391069   * this involves a bit of a hack to get the equivalent query terms - has to 
    10401070   * requery the query service - uses the last selected service name. (if it 
    1041    * ends in query). should this action do the query or should it send a 
    1042    * message to the query action? but that will involve lots of extra stuff.  
     1071   * ends in query).   
    10431072   */ 
    1044   protected boolean getQueryTermVariants(Element request, String  current_node_id, HashSet<String> query_term_variants, ArrayList<ArrayList<HashSet<String>>> phrase_query_term_variants_hierarchy) 
     1073  protected int getQueryTermVariants(Element request, HashSet<String> query_term_variants, ArrayList<ArrayList<HashSet<String>>> phrase_query_term_variants_hierarchy) 
    10451074  { 
    1046     Document doc = request.getOwnerDocument(); 
    1047      
     1075    Document doc = XMLConverter.newDOM();  
     1076 
    10481077    // do the query again to get term info  
    10491078    Element cgi_param_list = (Element) GSXML.getChildByTagName(request, GSXML.PARAM_ELEM + GSXML.LIST_MODIFIER); 
    1050     //logger.error("cgi param list = "+XMLConverter.getPrettyString(cgi_param_list)); 
    10511079    HashMap<String, Serializable> params = GSXML.extractParams(cgi_param_list, false); 
    10521080     
     
    10541082    if (previous_params == null) 
    10551083      { 
    1056     //logger.error("no p parms"); 
    1057     return false; 
     1084    return NO_QUERY_TERMS; 
    10581085      } 
    10591086    String service_name = (String) previous_params.get(GSParams.SERVICE); 
    10601087    if (service_name == null || !service_name.endsWith("Query")) 
    10611088      { // hack for now - we only do highlighting if we were in a query last - ie not if we were in a browse thingy 
    1062     logger.debug("invalid service, not doing highlighting"); 
    1063     return false; 
     1089    logger.debug("invalid service "+service_name+", not doing highlighting"); 
     1090    return NO_QUERY_TERMS; 
    10641091      } 
    10651092 
     
    10771104    Element query_param_list = doc.createElement(GSXML.PARAM_ELEM + GSXML.LIST_MODIFIER); 
    10781105    GSXML.addParametersToList(query_param_list, service_params); 
    1079     // is this only used for solr??? - do we still want it for solr?? 
    1080     // if (current_node_id != null) { 
    1081     //   GSXML.addParameterToList(query_param_list, "hldocOID", current_node_id); 
    1082     // } else { 
    1083     //   GSXML.addParameterToList(query_param_list, "hldocOID", (String) params.get(GSParams.DOCUMENT)); 
    1084     // } 
    10851106    mr_query_request.appendChild(query_param_list); 
     1107 
    10861108    // do the query 
    1087  
    10881109    Element mr_query_response = (Element) this.mr.process(mr_query_message); 
    1089  
     1110        
    10901111    // find the term lists 
    10911112    String path = GSPath.appendLink(GSXML.RESPONSE_ELEM, GSXML.TERM_ELEM + GSXML.LIST_MODIFIER); 
     
    10941115      { 
    10951116    // no term info 
    1096     logger.error("No query term information. xx\n"); 
    1097     return false; 
     1117    return  NO_QUERY_TERMS; 
    10981118      } 
    1099     //    logger.error("query term list info "+XMLConverter.getPrettyString(query_term_list_element)); 
    1100     //String content = GSXML.getNodeText(dc_response_doc_content); 
    1101      
    1102     String metadata_path = GSPath.appendLink(GSXML.RESPONSE_ELEM, GSXML.METADATA_ELEM + GSXML.LIST_MODIFIER); 
    1103     Element metadata_list = (Element) GSXML.getNodeByPath(mr_query_response, metadata_path); 
    1104      
     1119     
     1120    int result_code = NO_EQUIV_QUERY_TERMS; 
    11051121    NodeList equivalent_terms_nodelist = query_term_list_element.getElementsByTagName("equivTermList"); 
    11061122    if (equivalent_terms_nodelist == null || equivalent_terms_nodelist.getLength() == 0) 
    11071123      { 
     1124    // if we have no equivalent terms, just add the current terms lower cased and we do case insensitive matching later on 
    11081125    NodeList terms_nodelist = query_term_list_element.getElementsByTagName("term"); 
    11091126    if (terms_nodelist != null && terms_nodelist.getLength() > 0) 
     
    11121129          { 
    11131130        String termValue = ((Element) terms_nodelist.item(i)).getAttribute("name"); 
    1114         String termValueU = null; 
    1115         String termValueL = null; 
    1116          
    1117         if (termValue.length() > 1) 
    1118           { 
    1119             termValueU = termValue.substring(0, 1).toUpperCase() + termValue.substring(1); 
    1120             termValueL = termValue.substring(0, 1).toLowerCase() + termValue.substring(1); 
    1121           } 
    1122         else 
    1123           { 
    1124             termValueU = termValue.substring(0, 1).toUpperCase(); 
    1125             termValueL = termValue.substring(0, 1).toLowerCase(); 
    1126           } 
    1127         query_term_variants.add(termValueU); 
    1128         query_term_variants.add(termValueL); 
     1131        query_term_variants.add(termValue.toLowerCase()); 
    11291132          } 
    11301133      } 
     
    11321135    else 
    11331136      { 
     1137    result_code = EQUIV_QUERY_TERMS; 
    11341138    for (int i = 0; i < equivalent_terms_nodelist.getLength(); i++) 
    11351139      { 
     
    11431147      } 
    11441148     
     1149    String metadata_path = GSPath.appendLink(GSXML.RESPONSE_ELEM, GSXML.METADATA_ELEM + GSXML.LIST_MODIFIER); 
     1150    Element metadata_list = (Element) GSXML.getNodeByPath(mr_query_response, metadata_path); 
    11451151 
    11461152    Element query_element = GSXML.getNamedElement(metadata_list, GSXML.METADATA_ELEM, GSXML.NAME_ATT, "query"); 
    11471153    String performed_query = GSXML.getNodeText(query_element) + " "; 
    1148  
     1154    logger.debug("performed query="+performed_query); 
     1155 
     1156    boolean has_phrases = false; // if there are no phrases, we don't bother making the phrase variants structure 
     1157    if (performed_query.contains("\"")) { 
     1158      has_phrases = true; 
     1159    } 
     1160     
    11491161    ArrayList<HashSet<String>> phrase_query_p_term_variants_list = new ArrayList<HashSet<String>>(); 
    11501162    int term_start = 0; 
    11511163    boolean in_term = false; 
    11521164    boolean in_phrase = false; 
    1153     for (int i = 0; i < performed_query.length(); i++) 
    1154       { 
     1165    for (int i = 0; i < performed_query.length(); i++) { 
     1166       
    11551167    char character = performed_query.charAt(i); 
    11561168    boolean is_character_letter_or_digit = Character.isLetterOrDigit(character); 
     
    11681180        in_term = false; 
    11691181        String term = performed_query.substring(term_start, i); 
    1170          
    1171         Element term_element = GSXML.getNamedElement(query_term_list_element, GSXML.TERM_ELEM, GSXML.NAME_ATT, term); 
    1172         if (term_element != null) 
    1173           { 
     1182        if (has_phrases) { 
     1183          // do the phrase bit 
     1184          HashSet<String> phrase_query_p_term_x_variants = new HashSet<String>(); 
     1185          if (result_code == EQUIV_QUERY_TERMS) { 
     1186        Element term_element = GSXML.getNamedElement(query_term_list_element, GSXML.TERM_ELEM, GSXML.NAME_ATT, term); 
     1187        if (term_element != null) { 
     1188          // might be null for eg TX in [snails]:TX 
    11741189         
    1175         HashSet<String> phrase_query_p_term_x_variants = new HashSet<String>(); 
    1176          
    1177         NodeList term_equivalent_terms_nodelist = term_element.getElementsByTagName("equivTermList"); 
    1178         if (term_equivalent_terms_nodelist == null || term_equivalent_terms_nodelist.getLength() == 0) 
    1179           { 
    1180             String termValueU = null; 
    1181             String termValueL = null; 
    1182              
    1183             if (term.length() > 1) 
    1184               { 
    1185             termValueU = term.substring(0, 1).toUpperCase() + term.substring(1); 
    1186             termValueL = term.substring(0, 1).toLowerCase() + term.substring(1); 
    1187               } 
    1188             else 
    1189               { 
    1190             termValueU = term.substring(0, 1).toUpperCase(); 
    1191             termValueL = term.substring(0, 1).toLowerCase(); 
    1192               } 
    1193              
    1194             phrase_query_p_term_x_variants.add(termValueU); 
    1195             phrase_query_p_term_x_variants.add(termValueL); 
    1196           } 
    1197         else 
    1198           { 
     1190          NodeList term_equivalent_terms_nodelist = term_element.getElementsByTagName("equivTermList"); 
     1191          if (term_equivalent_terms_nodelist != null || term_equivalent_terms_nodelist.getLength() != 0) { 
    11991192            for (int j = 0; j < term_equivalent_terms_nodelist.getLength(); j++) 
    12001193              { 
     
    12071200              } 
    12081201          } 
     1202        } 
     1203          } else { // result_code != EQUIV_QUERY_TERMS 
     1204        // we don;t have equivalent term list, so just add the lower cased version in, and we do case-insensitive matching later on 
     1205        if (query_term_variants.contains(term.toLowerCase()) || containsSubString(query_term_variants, term)) { 
     1206          // this handles the case where the user has searched for snails, but term list returns 'snail' 
     1207            phrase_query_p_term_x_variants.add(term.toLowerCase()); 
     1208        } 
     1209          } 
     1210          if (phrase_query_p_term_x_variants.size()>0) { 
     1211        // we have found a valid term 
    12091212        phrase_query_p_term_variants_list.add(phrase_query_p_term_x_variants); 
    12101213         
     
    12151218          } 
    12161219          } 
    1217       } 
     1220        } // end if has_phrases 
     1221        else { 
     1222          // no phrases so we don't have to do the phrasey stuff. but 
     1223          // we need to check the term against the query term list - if its not in there, check whether its the root of a term. 
     1224          // we want to handle the case where user has queried "snails", the term list returned only has snail, and therefore snails doesn't get highlighted. 
     1225          // but dont want to include eg TX 
     1226          if (result_code == NO_EQUIV_QUERY_TERMS) { 
     1227        if (containsSubString(query_term_variants, term)) { 
     1228          query_term_variants.add(term.toLowerCase()); 
     1229        } 
     1230          } 
     1231           
     1232        }  
     1233      } // end of in_term...  
    12181234    // Watch for phrases (surrounded by quotes) 
    1219     if (character == '\"') 
    1220       { 
     1235    if (character == '\"') { 
     1236       
    12211237        // Has a phrase just started? 
    12221238        if (in_phrase == false) 
     
    12321248         
    12331249        phrase_query_p_term_variants_list = new ArrayList<HashSet<String>>(); 
    1234       } 
     1250    } // if char == " 
     1251    } // for each char in performed query 
     1252   
     1253    return result_code; 
     1254  } 
     1255 
     1256  protected boolean containsSubString(HashSet<String> query_term_variants, String term) { 
     1257    // hack to filter out TX, TI field names 
     1258    String lc_term = term.toLowerCase(); 
     1259    if (query_term_variants.contains(term)) { 
     1260      return false; // or true?? 
     1261    } 
     1262    if (term.matches("[A-Z][A-Z][A-Z]?")) { 
     1263      return false; 
     1264    } 
     1265    Iterator i = query_term_variants.iterator(); 
     1266    while (i.hasNext()) { 
     1267      String t = (String)i.next(); 
     1268      if (term.startsWith(t)) { 
     1269    return true; 
    12351270      } 
    1236    
    1237     return true; 
     1271    } 
     1272    return false; 
    12381273  } 
    12391274 
    1240   /** redo the request to get the query terms then highlight them in the text 
    1241    *  
    1242    */ 
    1243   protected Element highlightQueryTermsOld(Element request, String current_node_id, Element dc_response_doc_content) 
    1244     { 
    1245         Document doc = request.getOwnerDocument(); 
    1246          
    1247         // do the query again to get term info  
    1248         Element cgi_param_list = (Element) GSXML.getChildByTagName(request, GSXML.PARAM_ELEM + GSXML.LIST_MODIFIER); 
    1249         HashMap<String, Serializable> params = GSXML.extractParams(cgi_param_list, false); 
    1250  
    1251         HashMap previous_params = (HashMap) params.get("p"); 
    1252         if (previous_params == null) 
    1253         { 
    1254             return dc_response_doc_content; 
    1255         } 
    1256         String service_name = (String) previous_params.get(GSParams.SERVICE); 
    1257         if (service_name == null || !service_name.endsWith("Query")) 
    1258         { // hack for now - we only do highlighting if we were in a query last - ie not if we were in a browse thingy 
    1259             logger.debug("invalid service, not doing highlighting"); 
    1260             return dc_response_doc_content; 
    1261         } 
    1262         String collection = (String) params.get(GSParams.COLLECTION); 
    1263         UserContext userContext = new UserContext(request); 
    1264         String to = GSPath.appendLink(collection, service_name); 
    1265  
    1266         Element mr_query_message = doc.createElement(GSXML.MESSAGE_ELEM); 
    1267         Element mr_query_request = GSXML.createBasicRequest(doc, GSXML.REQUEST_TYPE_PROCESS, to, userContext); 
    1268         mr_query_message.appendChild(mr_query_request); 
    1269  
    1270         // paramList 
    1271         HashMap service_params = (HashMap) params.get("s1"); 
    1272  
    1273         Element query_param_list = doc.createElement(GSXML.PARAM_ELEM + GSXML.LIST_MODIFIER); 
    1274         GSXML.addParametersToList(query_param_list, service_params); 
    1275         if (current_node_id != null) { 
    1276           GSXML.addParameterToList(query_param_list, "hldocOID", current_node_id); 
    1277         } else { 
    1278           GSXML.addParameterToList(query_param_list, "hldocOID", (String) params.get(GSParams.DOCUMENT)); 
    1279         } 
    1280         mr_query_request.appendChild(query_param_list); 
    1281         // do the query 
    1282         Element mr_query_response = (Element) this.mr.process(mr_query_message); 
    1283         String pathNode = GSPath.appendLink(GSXML.RESPONSE_ELEM, GSXML.NODE_CONTENT_ELEM); 
    1284         Element highlighted_Node = (Element) GSXML.getNodeByPath(mr_query_response, pathNode); 
    1285         // For SOLR, the above query may come back with a nodeContent element, which is the hldocOID section content, with search terms marked up. We send it back to the documnetContentRetrieve service so that resolveTextMacros can be applied, and it can be properly encased in documentNode etc elements 
    1286         if (highlighted_Node != null) 
    1287         { 
    1288             // Build a request to process highlighted text 
    1289           logger.error("highlighted node is not null!!!!"); 
    1290           logger.error(XMLConverter.getPrettyString(highlighted_Node)); 
    1291             Element hl_message = doc.createElement(GSXML.MESSAGE_ELEM); 
    1292             to = GSPath.appendLink(collection, "DocumentContentRetrieve"); 
    1293             Element dc_request = GSXML.createBasicRequest(doc, GSXML.REQUEST_TYPE_PROCESS, to, userContext); 
    1294             hl_message.appendChild(dc_request); 
    1295  
    1296             // Create a parameter list to specify the request parameters - empty for now 
    1297             Element dc_param_list = doc.createElement(GSXML.PARAM_ELEM + GSXML.LIST_MODIFIER); 
    1298             dc_request.appendChild(dc_param_list); 
    1299  
    1300             // get the content 
    1301             Element doc_list = doc.createElement(GSXML.DOC_NODE_ELEM + GSXML.LIST_MODIFIER); 
    1302             dc_request.appendChild(doc_list); 
    1303             Element current_doc = doc.createElement(GSXML.DOC_NODE_ELEM); 
    1304             doc_list.appendChild(current_doc); 
    1305             current_doc.setAttribute(GSXML.NODE_ID_ATT, (String) params.get(GSParams.DOCUMENT)); 
    1306             //Append highlighted content to request for processing 
    1307             dc_request.appendChild(doc.importNode(highlighted_Node, true)); 
    1308             Element hl_response_message = (Element) this.mr.process(hl_message); 
    1309          
    1310             //Get results 
    1311             NodeList contentList = hl_response_message.getElementsByTagName(GSXML.NODE_CONTENT_ELEM); 
    1312             Element content = (Element) contentList.item(0);     
    1313             return content; 
    1314         } 
    1315         String path = GSPath.appendLink(GSXML.RESPONSE_ELEM, GSXML.TERM_ELEM + GSXML.LIST_MODIFIER); 
    1316         Element query_term_list_element = (Element) GSXML.getNodeByPath(mr_query_response, path); 
    1317         if (query_term_list_element == null) 
    1318         { 
    1319             // no term info 
    1320             logger.error("No query term information. yy\n"); 
    1321             return dc_response_doc_content; 
    1322         } 
    1323  
    1324         String content = GSXML.getNodeText(dc_response_doc_content); 
    1325  
    1326         String metadata_path = GSPath.appendLink(GSXML.RESPONSE_ELEM, GSXML.METADATA_ELEM + GSXML.LIST_MODIFIER); 
    1327         Element metadata_list = (Element) GSXML.getNodeByPath(mr_query_response, metadata_path); 
    1328  
    1329         HashSet<String> query_term_variants = new HashSet<String>(); 
    1330         NodeList equivalent_terms_nodelist = query_term_list_element.getElementsByTagName("equivTermList"); 
    1331         if (equivalent_terms_nodelist == null || equivalent_terms_nodelist.getLength() == 0) 
    1332         { 
    1333             NodeList terms_nodelist = query_term_list_element.getElementsByTagName("term"); 
    1334             if (terms_nodelist != null && terms_nodelist.getLength() > 0) 
    1335             { 
    1336                 for (int i = 0; i < terms_nodelist.getLength(); i++) 
    1337                 { 
    1338                     String termValue = ((Element) terms_nodelist.item(i)).getAttribute("name"); 
    1339                     String termValueU = null; 
    1340                     String termValueL = null; 
    1341  
    1342                     if (termValue.length() > 1) 
    1343                     { 
    1344                         termValueU = termValue.substring(0, 1).toUpperCase() + termValue.substring(1); 
    1345                         termValueL = termValue.substring(0, 1).toLowerCase() + termValue.substring(1); 
    1346                     } 
    1347                     else 
    1348                     { 
    1349                         termValueU = termValue.substring(0, 1).toUpperCase(); 
    1350                         termValueL = termValue.substring(0, 1).toLowerCase(); 
    1351                     } 
    1352  
    1353                     query_term_variants.add(termValueU); 
    1354                     query_term_variants.add(termValueL); 
    1355                 } 
    1356             } 
    1357         } 
    1358         else 
    1359         { 
    1360             for (int i = 0; i < equivalent_terms_nodelist.getLength(); i++) 
    1361             { 
    1362                 Element equivalent_terms_element = (Element) equivalent_terms_nodelist.item(i); 
    1363                 String[] equivalent_terms = GSXML.getAttributeValuesFromList(equivalent_terms_element, GSXML.NAME_ATT); 
    1364                 for (int j = 0; j < equivalent_terms.length; j++) 
    1365                 { 
    1366                     query_term_variants.add(equivalent_terms[j]); 
    1367                 } 
    1368             } 
    1369         } 
    1370  
    1371         ArrayList<ArrayList<HashSet<String>>> phrase_query_term_variants_hierarchy = new ArrayList<ArrayList<HashSet<String>>>(); 
    1372  
    1373         Element query_element = GSXML.getNamedElement(metadata_list, GSXML.METADATA_ELEM, GSXML.NAME_ATT, "query"); 
    1374         String performed_query = GSXML.getNodeText(query_element) + " "; 
    1375  
    1376         ArrayList<HashSet<String>> phrase_query_p_term_variants_list = new ArrayList<HashSet<String>>(); 
    1377         int term_start = 0; 
    1378         boolean in_term = false; 
    1379         boolean in_phrase = false; 
    1380         for (int i = 0; i < performed_query.length(); i++) 
    1381         { 
    1382             char character = performed_query.charAt(i); 
    1383             boolean is_character_letter_or_digit = Character.isLetterOrDigit(character); 
    1384  
    1385             // Has a query term just started? 
    1386             if (in_term == false && is_character_letter_or_digit == true) 
    1387             { 
    1388                 in_term = true; 
    1389                 term_start = i; 
    1390             } 
    1391  
    1392             // Or has a term just finished? 
    1393             else if (in_term == true && is_character_letter_or_digit == false) 
    1394             { 
    1395                 in_term = false; 
    1396                 String term = performed_query.substring(term_start, i); 
    1397  
    1398                 Element term_element = GSXML.getNamedElement(query_term_list_element, GSXML.TERM_ELEM, GSXML.NAME_ATT, term); 
    1399                 if (term_element != null) 
    1400                 { 
    1401  
    1402                     HashSet<String> phrase_query_p_term_x_variants = new HashSet<String>(); 
    1403  
    1404                     NodeList term_equivalent_terms_nodelist = term_element.getElementsByTagName("equivTermList"); 
    1405                     if (term_equivalent_terms_nodelist == null || term_equivalent_terms_nodelist.getLength() == 0) 
    1406                     { 
    1407                         String termValueU = null; 
    1408                         String termValueL = null; 
    1409  
    1410                         if (term.length() > 1) 
    1411                         { 
    1412                             termValueU = term.substring(0, 1).toUpperCase() + term.substring(1); 
    1413                             termValueL = term.substring(0, 1).toLowerCase() + term.substring(1); 
    1414                         } 
    1415                         else 
    1416                         { 
    1417                             termValueU = term.substring(0, 1).toUpperCase(); 
    1418                             termValueL = term.substring(0, 1).toLowerCase(); 
    1419                         } 
    1420  
    1421                         phrase_query_p_term_x_variants.add(termValueU); 
    1422                         phrase_query_p_term_x_variants.add(termValueL); 
    1423                     } 
    1424                     else 
    1425                     { 
    1426                         for (int j = 0; j < term_equivalent_terms_nodelist.getLength(); j++) 
    1427                         { 
    1428                             Element term_equivalent_terms_element = (Element) term_equivalent_terms_nodelist.item(j); 
    1429                             String[] term_equivalent_terms = GSXML.getAttributeValuesFromList(term_equivalent_terms_element, GSXML.NAME_ATT); 
    1430                             for (int k = 0; k < term_equivalent_terms.length; k++) 
    1431                             { 
    1432                                 phrase_query_p_term_x_variants.add(term_equivalent_terms[k]); 
    1433                             } 
    1434                         } 
    1435                     } 
    1436                     phrase_query_p_term_variants_list.add(phrase_query_p_term_x_variants); 
    1437  
    1438                     if (in_phrase == false) 
    1439                     { 
    1440                         phrase_query_term_variants_hierarchy.add(phrase_query_p_term_variants_list); 
    1441                         phrase_query_p_term_variants_list = new ArrayList<HashSet<String>>(); 
    1442                     } 
    1443                 } 
    1444             } 
    1445             // Watch for phrases (surrounded by quotes) 
    1446             if (character == '\"') 
    1447             { 
    1448                 // Has a phrase just started? 
    1449                 if (in_phrase == false) 
    1450                 { 
    1451                     in_phrase = true; 
    1452                 } 
    1453                 // Or has a phrase just finished? 
    1454                 else if (in_phrase == true) 
    1455                 { 
    1456                     in_phrase = false; 
    1457                     phrase_query_term_variants_hierarchy.add(phrase_query_p_term_variants_list); 
    1458                 } 
    1459  
    1460                 phrase_query_p_term_variants_list = new ArrayList<HashSet<String>>(); 
    1461             } 
    1462         } 
    1463  
    1464         return highlightQueryTermsInternalOrig(doc, content, query_term_variants, phrase_query_term_variants_hierarchy); 
    1465     } 
    1466  
     1275 
     1276  /** retrieve the marked up highlighted section - only works for solr collection */ 
     1277  protected Element retrieveHighlightedContent(Element request, String node_id) { 
     1278 
     1279    Document doc = XMLConverter.newDOM(); 
     1280 
     1281    // do the query again to get term info  
     1282    Element cgi_param_list = (Element) GSXML.getChildByTagName(request, GSXML.PARAM_ELEM + GSXML.LIST_MODIFIER); 
     1283    HashMap<String, Serializable> params = GSXML.extractParams(cgi_param_list, false); 
     1284     
     1285    HashMap previous_params = (HashMap) params.get("p"); 
     1286    if (previous_params == null) 
     1287      { 
     1288    return null; 
     1289      } 
     1290    String service_name = (String) previous_params.get(GSParams.SERVICE); 
     1291    if (service_name == null || !service_name.endsWith("Query")) 
     1292      { // hack for now - we only do highlighting if we were in a query last - ie not if we were in a browse thingy 
     1293    logger.debug("HL invalid service, not doing highlighting"); 
     1294    return null; 
     1295      } 
     1296 
     1297    String collection = (String) params.get(GSParams.COLLECTION); 
     1298    UserContext userContext = new UserContext(request); 
     1299    String to = GSPath.appendLink(collection, service_name); 
     1300     
     1301    Element mr_query_message = doc.createElement(GSXML.MESSAGE_ELEM); 
     1302    Element mr_query_request = GSXML.createBasicRequest(doc, GSXML.REQUEST_TYPE_PROCESS, to, userContext); 
     1303    mr_query_message.appendChild(mr_query_request); 
     1304     
     1305    // paramList 
     1306    HashMap service_params = (HashMap) params.get("s1"); 
     1307 
     1308    // hack in case the user searched on eg titles, but we want highlighting in the text 
     1309    service_params.put("index", "TX"); 
     1310    Element query_param_list = doc.createElement(GSXML.PARAM_ELEM + GSXML.LIST_MODIFIER); 
     1311    GSXML.addParametersToList(query_param_list, service_params); 
     1312 
     1313     if (node_id != null) { 
     1314       GSXML.addParameterToList(query_param_list, "hldocOID", node_id); 
     1315     } else { 
     1316       GSXML.addParameterToList(query_param_list, "hldocOID", (String) params.get(GSParams.DOCUMENT)); 
     1317     } 
     1318    mr_query_request.appendChild(query_param_list); 
     1319    // do the query 
     1320 
     1321    Element mr_query_response = (Element) this.mr.process(mr_query_message); 
     1322    String pathNode = GSPath.appendLink(GSXML.RESPONSE_ELEM, GSXML.NODE_CONTENT_ELEM); 
     1323    Element highlighted_node = (Element) GSXML.getNodeByPath(mr_query_response, pathNode); 
     1324 
     1325     if (highlighted_node == null) { 
     1326       return null; 
     1327     } 
     1328     // For SOLR, the highlighted node will be a nodeContent element, which is the hldocOID section content, with search terms marked up. 
     1329     //We send it back to the documnetContentRetrieve service so that resolveTextMacros can be applied, and it can be properly encased in documentNode etc elements 
     1330 
     1331     // Build a request to process highlighted text 
     1332      
     1333     Element hl_message = doc.createElement(GSXML.MESSAGE_ELEM); 
     1334     to = GSPath.appendLink(collection, "DocumentContentRetrieve"); 
     1335     Element dc_request = GSXML.createBasicRequest(doc, GSXML.REQUEST_TYPE_PROCESS, to, userContext); 
     1336     hl_message.appendChild(dc_request); 
     1337      
     1338     // Create a parameter list to specify the request parameters - empty for now 
     1339     Element dc_param_list = doc.createElement(GSXML.PARAM_ELEM + GSXML.LIST_MODIFIER); 
     1340     dc_request.appendChild(dc_param_list); 
     1341 
     1342     // get the content 
     1343     Element doc_list = doc.createElement(GSXML.DOC_NODE_ELEM + GSXML.LIST_MODIFIER); 
     1344     dc_request.appendChild(doc_list); 
     1345     Element current_doc = doc.createElement(GSXML.DOC_NODE_ELEM); 
     1346     doc_list.appendChild(current_doc); 
     1347     current_doc.setAttribute(GSXML.NODE_ID_ATT, (String) params.get(GSParams.DOCUMENT)); 
     1348     //Append highlighted content to request for processing 
     1349     dc_request.appendChild(doc.importNode(highlighted_node, true)); 
     1350     Element hl_response_message = (Element) this.mr.process(hl_message); 
     1351     //Get results 
     1352     NodeList contentList = hl_response_message.getElementsByTagName(GSXML.NODE_CONTENT_ELEM); 
     1353     Element content = (Element) contentList.item(0); 
     1354     return content; 
     1355      
     1356     
     1357  } 
    14671358  /**  
    14681359   * Highlights query terms in specified elements (whose name is in element_names) text inside top_level_elem 
    14691360   */ 
    1470   protected boolean highlightQueryTermsDOM(Document doc, Element top_level_elem, String element_name, HashSet<String> query_term_variants, ArrayList<ArrayList<HashSet<String>>> phrase_query_term_variants_hierarchy) { 
    1471  
    1472     //logger.error("begin highlight DOM "+XMLConverter.getPrettyString(top_level_elem)); 
     1361  protected boolean highlightQueryTermsDOM(Document doc, Element top_level_elem, String element_name, HashSet<String> query_term_variants, ArrayList<ArrayList<HashSet<String>>> phrase_query_term_variants_hierarchy, boolean case_insensitive) { 
     1362 
    14731363      NodeList named_elems = top_level_elem.getElementsByTagName(element_name); 
    14741364      for (int j=named_elems.getLength()-1; j>=0; j--) { 
    14751365    Element this_elem = (Element)named_elems.item(j); 
    1476     Element replacement_elem = highlightQueryTermsElementText(doc, this_elem, query_term_variants, phrase_query_term_variants_hierarchy); 
     1366    Element replacement_elem = highlightQueryTermsElementText(doc, this_elem, query_term_variants, phrase_query_term_variants_hierarchy, case_insensitive); 
    14771367    this_elem.getParentNode().replaceChild(replacement_elem, this_elem); 
    14781368      } 
    1479      
    1480  
    1481       //logger.error("end highlight DOM "+XMLConverter.getPrettyString(top_level_elem)); 
    14821369    return true; 
    14831370  } 
     
    14851372     * Highlights query terms in the text content of an element. 
    14861373     */ 
    1487   private Element highlightQueryTermsElementText(Document doc, Element original_element, /*String content,*/  HashSet<String> query_term_variants, ArrayList<ArrayList<HashSet<String>>> phrase_query_term_variants_hierarchy) 
     1374  private Element highlightQueryTermsElementText(Document doc, Element original_element,  HashSet<String> query_term_variants, ArrayList<ArrayList<HashSet<String>>> phrase_query_term_variants_hierarchy, boolean case_insensitive) 
    14881375    { 
    1489       //logger.error("in hl internal, query terms are "+query_term_variants.toString()); 
    14901376      String content = GSXML.getNodeText(original_element); 
    1491       //logger.error("original elem = "+XMLConverter.getPrettyString(original_element)); 
    1492       logger.error("highlighting content: "+content); 
    14931377        // Convert the content string to an array of characters for speed 
    14941378        char[] content_characters = new char[content.length()]; 
     
    15061390            if (content_characters[i] == '<') 
    15071391            { 
     1392              // are we currently in a word? 
     1393              if (in_word) { 
     1394                in_word = false; 
     1395                String word = new String(content_characters, word_start, (i - word_start)); 
     1396                if (case_insensitive) { 
     1397                  word = word.toLowerCase(); 
     1398                } 
     1399                if (query_term_variants.contains(word)) { 
     1400                  // We have found a matching word, so remember its location 
     1401                  word_matches.add(new WordMatch(word, word_start, i, preceding_word_matched)); 
     1402                  // should preceding word matched be set to true/false here?? 
     1403                  preceding_word_matched = true; 
     1404                } else { 
     1405                  preceding_word_matched = false; 
     1406                } 
     1407              } 
    15081408                inTag = true; 
    15091409                continue; 
     
    15121412            { 
    15131413                inTag = false; 
     1414                continue; 
    15141415            } 
    15151416            else if (inTag) 
     
    15341435                // Check if the word matches any of the query term equivalents 
    15351436                String word = new String(content_characters, word_start, (i - word_start)); 
    1536                 //logger.error("word: "+word); 
     1437                if (case_insensitive) { 
     1438                  word = word.toLowerCase(); 
     1439                } 
    15371440                if (query_term_variants.contains(word)) 
    15381441                { 
    1539                   //logger.error("matched"); 
    15401442                    // We have found a matching word, so remember its location 
    15411443                    word_matches.add(new WordMatch(word, word_start, i, preceding_word_matched)); 
     
    15541456            // Check if the word matches any of the query term equivalents 
    15551457            String word = new String(content_characters, word_start, (content_characters.length - word_start)); 
     1458            if (case_insensitive) { 
     1459              word = word.toLowerCase(); 
     1460            } 
    15561461            if (query_term_variants.contains(word)) 
    15571462            { 
     
    15611466        } 
    15621467 
     1468        if (word_matches.size() == 0) { 
     1469          // just return a copy of the original element 
     1470          return (Element)doc.importNode(original_element, true); 
     1471           
     1472        } 
     1473         
    15631474        ArrayList<Integer> highlight_start_positions = new ArrayList<Integer>(); 
    15641475        ArrayList<Integer> highlight_end_positions = new ArrayList<Integer>(); 
    1565  
     1476         
     1477        if (phrase_query_term_variants_hierarchy.size() ==0) { 
     1478          for (int i = 0; i < word_matches.size(); i++) { 
     1479            highlight_start_positions.add(new Integer(word_matches.get(i).start_position)); 
     1480            highlight_end_positions.add(new Integer(word_matches.get(i).end_position)); 
     1481          } 
     1482        } 
     1483        else { 
    15661484        // Deal with phrases now 
    15671485        ArrayList<PartialPhraseMatch> partial_phrase_matches = new ArrayList<PartialPhraseMatch>(); 
     
    16381556            } 
    16391557        } 
     1558        } 
    16401559 
    16411560        // Now add the annotation tags into the document at the correct points 
    1642         //Element content_element = doc.createElement(GSXML.NODE_CONTENT_ELEM); 
    16431561        Element content_element = (Element)doc.importNode(original_element, false); // just copy the element plus any attributes, but not any children. 
    16441562        int last_wrote = 0; 
     
    16751593    } 
    16761594   
    1677     private Element highlightQueryTermsInternalOrig(Document doc, String content, HashSet<String> query_term_variants, ArrayList<ArrayList<HashSet<String>>> phrase_query_term_variants_hierarchy) 
    1678     { 
    1679         // Convert the content string to an array of characters for speed 
    1680         char[] content_characters = new char[content.length()]; 
    1681         content.getChars(0, content.length(), content_characters, 0); 
    1682  
    1683         // Now skim through the content, identifying word matches 
    1684         ArrayList<WordMatch> word_matches = new ArrayList<WordMatch>(); 
    1685         int word_start = 0; 
    1686         boolean in_word = false; 
    1687         boolean preceding_word_matched = false; 
    1688         boolean inTag = false; 
    1689         for (int i = 0; i < content_characters.length; i++) 
    1690         { 
    1691             //We don't want to find words inside HTML tags 
    1692             if (content_characters[i] == '<') 
    1693             { 
    1694                 inTag = true; 
    1695                 continue; 
    1696             } 
    1697             else if (inTag && content_characters[i] == '>') 
    1698             { 
    1699                 inTag = false; 
    1700             } 
    1701             else if (inTag) 
    1702             { 
    1703                 continue; 
    1704             } 
    1705  
    1706             boolean is_character_letter_or_digit = Character.isLetterOrDigit(content_characters[i]); 
    1707  
    1708             // Has a word just started? 
    1709             if (in_word == false && is_character_letter_or_digit == true) 
    1710             { 
    1711                 in_word = true; 
    1712                 word_start = i; 
    1713             } 
    1714  
    1715             // Or has a word just finished? 
    1716             else if (in_word == true && is_character_letter_or_digit == false) 
    1717             { 
    1718                 in_word = false; 
    1719  
    1720                 // Check if the word matches any of the query term equivalents 
    1721                 String word = new String(content_characters, word_start, (i - word_start)); 
    1722                 if (query_term_variants.contains(word)) 
    1723                 { 
    1724                     // We have found a matching word, so remember its location 
    1725                     word_matches.add(new WordMatch(word, word_start, i, preceding_word_matched)); 
    1726                     preceding_word_matched = true; 
    1727                 } 
    1728                 else 
    1729                 { 
    1730                     preceding_word_matched = false; 
    1731                 } 
    1732             } 
    1733         } 
    1734  
    1735         // Don't forget the last word... 
    1736         if (in_word == true) 
    1737         { 
    1738             // Check if the word matches any of the query term equivalents 
    1739             String word = new String(content_characters, word_start, (content_characters.length - word_start)); 
    1740             if (query_term_variants.contains(word)) 
    1741             { 
    1742                 // We have found a matching word, so remember its location 
    1743                 word_matches.add(new WordMatch(word, word_start, content_characters.length, preceding_word_matched)); 
    1744             } 
    1745         } 
    1746  
    1747         ArrayList<Integer> highlight_start_positions = new ArrayList<Integer>(); 
    1748         ArrayList<Integer> highlight_end_positions = new ArrayList<Integer>(); 
    1749  
    1750         // Deal with phrases now 
    1751         ArrayList<PartialPhraseMatch> partial_phrase_matches = new ArrayList<PartialPhraseMatch>(); 
    1752         for (int i = 0; i < word_matches.size(); i++) 
    1753         { 
    1754             WordMatch word_match = word_matches.get(i); 
    1755  
    1756             // See if any partial phrase matches are extended by this word 
    1757             if (word_match.preceding_word_matched) 
    1758             { 
    1759                 for (int j = partial_phrase_matches.size() - 1; j >= 0; j--) 
    1760                 { 
    1761                     PartialPhraseMatch partial_phrase_match = partial_phrase_matches.remove(j); 
    1762                     ArrayList phrase_query_p_term_variants_list = phrase_query_term_variants_hierarchy.get(partial_phrase_match.query_phrase_number); 
    1763                     HashSet phrase_query_p_term_x_variants = (HashSet) phrase_query_p_term_variants_list.get(partial_phrase_match.num_words_matched); 
    1764                     if (phrase_query_p_term_x_variants.contains(word_match.word)) 
    1765                     { 
    1766                         partial_phrase_match.num_words_matched++; 
    1767  
    1768                         // Has a complete phrase match occurred? 
    1769                         if (partial_phrase_match.num_words_matched == phrase_query_p_term_variants_list.size()) 
    1770                         { 
    1771                             // Check for overlaps by looking at the previous highlight range 
    1772                             if (!highlight_end_positions.isEmpty()) 
    1773                             { 
    1774                                 int last_highlight_index = highlight_end_positions.size() - 1; 
    1775                                 int last_highlight_end = highlight_end_positions.get(last_highlight_index).intValue(); 
    1776                                 if (last_highlight_end > partial_phrase_match.start_position) 
    1777                                 { 
    1778                                     // There is an overlap, so remove the previous phrase match 
    1779                                     int last_highlight_start = highlight_start_positions.remove(last_highlight_index).intValue(); 
    1780                                     highlight_end_positions.remove(last_highlight_index); 
    1781                                     partial_phrase_match.start_position = last_highlight_start; 
    1782                                 } 
    1783                             } 
    1784  
    1785                             highlight_start_positions.add(new Integer(partial_phrase_match.start_position)); 
    1786                             highlight_end_positions.add(new Integer(word_match.end_position)); 
    1787                         } 
    1788                         // No, but add the partial match back into the list for next time 
    1789                         else 
    1790                         { 
    1791                             partial_phrase_matches.add(partial_phrase_match); 
    1792                         } 
    1793                     } 
    1794                 } 
    1795             } 
    1796             else 
    1797             { 
    1798                 partial_phrase_matches.clear(); 
    1799             } 
    1800  
    1801             // See if this word is at the start of any of the phrases 
    1802             for (int p = 0; p < phrase_query_term_variants_hierarchy.size(); p++) 
    1803             { 
    1804                 ArrayList phrase_query_p_term_variants_list = phrase_query_term_variants_hierarchy.get(p); 
    1805                 if (phrase_query_p_term_variants_list.size()>0) { 
    1806                 HashSet phrase_query_p_term_1_variants = (HashSet) phrase_query_p_term_variants_list.get(0); 
    1807                 if (phrase_query_p_term_1_variants.contains(word_match.word)) 
    1808                 { 
    1809                     // If this phrase is just one word long, we have a complete match 
    1810                     if (phrase_query_p_term_variants_list.size() == 1) 
    1811                     { 
    1812                         highlight_start_positions.add(new Integer(word_match.start_position)); 
    1813                         highlight_end_positions.add(new Integer(word_match.end_position)); 
    1814                     } 
    1815                     // Otherwise we have the start of a potential phrase match 
    1816                     else 
    1817                     { 
    1818                         partial_phrase_matches.add(new PartialPhraseMatch(word_match.start_position, p)); 
    1819                     } 
    1820                 } 
    1821                 } 
    1822             } 
    1823         } 
    1824  
    1825         // Now add the annotation tags into the document at the correct points 
    1826         Element content_element = doc.createElement(GSXML.NODE_CONTENT_ELEM); 
    1827  
    1828         int last_wrote = 0; 
    1829         for (int i = 0; i < highlight_start_positions.size(); i++) 
    1830         { 
    1831             int highlight_start = highlight_start_positions.get(i).intValue(); 
    1832             int highlight_end = highlight_end_positions.get(i).intValue(); 
    1833  
    1834             // Print anything before the highlight range 
    1835             if (last_wrote < highlight_start) 
    1836             { 
    1837                 String preceding_text = new String(content_characters, last_wrote, (highlight_start - last_wrote)); 
    1838                 content_element.appendChild(doc.createTextNode(preceding_text)); 
    1839             } 
    1840  
    1841             // Print the highlight text, annotated 
    1842             if (highlight_end > last_wrote) 
    1843             { 
    1844                 String highlight_text = new String(content_characters, highlight_start, (highlight_end - highlight_start)); 
    1845                 Element annotation_element = GSXML.createTextElement(doc, "annotation", highlight_text); 
    1846                 annotation_element.setAttribute("type", "query_term"); 
    1847                 content_element.appendChild(annotation_element); 
    1848                 last_wrote = highlight_end; 
    1849             } 
    1850         } 
    1851  
    1852         // Finish off any unwritten text 
    1853         if (last_wrote < content_characters.length) 
    1854         { 
    1855             String remaining_text = new String(content_characters, last_wrote, (content_characters.length - last_wrote)); 
    1856             content_element.appendChild(doc.createTextNode(remaining_text)); 
    1857         } 
    1858         return content_element; 
    1859     } 
    18601595 
    18611596    static private class WordMatch