Changeset 32545


Ignore:
Timestamp:
2018-10-29T12:19:46+13:00 (5 years ago)
Author:
kjdon
Message:

changes to query term highlighting. 1. added back in the functionality for getting marked up text back from indexer - useful for solr as it doesn't give you the equivalent terms list, so you don't know what terms it has matched. eg a query for snail will match snail and snails. only do this for text though, not metadata. 2. if the query doesn't give back equivalent terms (solr and lucene), then do case insensitive matching. previously it just did upper and lower case for first letter, but that didn't match eg SNAIL. 3. if the term in the performed query is not in the terms list (eg in the case of solr, query snails, term returned will be snail), but it starts with a term in the term list, then add it to the term list. this helps with a solr query of snails, but not with a query of snail - which matches snails, but snails won't get highlighted. 4. if there is no double quote in the query string, then don't worry about trying to make the phrase structure, just find all the matches of the terms and highlight them. I think thats all...

File:
1 edited

Legend:

Unmodified
Added
Removed
  • main/trunk/greenstone3/src/java/org/greenstone/gsdl3/action/DocumentAction.java

    r32505 r32545  
    3535import java.util.HashMap;
    3636import java.util.HashSet;
     37import java.util.Iterator;
    3738import java.io.File;
    3839import java.io.Serializable;
     
    541542        GSXML.mergeMetadataLists(the_document, top_doc_node);
    542543
     544        // if we are highlighting query terms, then we also get them highlighted in the metadata
     545       
     546        HashSet<String> query_term_variants = null;
     547        ArrayList<ArrayList<HashSet<String>>> phrase_query_term_variants_hierarchy = null;
     548        boolean do_highlight_query_terms = highlight_query_terms;
     549        int query_terms_status = 0;
     550        if (highlight_query_terms) {
     551          // lets get the query term equivalents
     552          query_term_variants = new HashSet<String>();
     553          phrase_query_term_variants_hierarchy = new ArrayList<ArrayList<HashSet<String>>>();
     554          if ((query_terms_status = getQueryTermVariants(request, query_term_variants, phrase_query_term_variants_hierarchy)) ==0) {
     555            do_highlight_query_terms = false; // we couldn't get the terms
     556          }
     557        }
     558
     559        // lets try marking up the metadata with search terms
     560        // if the search service doesn't send back <equivTermlist> then we haven't got the term variants. We lower case everything and do case insensitive matching
     561        boolean highlight_case_insensitive = false;
     562        if (query_terms_status == NO_EQUIV_QUERY_TERMS) {
     563          highlight_case_insensitive = true;
     564        }
     565        if (do_highlight_query_terms) {
     566          highlightQueryTermsDOM(doc, the_document, "metadata", query_term_variants, phrase_query_term_variants_hierarchy, highlight_case_insensitive);
     567        }
     568
    543569        // do we want doc text content? If not, we are done.
    544570        if (!get_text) {
     
    546572          return result;
    547573        }
    548 
    549574       
    550         HashSet<String> query_term_variants = null;
    551         ArrayList<ArrayList<HashSet<String>>> phrase_query_term_variants_hierarchy = null;
    552         boolean do_highlight_query_terms = highlight_query_terms;
    553         if (highlight_query_terms) {
    554           // lets get the query term equivalents
    555           query_term_variants = new HashSet<String>();
    556           phrase_query_term_variants_hierarchy = new ArrayList<ArrayList<HashSet<String>>>();
    557           if (!getQueryTermVariants(request, null, /*current_node_id,*/ query_term_variants, phrase_query_term_variants_hierarchy)) {
    558             do_highlight_query_terms = false; // we couldn't get the terms
    559           }
    560         }
    561 
    562         // lets try marking up the metadata with search terms
    563         if (do_highlight_query_terms) {
    564           highlightQueryTermsDOM(doc, the_document, "metadata", query_term_variants, phrase_query_term_variants_hierarchy);
    565         }
    566 
    567575        // Build a request to obtain some document content
    568576        Element dc_message = doc.createElement(GSXML.MESSAGE_ELEM);
     
    599607        Element dc_response_doc_list = (Element) GSXML.getNodeByPath(dc_response_message, path);
    600608
     609        boolean get_marked_up_doc_from_query = false;
     610        if (do_highlight_query_terms && query_terms_status == NO_EQUIV_QUERY_TERMS) {
     611          get_marked_up_doc_from_query = true; // we try to. solr we can, lucene we can't
     612        }
     613
    601614        if (expand_document)
    602615        {
     
    606619            {
    607620                String node_id = ((Element)doc_nodes.item(i)).getAttribute(GSXML.NODE_ID_ATT);
    608                 //Node content = GSXML.getChildByTagName((Element) dc_response_docs.item(i), GSXML.NODE_CONTENT_ELEM);
    609621                Node docNode = GSXML.getNamedElement(dc_response_doc_list, "documentNode", GSXML.NODE_ID_ATT, node_id);
    610622                Node content = GSXML.getChildByTagName(docNode, GSXML.NODE_CONTENT_ELEM);
    611623                if (content != null)
    612624                {
    613                     if (do_highlight_query_terms)
    614                     {
    615                       content = highlightQueryTermsElementText(doc, (Element)content, query_term_variants, phrase_query_term_variants_hierarchy);
    616                     }
    617                    
    618                     doc_nodes.item(i).appendChild(doc.importNode(content, true));
    619                 }
    620                 //GSXML.mergeMetadataLists(doc_nodes.item(i), dm_response_docs.item(i));
     625                  if (do_highlight_query_terms) {
     626                    if (get_marked_up_doc_from_query) {
     627                     
     628                      Element new_content = retrieveHighlightedContent(request, node_id);
     629                     
     630                      if (new_content == null) {
     631                    // we didn't get any text back from the request. assume we won't be able to get it next time either (eg lucene)
     632                    get_marked_up_doc_from_query = false;
     633                    content = highlightQueryTermsElementText(doc, (Element)content, query_term_variants, phrase_query_term_variants_hierarchy, highlight_case_insensitive);
     634                      } else {
     635                    content= new_content;
     636                      }
     637                    } else {
     638                      content = highlightQueryTermsElementText(doc, (Element)content, query_term_variants, phrase_query_term_variants_hierarchy, highlight_case_insensitive);
     639                    }
     640                  }
     641                  doc_nodes.item(i).appendChild(doc.importNode(content, true));
     642                }
     643
    621644            }
    622645            if (has_dummy && document_type.equals(GSXML.DOC_TYPE_SIMPLE)) {
     
    641664        else
    642665        {
    643             //path = GSPath.appendLink(path, GSXML.DOC_NODE_ELEM);
    644666            Element dc_response_doc = (Element) GSXML.getChildByTagName(dc_response_doc_list, GSXML.DOC_NODE_ELEM);
    645667            Element dc_response_doc_content = (Element) GSXML.getChildByTagName(dc_response_doc, GSXML.NODE_CONTENT_ELEM);
    646             //Element dc_response_doc_external = (Element) GSXML.getChildByTagName(dc_response_doc, "external");
    647668
    648669            if (dc_response_doc_content == null)
     
    651672                if (dc_response_doc.getAttribute("external").equals("true"))
    652673                {
    653 
    654                     //if (dc_response_doc_external != null)
    655                     //{
    656674                    String href_id = dc_response_doc.getAttribute(GSXML.HREF_ID_ATT);
    657675
     
    664682            {
    665683                dc_response_doc.removeChild(dc_response_doc_content);
    666 
    667                 dc_response_doc_content = highlightQueryTermsElementText(doc, (Element)dc_response_doc_content, query_term_variants, phrase_query_term_variants_hierarchy);
     684                if (get_marked_up_doc_from_query) {
     685                  Element new_content = retrieveHighlightedContent(request, null);
     686                  if (new_content == null) {
     687                    get_marked_up_doc_from_query = false;
     688                    dc_response_doc_content = highlightQueryTermsElementText(doc, (Element)dc_response_doc_content, query_term_variants, phrase_query_term_variants_hierarchy, highlight_case_insensitive);
     689                  } else {
     690                   
     691                    dc_response_doc_content = new_content;
     692                  }
     693                } else {
     694                  dc_response_doc_content = highlightQueryTermsElementText(doc, (Element)dc_response_doc_content, query_term_variants, phrase_query_term_variants_hierarchy, highlight_case_insensitive);
     695                }
    668696                dc_response_doc.appendChild(dc_response_doc.getOwnerDocument().importNode(dc_response_doc_content, true));
    669697            }
     
    720748
    721749                    NodeList dummy_children = dummy_node.getChildNodes();
    722                     //for (int i=0; i<dummy_children.getLength(); i++) {
    723750                    for (int i = dummy_children.getLength() - 1; i >= 0; i--)
    724751                    {
     
    10361063  }
    10371064
     1065  protected final int NO_QUERY_TERMS = 0;
     1066  protected final int NO_EQUIV_QUERY_TERMS = 1;
     1067  protected final int EQUIV_QUERY_TERMS = 2;
    10381068  /**
    10391069   * this involves a bit of a hack to get the equivalent query terms - has to
    10401070   * requery the query service - uses the last selected service name. (if it
    1041    * ends in query). should this action do the query or should it send a
    1042    * message to the query action? but that will involve lots of extra stuff.
     1071   * ends in query). 
    10431072   */
    1044   protected boolean getQueryTermVariants(Element request, String  current_node_id, HashSet<String> query_term_variants, ArrayList<ArrayList<HashSet<String>>> phrase_query_term_variants_hierarchy)
     1073  protected int getQueryTermVariants(Element request, HashSet<String> query_term_variants, ArrayList<ArrayList<HashSet<String>>> phrase_query_term_variants_hierarchy)
    10451074  {
    1046     Document doc = request.getOwnerDocument();
    1047    
     1075    Document doc = XMLConverter.newDOM();
     1076
    10481077    // do the query again to get term info
    10491078    Element cgi_param_list = (Element) GSXML.getChildByTagName(request, GSXML.PARAM_ELEM + GSXML.LIST_MODIFIER);
    1050     //logger.error("cgi param list = "+XMLConverter.getPrettyString(cgi_param_list));
    10511079    HashMap<String, Serializable> params = GSXML.extractParams(cgi_param_list, false);
    10521080   
     
    10541082    if (previous_params == null)
    10551083      {
    1056     //logger.error("no p parms");
    1057     return false;
     1084    return NO_QUERY_TERMS;
    10581085      }
    10591086    String service_name = (String) previous_params.get(GSParams.SERVICE);
    10601087    if (service_name == null || !service_name.endsWith("Query"))
    10611088      { // hack for now - we only do highlighting if we were in a query last - ie not if we were in a browse thingy
    1062     logger.debug("invalid service, not doing highlighting");
    1063     return false;
     1089    logger.debug("invalid service "+service_name+", not doing highlighting");
     1090    return NO_QUERY_TERMS;
    10641091      }
    10651092
     
    10771104    Element query_param_list = doc.createElement(GSXML.PARAM_ELEM + GSXML.LIST_MODIFIER);
    10781105    GSXML.addParametersToList(query_param_list, service_params);
    1079     // is this only used for solr??? - do we still want it for solr??
    1080     // if (current_node_id != null) {
    1081     //   GSXML.addParameterToList(query_param_list, "hldocOID", current_node_id);
    1082     // } else {
    1083     //   GSXML.addParameterToList(query_param_list, "hldocOID", (String) params.get(GSParams.DOCUMENT));
    1084     // }
    10851106    mr_query_request.appendChild(query_param_list);
     1107
    10861108    // do the query
    1087 
    10881109    Element mr_query_response = (Element) this.mr.process(mr_query_message);
    1089 
     1110       
    10901111    // find the term lists
    10911112    String path = GSPath.appendLink(GSXML.RESPONSE_ELEM, GSXML.TERM_ELEM + GSXML.LIST_MODIFIER);
     
    10941115      {
    10951116    // no term info
    1096     logger.error("No query term information. xx\n");
    1097     return false;
     1117    return  NO_QUERY_TERMS;
    10981118      }
    1099     //    logger.error("query term list info "+XMLConverter.getPrettyString(query_term_list_element));
    1100     //String content = GSXML.getNodeText(dc_response_doc_content);
    1101    
    1102     String metadata_path = GSPath.appendLink(GSXML.RESPONSE_ELEM, GSXML.METADATA_ELEM + GSXML.LIST_MODIFIER);
    1103     Element metadata_list = (Element) GSXML.getNodeByPath(mr_query_response, metadata_path);
    1104    
     1119   
     1120    int result_code = NO_EQUIV_QUERY_TERMS;
    11051121    NodeList equivalent_terms_nodelist = query_term_list_element.getElementsByTagName("equivTermList");
    11061122    if (equivalent_terms_nodelist == null || equivalent_terms_nodelist.getLength() == 0)
    11071123      {
     1124    // if we have no equivalent terms, just add the current terms lower cased and we do case insensitive matching later on
    11081125    NodeList terms_nodelist = query_term_list_element.getElementsByTagName("term");
    11091126    if (terms_nodelist != null && terms_nodelist.getLength() > 0)
     
    11121129          {
    11131130        String termValue = ((Element) terms_nodelist.item(i)).getAttribute("name");
    1114         String termValueU = null;
    1115         String termValueL = null;
    1116        
    1117         if (termValue.length() > 1)
    1118           {
    1119             termValueU = termValue.substring(0, 1).toUpperCase() + termValue.substring(1);
    1120             termValueL = termValue.substring(0, 1).toLowerCase() + termValue.substring(1);
    1121           }
    1122         else
    1123           {
    1124             termValueU = termValue.substring(0, 1).toUpperCase();
    1125             termValueL = termValue.substring(0, 1).toLowerCase();
    1126           }
    1127         query_term_variants.add(termValueU);
    1128         query_term_variants.add(termValueL);
     1131        query_term_variants.add(termValue.toLowerCase());
    11291132          }
    11301133      }
     
    11321135    else
    11331136      {
     1137    result_code = EQUIV_QUERY_TERMS;
    11341138    for (int i = 0; i < equivalent_terms_nodelist.getLength(); i++)
    11351139      {
     
    11431147      }
    11441148   
     1149    String metadata_path = GSPath.appendLink(GSXML.RESPONSE_ELEM, GSXML.METADATA_ELEM + GSXML.LIST_MODIFIER);
     1150    Element metadata_list = (Element) GSXML.getNodeByPath(mr_query_response, metadata_path);
    11451151
    11461152    Element query_element = GSXML.getNamedElement(metadata_list, GSXML.METADATA_ELEM, GSXML.NAME_ATT, "query");
    11471153    String performed_query = GSXML.getNodeText(query_element) + " ";
    1148 
     1154    logger.debug("performed query="+performed_query);
     1155
     1156    boolean has_phrases = false; // if there are no phrases, we don't bother making the phrase variants structure
     1157    if (performed_query.contains("\"")) {
     1158      has_phrases = true;
     1159    }
     1160   
    11491161    ArrayList<HashSet<String>> phrase_query_p_term_variants_list = new ArrayList<HashSet<String>>();
    11501162    int term_start = 0;
    11511163    boolean in_term = false;
    11521164    boolean in_phrase = false;
    1153     for (int i = 0; i < performed_query.length(); i++)
    1154       {
     1165    for (int i = 0; i < performed_query.length(); i++) {
     1166     
    11551167    char character = performed_query.charAt(i);
    11561168    boolean is_character_letter_or_digit = Character.isLetterOrDigit(character);
     
    11681180        in_term = false;
    11691181        String term = performed_query.substring(term_start, i);
    1170        
    1171         Element term_element = GSXML.getNamedElement(query_term_list_element, GSXML.TERM_ELEM, GSXML.NAME_ATT, term);
    1172         if (term_element != null)
    1173           {
     1182        if (has_phrases) {
     1183          // do the phrase bit
     1184          HashSet<String> phrase_query_p_term_x_variants = new HashSet<String>();
     1185          if (result_code == EQUIV_QUERY_TERMS) {
     1186        Element term_element = GSXML.getNamedElement(query_term_list_element, GSXML.TERM_ELEM, GSXML.NAME_ATT, term);
     1187        if (term_element != null) {
     1188          // might be null for eg TX in [snails]:TX
    11741189       
    1175         HashSet<String> phrase_query_p_term_x_variants = new HashSet<String>();
    1176        
    1177         NodeList term_equivalent_terms_nodelist = term_element.getElementsByTagName("equivTermList");
    1178         if (term_equivalent_terms_nodelist == null || term_equivalent_terms_nodelist.getLength() == 0)
    1179           {
    1180             String termValueU = null;
    1181             String termValueL = null;
    1182            
    1183             if (term.length() > 1)
    1184               {
    1185             termValueU = term.substring(0, 1).toUpperCase() + term.substring(1);
    1186             termValueL = term.substring(0, 1).toLowerCase() + term.substring(1);
    1187               }
    1188             else
    1189               {
    1190             termValueU = term.substring(0, 1).toUpperCase();
    1191             termValueL = term.substring(0, 1).toLowerCase();
    1192               }
    1193            
    1194             phrase_query_p_term_x_variants.add(termValueU);
    1195             phrase_query_p_term_x_variants.add(termValueL);
    1196           }
    1197         else
    1198           {
     1190          NodeList term_equivalent_terms_nodelist = term_element.getElementsByTagName("equivTermList");
     1191          if (term_equivalent_terms_nodelist != null || term_equivalent_terms_nodelist.getLength() != 0) {
    11991192            for (int j = 0; j < term_equivalent_terms_nodelist.getLength(); j++)
    12001193              {
     
    12071200              }
    12081201          }
     1202        }
     1203          } else { // result_code != EQUIV_QUERY_TERMS
     1204        // we don;t have equivalent term list, so just add the lower cased version in, and we do case-insensitive matching later on
     1205        if (query_term_variants.contains(term.toLowerCase()) || containsSubString(query_term_variants, term)) {
     1206          // this handles the case where the user has searched for snails, but term list returns 'snail'
     1207            phrase_query_p_term_x_variants.add(term.toLowerCase());
     1208        }
     1209          }
     1210          if (phrase_query_p_term_x_variants.size()>0) {
     1211        // we have found a valid term
    12091212        phrase_query_p_term_variants_list.add(phrase_query_p_term_x_variants);
    12101213       
     
    12151218          }
    12161219          }
    1217       }
     1220        } // end if has_phrases
     1221        else {
     1222          // no phrases so we don't have to do the phrasey stuff. but
     1223          // we need to check the term against the query term list - if its not in there, check whether its the root of a term.
     1224          // we want to handle the case where user has queried "snails", the term list returned only has snail, and therefore snails doesn't get highlighted.
     1225          // but dont want to include eg TX
     1226          if (result_code == NO_EQUIV_QUERY_TERMS) {
     1227        if (containsSubString(query_term_variants, term)) {
     1228          query_term_variants.add(term.toLowerCase());
     1229        }
     1230          }
     1231         
     1232        }
     1233      } // end of in_term...
    12181234    // Watch for phrases (surrounded by quotes)
    1219     if (character == '\"')
    1220       {
     1235    if (character == '\"') {
     1236     
    12211237        // Has a phrase just started?
    12221238        if (in_phrase == false)
     
    12321248       
    12331249        phrase_query_p_term_variants_list = new ArrayList<HashSet<String>>();
    1234       }
     1250    } // if char == "
     1251    } // for each char in performed query
     1252 
     1253    return result_code;
     1254  }
     1255
     1256  protected boolean containsSubString(HashSet<String> query_term_variants, String term) {
     1257    // hack to filter out TX, TI field names
     1258    String lc_term = term.toLowerCase();
     1259    if (query_term_variants.contains(term)) {
     1260      return false; // or true??
     1261    }
     1262    if (term.matches("[A-Z][A-Z][A-Z]?")) {
     1263      return false;
     1264    }
     1265    Iterator i = query_term_variants.iterator();
     1266    while (i.hasNext()) {
     1267      String t = (String)i.next();
     1268      if (term.startsWith(t)) {
     1269    return true;
    12351270      }
    1236  
    1237     return true;
     1271    }
     1272    return false;
    12381273  }
    12391274
    1240   /** redo the request to get the query terms then highlight them in the text
    1241    *
    1242    */
    1243   protected Element highlightQueryTermsOld(Element request, String current_node_id, Element dc_response_doc_content)
    1244     {
    1245         Document doc = request.getOwnerDocument();
    1246        
    1247         // do the query again to get term info
    1248         Element cgi_param_list = (Element) GSXML.getChildByTagName(request, GSXML.PARAM_ELEM + GSXML.LIST_MODIFIER);
    1249         HashMap<String, Serializable> params = GSXML.extractParams(cgi_param_list, false);
    1250 
    1251         HashMap previous_params = (HashMap) params.get("p");
    1252         if (previous_params == null)
    1253         {
    1254             return dc_response_doc_content;
    1255         }
    1256         String service_name = (String) previous_params.get(GSParams.SERVICE);
    1257         if (service_name == null || !service_name.endsWith("Query"))
    1258         { // hack for now - we only do highlighting if we were in a query last - ie not if we were in a browse thingy
    1259             logger.debug("invalid service, not doing highlighting");
    1260             return dc_response_doc_content;
    1261         }
    1262         String collection = (String) params.get(GSParams.COLLECTION);
    1263         UserContext userContext = new UserContext(request);
    1264         String to = GSPath.appendLink(collection, service_name);
    1265 
    1266         Element mr_query_message = doc.createElement(GSXML.MESSAGE_ELEM);
    1267         Element mr_query_request = GSXML.createBasicRequest(doc, GSXML.REQUEST_TYPE_PROCESS, to, userContext);
    1268         mr_query_message.appendChild(mr_query_request);
    1269 
    1270         // paramList
    1271         HashMap service_params = (HashMap) params.get("s1");
    1272 
    1273         Element query_param_list = doc.createElement(GSXML.PARAM_ELEM + GSXML.LIST_MODIFIER);
    1274         GSXML.addParametersToList(query_param_list, service_params);
    1275         if (current_node_id != null) {
    1276           GSXML.addParameterToList(query_param_list, "hldocOID", current_node_id);
    1277         } else {
    1278           GSXML.addParameterToList(query_param_list, "hldocOID", (String) params.get(GSParams.DOCUMENT));
    1279         }
    1280         mr_query_request.appendChild(query_param_list);
    1281         // do the query
    1282         Element mr_query_response = (Element) this.mr.process(mr_query_message);
    1283         String pathNode = GSPath.appendLink(GSXML.RESPONSE_ELEM, GSXML.NODE_CONTENT_ELEM);
    1284         Element highlighted_Node = (Element) GSXML.getNodeByPath(mr_query_response, pathNode);
    1285         // For SOLR, the above query may come back with a nodeContent element, which is the hldocOID section content, with search terms marked up. We send it back to the documnetContentRetrieve service so that resolveTextMacros can be applied, and it can be properly encased in documentNode etc elements
    1286         if (highlighted_Node != null)
    1287         {
    1288             // Build a request to process highlighted text
    1289           logger.error("highlighted node is not null!!!!");
    1290           logger.error(XMLConverter.getPrettyString(highlighted_Node));
    1291             Element hl_message = doc.createElement(GSXML.MESSAGE_ELEM);
    1292             to = GSPath.appendLink(collection, "DocumentContentRetrieve");
    1293             Element dc_request = GSXML.createBasicRequest(doc, GSXML.REQUEST_TYPE_PROCESS, to, userContext);
    1294             hl_message.appendChild(dc_request);
    1295 
    1296             // Create a parameter list to specify the request parameters - empty for now
    1297             Element dc_param_list = doc.createElement(GSXML.PARAM_ELEM + GSXML.LIST_MODIFIER);
    1298             dc_request.appendChild(dc_param_list);
    1299 
    1300             // get the content
    1301             Element doc_list = doc.createElement(GSXML.DOC_NODE_ELEM + GSXML.LIST_MODIFIER);
    1302             dc_request.appendChild(doc_list);
    1303             Element current_doc = doc.createElement(GSXML.DOC_NODE_ELEM);
    1304             doc_list.appendChild(current_doc);
    1305             current_doc.setAttribute(GSXML.NODE_ID_ATT, (String) params.get(GSParams.DOCUMENT));
    1306             //Append highlighted content to request for processing
    1307             dc_request.appendChild(doc.importNode(highlighted_Node, true));
    1308             Element hl_response_message = (Element) this.mr.process(hl_message);
    1309        
    1310             //Get results
    1311             NodeList contentList = hl_response_message.getElementsByTagName(GSXML.NODE_CONTENT_ELEM);
    1312             Element content = (Element) contentList.item(0);   
    1313             return content;
    1314         }
    1315         String path = GSPath.appendLink(GSXML.RESPONSE_ELEM, GSXML.TERM_ELEM + GSXML.LIST_MODIFIER);
    1316         Element query_term_list_element = (Element) GSXML.getNodeByPath(mr_query_response, path);
    1317         if (query_term_list_element == null)
    1318         {
    1319             // no term info
    1320             logger.error("No query term information. yy\n");
    1321             return dc_response_doc_content;
    1322         }
    1323 
    1324         String content = GSXML.getNodeText(dc_response_doc_content);
    1325 
    1326         String metadata_path = GSPath.appendLink(GSXML.RESPONSE_ELEM, GSXML.METADATA_ELEM + GSXML.LIST_MODIFIER);
    1327         Element metadata_list = (Element) GSXML.getNodeByPath(mr_query_response, metadata_path);
    1328 
    1329         HashSet<String> query_term_variants = new HashSet<String>();
    1330         NodeList equivalent_terms_nodelist = query_term_list_element.getElementsByTagName("equivTermList");
    1331         if (equivalent_terms_nodelist == null || equivalent_terms_nodelist.getLength() == 0)
    1332         {
    1333             NodeList terms_nodelist = query_term_list_element.getElementsByTagName("term");
    1334             if (terms_nodelist != null && terms_nodelist.getLength() > 0)
    1335             {
    1336                 for (int i = 0; i < terms_nodelist.getLength(); i++)
    1337                 {
    1338                     String termValue = ((Element) terms_nodelist.item(i)).getAttribute("name");
    1339                     String termValueU = null;
    1340                     String termValueL = null;
    1341 
    1342                     if (termValue.length() > 1)
    1343                     {
    1344                         termValueU = termValue.substring(0, 1).toUpperCase() + termValue.substring(1);
    1345                         termValueL = termValue.substring(0, 1).toLowerCase() + termValue.substring(1);
    1346                     }
    1347                     else
    1348                     {
    1349                         termValueU = termValue.substring(0, 1).toUpperCase();
    1350                         termValueL = termValue.substring(0, 1).toLowerCase();
    1351                     }
    1352 
    1353                     query_term_variants.add(termValueU);
    1354                     query_term_variants.add(termValueL);
    1355                 }
    1356             }
    1357         }
    1358         else
    1359         {
    1360             for (int i = 0; i < equivalent_terms_nodelist.getLength(); i++)
    1361             {
    1362                 Element equivalent_terms_element = (Element) equivalent_terms_nodelist.item(i);
    1363                 String[] equivalent_terms = GSXML.getAttributeValuesFromList(equivalent_terms_element, GSXML.NAME_ATT);
    1364                 for (int j = 0; j < equivalent_terms.length; j++)
    1365                 {
    1366                     query_term_variants.add(equivalent_terms[j]);
    1367                 }
    1368             }
    1369         }
    1370 
    1371         ArrayList<ArrayList<HashSet<String>>> phrase_query_term_variants_hierarchy = new ArrayList<ArrayList<HashSet<String>>>();
    1372 
    1373         Element query_element = GSXML.getNamedElement(metadata_list, GSXML.METADATA_ELEM, GSXML.NAME_ATT, "query");
    1374         String performed_query = GSXML.getNodeText(query_element) + " ";
    1375 
    1376         ArrayList<HashSet<String>> phrase_query_p_term_variants_list = new ArrayList<HashSet<String>>();
    1377         int term_start = 0;
    1378         boolean in_term = false;
    1379         boolean in_phrase = false;
    1380         for (int i = 0; i < performed_query.length(); i++)
    1381         {
    1382             char character = performed_query.charAt(i);
    1383             boolean is_character_letter_or_digit = Character.isLetterOrDigit(character);
    1384 
    1385             // Has a query term just started?
    1386             if (in_term == false && is_character_letter_or_digit == true)
    1387             {
    1388                 in_term = true;
    1389                 term_start = i;
    1390             }
    1391 
    1392             // Or has a term just finished?
    1393             else if (in_term == true && is_character_letter_or_digit == false)
    1394             {
    1395                 in_term = false;
    1396                 String term = performed_query.substring(term_start, i);
    1397 
    1398                 Element term_element = GSXML.getNamedElement(query_term_list_element, GSXML.TERM_ELEM, GSXML.NAME_ATT, term);
    1399                 if (term_element != null)
    1400                 {
    1401 
    1402                     HashSet<String> phrase_query_p_term_x_variants = new HashSet<String>();
    1403 
    1404                     NodeList term_equivalent_terms_nodelist = term_element.getElementsByTagName("equivTermList");
    1405                     if (term_equivalent_terms_nodelist == null || term_equivalent_terms_nodelist.getLength() == 0)
    1406                     {
    1407                         String termValueU = null;
    1408                         String termValueL = null;
    1409 
    1410                         if (term.length() > 1)
    1411                         {
    1412                             termValueU = term.substring(0, 1).toUpperCase() + term.substring(1);
    1413                             termValueL = term.substring(0, 1).toLowerCase() + term.substring(1);
    1414                         }
    1415                         else
    1416                         {
    1417                             termValueU = term.substring(0, 1).toUpperCase();
    1418                             termValueL = term.substring(0, 1).toLowerCase();
    1419                         }
    1420 
    1421                         phrase_query_p_term_x_variants.add(termValueU);
    1422                         phrase_query_p_term_x_variants.add(termValueL);
    1423                     }
    1424                     else
    1425                     {
    1426                         for (int j = 0; j < term_equivalent_terms_nodelist.getLength(); j++)
    1427                         {
    1428                             Element term_equivalent_terms_element = (Element) term_equivalent_terms_nodelist.item(j);
    1429                             String[] term_equivalent_terms = GSXML.getAttributeValuesFromList(term_equivalent_terms_element, GSXML.NAME_ATT);
    1430                             for (int k = 0; k < term_equivalent_terms.length; k++)
    1431                             {
    1432                                 phrase_query_p_term_x_variants.add(term_equivalent_terms[k]);
    1433                             }
    1434                         }
    1435                     }
    1436                     phrase_query_p_term_variants_list.add(phrase_query_p_term_x_variants);
    1437 
    1438                     if (in_phrase == false)
    1439                     {
    1440                         phrase_query_term_variants_hierarchy.add(phrase_query_p_term_variants_list);
    1441                         phrase_query_p_term_variants_list = new ArrayList<HashSet<String>>();
    1442                     }
    1443                 }
    1444             }
    1445             // Watch for phrases (surrounded by quotes)
    1446             if (character == '\"')
    1447             {
    1448                 // Has a phrase just started?
    1449                 if (in_phrase == false)
    1450                 {
    1451                     in_phrase = true;
    1452                 }
    1453                 // Or has a phrase just finished?
    1454                 else if (in_phrase == true)
    1455                 {
    1456                     in_phrase = false;
    1457                     phrase_query_term_variants_hierarchy.add(phrase_query_p_term_variants_list);
    1458                 }
    1459 
    1460                 phrase_query_p_term_variants_list = new ArrayList<HashSet<String>>();
    1461             }
    1462         }
    1463 
    1464         return highlightQueryTermsInternalOrig(doc, content, query_term_variants, phrase_query_term_variants_hierarchy);
    1465     }
    1466 
     1275
     1276  /** retrieve the marked up highlighted section - only works for solr collection */
     1277  protected Element retrieveHighlightedContent(Element request, String node_id) {
     1278
     1279    Document doc = XMLConverter.newDOM();
     1280
     1281    // do the query again to get term info
     1282    Element cgi_param_list = (Element) GSXML.getChildByTagName(request, GSXML.PARAM_ELEM + GSXML.LIST_MODIFIER);
     1283    HashMap<String, Serializable> params = GSXML.extractParams(cgi_param_list, false);
     1284   
     1285    HashMap previous_params = (HashMap) params.get("p");
     1286    if (previous_params == null)
     1287      {
     1288    return null;
     1289      }
     1290    String service_name = (String) previous_params.get(GSParams.SERVICE);
     1291    if (service_name == null || !service_name.endsWith("Query"))
     1292      { // hack for now - we only do highlighting if we were in a query last - ie not if we were in a browse thingy
     1293    logger.debug("HL invalid service, not doing highlighting");
     1294    return null;
     1295      }
     1296
     1297    String collection = (String) params.get(GSParams.COLLECTION);
     1298    UserContext userContext = new UserContext(request);
     1299    String to = GSPath.appendLink(collection, service_name);
     1300   
     1301    Element mr_query_message = doc.createElement(GSXML.MESSAGE_ELEM);
     1302    Element mr_query_request = GSXML.createBasicRequest(doc, GSXML.REQUEST_TYPE_PROCESS, to, userContext);
     1303    mr_query_message.appendChild(mr_query_request);
     1304   
     1305    // paramList
     1306    HashMap service_params = (HashMap) params.get("s1");
     1307
     1308    // hack in case the user searched on eg titles, but we want highlighting in the text
     1309    service_params.put("index", "TX");
     1310    Element query_param_list = doc.createElement(GSXML.PARAM_ELEM + GSXML.LIST_MODIFIER);
     1311    GSXML.addParametersToList(query_param_list, service_params);
     1312
     1313     if (node_id != null) {
     1314       GSXML.addParameterToList(query_param_list, "hldocOID", node_id);
     1315     } else {
     1316       GSXML.addParameterToList(query_param_list, "hldocOID", (String) params.get(GSParams.DOCUMENT));
     1317     }
     1318    mr_query_request.appendChild(query_param_list);
     1319    // do the query
     1320
     1321    Element mr_query_response = (Element) this.mr.process(mr_query_message);
     1322    String pathNode = GSPath.appendLink(GSXML.RESPONSE_ELEM, GSXML.NODE_CONTENT_ELEM);
     1323    Element highlighted_node = (Element) GSXML.getNodeByPath(mr_query_response, pathNode);
     1324
     1325     if (highlighted_node == null) {
     1326       return null;
     1327     }
     1328     // For SOLR, the highlighted node will be a nodeContent element, which is the hldocOID section content, with search terms marked up.
     1329     //We send it back to the documnetContentRetrieve service so that resolveTextMacros can be applied, and it can be properly encased in documentNode etc elements
     1330
     1331     // Build a request to process highlighted text
     1332     
     1333     Element hl_message = doc.createElement(GSXML.MESSAGE_ELEM);
     1334     to = GSPath.appendLink(collection, "DocumentContentRetrieve");
     1335     Element dc_request = GSXML.createBasicRequest(doc, GSXML.REQUEST_TYPE_PROCESS, to, userContext);
     1336     hl_message.appendChild(dc_request);
     1337     
     1338     // Create a parameter list to specify the request parameters - empty for now
     1339     Element dc_param_list = doc.createElement(GSXML.PARAM_ELEM + GSXML.LIST_MODIFIER);
     1340     dc_request.appendChild(dc_param_list);
     1341
     1342     // get the content
     1343     Element doc_list = doc.createElement(GSXML.DOC_NODE_ELEM + GSXML.LIST_MODIFIER);
     1344     dc_request.appendChild(doc_list);
     1345     Element current_doc = doc.createElement(GSXML.DOC_NODE_ELEM);
     1346     doc_list.appendChild(current_doc);
     1347     current_doc.setAttribute(GSXML.NODE_ID_ATT, (String) params.get(GSParams.DOCUMENT));
     1348     //Append highlighted content to request for processing
     1349     dc_request.appendChild(doc.importNode(highlighted_node, true));
     1350     Element hl_response_message = (Element) this.mr.process(hl_message);
     1351     //Get results
     1352     NodeList contentList = hl_response_message.getElementsByTagName(GSXML.NODE_CONTENT_ELEM);
     1353     Element content = (Element) contentList.item(0);
     1354     return content;
     1355     
     1356   
     1357  }
    14671358  /**
    14681359   * Highlights query terms in specified elements (whose name is in element_names) text inside top_level_elem
    14691360   */
    1470   protected boolean highlightQueryTermsDOM(Document doc, Element top_level_elem, String element_name, HashSet<String> query_term_variants, ArrayList<ArrayList<HashSet<String>>> phrase_query_term_variants_hierarchy) {
    1471 
    1472     //logger.error("begin highlight DOM "+XMLConverter.getPrettyString(top_level_elem));
     1361  protected boolean highlightQueryTermsDOM(Document doc, Element top_level_elem, String element_name, HashSet<String> query_term_variants, ArrayList<ArrayList<HashSet<String>>> phrase_query_term_variants_hierarchy, boolean case_insensitive) {
     1362
    14731363      NodeList named_elems = top_level_elem.getElementsByTagName(element_name);
    14741364      for (int j=named_elems.getLength()-1; j>=0; j--) {
    14751365    Element this_elem = (Element)named_elems.item(j);
    1476     Element replacement_elem = highlightQueryTermsElementText(doc, this_elem, query_term_variants, phrase_query_term_variants_hierarchy);
     1366    Element replacement_elem = highlightQueryTermsElementText(doc, this_elem, query_term_variants, phrase_query_term_variants_hierarchy, case_insensitive);
    14771367    this_elem.getParentNode().replaceChild(replacement_elem, this_elem);
    14781368      }
    1479    
    1480 
    1481       //logger.error("end highlight DOM "+XMLConverter.getPrettyString(top_level_elem));
    14821369    return true;
    14831370  }
     
    14851372     * Highlights query terms in the text content of an element.
    14861373     */
    1487   private Element highlightQueryTermsElementText(Document doc, Element original_element, /*String content,*/  HashSet<String> query_term_variants, ArrayList<ArrayList<HashSet<String>>> phrase_query_term_variants_hierarchy)
     1374  private Element highlightQueryTermsElementText(Document doc, Element original_element,  HashSet<String> query_term_variants, ArrayList<ArrayList<HashSet<String>>> phrase_query_term_variants_hierarchy, boolean case_insensitive)
    14881375    {
    1489       //logger.error("in hl internal, query terms are "+query_term_variants.toString());
    14901376      String content = GSXML.getNodeText(original_element);
    1491       //logger.error("original elem = "+XMLConverter.getPrettyString(original_element));
    1492       logger.error("highlighting content: "+content);
    14931377        // Convert the content string to an array of characters for speed
    14941378        char[] content_characters = new char[content.length()];
     
    15061390            if (content_characters[i] == '<')
    15071391            {
     1392              // are we currently in a word?
     1393              if (in_word) {
     1394                in_word = false;
     1395                String word = new String(content_characters, word_start, (i - word_start));
     1396                if (case_insensitive) {
     1397                  word = word.toLowerCase();
     1398                }
     1399                if (query_term_variants.contains(word)) {
     1400                  // We have found a matching word, so remember its location
     1401                  word_matches.add(new WordMatch(word, word_start, i, preceding_word_matched));
     1402                  // should preceding word matched be set to true/false here??
     1403                  preceding_word_matched = true;
     1404                } else {
     1405                  preceding_word_matched = false;
     1406                }
     1407              }
    15081408                inTag = true;
    15091409                continue;
     
    15121412            {
    15131413                inTag = false;
     1414                continue;
    15141415            }
    15151416            else if (inTag)
     
    15341435                // Check if the word matches any of the query term equivalents
    15351436                String word = new String(content_characters, word_start, (i - word_start));
    1536                 //logger.error("word: "+word);
     1437                if (case_insensitive) {
     1438                  word = word.toLowerCase();
     1439                }
    15371440                if (query_term_variants.contains(word))
    15381441                {
    1539                   //logger.error("matched");
    15401442                    // We have found a matching word, so remember its location
    15411443                    word_matches.add(new WordMatch(word, word_start, i, preceding_word_matched));
     
    15541456            // Check if the word matches any of the query term equivalents
    15551457            String word = new String(content_characters, word_start, (content_characters.length - word_start));
     1458            if (case_insensitive) {
     1459              word = word.toLowerCase();
     1460            }
    15561461            if (query_term_variants.contains(word))
    15571462            {
     
    15611466        }
    15621467
     1468        if (word_matches.size() == 0) {
     1469          // just return a copy of the original element
     1470          return (Element)doc.importNode(original_element, true);
     1471         
     1472        }
     1473       
    15631474        ArrayList<Integer> highlight_start_positions = new ArrayList<Integer>();
    15641475        ArrayList<Integer> highlight_end_positions = new ArrayList<Integer>();
    1565 
     1476       
     1477        if (phrase_query_term_variants_hierarchy.size() ==0) {
     1478          for (int i = 0; i < word_matches.size(); i++) {
     1479            highlight_start_positions.add(new Integer(word_matches.get(i).start_position));
     1480            highlight_end_positions.add(new Integer(word_matches.get(i).end_position));
     1481          }
     1482        }
     1483        else {
    15661484        // Deal with phrases now
    15671485        ArrayList<PartialPhraseMatch> partial_phrase_matches = new ArrayList<PartialPhraseMatch>();
     
    16381556            }
    16391557        }
     1558        }
    16401559
    16411560        // Now add the annotation tags into the document at the correct points
    1642         //Element content_element = doc.createElement(GSXML.NODE_CONTENT_ELEM);
    16431561        Element content_element = (Element)doc.importNode(original_element, false); // just copy the element plus any attributes, but not any children.
    16441562        int last_wrote = 0;
     
    16751593    }
    16761594 
    1677     private Element highlightQueryTermsInternalOrig(Document doc, String content, HashSet<String> query_term_variants, ArrayList<ArrayList<HashSet<String>>> phrase_query_term_variants_hierarchy)
    1678     {
    1679         // Convert the content string to an array of characters for speed
    1680         char[] content_characters = new char[content.length()];
    1681         content.getChars(0, content.length(), content_characters, 0);
    1682 
    1683         // Now skim through the content, identifying word matches
    1684         ArrayList<WordMatch> word_matches = new ArrayList<WordMatch>();
    1685         int word_start = 0;
    1686         boolean in_word = false;
    1687         boolean preceding_word_matched = false;
    1688         boolean inTag = false;
    1689         for (int i = 0; i < content_characters.length; i++)
    1690         {
    1691             //We don't want to find words inside HTML tags
    1692             if (content_characters[i] == '<')
    1693             {
    1694                 inTag = true;
    1695                 continue;
    1696             }
    1697             else if (inTag && content_characters[i] == '>')
    1698             {
    1699                 inTag = false;
    1700             }
    1701             else if (inTag)
    1702             {
    1703                 continue;
    1704             }
    1705 
    1706             boolean is_character_letter_or_digit = Character.isLetterOrDigit(content_characters[i]);
    1707 
    1708             // Has a word just started?
    1709             if (in_word == false && is_character_letter_or_digit == true)
    1710             {
    1711                 in_word = true;
    1712                 word_start = i;
    1713             }
    1714 
    1715             // Or has a word just finished?
    1716             else if (in_word == true && is_character_letter_or_digit == false)
    1717             {
    1718                 in_word = false;
    1719 
    1720                 // Check if the word matches any of the query term equivalents
    1721                 String word = new String(content_characters, word_start, (i - word_start));
    1722                 if (query_term_variants.contains(word))
    1723                 {
    1724                     // We have found a matching word, so remember its location
    1725                     word_matches.add(new WordMatch(word, word_start, i, preceding_word_matched));
    1726                     preceding_word_matched = true;
    1727                 }
    1728                 else
    1729                 {
    1730                     preceding_word_matched = false;
    1731                 }
    1732             }
    1733         }
    1734 
    1735         // Don't forget the last word...
    1736         if (in_word == true)
    1737         {
    1738             // Check if the word matches any of the query term equivalents
    1739             String word = new String(content_characters, word_start, (content_characters.length - word_start));
    1740             if (query_term_variants.contains(word))
    1741             {
    1742                 // We have found a matching word, so remember its location
    1743                 word_matches.add(new WordMatch(word, word_start, content_characters.length, preceding_word_matched));
    1744             }
    1745         }
    1746 
    1747         ArrayList<Integer> highlight_start_positions = new ArrayList<Integer>();
    1748         ArrayList<Integer> highlight_end_positions = new ArrayList<Integer>();
    1749 
    1750         // Deal with phrases now
    1751         ArrayList<PartialPhraseMatch> partial_phrase_matches = new ArrayList<PartialPhraseMatch>();
    1752         for (int i = 0; i < word_matches.size(); i++)
    1753         {
    1754             WordMatch word_match = word_matches.get(i);
    1755 
    1756             // See if any partial phrase matches are extended by this word
    1757             if (word_match.preceding_word_matched)
    1758             {
    1759                 for (int j = partial_phrase_matches.size() - 1; j >= 0; j--)
    1760                 {
    1761                     PartialPhraseMatch partial_phrase_match = partial_phrase_matches.remove(j);
    1762                     ArrayList phrase_query_p_term_variants_list = phrase_query_term_variants_hierarchy.get(partial_phrase_match.query_phrase_number);
    1763                     HashSet phrase_query_p_term_x_variants = (HashSet) phrase_query_p_term_variants_list.get(partial_phrase_match.num_words_matched);
    1764                     if (phrase_query_p_term_x_variants.contains(word_match.word))
    1765                     {
    1766                         partial_phrase_match.num_words_matched++;
    1767 
    1768                         // Has a complete phrase match occurred?
    1769                         if (partial_phrase_match.num_words_matched == phrase_query_p_term_variants_list.size())
    1770                         {
    1771                             // Check for overlaps by looking at the previous highlight range
    1772                             if (!highlight_end_positions.isEmpty())
    1773                             {
    1774                                 int last_highlight_index = highlight_end_positions.size() - 1;
    1775                                 int last_highlight_end = highlight_end_positions.get(last_highlight_index).intValue();
    1776                                 if (last_highlight_end > partial_phrase_match.start_position)
    1777                                 {
    1778                                     // There is an overlap, so remove the previous phrase match
    1779                                     int last_highlight_start = highlight_start_positions.remove(last_highlight_index).intValue();
    1780                                     highlight_end_positions.remove(last_highlight_index);
    1781                                     partial_phrase_match.start_position = last_highlight_start;
    1782                                 }
    1783                             }
    1784 
    1785                             highlight_start_positions.add(new Integer(partial_phrase_match.start_position));
    1786                             highlight_end_positions.add(new Integer(word_match.end_position));
    1787                         }
    1788                         // No, but add the partial match back into the list for next time
    1789                         else
    1790                         {
    1791                             partial_phrase_matches.add(partial_phrase_match);
    1792                         }
    1793                     }
    1794                 }
    1795             }
    1796             else
    1797             {
    1798                 partial_phrase_matches.clear();
    1799             }
    1800 
    1801             // See if this word is at the start of any of the phrases
    1802             for (int p = 0; p < phrase_query_term_variants_hierarchy.size(); p++)
    1803             {
    1804                 ArrayList phrase_query_p_term_variants_list = phrase_query_term_variants_hierarchy.get(p);
    1805                 if (phrase_query_p_term_variants_list.size()>0) {
    1806                 HashSet phrase_query_p_term_1_variants = (HashSet) phrase_query_p_term_variants_list.get(0);
    1807                 if (phrase_query_p_term_1_variants.contains(word_match.word))
    1808                 {
    1809                     // If this phrase is just one word long, we have a complete match
    1810                     if (phrase_query_p_term_variants_list.size() == 1)
    1811                     {
    1812                         highlight_start_positions.add(new Integer(word_match.start_position));
    1813                         highlight_end_positions.add(new Integer(word_match.end_position));
    1814                     }
    1815                     // Otherwise we have the start of a potential phrase match
    1816                     else
    1817                     {
    1818                         partial_phrase_matches.add(new PartialPhraseMatch(word_match.start_position, p));
    1819                     }
    1820                 }
    1821                 }
    1822             }
    1823         }
    1824 
    1825         // Now add the annotation tags into the document at the correct points
    1826         Element content_element = doc.createElement(GSXML.NODE_CONTENT_ELEM);
    1827 
    1828         int last_wrote = 0;
    1829         for (int i = 0; i < highlight_start_positions.size(); i++)
    1830         {
    1831             int highlight_start = highlight_start_positions.get(i).intValue();
    1832             int highlight_end = highlight_end_positions.get(i).intValue();
    1833 
    1834             // Print anything before the highlight range
    1835             if (last_wrote < highlight_start)
    1836             {
    1837                 String preceding_text = new String(content_characters, last_wrote, (highlight_start - last_wrote));
    1838                 content_element.appendChild(doc.createTextNode(preceding_text));
    1839             }
    1840 
    1841             // Print the highlight text, annotated
    1842             if (highlight_end > last_wrote)
    1843             {
    1844                 String highlight_text = new String(content_characters, highlight_start, (highlight_end - highlight_start));
    1845                 Element annotation_element = GSXML.createTextElement(doc, "annotation", highlight_text);
    1846                 annotation_element.setAttribute("type", "query_term");
    1847                 content_element.appendChild(annotation_element);
    1848                 last_wrote = highlight_end;
    1849             }
    1850         }
    1851 
    1852         // Finish off any unwritten text
    1853         if (last_wrote < content_characters.length)
    1854         {
    1855             String remaining_text = new String(content_characters, last_wrote, (content_characters.length - last_wrote));
    1856             content_element.appendChild(doc.createTextNode(remaining_text));
    1857         }
    1858         return content_element;
    1859     }
    18601595
    18611596    static private class WordMatch
Note: See TracChangeset for help on using the changeset viewer.