Changeset 32505


Ignore:
Timestamp:
2018-10-09T16:02:10+13:00 (6 years ago)
Author:
kjdon
Message:

moved soem code into a new method getFormattedArchiveDoc. Modified search term highlighting code. separated the getting of query term variants, and marking up the text. then redoing the query is only called once. now can call the text marking up bit on metadata too - useful if the document page displays a table of metadata - want to highlight search terms in the table.

File:
1 edited

Legend:

Unmodified
Added
Removed
  • main/trunk/greenstone3/src/java/org/greenstone/gsdl3/action/DocumentAction.java

    r32448 r32505  
    8585
    8686        Element message = GSXML.nodeToElement(message_node);
    87         Document doc = XMLConverter.newDOM(); //message.getOwnerDocument();
     87        Document doc = XMLConverter.newDOM();
    8888       
    8989        // the response
     
    9898
    9999        // just in case there are some that need to get passed to the services
     100        // why do we use s0 here and s1 in other places???
    100101        HashMap service_params = (HashMap) params.get("s0");
    101102
     
    167168        // are we editing mode? just get the archive document, convert to our internal doc format, and return it
    168169        if (editing_document) {
    169 
    170           // call get archive doc
    171           Element dx_message = doc.createElement(GSXML.MESSAGE_ELEM);
    172           String to = "DocXMLGetSection";
    173           Element dx_request = GSXML.createBasicRequest(doc, GSXML.REQUEST_TYPE_PROCESS, to, userContext);
    174           dx_message.appendChild(dx_request);
    175           Element dx_section = doc.createElement(GSXML.DOCXML_SECTION_ELEM);
    176           dx_section.setAttribute(GSXML.NODE_ID_ATT, document_id);
    177           dx_section.setAttribute(GSXML.COLLECTION_ATT, collection);
    178           dx_request.appendChild(dx_section);
    179 
    180           Element dx_response_message = (Element) this.mr.process(dx_message);
    181           if (processErrorElements(dx_response_message, page_response))
    182             {
    183               return result;
    184             }
    185 
    186           // get the section out
    187           String path = GSPath.appendLink(GSXML.RESPONSE_ELEM, GSXML.DOCXML_SECTION_ELEM);
    188           Element section = (Element) GSXML.getNodeByPath(dx_response_message, path);
    189           if (section == null) {
    190             logger.error("no archive doc returned for "+document_id);
    191             return result;
    192           }
    193           // convert the archive format into the internal format that the page response requires
    194 
    195           // work out doctype
    196           // NOTE: this will be coming from collection database in index
    197           // the archive file doesn't store this. So we have to assume
    198           // that the doc type will not be changing with any
    199           // modifications happening to archives.
    200          
    201           // if doc type is null, then we need to work it out.
    202           // create a basic doc list containing the current node
    203 
    204           if (document_type == null) {
    205             Element basic_doc_list = doc.createElement(GSXML.DOC_NODE_ELEM + GSXML.LIST_MODIFIER);
    206             Element current_doc = doc.createElement(GSXML.DOC_NODE_ELEM);
    207             basic_doc_list.appendChild(current_doc);
    208             current_doc.setAttribute(GSXML.NODE_ID_ATT, document_id);
    209             basic_doc_list.appendChild(current_doc);
    210             document_type = getDocumentType(basic_doc_list, collection, userContext, page_response);
    211           }
    212          
    213           if (document_type == null) {
    214               logger.debug("@@@ doctype is null, setting to simple");
    215               document_type = GSXML.DOC_TYPE_SIMPLE;
    216           }       
    217 
    218           Element doc_elem = doc.createElement(GSXML.DOCUMENT_ELEM);     
    219           doc_elem.setAttribute(GSXML.DOC_TYPE_ATT, document_type);
    220           page_response.appendChild(doc_elem);
    221 
    222           Element transformed_section = transformArchiveToDocument(section);
    223           if (document_type ==  GSXML.DOC_TYPE_SIMPLE) {
    224             // simple doc, only returning a single document node, which is the top level section.
    225             doc_elem.setAttribute(GSXML.NODE_ID_ATT, document_id);
    226             GSXML.mergeElements(doc_elem, transformed_section);
    227             return result;
    228           }
    229 
    230           // multi sectioned document.
    231           transformed_section.setAttribute(GSXML.NODE_ID_ATT, document_id);       
    232           // In docEdit mode, we obtain the text from archives, from doc.xml
    233           // Now the transformation has replaced <Section> with <documentNode>
    234           // Need to add nodeID, nodeType and docType attributes to each docNode
    235           // as doc.xml doesn't store that.
    236           insertDocNodeAttributes(transformed_section, document_type, null);         
    237           doc_elem.appendChild(doc.importNode(transformed_section, true));
    238           logger.debug("dx result = "+XMLConverter.getPrettyString(result));
    239 
    240           return result;
     170          return getFormattedArchiveDoc(doc, collection, document_id, document_type, result, page_response, userContext);
    241171        }
    242172       
     
    321251       
    322252        the_document.setAttribute(GSXML.DOC_TYPE_ATT, document_type);
     253
     254        // start getting doc structure
    323255       
    324256        // Create a parameter list to specify the required structure information
     
    482414        }
    483415
     416        // end getting doc structure
     417
     418        // start getting doc metadata
     419       
    484420        // Build a request to obtain some document metadata
    485421        Element dm_message = doc.createElement(GSXML.MESSAGE_ELEM);
     
    611547        }
    612548
     549       
     550        HashSet<String> query_term_variants = null;
     551        ArrayList<ArrayList<HashSet<String>>> phrase_query_term_variants_hierarchy = null;
     552        boolean do_highlight_query_terms = highlight_query_terms;
     553        if (highlight_query_terms) {
     554          // lets get the query term equivalents
     555          query_term_variants = new HashSet<String>();
     556          phrase_query_term_variants_hierarchy = new ArrayList<ArrayList<HashSet<String>>>();
     557          if (!getQueryTermVariants(request, null, /*current_node_id,*/ query_term_variants, phrase_query_term_variants_hierarchy)) {
     558            do_highlight_query_terms = false; // we couldn't get the terms
     559          }
     560        }
     561
     562        // lets try marking up the metadata with search terms
     563        if (do_highlight_query_terms) {
     564          highlightQueryTermsDOM(doc, the_document, "metadata", query_term_variants, phrase_query_term_variants_hierarchy);
     565        }
     566
    613567        // Build a request to obtain some document content
    614568        Element dc_message = doc.createElement(GSXML.MESSAGE_ELEM);
     
    657611                if (content != null)
    658612                {
    659                     if (highlight_query_terms)
     613                    if (do_highlight_query_terms)
    660614                    {
    661                      
    662                       content = highlightQueryTerms(request, node_id, (Element) content);
     615                      content = highlightQueryTermsElementText(doc, (Element)content, query_term_variants, phrase_query_term_variants_hierarchy);
    663616                    }
    664617                   
     
    708661                return result;
    709662            }
    710             if (highlight_query_terms)
     663            if (do_highlight_query_terms)
    711664            {
    712665                dc_response_doc.removeChild(dc_response_doc_content);
    713666
    714                 dc_response_doc_content = highlightQueryTerms(request, null, dc_response_doc_content);
     667                dc_response_doc_content = highlightQueryTermsElementText(doc, (Element)dc_response_doc_content, query_term_variants, phrase_query_term_variants_hierarchy);
    715668                dc_response_doc.appendChild(dc_response_doc.getOwnerDocument().importNode(dc_response_doc_content, true));
    716669            }
     
    803756    }
    804757
     758  protected Element getFormattedArchiveDoc(Document doc, String collection, String document_id, String document_type, Element result, Element page_response, UserContext userContext ) {
     759    // call get archive doc
     760    Element dx_message = doc.createElement(GSXML.MESSAGE_ELEM);
     761    String to = "DocXMLGetSection";
     762    Element dx_request = GSXML.createBasicRequest(doc, GSXML.REQUEST_TYPE_PROCESS, to, userContext);
     763    dx_message.appendChild(dx_request);
     764    Element dx_section = doc.createElement(GSXML.DOCXML_SECTION_ELEM);
     765    dx_section.setAttribute(GSXML.NODE_ID_ATT, document_id);
     766    dx_section.setAttribute(GSXML.COLLECTION_ATT, collection);
     767    dx_request.appendChild(dx_section);
     768
     769    Element dx_response_message = (Element) this.mr.process(dx_message);
     770    if (processErrorElements(dx_response_message, page_response))
     771      {
     772    return result;
     773      }
     774   
     775    // get the section out
     776    String path = GSPath.appendLink(GSXML.RESPONSE_ELEM, GSXML.DOCXML_SECTION_ELEM);
     777    Element section = (Element) GSXML.getNodeByPath(dx_response_message, path);
     778    if (section == null) {
     779      logger.error("no archive doc returned for "+document_id);
     780      return result;
     781    }
     782    // convert the archive format into the internal format that the page response requires
     783   
     784    // work out doctype
     785    // NOTE: this will be coming from collection database in index
     786    // the archive file doesn't store this. So we have to assume
     787    // that the doc type will not be changing with any
     788    // modifications happening to archives.
     789   
     790    // if doc type is null, then we need to work it out.
     791    // create a basic doc list containing the current node
     792   
     793    if (document_type == null) {
     794      Element basic_doc_list = doc.createElement(GSXML.DOC_NODE_ELEM + GSXML.LIST_MODIFIER);
     795      Element current_doc = doc.createElement(GSXML.DOC_NODE_ELEM);
     796      basic_doc_list.appendChild(current_doc);
     797      current_doc.setAttribute(GSXML.NODE_ID_ATT, document_id);
     798      basic_doc_list.appendChild(current_doc);
     799      document_type = getDocumentType(basic_doc_list, collection, userContext, page_response);
     800    }
     801   
     802    if (document_type == null) {
     803      logger.debug("@@@ doctype is null, setting to simple");
     804      document_type = GSXML.DOC_TYPE_SIMPLE;
     805    }         
     806   
     807    Element doc_elem = doc.createElement(GSXML.DOCUMENT_ELEM);   
     808    doc_elem.setAttribute(GSXML.DOC_TYPE_ATT, document_type);
     809    page_response.appendChild(doc_elem);
     810   
     811    Element transformed_section = transformArchiveToDocument(section);
     812    if (document_type ==  GSXML.DOC_TYPE_SIMPLE) {
     813      // simple doc, only returning a single document node, which is the top level section.
     814      doc_elem.setAttribute(GSXML.NODE_ID_ATT, document_id);
     815      GSXML.mergeElements(doc_elem, transformed_section);
     816      return result;
     817    }
     818   
     819    // multi sectioned document.
     820    transformed_section.setAttribute(GSXML.NODE_ID_ATT, document_id);         
     821    // In docEdit mode, we obtain the text from archives, from doc.xml
     822    // Now the transformation has replaced <Section> with <documentNode>
     823    // Need to add nodeID, nodeType and docType attributes to each docNode
     824    // as doc.xml doesn't store that.
     825    insertDocNodeAttributes(transformed_section, document_type, null);       
     826    doc_elem.appendChild(doc.importNode(transformed_section, true));
     827    logger.debug("dx result = "+XMLConverter.getPrettyString(result));
     828   
     829    return result;
     830  }
     831 
    805832   
    806833    private boolean needSectionContent(HashMap<String, Serializable> params) {
     
    10091036  }
    10101037
    1011 
    1012     /**
    1013      * this involves a bit of a hack to get the equivalent query terms - has to
    1014      * requery the query service - uses the last selected service name. (if it
    1015      * ends in query). should this action do the query or should it send a
    1016      * message to the query action? but that will involve lots of extra stuff.
    1017      * also doesn't handle phrases properly - just highlights all the terms
    1018      * found in the text.
    1019      */
    1020   protected Element highlightQueryTerms(Element request, String current_node_id, Element dc_response_doc_content)
     1038  /**
     1039   * this involves a bit of a hack to get the equivalent query terms - has to
     1040   * requery the query service - uses the last selected service name. (if it
     1041   * ends in query). should this action do the query or should it send a
     1042   * message to the query action? but that will involve lots of extra stuff.
     1043   */
     1044  protected boolean getQueryTermVariants(Element request, String  current_node_id, HashSet<String> query_term_variants, ArrayList<ArrayList<HashSet<String>>> phrase_query_term_variants_hierarchy)
     1045  {
     1046    Document doc = request.getOwnerDocument();
     1047   
     1048    // do the query again to get term info
     1049    Element cgi_param_list = (Element) GSXML.getChildByTagName(request, GSXML.PARAM_ELEM + GSXML.LIST_MODIFIER);
     1050    //logger.error("cgi param list = "+XMLConverter.getPrettyString(cgi_param_list));
     1051    HashMap<String, Serializable> params = GSXML.extractParams(cgi_param_list, false);
     1052   
     1053    HashMap previous_params = (HashMap) params.get("p");
     1054    if (previous_params == null)
     1055      {
     1056    //logger.error("no p parms");
     1057    return false;
     1058      }
     1059    String service_name = (String) previous_params.get(GSParams.SERVICE);
     1060    if (service_name == null || !service_name.endsWith("Query"))
     1061      { // hack for now - we only do highlighting if we were in a query last - ie not if we were in a browse thingy
     1062    logger.debug("invalid service, not doing highlighting");
     1063    return false;
     1064      }
     1065
     1066    String collection = (String) params.get(GSParams.COLLECTION);
     1067    UserContext userContext = new UserContext(request);
     1068    String to = GSPath.appendLink(collection, service_name);
     1069   
     1070    Element mr_query_message = doc.createElement(GSXML.MESSAGE_ELEM);
     1071    Element mr_query_request = GSXML.createBasicRequest(doc, GSXML.REQUEST_TYPE_PROCESS, to, userContext);
     1072    mr_query_message.appendChild(mr_query_request);
     1073   
     1074    // paramList
     1075    HashMap service_params = (HashMap) params.get("s1");
     1076   
     1077    Element query_param_list = doc.createElement(GSXML.PARAM_ELEM + GSXML.LIST_MODIFIER);
     1078    GSXML.addParametersToList(query_param_list, service_params);
     1079    // is this only used for solr??? - do we still want it for solr??
     1080    // if (current_node_id != null) {
     1081    //   GSXML.addParameterToList(query_param_list, "hldocOID", current_node_id);
     1082    // } else {
     1083    //   GSXML.addParameterToList(query_param_list, "hldocOID", (String) params.get(GSParams.DOCUMENT));
     1084    // }
     1085    mr_query_request.appendChild(query_param_list);
     1086    // do the query
     1087
     1088    Element mr_query_response = (Element) this.mr.process(mr_query_message);
     1089
     1090    // find the term lists
     1091    String path = GSPath.appendLink(GSXML.RESPONSE_ELEM, GSXML.TERM_ELEM + GSXML.LIST_MODIFIER);
     1092    Element query_term_list_element = (Element) GSXML.getNodeByPath(mr_query_response, path);
     1093    if (query_term_list_element == null)
     1094      {
     1095    // no term info
     1096    logger.error("No query term information. xx\n");
     1097    return false;
     1098      }
     1099    //    logger.error("query term list info "+XMLConverter.getPrettyString(query_term_list_element));
     1100    //String content = GSXML.getNodeText(dc_response_doc_content);
     1101   
     1102    String metadata_path = GSPath.appendLink(GSXML.RESPONSE_ELEM, GSXML.METADATA_ELEM + GSXML.LIST_MODIFIER);
     1103    Element metadata_list = (Element) GSXML.getNodeByPath(mr_query_response, metadata_path);
     1104   
     1105    NodeList equivalent_terms_nodelist = query_term_list_element.getElementsByTagName("equivTermList");
     1106    if (equivalent_terms_nodelist == null || equivalent_terms_nodelist.getLength() == 0)
     1107      {
     1108    NodeList terms_nodelist = query_term_list_element.getElementsByTagName("term");
     1109    if (terms_nodelist != null && terms_nodelist.getLength() > 0)
     1110      {
     1111        for (int i = 0; i < terms_nodelist.getLength(); i++)
     1112          {
     1113        String termValue = ((Element) terms_nodelist.item(i)).getAttribute("name");
     1114        String termValueU = null;
     1115        String termValueL = null;
     1116       
     1117        if (termValue.length() > 1)
     1118          {
     1119            termValueU = termValue.substring(0, 1).toUpperCase() + termValue.substring(1);
     1120            termValueL = termValue.substring(0, 1).toLowerCase() + termValue.substring(1);
     1121          }
     1122        else
     1123          {
     1124            termValueU = termValue.substring(0, 1).toUpperCase();
     1125            termValueL = termValue.substring(0, 1).toLowerCase();
     1126          }
     1127        query_term_variants.add(termValueU);
     1128        query_term_variants.add(termValueL);
     1129          }
     1130      }
     1131      }
     1132    else
     1133      {
     1134    for (int i = 0; i < equivalent_terms_nodelist.getLength(); i++)
     1135      {
     1136        Element equivalent_terms_element = (Element) equivalent_terms_nodelist.item(i);
     1137        String[] equivalent_terms = GSXML.getAttributeValuesFromList(equivalent_terms_element, GSXML.NAME_ATT);
     1138        for (int j = 0; j < equivalent_terms.length; j++)
     1139          {
     1140        query_term_variants.add(equivalent_terms[j]);
     1141          }
     1142      }
     1143      }
     1144   
     1145
     1146    Element query_element = GSXML.getNamedElement(metadata_list, GSXML.METADATA_ELEM, GSXML.NAME_ATT, "query");
     1147    String performed_query = GSXML.getNodeText(query_element) + " ";
     1148
     1149    ArrayList<HashSet<String>> phrase_query_p_term_variants_list = new ArrayList<HashSet<String>>();
     1150    int term_start = 0;
     1151    boolean in_term = false;
     1152    boolean in_phrase = false;
     1153    for (int i = 0; i < performed_query.length(); i++)
     1154      {
     1155    char character = performed_query.charAt(i);
     1156    boolean is_character_letter_or_digit = Character.isLetterOrDigit(character);
     1157   
     1158    // Has a query term just started?
     1159    if (in_term == false && is_character_letter_or_digit == true)
     1160      {
     1161        in_term = true;
     1162        term_start = i;
     1163      }
     1164   
     1165    // Or has a term just finished?
     1166    else if (in_term == true && is_character_letter_or_digit == false)
     1167      {
     1168        in_term = false;
     1169        String term = performed_query.substring(term_start, i);
     1170       
     1171        Element term_element = GSXML.getNamedElement(query_term_list_element, GSXML.TERM_ELEM, GSXML.NAME_ATT, term);
     1172        if (term_element != null)
     1173          {
     1174       
     1175        HashSet<String> phrase_query_p_term_x_variants = new HashSet<String>();
     1176       
     1177        NodeList term_equivalent_terms_nodelist = term_element.getElementsByTagName("equivTermList");
     1178        if (term_equivalent_terms_nodelist == null || term_equivalent_terms_nodelist.getLength() == 0)
     1179          {
     1180            String termValueU = null;
     1181            String termValueL = null;
     1182           
     1183            if (term.length() > 1)
     1184              {
     1185            termValueU = term.substring(0, 1).toUpperCase() + term.substring(1);
     1186            termValueL = term.substring(0, 1).toLowerCase() + term.substring(1);
     1187              }
     1188            else
     1189              {
     1190            termValueU = term.substring(0, 1).toUpperCase();
     1191            termValueL = term.substring(0, 1).toLowerCase();
     1192              }
     1193           
     1194            phrase_query_p_term_x_variants.add(termValueU);
     1195            phrase_query_p_term_x_variants.add(termValueL);
     1196          }
     1197        else
     1198          {
     1199            for (int j = 0; j < term_equivalent_terms_nodelist.getLength(); j++)
     1200              {
     1201            Element term_equivalent_terms_element = (Element) term_equivalent_terms_nodelist.item(j);
     1202            String[] term_equivalent_terms = GSXML.getAttributeValuesFromList(term_equivalent_terms_element, GSXML.NAME_ATT);
     1203            for (int k = 0; k < term_equivalent_terms.length; k++)
     1204              {
     1205                phrase_query_p_term_x_variants.add(term_equivalent_terms[k]);
     1206              }
     1207              }
     1208          }
     1209        phrase_query_p_term_variants_list.add(phrase_query_p_term_x_variants);
     1210       
     1211        if (in_phrase == false)
     1212          {
     1213            phrase_query_term_variants_hierarchy.add(phrase_query_p_term_variants_list);
     1214            phrase_query_p_term_variants_list = new ArrayList<HashSet<String>>();
     1215          }
     1216          }
     1217      }
     1218    // Watch for phrases (surrounded by quotes)
     1219    if (character == '\"')
     1220      {
     1221        // Has a phrase just started?
     1222        if (in_phrase == false)
     1223          {
     1224        in_phrase = true;
     1225          }
     1226        // Or has a phrase just finished?
     1227        else if (in_phrase == true)
     1228          {
     1229        in_phrase = false;
     1230        phrase_query_term_variants_hierarchy.add(phrase_query_p_term_variants_list);
     1231          }
     1232       
     1233        phrase_query_p_term_variants_list = new ArrayList<HashSet<String>>();
     1234      }
     1235      }
     1236 
     1237    return true;
     1238  }
     1239
     1240  /** redo the request to get the query terms then highlight them in the text
     1241   *
     1242   */
     1243  protected Element highlightQueryTermsOld(Element request, String current_node_id, Element dc_response_doc_content)
    10211244    {
    10221245        Document doc = request.getOwnerDocument();
     
    10641287        {
    10651288            // Build a request to process highlighted text
    1066            
     1289          logger.error("highlighted node is not null!!!!");
     1290          logger.error(XMLConverter.getPrettyString(highlighted_Node));
    10671291            Element hl_message = doc.createElement(GSXML.MESSAGE_ELEM);
    10681292            to = GSPath.appendLink(collection, "DocumentContentRetrieve");
     
    10941318        {
    10951319            // no term info
    1096             logger.error("No query term information.\n");
     1320            logger.error("No query term information. yy\n");
    10971321            return dc_response_doc_content;
    10981322        }
     
    12381462        }
    12391463
    1240         return highlightQueryTermsInternal(doc, content, query_term_variants, phrase_query_term_variants_hierarchy);
     1464        return highlightQueryTermsInternalOrig(doc, content, query_term_variants, phrase_query_term_variants_hierarchy);
    12411465    }
    12421466
     1467  /**
     1468   * Highlights query terms in specified elements (whose name is in element_names) text inside top_level_elem
     1469   */
     1470  protected boolean highlightQueryTermsDOM(Document doc, Element top_level_elem, String element_name, HashSet<String> query_term_variants, ArrayList<ArrayList<HashSet<String>>> phrase_query_term_variants_hierarchy) {
     1471
     1472    //logger.error("begin highlight DOM "+XMLConverter.getPrettyString(top_level_elem));
     1473      NodeList named_elems = top_level_elem.getElementsByTagName(element_name);
     1474      for (int j=named_elems.getLength()-1; j>=0; j--) {
     1475    Element this_elem = (Element)named_elems.item(j);
     1476    Element replacement_elem = highlightQueryTermsElementText(doc, this_elem, query_term_variants, phrase_query_term_variants_hierarchy);
     1477    this_elem.getParentNode().replaceChild(replacement_elem, this_elem);
     1478      }
     1479   
     1480
     1481      //logger.error("end highlight DOM "+XMLConverter.getPrettyString(top_level_elem));
     1482    return true;
     1483  }
    12431484    /**
    1244      * Highlights query terms in a piece of text.
     1485     * Highlights query terms in the text content of an element.
    12451486     */
    1246     private Element highlightQueryTermsInternal(Document doc, String content, HashSet<String> query_term_variants, ArrayList<ArrayList<HashSet<String>>> phrase_query_term_variants_hierarchy)
     1487  private Element highlightQueryTermsElementText(Document doc, Element original_element, /*String content,*/ HashSet<String> query_term_variants, ArrayList<ArrayList<HashSet<String>>> phrase_query_term_variants_hierarchy)
    12471488    {
     1489      //logger.error("in hl internal, query terms are "+query_term_variants.toString());
     1490      String content = GSXML.getNodeText(original_element);
     1491      //logger.error("original elem = "+XMLConverter.getPrettyString(original_element));
     1492      logger.error("highlighting content: "+content);
    12481493        // Convert the content string to an array of characters for speed
    12491494        char[] content_characters = new char[content.length()];
     
    12891534                // Check if the word matches any of the query term equivalents
    12901535                String word = new String(content_characters, word_start, (i - word_start));
     1536                //logger.error("word: "+word);
    12911537                if (query_term_variants.contains(word))
    12921538                {
     1539                  //logger.error("matched");
    12931540                    // We have found a matching word, so remember its location
    12941541                    word_matches.add(new WordMatch(word, word_start, i, preceding_word_matched));
     
    13931640
    13941641        // Now add the annotation tags into the document at the correct points
     1642        //Element content_element = doc.createElement(GSXML.NODE_CONTENT_ELEM);
     1643        Element content_element = (Element)doc.importNode(original_element, false); // just copy the element plus any attributes, but not any children.
     1644        int last_wrote = 0;
     1645        for (int i = 0; i < highlight_start_positions.size(); i++)
     1646        {
     1647            int highlight_start = highlight_start_positions.get(i).intValue();
     1648            int highlight_end = highlight_end_positions.get(i).intValue();
     1649
     1650            // Print anything before the highlight range
     1651            if (last_wrote < highlight_start)
     1652            {
     1653                String preceding_text = new String(content_characters, last_wrote, (highlight_start - last_wrote));
     1654                content_element.appendChild(doc.createTextNode(preceding_text));
     1655            }
     1656
     1657            // Print the highlight text, annotated
     1658            if (highlight_end > last_wrote)
     1659            {
     1660                String highlight_text = new String(content_characters, highlight_start, (highlight_end - highlight_start));
     1661                Element annotation_element = GSXML.createTextElement(doc, "annotation", highlight_text);
     1662                annotation_element.setAttribute("type", "query_term");
     1663                content_element.appendChild(annotation_element);
     1664                last_wrote = highlight_end;
     1665            }
     1666        }
     1667
     1668        // Finish off any unwritten text
     1669        if (last_wrote < content_characters.length)
     1670        {
     1671            String remaining_text = new String(content_characters, last_wrote, (content_characters.length - last_wrote));
     1672            content_element.appendChild(doc.createTextNode(remaining_text));
     1673        }
     1674        return content_element;
     1675    }
     1676 
     1677    private Element highlightQueryTermsInternalOrig(Document doc, String content, HashSet<String> query_term_variants, ArrayList<ArrayList<HashSet<String>>> phrase_query_term_variants_hierarchy)
     1678    {
     1679        // Convert the content string to an array of characters for speed
     1680        char[] content_characters = new char[content.length()];
     1681        content.getChars(0, content.length(), content_characters, 0);
     1682
     1683        // Now skim through the content, identifying word matches
     1684        ArrayList<WordMatch> word_matches = new ArrayList<WordMatch>();
     1685        int word_start = 0;
     1686        boolean in_word = false;
     1687        boolean preceding_word_matched = false;
     1688        boolean inTag = false;
     1689        for (int i = 0; i < content_characters.length; i++)
     1690        {
     1691            //We don't want to find words inside HTML tags
     1692            if (content_characters[i] == '<')
     1693            {
     1694                inTag = true;
     1695                continue;
     1696            }
     1697            else if (inTag && content_characters[i] == '>')
     1698            {
     1699                inTag = false;
     1700            }
     1701            else if (inTag)
     1702            {
     1703                continue;
     1704            }
     1705
     1706            boolean is_character_letter_or_digit = Character.isLetterOrDigit(content_characters[i]);
     1707
     1708            // Has a word just started?
     1709            if (in_word == false && is_character_letter_or_digit == true)
     1710            {
     1711                in_word = true;
     1712                word_start = i;
     1713            }
     1714
     1715            // Or has a word just finished?
     1716            else if (in_word == true && is_character_letter_or_digit == false)
     1717            {
     1718                in_word = false;
     1719
     1720                // Check if the word matches any of the query term equivalents
     1721                String word = new String(content_characters, word_start, (i - word_start));
     1722                if (query_term_variants.contains(word))
     1723                {
     1724                    // We have found a matching word, so remember its location
     1725                    word_matches.add(new WordMatch(word, word_start, i, preceding_word_matched));
     1726                    preceding_word_matched = true;
     1727                }
     1728                else
     1729                {
     1730                    preceding_word_matched = false;
     1731                }
     1732            }
     1733        }
     1734
     1735        // Don't forget the last word...
     1736        if (in_word == true)
     1737        {
     1738            // Check if the word matches any of the query term equivalents
     1739            String word = new String(content_characters, word_start, (content_characters.length - word_start));
     1740            if (query_term_variants.contains(word))
     1741            {
     1742                // We have found a matching word, so remember its location
     1743                word_matches.add(new WordMatch(word, word_start, content_characters.length, preceding_word_matched));
     1744            }
     1745        }
     1746
     1747        ArrayList<Integer> highlight_start_positions = new ArrayList<Integer>();
     1748        ArrayList<Integer> highlight_end_positions = new ArrayList<Integer>();
     1749
     1750        // Deal with phrases now
     1751        ArrayList<PartialPhraseMatch> partial_phrase_matches = new ArrayList<PartialPhraseMatch>();
     1752        for (int i = 0; i < word_matches.size(); i++)
     1753        {
     1754            WordMatch word_match = word_matches.get(i);
     1755
     1756            // See if any partial phrase matches are extended by this word
     1757            if (word_match.preceding_word_matched)
     1758            {
     1759                for (int j = partial_phrase_matches.size() - 1; j >= 0; j--)
     1760                {
     1761                    PartialPhraseMatch partial_phrase_match = partial_phrase_matches.remove(j);
     1762                    ArrayList phrase_query_p_term_variants_list = phrase_query_term_variants_hierarchy.get(partial_phrase_match.query_phrase_number);
     1763                    HashSet phrase_query_p_term_x_variants = (HashSet) phrase_query_p_term_variants_list.get(partial_phrase_match.num_words_matched);
     1764                    if (phrase_query_p_term_x_variants.contains(word_match.word))
     1765                    {
     1766                        partial_phrase_match.num_words_matched++;
     1767
     1768                        // Has a complete phrase match occurred?
     1769                        if (partial_phrase_match.num_words_matched == phrase_query_p_term_variants_list.size())
     1770                        {
     1771                            // Check for overlaps by looking at the previous highlight range
     1772                            if (!highlight_end_positions.isEmpty())
     1773                            {
     1774                                int last_highlight_index = highlight_end_positions.size() - 1;
     1775                                int last_highlight_end = highlight_end_positions.get(last_highlight_index).intValue();
     1776                                if (last_highlight_end > partial_phrase_match.start_position)
     1777                                {
     1778                                    // There is an overlap, so remove the previous phrase match
     1779                                    int last_highlight_start = highlight_start_positions.remove(last_highlight_index).intValue();
     1780                                    highlight_end_positions.remove(last_highlight_index);
     1781                                    partial_phrase_match.start_position = last_highlight_start;
     1782                                }
     1783                            }
     1784
     1785                            highlight_start_positions.add(new Integer(partial_phrase_match.start_position));
     1786                            highlight_end_positions.add(new Integer(word_match.end_position));
     1787                        }
     1788                        // No, but add the partial match back into the list for next time
     1789                        else
     1790                        {
     1791                            partial_phrase_matches.add(partial_phrase_match);
     1792                        }
     1793                    }
     1794                }
     1795            }
     1796            else
     1797            {
     1798                partial_phrase_matches.clear();
     1799            }
     1800
     1801            // See if this word is at the start of any of the phrases
     1802            for (int p = 0; p < phrase_query_term_variants_hierarchy.size(); p++)
     1803            {
     1804                ArrayList phrase_query_p_term_variants_list = phrase_query_term_variants_hierarchy.get(p);
     1805                if (phrase_query_p_term_variants_list.size()>0) {
     1806                HashSet phrase_query_p_term_1_variants = (HashSet) phrase_query_p_term_variants_list.get(0);
     1807                if (phrase_query_p_term_1_variants.contains(word_match.word))
     1808                {
     1809                    // If this phrase is just one word long, we have a complete match
     1810                    if (phrase_query_p_term_variants_list.size() == 1)
     1811                    {
     1812                        highlight_start_positions.add(new Integer(word_match.start_position));
     1813                        highlight_end_positions.add(new Integer(word_match.end_position));
     1814                    }
     1815                    // Otherwise we have the start of a potential phrase match
     1816                    else
     1817                    {
     1818                        partial_phrase_matches.add(new PartialPhraseMatch(word_match.start_position, p));
     1819                    }
     1820                }
     1821                }
     1822            }
     1823        }
     1824
     1825        // Now add the annotation tags into the document at the correct points
    13951826        Element content_element = doc.createElement(GSXML.NODE_CONTENT_ELEM);
    13961827
Note: See TracChangeset for help on using the changeset viewer.