Context Navigation

← Previous Changeset
Next Changeset →

Changeset 32505

Timestamp:

2018-10-09T16:02:10+13:00 (6 years ago)

Author:

kjdon

Message:

moved soem code into a new method getFormattedArchiveDoc. Modified search term highlighting code. separated the getting of query term variants, and marking up the text. then redoing the query is only called once. now can call the text marking up bit on metadata too - useful if the document page displays a table of metadata - want to highlight search terms in the table.

File:

: 1 edited

main/trunk/greenstone3/src/java/org/greenstone/gsdl3/action/DocumentAction.java (modified) (15 diffs)

Legend:

: Unmodified
: Added
: Removed

main/trunk/greenstone3/src/java/org/greenstone/gsdl3/action/DocumentAction.java

-              r32448
+              r32505
         Element message = GSXML.nodeToElement(message_node);
         Document doc = XMLConverter.newDOM(); //message.getOwnerDocument();
+        Document doc = XMLConverter.newDOM();
         // the response
 …
         // just in case there are some that need to get passed to the services
+        // why do we use s0 here and s1 in other places???
         HashMap service_params = (HashMap) params.get("s0");
 …
         // are we editing mode? just get the archive document, convert to our internal doc format, and return it
         if (editing_document) {
+          // call get archive doc
+          Element dx_message = doc.createElement(GSXML.MESSAGE_ELEM);
+          String to = "DocXMLGetSection";
+          Element dx_request = GSXML.createBasicRequest(doc, GSXML.REQUEST_TYPE_PROCESS, to, userContext);
+          dx_message.appendChild(dx_request);
+          Element dx_section = doc.createElement(GSXML.DOCXML_SECTION_ELEM);
+          dx_section.setAttribute(GSXML.NODE_ID_ATT, document_id);
+          dx_section.setAttribute(GSXML.COLLECTION_ATT, collection);
+          dx_request.appendChild(dx_section);
+          Element dx_response_message = (Element) this.mr.process(dx_message);
+          if (processErrorElements(dx_response_message, page_response))
+            {
+              return result;
+            }
+          // get the section out
+          String path = GSPath.appendLink(GSXML.RESPONSE_ELEM, GSXML.DOCXML_SECTION_ELEM);
+          Element section = (Element) GSXML.getNodeByPath(dx_response_message, path);
+          if (section == null) {
+            logger.error("no archive doc returned for "+document_id);
+            return result;
+          }
+          // convert the archive format into the internal format that the page response requires
+          // work out doctype
+          // NOTE: this will be coming from collection database in index
+          // the archive file doesn't store this. So we have to assume
+          // that the doc type will not be changing with any
+          // modifications happening to archives.
+          // if doc type is null, then we need to work it out.
+          // create a basic doc list containing the current node
+          if (document_type == null) {
+            Element basic_doc_list = doc.createElement(GSXML.DOC_NODE_ELEM + GSXML.LIST_MODIFIER);
+            Element current_doc = doc.createElement(GSXML.DOC_NODE_ELEM);
+            basic_doc_list.appendChild(current_doc);
+            current_doc.setAttribute(GSXML.NODE_ID_ATT, document_id);
+            basic_doc_list.appendChild(current_doc);
+            document_type = getDocumentType(basic_doc_list, collection, userContext, page_response);
+          }
+          if (document_type == null) {
+              logger.debug("@@@ doctype is null, setting to simple");
+              document_type = GSXML.DOC_TYPE_SIMPLE;
+          }
+          Element doc_elem = doc.createElement(GSXML.DOCUMENT_ELEM);
+          doc_elem.setAttribute(GSXML.DOC_TYPE_ATT, document_type);
+          page_response.appendChild(doc_elem);
+          Element transformed_section = transformArchiveToDocument(section);
+          if (document_type ==  GSXML.DOC_TYPE_SIMPLE) {
+            // simple doc, only returning a single document node, which is the top level section.
+            doc_elem.setAttribute(GSXML.NODE_ID_ATT, document_id);
+            GSXML.mergeElements(doc_elem, transformed_section);
+            return result;
+          }
+          // multi sectioned document.
+          transformed_section.setAttribute(GSXML.NODE_ID_ATT, document_id);
+          // In docEdit mode, we obtain the text from archives, from doc.xml
+          // Now the transformation has replaced <Section> with <documentNode>
+          // Need to add nodeID, nodeType and docType attributes to each docNode
+          // as doc.xml doesn't store that.
+          insertDocNodeAttributes(transformed_section, document_type, null);
+          doc_elem.appendChild(doc.importNode(transformed_section, true));
+          logger.debug("dx result = "+XMLConverter.getPrettyString(result));
+          return result;
+          return getFormattedArchiveDoc(doc, collection, document_id, document_type, result, page_response, userContext);
+        }
 …
         the_document.setAttribute(GSXML.DOC_TYPE_ATT, document_type);
+        // start getting doc structure
         // Create a parameter list to specify the required structure information
 …
+        }
+        // end getting doc structure
+        // start getting doc metadata
         // Build a request to obtain some document metadata
         Element dm_message = doc.createElement(GSXML.MESSAGE_ELEM);
 …
+        }
+        HashSet<String> query_term_variants = null;
+        ArrayList<ArrayList<HashSet<String>>> phrase_query_term_variants_hierarchy = null;
+        boolean do_highlight_query_terms = highlight_query_terms;
+        if (highlight_query_terms) {
+          // lets get the query term equivalents
+          query_term_variants = new HashSet<String>();
+          phrase_query_term_variants_hierarchy = new ArrayList<ArrayList<HashSet<String>>>();
+          if (!getQueryTermVariants(request, null, /*current_node_id,*/ query_term_variants, phrase_query_term_variants_hierarchy)) {
+            do_highlight_query_terms = false; // we couldn't get the terms
+          }
+        }
+        // lets try marking up the metadata with search terms
+        if (do_highlight_query_terms) {
+          highlightQueryTermsDOM(doc, the_document, "metadata", query_term_variants, phrase_query_term_variants_hierarchy);
+        }
         // Build a request to obtain some document content
         Element dc_message = doc.createElement(GSXML.MESSAGE_ELEM);
 …
                 if (content != null)
+                {
                     if (highlight_query_terms)
+                    if (do_highlight_query_terms)
+                    {
+                      content = highlightQueryTerms(request, node_id, (Element) content);
+                      content = highlightQueryTermsElementText(doc, (Element)content, query_term_variants, phrase_query_term_variants_hierarchy);
+                    }
 …
                 return result;
+            }
             if (highlight_query_terms)
+            if (do_highlight_query_terms)
+            {
                 dc_response_doc.removeChild(dc_response_doc_content);
                 dc_response_doc_content = highlightQueryTerms(request, null, dc_response_doc_content);
+                dc_response_doc_content = highlightQueryTermsElementText(doc, (Element)dc_response_doc_content, query_term_variants, phrase_query_term_variants_hierarchy);
                 dc_response_doc.appendChild(dc_response_doc.getOwnerDocument().importNode(dc_response_doc_content, true));
+            }
 …
+    }
+  protected Element getFormattedArchiveDoc(Document doc, String collection, String document_id, String document_type, Element result, Element page_response, UserContext userContext ) {
+    // call get archive doc
+    Element dx_message = doc.createElement(GSXML.MESSAGE_ELEM);
+    String to = "DocXMLGetSection";
+    Element dx_request = GSXML.createBasicRequest(doc, GSXML.REQUEST_TYPE_PROCESS, to, userContext);
+    dx_message.appendChild(dx_request);
+    Element dx_section = doc.createElement(GSXML.DOCXML_SECTION_ELEM);
+    dx_section.setAttribute(GSXML.NODE_ID_ATT, document_id);
+    dx_section.setAttribute(GSXML.COLLECTION_ATT, collection);
+    dx_request.appendChild(dx_section);
+    Element dx_response_message = (Element) this.mr.process(dx_message);
+    if (processErrorElements(dx_response_message, page_response))
+      {
+    return result;
+      }
+    // get the section out
+    String path = GSPath.appendLink(GSXML.RESPONSE_ELEM, GSXML.DOCXML_SECTION_ELEM);
+    Element section = (Element) GSXML.getNodeByPath(dx_response_message, path);
+    if (section == null) {
+      logger.error("no archive doc returned for "+document_id);
+      return result;
+    }
+    // convert the archive format into the internal format that the page response requires
+    // work out doctype
+    // NOTE: this will be coming from collection database in index
+    // the archive file doesn't store this. So we have to assume
+    // that the doc type will not be changing with any
+    // modifications happening to archives.
+    // if doc type is null, then we need to work it out.
+    // create a basic doc list containing the current node
+    if (document_type == null) {
+      Element basic_doc_list = doc.createElement(GSXML.DOC_NODE_ELEM + GSXML.LIST_MODIFIER);
+      Element current_doc = doc.createElement(GSXML.DOC_NODE_ELEM);
+      basic_doc_list.appendChild(current_doc);
+      current_doc.setAttribute(GSXML.NODE_ID_ATT, document_id);
+      basic_doc_list.appendChild(current_doc);
+      document_type = getDocumentType(basic_doc_list, collection, userContext, page_response);
+    }
+    if (document_type == null) {
+      logger.debug("@@@ doctype is null, setting to simple");
+      document_type = GSXML.DOC_TYPE_SIMPLE;
+    }
+    Element doc_elem = doc.createElement(GSXML.DOCUMENT_ELEM);
+    doc_elem.setAttribute(GSXML.DOC_TYPE_ATT, document_type);
+    page_response.appendChild(doc_elem);
+    Element transformed_section = transformArchiveToDocument(section);
+    if (document_type ==  GSXML.DOC_TYPE_SIMPLE) {
+      // simple doc, only returning a single document node, which is the top level section.
+      doc_elem.setAttribute(GSXML.NODE_ID_ATT, document_id);
+      GSXML.mergeElements(doc_elem, transformed_section);
+      return result;
+    }
+    // multi sectioned document.
+    transformed_section.setAttribute(GSXML.NODE_ID_ATT, document_id);
+    // In docEdit mode, we obtain the text from archives, from doc.xml
+    // Now the transformation has replaced <Section> with <documentNode>
+    // Need to add nodeID, nodeType and docType attributes to each docNode
+    // as doc.xml doesn't store that.
+    insertDocNodeAttributes(transformed_section, document_type, null);
+    doc_elem.appendChild(doc.importNode(transformed_section, true));
+    logger.debug("dx result = "+XMLConverter.getPrettyString(result));
+    return result;
+  }
     private boolean needSectionContent(HashMap<String, Serializable> params) {
 …
+  }
+    /**
+     * this involves a bit of a hack to get the equivalent query terms - has to
+     * requery the query service - uses the last selected service name. (if it
+     * ends in query). should this action do the query or should it send a
+     * message to the query action? but that will involve lots of extra stuff.
+     * also doesn't handle phrases properly - just highlights all the terms
+     * found in the text.
+     */
+  protected Element highlightQueryTerms(Element request, String current_node_id, Element dc_response_doc_content)
+  /**
+   * this involves a bit of a hack to get the equivalent query terms - has to
+   * requery the query service - uses the last selected service name. (if it
+   * ends in query). should this action do the query or should it send a
+   * message to the query action? but that will involve lots of extra stuff.
+   */
+  protected boolean getQueryTermVariants(Element request, String  current_node_id, HashSet<String> query_term_variants, ArrayList<ArrayList<HashSet<String>>> phrase_query_term_variants_hierarchy)
+  {
+    Document doc = request.getOwnerDocument();
+    // do the query again to get term info
+    Element cgi_param_list = (Element) GSXML.getChildByTagName(request, GSXML.PARAM_ELEM + GSXML.LIST_MODIFIER);
+    //logger.error("cgi param list = "+XMLConverter.getPrettyString(cgi_param_list));
+    HashMap<String, Serializable> params = GSXML.extractParams(cgi_param_list, false);
+    HashMap previous_params = (HashMap) params.get("p");
+    if (previous_params == null)
+      {
+    //logger.error("no p parms");
+    return false;
+      }
+    String service_name = (String) previous_params.get(GSParams.SERVICE);
+    if (service_name == null || !service_name.endsWith("Query"))
+      { // hack for now - we only do highlighting if we were in a query last - ie not if we were in a browse thingy
+    logger.debug("invalid service, not doing highlighting");
+    return false;
+      }
+    String collection = (String) params.get(GSParams.COLLECTION);
+    UserContext userContext = new UserContext(request);
+    String to = GSPath.appendLink(collection, service_name);
+    Element mr_query_message = doc.createElement(GSXML.MESSAGE_ELEM);
+    Element mr_query_request = GSXML.createBasicRequest(doc, GSXML.REQUEST_TYPE_PROCESS, to, userContext);
+    mr_query_message.appendChild(mr_query_request);
+    // paramList
+    HashMap service_params = (HashMap) params.get("s1");
+    Element query_param_list = doc.createElement(GSXML.PARAM_ELEM + GSXML.LIST_MODIFIER);
+    GSXML.addParametersToList(query_param_list, service_params);
+    // is this only used for solr??? - do we still want it for solr??
+    // if (current_node_id != null) {
+    //   GSXML.addParameterToList(query_param_list, "hldocOID", current_node_id);
+    // } else {
+    //   GSXML.addParameterToList(query_param_list, "hldocOID", (String) params.get(GSParams.DOCUMENT));
+    // }
+    mr_query_request.appendChild(query_param_list);
+    // do the query
+    Element mr_query_response = (Element) this.mr.process(mr_query_message);
+    // find the term lists
+    String path = GSPath.appendLink(GSXML.RESPONSE_ELEM, GSXML.TERM_ELEM + GSXML.LIST_MODIFIER);
+    Element query_term_list_element = (Element) GSXML.getNodeByPath(mr_query_response, path);
+    if (query_term_list_element == null)
+      {
+    // no term info
+    logger.error("No query term information. xx\n");
+    return false;
+      }
+    //    logger.error("query term list info "+XMLConverter.getPrettyString(query_term_list_element));
+    //String content = GSXML.getNodeText(dc_response_doc_content);
+    String metadata_path = GSPath.appendLink(GSXML.RESPONSE_ELEM, GSXML.METADATA_ELEM + GSXML.LIST_MODIFIER);
+    Element metadata_list = (Element) GSXML.getNodeByPath(mr_query_response, metadata_path);
+    NodeList equivalent_terms_nodelist = query_term_list_element.getElementsByTagName("equivTermList");
+    if (equivalent_terms_nodelist == null || equivalent_terms_nodelist.getLength() == 0)
+      {
+    NodeList terms_nodelist = query_term_list_element.getElementsByTagName("term");
+    if (terms_nodelist != null && terms_nodelist.getLength() > 0)
+      {
+        for (int i = 0; i < terms_nodelist.getLength(); i++)
+          {
+        String termValue = ((Element) terms_nodelist.item(i)).getAttribute("name");
+        String termValueU = null;
+        String termValueL = null;
+        if (termValue.length() > 1)
+          {
+            termValueU = termValue.substring(0, 1).toUpperCase() + termValue.substring(1);
+            termValueL = termValue.substring(0, 1).toLowerCase() + termValue.substring(1);
+          }
+        else
+          {
+            termValueU = termValue.substring(0, 1).toUpperCase();
+            termValueL = termValue.substring(0, 1).toLowerCase();
+          }
+        query_term_variants.add(termValueU);
+        query_term_variants.add(termValueL);
+          }
+      }
+      }
+    else
+      {
+    for (int i = 0; i < equivalent_terms_nodelist.getLength(); i++)
+      {
+        Element equivalent_terms_element = (Element) equivalent_terms_nodelist.item(i);
+        String[] equivalent_terms = GSXML.getAttributeValuesFromList(equivalent_terms_element, GSXML.NAME_ATT);
+        for (int j = 0; j < equivalent_terms.length; j++)
+          {
+        query_term_variants.add(equivalent_terms[j]);
+          }
+      }
+      }
+    Element query_element = GSXML.getNamedElement(metadata_list, GSXML.METADATA_ELEM, GSXML.NAME_ATT, "query");
+    String performed_query = GSXML.getNodeText(query_element) + " ";
+    ArrayList<HashSet<String>> phrase_query_p_term_variants_list = new ArrayList<HashSet<String>>();
+    int term_start = 0;
+    boolean in_term = false;
+    boolean in_phrase = false;
+    for (int i = 0; i < performed_query.length(); i++)
+      {
+    char character = performed_query.charAt(i);
+    boolean is_character_letter_or_digit = Character.isLetterOrDigit(character);
+    // Has a query term just started?
+    if (in_term == false && is_character_letter_or_digit == true)
+      {
+        in_term = true;
+        term_start = i;
+      }
+    // Or has a term just finished?
+    else if (in_term == true && is_character_letter_or_digit == false)
+      {
+        in_term = false;
+        String term = performed_query.substring(term_start, i);
+        Element term_element = GSXML.getNamedElement(query_term_list_element, GSXML.TERM_ELEM, GSXML.NAME_ATT, term);
+        if (term_element != null)
+          {
+        HashSet<String> phrase_query_p_term_x_variants = new HashSet<String>();
+        NodeList term_equivalent_terms_nodelist = term_element.getElementsByTagName("equivTermList");
+        if (term_equivalent_terms_nodelist == null || term_equivalent_terms_nodelist.getLength() == 0)
+          {
+            String termValueU = null;
+            String termValueL = null;
+            if (term.length() > 1)
+              {
+            termValueU = term.substring(0, 1).toUpperCase() + term.substring(1);
+            termValueL = term.substring(0, 1).toLowerCase() + term.substring(1);
+              }
+            else
+              {
+            termValueU = term.substring(0, 1).toUpperCase();
+            termValueL = term.substring(0, 1).toLowerCase();
+              }
+            phrase_query_p_term_x_variants.add(termValueU);
+            phrase_query_p_term_x_variants.add(termValueL);
+          }
+        else
+          {
+            for (int j = 0; j < term_equivalent_terms_nodelist.getLength(); j++)
+              {
+            Element term_equivalent_terms_element = (Element) term_equivalent_terms_nodelist.item(j);
+            String[] term_equivalent_terms = GSXML.getAttributeValuesFromList(term_equivalent_terms_element, GSXML.NAME_ATT);
+            for (int k = 0; k < term_equivalent_terms.length; k++)
+              {
+                phrase_query_p_term_x_variants.add(term_equivalent_terms[k]);
+              }
+              }
+          }
+        phrase_query_p_term_variants_list.add(phrase_query_p_term_x_variants);
+        if (in_phrase == false)
+          {
+            phrase_query_term_variants_hierarchy.add(phrase_query_p_term_variants_list);
+            phrase_query_p_term_variants_list = new ArrayList<HashSet<String>>();
+          }
+          }
+      }
+    // Watch for phrases (surrounded by quotes)
+    if (character == '\"')
+      {
+        // Has a phrase just started?
+        if (in_phrase == false)
+          {
+        in_phrase = true;
+          }
+        // Or has a phrase just finished?
+        else if (in_phrase == true)
+          {
+        in_phrase = false;
+        phrase_query_term_variants_hierarchy.add(phrase_query_p_term_variants_list);
+          }
+        phrase_query_p_term_variants_list = new ArrayList<HashSet<String>>();
+      }
+      }
+    return true;
+  }
+  /** redo the request to get the query terms then highlight them in the text
+   *
+   */
+  protected Element highlightQueryTermsOld(Element request, String current_node_id, Element dc_response_doc_content)
+    {
         Document doc = request.getOwnerDocument();
 …
+        {
             // Build a request to process highlighted text
+          logger.error("highlighted node is not null!!!!");
+          logger.error(XMLConverter.getPrettyString(highlighted_Node));
             Element hl_message = doc.createElement(GSXML.MESSAGE_ELEM);
             to = GSPath.appendLink(collection, "DocumentContentRetrieve");
 …
+        {
             // no term info
             logger.error("No query term information.\n");
+            logger.error("No query term information. yy\n");
             return dc_response_doc_content;
+        }
 …
+        }
         return highlightQueryTermsInternal(doc, content, query_term_variants, phrase_query_term_variants_hierarchy);
+        return highlightQueryTermsInternalOrig(doc, content, query_term_variants, phrase_query_term_variants_hierarchy);
+    }
+  /**
+   * Highlights query terms in specified elements (whose name is in element_names) text inside top_level_elem
+   */
+  protected boolean highlightQueryTermsDOM(Document doc, Element top_level_elem, String element_name, HashSet<String> query_term_variants, ArrayList<ArrayList<HashSet<String>>> phrase_query_term_variants_hierarchy) {
+    //logger.error("begin highlight DOM "+XMLConverter.getPrettyString(top_level_elem));
+      NodeList named_elems = top_level_elem.getElementsByTagName(element_name);
+      for (int j=named_elems.getLength()-1; j>=0; j--) {
+    Element this_elem = (Element)named_elems.item(j);
+    Element replacement_elem = highlightQueryTermsElementText(doc, this_elem, query_term_variants, phrase_query_term_variants_hierarchy);
+    this_elem.getParentNode().replaceChild(replacement_elem, this_elem);
+      }
+      //logger.error("end highlight DOM "+XMLConverter.getPrettyString(top_level_elem));
+    return true;
+  }
     /**
      * Highlights query terms in a piece of text.
+     * Highlights query terms in the text content of an element.
      */
     private Element highlightQueryTermsInternal(Document doc, String content, HashSet<String> query_term_variants, ArrayList<ArrayList<HashSet<String>>> phrase_query_term_variants_hierarchy)
+  private Element highlightQueryTermsElementText(Document doc, Element original_element, /*String content,*/  HashSet<String> query_term_variants, ArrayList<ArrayList<HashSet<String>>> phrase_query_term_variants_hierarchy)
+    {
+      //logger.error("in hl internal, query terms are "+query_term_variants.toString());
+      String content = GSXML.getNodeText(original_element);
+      //logger.error("original elem = "+XMLConverter.getPrettyString(original_element));
+      logger.error("highlighting content: "+content);
         // Convert the content string to an array of characters for speed
         char[] content_characters = new char[content.length()];
 …
                 // Check if the word matches any of the query term equivalents
                 String word = new String(content_characters, word_start, (i - word_start));
+                //logger.error("word: "+word);
                 if (query_term_variants.contains(word))
+                {
+                  //logger.error("matched");
                     // We have found a matching word, so remember its location
                     word_matches.add(new WordMatch(word, word_start, i, preceding_word_matched));
 …
         // Now add the annotation tags into the document at the correct points
+        //Element content_element = doc.createElement(GSXML.NODE_CONTENT_ELEM);
+        Element content_element = (Element)doc.importNode(original_element, false); // just copy the element plus any attributes, but not any children.
+        int last_wrote = 0;
+        for (int i = 0; i < highlight_start_positions.size(); i++)
+        {
+            int highlight_start = highlight_start_positions.get(i).intValue();
+            int highlight_end = highlight_end_positions.get(i).intValue();
+            // Print anything before the highlight range
+            if (last_wrote < highlight_start)
+            {
+                String preceding_text = new String(content_characters, last_wrote, (highlight_start - last_wrote));
+                content_element.appendChild(doc.createTextNode(preceding_text));
+            }
+            // Print the highlight text, annotated
+            if (highlight_end > last_wrote)
+            {
+                String highlight_text = new String(content_characters, highlight_start, (highlight_end - highlight_start));
+                Element annotation_element = GSXML.createTextElement(doc, "annotation", highlight_text);
+                annotation_element.setAttribute("type", "query_term");
+                content_element.appendChild(annotation_element);
+                last_wrote = highlight_end;
+            }
+        }
+        // Finish off any unwritten text
+        if (last_wrote < content_characters.length)
+        {
+            String remaining_text = new String(content_characters, last_wrote, (content_characters.length - last_wrote));
+            content_element.appendChild(doc.createTextNode(remaining_text));
+        }
+        return content_element;
+    }
+    private Element highlightQueryTermsInternalOrig(Document doc, String content, HashSet<String> query_term_variants, ArrayList<ArrayList<HashSet<String>>> phrase_query_term_variants_hierarchy)
+    {
+        // Convert the content string to an array of characters for speed
+        char[] content_characters = new char[content.length()];
+        content.getChars(0, content.length(), content_characters, 0);
+        // Now skim through the content, identifying word matches
+        ArrayList<WordMatch> word_matches = new ArrayList<WordMatch>();
+        int word_start = 0;
+        boolean in_word = false;
+        boolean preceding_word_matched = false;
+        boolean inTag = false;
+        for (int i = 0; i < content_characters.length; i++)
+        {
+            //We don't want to find words inside HTML tags
+            if (content_characters[i] == '<')
+            {
+                inTag = true;
+                continue;
+            }
+            else if (inTag && content_characters[i] == '>')
+            {
+                inTag = false;
+            }
+            else if (inTag)
+            {
+                continue;
+            }
+            boolean is_character_letter_or_digit = Character.isLetterOrDigit(content_characters[i]);
+            // Has a word just started?
+            if (in_word == false && is_character_letter_or_digit == true)
+            {
+                in_word = true;
+                word_start = i;
+            }
+            // Or has a word just finished?
+            else if (in_word == true && is_character_letter_or_digit == false)
+            {
+                in_word = false;
+                // Check if the word matches any of the query term equivalents
+                String word = new String(content_characters, word_start, (i - word_start));
+                if (query_term_variants.contains(word))
+                {
+                    // We have found a matching word, so remember its location
+                    word_matches.add(new WordMatch(word, word_start, i, preceding_word_matched));
+                    preceding_word_matched = true;
+                }
+                else
+                {
+                    preceding_word_matched = false;
+                }
+            }
+        }
+        // Don't forget the last word...
+        if (in_word == true)
+        {
+            // Check if the word matches any of the query term equivalents
+            String word = new String(content_characters, word_start, (content_characters.length - word_start));
+            if (query_term_variants.contains(word))
+            {
+                // We have found a matching word, so remember its location
+                word_matches.add(new WordMatch(word, word_start, content_characters.length, preceding_word_matched));
+            }
+        }
+        ArrayList<Integer> highlight_start_positions = new ArrayList<Integer>();
+        ArrayList<Integer> highlight_end_positions = new ArrayList<Integer>();
+        // Deal with phrases now
+        ArrayList<PartialPhraseMatch> partial_phrase_matches = new ArrayList<PartialPhraseMatch>();
+        for (int i = 0; i < word_matches.size(); i++)
+        {
+            WordMatch word_match = word_matches.get(i);
+            // See if any partial phrase matches are extended by this word
+            if (word_match.preceding_word_matched)
+            {
+                for (int j = partial_phrase_matches.size() - 1; j >= 0; j--)
+                {
+                    PartialPhraseMatch partial_phrase_match = partial_phrase_matches.remove(j);
+                    ArrayList phrase_query_p_term_variants_list = phrase_query_term_variants_hierarchy.get(partial_phrase_match.query_phrase_number);
+                    HashSet phrase_query_p_term_x_variants = (HashSet) phrase_query_p_term_variants_list.get(partial_phrase_match.num_words_matched);
+                    if (phrase_query_p_term_x_variants.contains(word_match.word))
+                    {
+                        partial_phrase_match.num_words_matched++;
+                        // Has a complete phrase match occurred?
+                        if (partial_phrase_match.num_words_matched == phrase_query_p_term_variants_list.size())
+                        {
+                            // Check for overlaps by looking at the previous highlight range
+                            if (!highlight_end_positions.isEmpty())
+                            {
+                                int last_highlight_index = highlight_end_positions.size() - 1;
+                                int last_highlight_end = highlight_end_positions.get(last_highlight_index).intValue();
+                                if (last_highlight_end > partial_phrase_match.start_position)
+                                {
+                                    // There is an overlap, so remove the previous phrase match
+                                    int last_highlight_start = highlight_start_positions.remove(last_highlight_index).intValue();
+                                    highlight_end_positions.remove(last_highlight_index);
+                                    partial_phrase_match.start_position = last_highlight_start;
+                                }
+                            }
+                            highlight_start_positions.add(new Integer(partial_phrase_match.start_position));
+                            highlight_end_positions.add(new Integer(word_match.end_position));
+                        }
+                        // No, but add the partial match back into the list for next time
+                        else
+                        {
+                            partial_phrase_matches.add(partial_phrase_match);
+                        }
+                    }
+                }
+            }
+            else
+            {
+                partial_phrase_matches.clear();
+            }
+            // See if this word is at the start of any of the phrases
+            for (int p = 0; p < phrase_query_term_variants_hierarchy.size(); p++)
+            {
+                ArrayList phrase_query_p_term_variants_list = phrase_query_term_variants_hierarchy.get(p);
+                if (phrase_query_p_term_variants_list.size()>0) {
+                HashSet phrase_query_p_term_1_variants = (HashSet) phrase_query_p_term_variants_list.get(0);
+                if (phrase_query_p_term_1_variants.contains(word_match.word))
+                {
+                    // If this phrase is just one word long, we have a complete match
+                    if (phrase_query_p_term_variants_list.size() == 1)
+                    {
+                        highlight_start_positions.add(new Integer(word_match.start_position));
+                        highlight_end_positions.add(new Integer(word_match.end_position));
+                    }
+                    // Otherwise we have the start of a potential phrase match
+                    else
+                    {
+                        partial_phrase_matches.add(new PartialPhraseMatch(word_match.start_position, p));
+                    }
+                }
+                }
+            }
+        }
+        // Now add the annotation tags into the document at the correct points
         Element content_element = doc.createElement(GSXML.NODE_CONTENT_ELEM);

Note: See TracChangeset for help on using the changeset viewer.

Context Navigation

Changeset 32505

Legend:

main/trunk/greenstone3/src/java/org/greenstone/gsdl3/action/DocumentAction.java

Download in other formats: