Changeset 8731


Ignore:
Timestamp:
2004-12-03T13:09:47+13:00 (19 years ago)
Author:
mdewsnip
Message:

Much more advanced query term highlighting. Supports query term highlighting and query phrase highlighting, with all permutations of case folding and stemming.

File:
1 edited

Legend:

Unmodified
Added
Removed
  • trunk/gsdl3/src/java/org/greenstone/gsdl3/action/DocumentAction.java

    r8714 r8731  
    3131
    3232// General Java classes
     33import java.util.ArrayList;
    3334import java.util.HashMap;
    3435import java.util.HashSet;
     
    4849     * type services (enrich) will be offered to the user as well */
    4950    protected static final boolean provide_annotations = false; //true;
     51
    5052
    5153    public Element process (Element message)
     
    540542   
    541543    String path = GSPath.appendLink(GSXML.RESPONSE_ELEM, GSXML.TERM_ELEM+GSXML.LIST_MODIFIER);
    542     Element query_term_info_list = (Element) GSXML.getNodeByPath(mr_query_response, path);
    543     if (query_term_info_list == null) {
     544    Element query_term_list_element = (Element) GSXML.getNodeByPath(mr_query_response, path);
     545    if (query_term_list_element == null) {
    544546        // no term info
    545547        System.err.println("DocumentAction: Warning: No query term information.\n");
     
    547549    }
    548550
    549     NodeList equivs = query_term_info_list.getElementsByTagName("equivTermList");
    550     HashSet all_terms = new HashSet();
    551     for (int i=0; i<equivs.getLength(); i++) {
    552        
    553         // get the terms
    554         String [] terms = GSXML.getAttributeValuesFromList((Element)equivs.item(i), GSXML.NAME_ATT);
    555         for (int j=0; j<terms.length; j++) {
    556        
    557         all_terms.add(terms[j]);
    558         }
    559     }
    560 
    561     Element new_content_elem = this.doc.createElement(GSXML.NODE_CONTENT_ELEM);
    562 
    563551    String content = GSXML.getNodeText(dc_response_doc_content);
    564552
    565     StringBuffer temp = new StringBuffer();
    566     StringBuffer temp_content = new StringBuffer();
    567 
    568     for (int i=0; i<content.length(); i++) {
    569         char c = content.charAt(i);
    570         if (Character.isLetterOrDigit(c)) {
    571         // not word boundary
    572         temp.append(c);
    573         } else {
    574         // word boundary
    575         // add the last word if there was one
    576         if (temp.length()>0) {
    577             if (all_terms.contains(temp.toString())) {
    578             //if there is anything already present in temp_content, add it as a text node
    579             Text t = this.doc.createTextNode(temp_content.toString());
    580             new_content_elem.appendChild(t);
    581             temp_content.delete(0, temp_content.length());
    582             Element annot = GSXML.createTextElement(this.doc, "annotation", temp.toString());
    583             annot.setAttribute("type", "query_term");
    584             new_content_elem.appendChild(annot);
    585             //new_content.append("<annotation type='query_term'>"+temp+"</annotation>");
    586             } else {
    587             temp_content.append(temp);
     553    String metadata_path = GSPath.appendLink(GSXML.RESPONSE_ELEM, GSXML.METADATA_ELEM+GSXML.LIST_MODIFIER);
     554    Element metadata_list = (Element) GSXML.getNodeByPath(mr_query_response, metadata_path);
     555
     556    HashSet query_term_variants = new HashSet();
     557    NodeList equivalent_terms_nodelist = query_term_list_element.getElementsByTagName("equivTermList");
     558    for (int i = 0; i < equivalent_terms_nodelist.getLength(); i++) {
     559        Element equivalent_terms_element = (Element) equivalent_terms_nodelist.item(i);
     560        String[] equivalent_terms = GSXML.getAttributeValuesFromList(equivalent_terms_element, GSXML.NAME_ATT);
     561        for (int j = 0; j < equivalent_terms.length; j++) {
     562        System.err.println("Adding query term variant: " + equivalent_terms[j]);
     563        query_term_variants.add(equivalent_terms[j]);
     564        }
     565    }
     566
     567    ArrayList phrase_query_term_variants_hierarchy = new ArrayList();
     568
     569    Element query_element = GSXML.getNamedElement(metadata_list, GSXML.METADATA_ELEM, GSXML.NAME_ATT, "query");
     570    String performed_query = GSXML.getNodeText(query_element) + " ";
     571
     572    ArrayList phrase_query_p_term_variants_list = new ArrayList();
     573    int term_start = 0;
     574    boolean in_term = false;
     575    boolean in_phrase = false;
     576    for (int i = 0; i < performed_query.length(); i++) {
     577        char character = performed_query.charAt(i);
     578        boolean is_character_letter_or_digit = Character.isLetterOrDigit(character);
     579
     580        // Has a query term just started?
     581        if (in_term == false && is_character_letter_or_digit == true) {
     582        in_term = true;
     583        term_start = i;
     584        }
     585
     586        // Or has a term just finished?
     587        else if (in_term == true && is_character_letter_or_digit == false) {
     588        in_term = false;
     589        String term = performed_query.substring(term_start, i);
     590        System.err.println("Term: " + term);
     591
     592        HashSet phrase_query_p_term_x_variants = new HashSet();
     593        Element term_element = GSXML.getNamedElement(query_term_list_element, GSXML.TERM_ELEM, GSXML.NAME_ATT, term);
     594        NodeList term_equivalent_terms_nodelist = term_element.getElementsByTagName("equivTermList");
     595        for (int j = 0; j < term_equivalent_terms_nodelist.getLength(); j++) {
     596            Element term_equivalent_terms_element = (Element) term_equivalent_terms_nodelist.item(j);
     597            String[] term_equivalent_terms = GSXML.getAttributeValuesFromList(term_equivalent_terms_element, GSXML.NAME_ATT);
     598            for (int k = 0; k < term_equivalent_terms.length; k++) {
     599            System.err.println("Adding query term variant: " + term_equivalent_terms[k]);
     600            phrase_query_p_term_x_variants.add(term_equivalent_terms[k]);
    588601            }
    589             temp.delete(0, temp.length());
    590         }
    591         if (c=='<') {
    592             temp_content.append(c);
    593             i++;
    594             // skip over html
    595             while (i<content.length() && content.charAt(i)!='>') {
    596             temp_content.append(content.charAt(i));
    597             i++;
     602        }
     603        phrase_query_p_term_variants_list.add(phrase_query_p_term_x_variants);
     604
     605        if (in_phrase == false) {
     606            phrase_query_term_variants_hierarchy.add(phrase_query_p_term_variants_list);
     607            phrase_query_p_term_variants_list = new ArrayList();
     608        }
     609        }
     610
     611        // Watch for phrases (surrounded by quotes)
     612        if (character == '\"') {
     613        // Has a phrase just started?
     614        if (in_phrase == false) {
     615            in_phrase = true;
     616        }
     617        // Or has a phrase just finished?
     618        else if (in_phrase == true) {
     619            in_phrase = false;
     620            phrase_query_term_variants_hierarchy.add(phrase_query_p_term_variants_list);
     621        }
     622
     623        phrase_query_p_term_variants_list = new ArrayList();
     624        }
     625    }
     626
     627    return highlightQueryTermsInternal(content, query_term_variants, phrase_query_term_variants_hierarchy);
     628    }
     629
     630
     631    /**
     632     * Highlights query terms in a piece of text.
     633     */
     634    private Element highlightQueryTermsInternal(String content, HashSet query_term_variants, ArrayList phrase_query_term_variants_hierarchy)
     635    {
     636    // Convert the content string to an array of characters for speed
     637    char[] content_characters = new char[content.length()];
     638    content.getChars(0, content.length(), content_characters, 0);
     639
     640    // Now skim through the content, identifying word matches
     641    ArrayList word_matches = new ArrayList();
     642    int word_start = 0;
     643    boolean in_word = false;
     644    boolean preceding_word_matched = false;
     645    for (int i = 0; i < content_characters.length; i++) {
     646        boolean is_character_letter_or_digit = Character.isLetterOrDigit(content_characters[i]);
     647
     648        // Has a word just started?
     649        if (in_word == false && is_character_letter_or_digit == true) {
     650        in_word = true;
     651        word_start = i;
     652        }
     653
     654        // Or has a word just finished?
     655        else if (in_word == true && is_character_letter_or_digit == false) {
     656        in_word = false;
     657
     658        // Check if the word matches any of the query term equivalents
     659        String word = new String(content_characters, word_start, (i - word_start));
     660        if (query_term_variants.contains(word)) {
     661            // We have found a matching word, so remember its location
     662            word_matches.add(new WordMatch(word, word_start, i, preceding_word_matched));
     663            preceding_word_matched = true;
     664        }
     665        else {
     666            preceding_word_matched = false;
     667        }
     668        }
     669    }
     670
     671    // Don't forget the last word...
     672    if (in_word == true) {
     673        // Check if the word matches any of the query term equivalents
     674        String word = new String(content_characters, word_start, (content_characters.length - word_start));
     675        if (query_term_variants.contains(word)) {
     676        // We have found a matching word, so remember its location
     677        word_matches.add(new WordMatch(word, word_start, content_characters.length, preceding_word_matched));
     678        }
     679    }
     680
     681    ArrayList highlight_start_positions = new ArrayList();
     682    ArrayList highlight_end_positions = new ArrayList();
     683
     684    // Deal with phrases now
     685    ArrayList partial_phrase_matches = new ArrayList();
     686    for (int i = 0; i < word_matches.size(); i++) {
     687        WordMatch word_match = (WordMatch) word_matches.get(i);
     688
     689        // See if any partial phrase matches are extended by this word
     690        if (word_match.preceding_word_matched) {
     691        for (int j = partial_phrase_matches.size() - 1; j >= 0; j--) {
     692            PartialPhraseMatch partial_phrase_match = (PartialPhraseMatch) partial_phrase_matches.remove(j);
     693            ArrayList phrase_query_p_term_variants_list = (ArrayList) phrase_query_term_variants_hierarchy.get(partial_phrase_match.query_phrase_number);
     694            HashSet phrase_query_p_term_x_variants = (HashSet) phrase_query_p_term_variants_list.get(partial_phrase_match.num_words_matched);
     695            if (phrase_query_p_term_x_variants.contains(word_match.word)) {
     696            partial_phrase_match.num_words_matched++;
     697
     698            // Has a complete phrase match occurred?
     699            if (partial_phrase_match.num_words_matched == phrase_query_p_term_variants_list.size()) {
     700                // Check for overlaps by looking at the previous highlight range
     701                if (!highlight_end_positions.isEmpty()) {
     702                int last_highlight_index = highlight_end_positions.size() - 1;
     703                int last_highlight_end = ((Integer) highlight_end_positions.get(last_highlight_index)).intValue();
     704                if (last_highlight_end > partial_phrase_match.start_position) {
     705                    // There is an overlap, so remove the previous phrase match
     706                    int last_highlight_start = ((Integer) highlight_start_positions.remove(last_highlight_index)).intValue();
     707                    highlight_end_positions.remove(last_highlight_index);
     708                    partial_phrase_match.start_position = last_highlight_start;
     709                }
     710                }
     711
     712                highlight_start_positions.add(new Integer(partial_phrase_match.start_position));
     713                highlight_end_positions.add(new Integer(word_match.end_position));
     714            }
     715            // No, but add the partial match back into the list for next time
     716            else {
     717                partial_phrase_matches.add(partial_phrase_match);
     718            }
     719            }
     720        }
     721        }
     722        else {
     723        partial_phrase_matches.clear();
     724        }
     725
     726        // See if this word is at the start of any of the phrases
     727        for (int p = 0; p < phrase_query_term_variants_hierarchy.size(); p++) {
     728        ArrayList phrase_query_p_term_variants_list = (ArrayList) phrase_query_term_variants_hierarchy.get(p);
     729        HashSet phrase_query_p_term_1_variants = (HashSet) phrase_query_p_term_variants_list.get(0);
     730        if (phrase_query_p_term_1_variants.contains(word_match.word)) {
     731            // If this phrase is just one word long, we have a complete match
     732            if (phrase_query_p_term_variants_list.size() == 1) {
     733            highlight_start_positions.add(new Integer(word_match.start_position));
     734            highlight_end_positions.add(new Integer(word_match.end_position));
    598735            }
    599             temp_content.append(content.charAt(i));
    600             //temp_content.append(GSXML.xmlSafe(temp.toString()));
    601             //temp.delete(0, temp.length());
    602            
    603         } else {
    604             temp_content.append(c);
    605         }
    606         }
    607     }
    608     // append anything left of temp_content and temp
    609     Text t = this.doc.createTextNode(temp_content.toString());
    610     new_content_elem.appendChild(t);
    611 
    612     if (temp.length() > 0) {
    613         Element annot = GSXML.createTextElement(this.doc, "annotation", temp.toString());
    614         annot.setAttribute("type", "query_term");
    615         new_content_elem.appendChild(annot);
    616     }
    617     //String content_string = "<nodeContent>"+new_content.toString()+"</nodeContent>";
    618     //Element content_elem = this.converter.getDOM(content_string).getDocumentElement();
    619     return new_content_elem;
     736            // Otherwise we have the start of a potential phrase match
     737            else {
     738            partial_phrase_matches.add(new PartialPhraseMatch(word_match.start_position, p));
     739            }
     740        }
     741        }
     742    }
     743
     744    // Now add the annotation tags into the document at the correct points
     745    Element content_element = this.doc.createElement(GSXML.NODE_CONTENT_ELEM);
     746
     747    int last_wrote = 0;
     748    for (int i = 0; i < highlight_start_positions.size(); i++) {
     749        int highlight_start = ((Integer) highlight_start_positions.get(i)).intValue();
     750        int highlight_end = ((Integer) highlight_end_positions.get(i)).intValue();
     751
     752        // Print anything before the highlight range
     753        if (last_wrote < highlight_start) {
     754        String preceding_text = new String(content_characters, last_wrote, (highlight_start - last_wrote));
     755        // System.err.print(preceding_text);
     756        content_element.appendChild(this.doc.createTextNode(preceding_text));
     757        }
     758
     759        // Print the highlight text, annotated
     760        if (highlight_end > last_wrote) {
     761        String highlight_text = new String(content_characters, highlight_start, (highlight_end - highlight_start));
     762        // System.err.print("|" + highlight_text + "|");
     763        Element annotation_element = GSXML.createTextElement(this.doc, "annotation", highlight_text);
     764        annotation_element.setAttribute("type", "query_term");
     765        content_element.appendChild(annotation_element);
     766        last_wrote = highlight_end;
     767        }
     768    }
     769
     770    // Finish off any unwritten text
     771    if (last_wrote < content_characters.length) {
     772        String remaining_text = new String(content_characters, last_wrote, (content_characters.length - last_wrote));
     773        // System.err.print(remaining_text);
     774        content_element.appendChild(this.doc.createTextNode(remaining_text));
     775    }
     776
     777    return content_element;
     778    }
     779
     780
     781    static private class WordMatch
     782    {
     783    public String word;
     784    public int start_position;
     785    public int end_position;
     786    public boolean preceding_word_matched;
     787
     788    public WordMatch(String word, int start_position, int end_position, boolean preceding_word_matched)
     789    {
     790        this.word = word;
     791        this.start_position = start_position;
     792        this.end_position = end_position;
     793        this.preceding_word_matched = preceding_word_matched;
     794    }
     795    }
     796
     797
     798    static private class PartialPhraseMatch
     799    {
     800    public int start_position;
     801    public int query_phrase_number;
     802    public int num_words_matched;
     803
     804    public PartialPhraseMatch(int start_position, int query_phrase_number)
     805    {
     806        this.start_position = start_position;
     807        this.query_phrase_number = query_phrase_number;
     808        this.num_words_matched = 1;
     809    }
    620810    }
    621811}
Note: See TracChangeset for help on using the changeset viewer.