Changeset 12975


Ignore:
Timestamp:
2006-10-03T14:17:12+13:00 (18 years ago)
Author:
mdewsnip
Message:

Simplified all the term frequency stuff considerably, and made it actually do what we want: return the term frequencies over all the documents in the index (consistent with MG and MGPP), not just the documents that match the search.

Location:
trunk
Files:
2 edited

Legend:

Unmodified
Added
Removed
  • trunk/gsdl/src/java/org/nzdl/gsdl/LuceneWrap/GS2LuceneQuery.java

    r12846 r12975  
    2020import org.apache.lucene.index.IndexReader;
    2121import org.apache.lucene.index.Term;
    22 import org.apache.lucene.index.TermFreqVector;
     22import org.apache.lucene.index.TermDocs;
    2323import org.apache.lucene.queryParser.ParseException;
    2424import org.apache.lucene.queryParser.QueryParser;
     
    5757        QueryParser query_parser_no_stop_words = new QueryParser(TEXTFIELD, new StandardAnalyzer(new String[] { }));
    5858
    59         Sort sorter = new Sort();
     59        Sort sorter = null;
    6060        Filter filter = null;
    6161        String fuzziness = null;
     
    7575        if (args[i].equals("-filter")) {
    7676            i++;
    77            
    78             // Parse up filter
    7977            filter = parseFilterString(args[i]);
    8078        }
     
    127125            query = query.rewrite(reader);
    128126
    129             // Perform the query
    130             Hits hits;
    131             if (filter != null) {
    132             hits = searcher.search(query, filter, sorter);
    133             }
    134             else {
    135             hits = searcher.search(query, sorter);
    136             }
    137 
    138             // Return the list of expanded query terms and their frequencies
    139             HashMap term_counts = new HashMap();
    140             HashMap term_fields = new HashMap();
    141             HashSet terms = new HashSet();
    142             query.extractTerms(terms);
    143             Iterator iter = terms.iterator();
    144             while (iter.hasNext()) {
    145             Term term = (Term) iter.next();
    146             // If you wanted to limit this to just text terms add
    147             // something like this:
    148             //if (term.field().equals(TEXTFIELD))
    149             term_counts.put(term.text(), new Integer(0));
    150             term_fields.put(term.text(), term.field());
    151             }
     127            // Perform the query (filter and sorter may be null)
     128            Hits hits = searcher.search(query, filter, sorter);
    152129
    153130            // Do we need to use a hit iterator to get sorted results?
     
    173150            }
    174151            // And skip all the rest
    175            
    176             // From the document, extract the Term Vector for the
    177             // text field
    178             TermFreqVector term_freq_vector = reader.getTermFreqVector(hit.getId(), TEXTFIELD);
    179             if (term_freq_vector != null && term_freq_vector.size() > 0) {
    180                 int[] term_frequencies = term_freq_vector.getTermFrequencies();
    181                 // Now for each query term, determine the
    182                 // frequency - which may of course be 0.
    183                 Set term_counts_set = term_counts.keySet();
    184                 Iterator terms_iter = term_counts_set.iterator();
    185                 while (terms_iter.hasNext()) {
    186                
    187                 String term = (String) terms_iter.next();
    188                 Integer count_integer = (Integer) term_counts.get(term);
    189                 int count = count_integer.intValue();
    190                 int index = term_freq_vector.indexOf(term);
    191                 // If the term has a count, then add to
    192                 // the total count for this term
    193                 if (index != -1) {
    194                     count += term_frequencies[index];
    195                 }
    196                 // Store the result
    197                 term_counts.put(term, new Integer(count));
    198                 count_integer = null;
    199                 term = null;
    200                 }
    201                 terms_iter = null;
    202                 term_counts_set = null;
     152
     153            ++counter;
     154            }
     155
     156            // Return the list of expanded query terms and their frequencies
     157            HashSet terms = new HashSet();
     158            query.extractTerms(terms);
     159            Iterator term_iterator = terms.iterator();
     160            System.out.println("  <QueryTermsInfo num=\"" + terms.size() + "\"/>");
     161            while (term_iterator.hasNext()) {
     162            Term term = (Term) term_iterator.next();
     163
     164            // Get the term frequency over all the documents
     165            TermDocs term_docs = reader.termDocs(term);
     166            int term_freq = term_docs.freq();
     167            while (term_docs.next()) {
     168                term_freq += term_docs.freq();
    203169            }
    204             else {
    205                 ///ystem.err.println("Error! Missing term vector for document " + hit.getId());
    206             }
    207             ++counter;
     170
     171            // If you wanted to limit this to just text terms add
     172            // something like this:
     173            // if (term.field().equals(TEXTFIELD))
     174            System.out.println("  <Term value=\"" + term.text() + "\" field=\"" + term.field() + "\" freq=\"" + term_freq + "\" />");
    208175            }
    209176           
    210             // Retrieve all the useful terms
    211             Set term_counts_set = term_counts.keySet();
    212             System.out.println("  <QueryTermsInfo num=\"" + term_counts_set.size() + "\"/>");
    213             // Iterate over them
    214             Iterator terms_iter = term_counts_set.iterator();
    215             while (terms_iter.hasNext()) {
    216             String term = (String) terms_iter.next();
    217             Integer count = (Integer) term_counts.get(term);
    218             String field = (String) term_fields.get(term);
    219 
    220             // Ignore any terms with zero frequency, because they don't exist in the matching
    221             // documents. It seems that this should never happen, but it's a consequence of
    222             // how the terms are identified. The terms are found by rewriting the query (above).
    223             // At this point, the query hasn't been run, so each query term is expanded without
    224             // knowing whether the expanded term will actually appear in one of the resulting
    225             // documents. For example, "otago" may be expanded to "otaio" and "otaqo", but if
    226             // the search is for "otago AND auckland", no matching documents may include "otaio".
    227             // Hopefully that made some sense...
    228             if (count.intValue() > 0) {
    229                 System.out.println("  <Term value=\"" + term + "\" field=\"" + field + "\" freq=\"" + count.intValue() + "\" />");
    230             }
    231             count = null;
    232             term = null;
    233             }
    234 
    235             // Cleanup
    236             terms_iter = null;
    237             term_counts_set = null;
    238 
    239177            // Return the list of stop words removed from the query
    240178            HashSet terms_including_stop_words = new HashSet();
  • trunk/indexers/lucene-gs/src/org/greenstone/LuceneWrapper/GS2LuceneQuery.java

    r12846 r12975  
    2020import org.apache.lucene.index.IndexReader;
    2121import org.apache.lucene.index.Term;
    22 import org.apache.lucene.index.TermFreqVector;
     22import org.apache.lucene.index.TermDocs;
    2323import org.apache.lucene.queryParser.ParseException;
    2424import org.apache.lucene.queryParser.QueryParser;
     
    5757        QueryParser query_parser_no_stop_words = new QueryParser(TEXTFIELD, new StandardAnalyzer(new String[] { }));
    5858
    59         Sort sorter = new Sort();
     59        Sort sorter = null;
    6060        Filter filter = null;
    6161        String fuzziness = null;
     
    7575        if (args[i].equals("-filter")) {
    7676            i++;
    77            
    78             // Parse up filter
    7977            filter = parseFilterString(args[i]);
    8078        }
     
    127125            query = query.rewrite(reader);
    128126
    129             // Perform the query
    130             Hits hits;
    131             if (filter != null) {
    132             hits = searcher.search(query, filter, sorter);
    133             }
    134             else {
    135             hits = searcher.search(query, sorter);
    136             }
    137 
    138             // Return the list of expanded query terms and their frequencies
    139             HashMap term_counts = new HashMap();
    140             HashMap term_fields = new HashMap();
    141             HashSet terms = new HashSet();
    142             query.extractTerms(terms);
    143             Iterator iter = terms.iterator();
    144             while (iter.hasNext()) {
    145             Term term = (Term) iter.next();
    146             // If you wanted to limit this to just text terms add
    147             // something like this:
    148             //if (term.field().equals(TEXTFIELD))
    149             term_counts.put(term.text(), new Integer(0));
    150             term_fields.put(term.text(), term.field());
    151             }
     127            // Perform the query (filter and sorter may be null)
     128            Hits hits = searcher.search(query, filter, sorter);
    152129
    153130            // Do we need to use a hit iterator to get sorted results?
     
    173150            }
    174151            // And skip all the rest
    175            
    176             // From the document, extract the Term Vector for the
    177             // text field
    178             TermFreqVector term_freq_vector = reader.getTermFreqVector(hit.getId(), TEXTFIELD);
    179             if (term_freq_vector != null && term_freq_vector.size() > 0) {
    180                 int[] term_frequencies = term_freq_vector.getTermFrequencies();
    181                 // Now for each query term, determine the
    182                 // frequency - which may of course be 0.
    183                 Set term_counts_set = term_counts.keySet();
    184                 Iterator terms_iter = term_counts_set.iterator();
    185                 while (terms_iter.hasNext()) {
    186                
    187                 String term = (String) terms_iter.next();
    188                 Integer count_integer = (Integer) term_counts.get(term);
    189                 int count = count_integer.intValue();
    190                 int index = term_freq_vector.indexOf(term);
    191                 // If the term has a count, then add to
    192                 // the total count for this term
    193                 if (index != -1) {
    194                     count += term_frequencies[index];
    195                 }
    196                 // Store the result
    197                 term_counts.put(term, new Integer(count));
    198                 count_integer = null;
    199                 term = null;
    200                 }
    201                 terms_iter = null;
    202                 term_counts_set = null;
     152
     153            ++counter;
     154            }
     155
     156            // Return the list of expanded query terms and their frequencies
     157            HashSet terms = new HashSet();
     158            query.extractTerms(terms);
     159            Iterator term_iterator = terms.iterator();
     160            System.out.println("  <QueryTermsInfo num=\"" + terms.size() + "\"/>");
     161            while (term_iterator.hasNext()) {
     162            Term term = (Term) term_iterator.next();
     163
     164            // Get the term frequency over all the documents
     165            TermDocs term_docs = reader.termDocs(term);
     166            int term_freq = term_docs.freq();
     167            while (term_docs.next()) {
     168                term_freq += term_docs.freq();
    203169            }
    204             else {
    205                 ///ystem.err.println("Error! Missing term vector for document " + hit.getId());
    206             }
    207             ++counter;
     170
     171            // If you wanted to limit this to just text terms add
     172            // something like this:
     173            // if (term.field().equals(TEXTFIELD))
     174            System.out.println("  <Term value=\"" + term.text() + "\" field=\"" + term.field() + "\" freq=\"" + term_freq + "\" />");
    208175            }
    209176           
    210             // Retrieve all the useful terms
    211             Set term_counts_set = term_counts.keySet();
    212             System.out.println("  <QueryTermsInfo num=\"" + term_counts_set.size() + "\"/>");
    213             // Iterate over them
    214             Iterator terms_iter = term_counts_set.iterator();
    215             while (terms_iter.hasNext()) {
    216             String term = (String) terms_iter.next();
    217             Integer count = (Integer) term_counts.get(term);
    218             String field = (String) term_fields.get(term);
    219 
    220             // Ignore any terms with zero frequency, because they don't exist in the matching
    221             // documents. It seems that this should never happen, but it's a consequence of
    222             // how the terms are identified. The terms are found by rewriting the query (above).
    223             // At this point, the query hasn't been run, so each query term is expanded without
    224             // knowing whether the expanded term will actually appear in one of the resulting
    225             // documents. For example, "otago" may be expanded to "otaio" and "otaqo", but if
    226             // the search is for "otago AND auckland", no matching documents may include "otaio".
    227             // Hopefully that made some sense...
    228             if (count.intValue() > 0) {
    229                 System.out.println("  <Term value=\"" + term + "\" field=\"" + field + "\" freq=\"" + count.intValue() + "\" />");
    230             }
    231             count = null;
    232             term = null;
    233             }
    234 
    235             // Cleanup
    236             terms_iter = null;
    237             term_counts_set = null;
    238 
    239177            // Return the list of stop words removed from the query
    240178            HashSet terms_including_stop_words = new HashSet();
Note: See TracChangeset for help on using the changeset viewer.