Context Navigation

← Previous Changeset
Next Changeset →

Changeset 12975

Timestamp:

2006-10-03T14:17:12+13:00 (18 years ago)

Author:

mdewsnip

Message:

Simplified all the term frequency stuff considerably, and made it actually do what we want: return the term frequencies over all the documents in the index (consistent with MG and MGPP), not just the documents that match the search.

Location:

trunk

Files:

: 2 edited

gsdl/src/java/org/nzdl/gsdl/LuceneWrap/GS2LuceneQuery.java (modified) (5 diffs)
indexers/lucene-gs/src/org/greenstone/LuceneWrapper/GS2LuceneQuery.java (modified) (5 diffs)

Legend:

: Unmodified
: Added
: Removed

trunk/gsdl/src/java/org/nzdl/gsdl/LuceneWrap/GS2LuceneQuery.java

-              r12846
+              r12975
 import org.apache.lucene.index.IndexReader;
 import org.apache.lucene.index.Term;
 import org.apache.lucene.index.TermFreqVector;
+import org.apache.lucene.index.TermDocs;
 import org.apache.lucene.queryParser.ParseException;
 import org.apache.lucene.queryParser.QueryParser;
 …
         QueryParser query_parser_no_stop_words = new QueryParser(TEXTFIELD, new StandardAnalyzer(new String[] { }));
         Sort sorter = new Sort();
+        Sort sorter = null;
         Filter filter = null;
         String fuzziness = null;
 …
         if (args[i].equals("-filter")) {
             i++;
-            // Parse up filter
             filter = parseFilterString(args[i]);
+        }
 …
             query = query.rewrite(reader);
+            // Perform the query
+            Hits hits;
+            if (filter != null) {
+            hits = searcher.search(query, filter, sorter);
+            }
+            else {
+            hits = searcher.search(query, sorter);
+            }
+            // Return the list of expanded query terms and their frequencies
+            HashMap term_counts = new HashMap();
+            HashMap term_fields = new HashMap();
+            HashSet terms = new HashSet();
+            query.extractTerms(terms);
+            Iterator iter = terms.iterator();
+            while (iter.hasNext()) {
+            Term term = (Term) iter.next();
+            // If you wanted to limit this to just text terms add
+            // something like this:
+            //if (term.field().equals(TEXTFIELD))
+            term_counts.put(term.text(), new Integer(0));
+            term_fields.put(term.text(), term.field());
+            }
+            // Perform the query (filter and sorter may be null)
+            Hits hits = searcher.search(query, filter, sorter);
             // Do we need to use a hit iterator to get sorted results?
 …
+            }
             // And skip all the rest
+            // From the document, extract the Term Vector for the
+            // text field
+            TermFreqVector term_freq_vector = reader.getTermFreqVector(hit.getId(), TEXTFIELD);
+            if (term_freq_vector != null && term_freq_vector.size() > 0) {
+                int[] term_frequencies = term_freq_vector.getTermFrequencies();
+                // Now for each query term, determine the
+                // frequency - which may of course be 0.
+                Set term_counts_set = term_counts.keySet();
+                Iterator terms_iter = term_counts_set.iterator();
+                while (terms_iter.hasNext()) {
+                String term = (String) terms_iter.next();
+                Integer count_integer = (Integer) term_counts.get(term);
+                int count = count_integer.intValue();
+                int index = term_freq_vector.indexOf(term);
+                // If the term has a count, then add to
+                // the total count for this term
+                if (index != -1) {
+                    count += term_frequencies[index];
+                }
+                // Store the result
+                term_counts.put(term, new Integer(count));
+                count_integer = null;
+                term = null;
+                }
+                terms_iter = null;
+                term_counts_set = null;
+            ++counter;
+            }
+            // Return the list of expanded query terms and their frequencies
+            HashSet terms = new HashSet();
+            query.extractTerms(terms);
+            Iterator term_iterator = terms.iterator();
+            System.out.println("  <QueryTermsInfo num=\"" + terms.size() + "\"/>");
+            while (term_iterator.hasNext()) {
+            Term term = (Term) term_iterator.next();
+            // Get the term frequency over all the documents
+            TermDocs term_docs = reader.termDocs(term);
+            int term_freq = term_docs.freq();
+            while (term_docs.next()) {
+                term_freq += term_docs.freq();
+            }
+            else {
+                ///ystem.err.println("Error! Missing term vector for document " + hit.getId());
+            }
+            ++counter;
+            // If you wanted to limit this to just text terms add
+            // something like this:
+            // if (term.field().equals(TEXTFIELD))
+            System.out.println("  <Term value=\"" + term.text() + "\" field=\"" + term.field() + "\" freq=\"" + term_freq + "\" />");
+            }
-            // Retrieve all the useful terms
-            Set term_counts_set = term_counts.keySet();
-            System.out.println("  <QueryTermsInfo num=\"" + term_counts_set.size() + "\"/>");
-            // Iterate over them
-            Iterator terms_iter = term_counts_set.iterator();
-            while (terms_iter.hasNext()) {
-            String term = (String) terms_iter.next();
-            Integer count = (Integer) term_counts.get(term);
-            String field = (String) term_fields.get(term);
-            // Ignore any terms with zero frequency, because they don't exist in the matching
-            // documents. It seems that this should never happen, but it's a consequence of
-            // how the terms are identified. The terms are found by rewriting the query (above).
-            // At this point, the query hasn't been run, so each query term is expanded without
-            // knowing whether the expanded term will actually appear in one of the resulting
-            // documents. For example, "otago" may be expanded to "otaio" and "otaqo", but if
-            // the search is for "otago AND auckland", no matching documents may include "otaio".
-            // Hopefully that made some sense...
-            if (count.intValue() > 0) {
-                System.out.println("  <Term value=\"" + term + "\" field=\"" + field + "\" freq=\"" + count.intValue() + "\" />");
+            }
-            count = null;
-            term = null;
+            }
-            // Cleanup
-            terms_iter = null;
-            term_counts_set = null;
             // Return the list of stop words removed from the query
             HashSet terms_including_stop_words = new HashSet();

trunk/indexers/lucene-gs/src/org/greenstone/LuceneWrapper/GS2LuceneQuery.java

-              r12846
+              r12975
 import org.apache.lucene.index.IndexReader;
 import org.apache.lucene.index.Term;
 import org.apache.lucene.index.TermFreqVector;
+import org.apache.lucene.index.TermDocs;
 import org.apache.lucene.queryParser.ParseException;
 import org.apache.lucene.queryParser.QueryParser;
 …
         QueryParser query_parser_no_stop_words = new QueryParser(TEXTFIELD, new StandardAnalyzer(new String[] { }));
         Sort sorter = new Sort();
+        Sort sorter = null;
         Filter filter = null;
         String fuzziness = null;
 …
         if (args[i].equals("-filter")) {
             i++;
-            // Parse up filter
             filter = parseFilterString(args[i]);
+        }
 …
             query = query.rewrite(reader);
+            // Perform the query
+            Hits hits;
+            if (filter != null) {
+            hits = searcher.search(query, filter, sorter);
+            }
+            else {
+            hits = searcher.search(query, sorter);
+            }
+            // Return the list of expanded query terms and their frequencies
+            HashMap term_counts = new HashMap();
+            HashMap term_fields = new HashMap();
+            HashSet terms = new HashSet();
+            query.extractTerms(terms);
+            Iterator iter = terms.iterator();
+            while (iter.hasNext()) {
+            Term term = (Term) iter.next();
+            // If you wanted to limit this to just text terms add
+            // something like this:
+            //if (term.field().equals(TEXTFIELD))
+            term_counts.put(term.text(), new Integer(0));
+            term_fields.put(term.text(), term.field());
+            }
+            // Perform the query (filter and sorter may be null)
+            Hits hits = searcher.search(query, filter, sorter);
             // Do we need to use a hit iterator to get sorted results?
 …
+            }
             // And skip all the rest
+            // From the document, extract the Term Vector for the
+            // text field
+            TermFreqVector term_freq_vector = reader.getTermFreqVector(hit.getId(), TEXTFIELD);
+            if (term_freq_vector != null && term_freq_vector.size() > 0) {
+                int[] term_frequencies = term_freq_vector.getTermFrequencies();
+                // Now for each query term, determine the
+                // frequency - which may of course be 0.
+                Set term_counts_set = term_counts.keySet();
+                Iterator terms_iter = term_counts_set.iterator();
+                while (terms_iter.hasNext()) {
+                String term = (String) terms_iter.next();
+                Integer count_integer = (Integer) term_counts.get(term);
+                int count = count_integer.intValue();
+                int index = term_freq_vector.indexOf(term);
+                // If the term has a count, then add to
+                // the total count for this term
+                if (index != -1) {
+                    count += term_frequencies[index];
+                }
+                // Store the result
+                term_counts.put(term, new Integer(count));
+                count_integer = null;
+                term = null;
+                }
+                terms_iter = null;
+                term_counts_set = null;
+            ++counter;
+            }
+            // Return the list of expanded query terms and their frequencies
+            HashSet terms = new HashSet();
+            query.extractTerms(terms);
+            Iterator term_iterator = terms.iterator();
+            System.out.println("  <QueryTermsInfo num=\"" + terms.size() + "\"/>");
+            while (term_iterator.hasNext()) {
+            Term term = (Term) term_iterator.next();
+            // Get the term frequency over all the documents
+            TermDocs term_docs = reader.termDocs(term);
+            int term_freq = term_docs.freq();
+            while (term_docs.next()) {
+                term_freq += term_docs.freq();
+            }
+            else {
+                ///ystem.err.println("Error! Missing term vector for document " + hit.getId());
+            }
+            ++counter;
+            // If you wanted to limit this to just text terms add
+            // something like this:
+            // if (term.field().equals(TEXTFIELD))
+            System.out.println("  <Term value=\"" + term.text() + "\" field=\"" + term.field() + "\" freq=\"" + term_freq + "\" />");
+            }
-            // Retrieve all the useful terms
-            Set term_counts_set = term_counts.keySet();
-            System.out.println("  <QueryTermsInfo num=\"" + term_counts_set.size() + "\"/>");
-            // Iterate over them
-            Iterator terms_iter = term_counts_set.iterator();
-            while (terms_iter.hasNext()) {
-            String term = (String) terms_iter.next();
-            Integer count = (Integer) term_counts.get(term);
-            String field = (String) term_fields.get(term);
-            // Ignore any terms with zero frequency, because they don't exist in the matching
-            // documents. It seems that this should never happen, but it's a consequence of
-            // how the terms are identified. The terms are found by rewriting the query (above).
-            // At this point, the query hasn't been run, so each query term is expanded without
-            // knowing whether the expanded term will actually appear in one of the resulting
-            // documents. For example, "otago" may be expanded to "otaio" and "otaqo", but if
-            // the search is for "otago AND auckland", no matching documents may include "otaio".
-            // Hopefully that made some sense...
-            if (count.intValue() > 0) {
-                System.out.println("  <Term value=\"" + term + "\" field=\"" + field + "\" freq=\"" + count.intValue() + "\" />");
+            }
-            count = null;
-            term = null;
+            }
-            // Cleanup
-            terms_iter = null;
-            term_counts_set = null;
             // Return the list of stop words removed from the query
             HashSet terms_including_stop_words = new HashSet();

Note: See TracChangeset for help on using the changeset viewer.

Context Navigation

Changeset 12975

Legend:

trunk/gsdl/src/java/org/nzdl/gsdl/LuceneWrap/GS2LuceneQuery.java

trunk/indexers/lucene-gs/src/org/greenstone/LuceneWrapper/GS2LuceneQuery.java

Download in other formats: