Context Navigation

← Previous Changeset
Next Changeset →

Changeset 12377

Timestamp:

2006-08-02T15:07:47+12:00 (18 years ago)

Author:

mdewsnip

Message:

Now returns query term occurrences correctly, and does fuzzy searching with the -fuzzy argument. Many thanks to John Thompson and DL Consulting Ltd.

Location:

trunk

Files:

: 3 edited

gsdl/bin/java/LuceneWrap.jar (modified) ( previous)
gsdl/src/java/org/nzdl/gsdl/LuceneWrap/GS2LuceneQuery.java (modified) (8 diffs)
indexers/lucene-gs/src/org/greenstone/LuceneWrapper/GS2LuceneQuery.java (modified) (8 diffs)

Legend:

: Unmodified
: Added
: Removed

trunk/gsdl/src/java/org/nzdl/gsdl/LuceneWrap/GS2LuceneQuery.java

-              r12375
+              r12377
 import java.io.BufferedReader;
 import java.io.InputStreamReader;
+import java.util.Collections;
+import java.util.HashMap;
 import java.util.HashSet;
 import java.util.Iterator;
+import java.util.Set;
 import org.apache.lucene.analysis.Analyzer;
 …
 import org.apache.lucene.index.IndexReader;
 import org.apache.lucene.index.Term;
+import org.apache.lucene.index.TermFreqVector;
 import org.apache.lucene.queryParser.QueryParser;
+import org.apache.lucene.search.Hit;
 import org.apache.lucene.search.Hits;
 import org.apache.lucene.search.IndexSearcher;
 …
         Sort sorter = new Sort();
+            boolean fuzzy = false;
             // New code to allow the default conjunction operator to be
 …
+                        {
                             i++;
+                            ///ystem.err.println("**** sort by = " + args[i]);
                             sorter = new Sort(args[i]);
+                        }
 …
                             default_conjuction_operator = args[i];
+                        }
+                    if (args[i].equals("-fuzzy"))
+                        {
+                            fuzzy = true;
+                        }
+                }
 …
             break;
+        }
+        System.err.println("**** query = " + query_string);
+        // Parse the query and rewrite it into individual terms (eg. for wildcard searches)
+        ///ystem.err.println("**** query = " + query_string);
         Query query = query_parser.parse(query_string);
         query = query.rewrite(reader);
 …
         query_including_stop_words = query_including_stop_words.rewrite(reader);
+                // If this is a fuzzy search, then we need to add the fuzzy
+                // flag to each of the query terms
+                if (fuzzy && query.toString().length() > 0)
+                    {
+                        // Revert the query to a string
+                        ///ystem.err.println("Rewritten query: " + query.toString());
+                        // Search through the string for TX:<term> query terms
+                        // and append the ~ operator. Not that this search will
+                        // not change phrase searches (TX:"<term> <term>") as
+                        // fuzzy searching is not possible for these entries.
+                        // Yahoo! Time for a state machine!
+                        StringBuffer mutable_query_string = new StringBuffer(query.toString());
+                        int o = 0; // Offset
+                        // 0 = BASE, 1 = SEEN_T, 2 = SEEN_TX, 3 = SEEN_TX:
+                        int s = 0; // State
+                        while(o < mutable_query_string.length())
+                            {
+                                char c = mutable_query_string.charAt(o);
+                                ///ystem.err.println("SM: in state " + s + " and reading a " + c);
+                                if (s == 0 && c == 'T')
+                                    {
+                                        ///ystem.err.println("Found T!");
+                                        s = 1;
+                                    }
+                                else if (s == 1)
+                                    {
+                                        if (c == 'X')
+                                            {
+                                                ///ystem.err.println("Found X!");
+                                                s = 2;
+                                            }
+                                        else
+                                            {
+                                                s = 0; // Reset
+                                            }
+                                    }
+                                else if (s == 2)
+                                    {
+                                        if (c == ':')
+                                            {
+                                                ///ystem.err.println("Found TX:!");
+                                                s = 3;
+                                            }
+                                        else
+                                            {
+                                                s = 0; // Reset
+                                            }
+                                    }
+                                else if (s == 3)
+                                    {
+                                        // Don't process phrases
+                                        if (c == '"')
+                                            {
+                                                ///ystem.err.println("Stupid phrase...");
+                                                s = 0; // Reset
+                                            }
+                                        // Found the end of the term... add the
+                                        // fuzzy search indicator
+                                        // Nor outside the scope of parentheses
+                                        if (Character.isWhitespace(c) || c == ')')
+                                            {
+                                                ///ystem.err.println("Yahoo! Found fuzzy term.");
+                                                mutable_query_string.insert(o, '~');
+                                                o++;
+                                                s = 0; // Reset
+                                            }
+                                    }
+                                o++;
+                            }
+                        // If we were in the state of looking for the end of a
+                        // term - then we just found it!
+                        if (s == 3)
+                            {
+                                mutable_query_string.append('~');
+                            }
+                        // Reparse the query
+                        ///ystem.err.println("Fuzzy query: " + mutable_query_string.toString());
+                        query = query_parser.parse(mutable_query_string.toString());
+                        // And rewrite again
+                        query = query.rewrite(reader);
+                        ///ystem.err.println("Rewritten Fuzzy query: " + query.toString());
+                    }
         // Perform the query
         Hits hits = searcher.search(query, sorter);
         System.out.println("<ResultSet>");
         System.out.println("  <QueryString>" + query_string + "</QueryString>");
         // Return the list of expanded query terms and their frequencies
+                HashMap term_counts = new HashMap();
+                HashMap term_fields = new HashMap();
         HashSet terms = new HashSet();
         query.extractTerms(terms);
+        System.out.println("  <QueryTermsInfo num=\"" + terms.size() + "\"/>");
+        Iterator terms_iter = terms.iterator();
+        while (terms_iter.hasNext()) {
+            Term term = (Term) terms_iter.next();
+            System.out.println("  <Term value=\"" + term.text() + "\" freq=\"" + reader.docFreq(term) + "\" field=\"" + term.field() + "\"/>");
+        }
+        //System.out.println("  <QueryTermsInfo num=\"" + terms.size() + "\"/>");
+        Iterator iter = terms.iterator();
+        while (iter.hasNext())
+                    {
+                        Term term = (Term) iter.next();
+                        // If you wanted to limit this to just TX terms add
+                        // something like this:
+                        //if (term.field().equals("TX"))
+                        term_counts.put(term.text(), new Integer(0));
+                        term_fields.put(term.text(), term.field());
+                    }
+                // Do we need to use a hit iterator to get sorted results?
+                System.out.println("  <MatchingDocsInfo num=\"" + hits.length() + "\"/>");
+                Iterator hit_iter = hits.iterator();
+                while (hit_iter.hasNext())
+                    {
+                        Hit hit = (Hit) hit_iter.next();
+                        Document doc = hit.getDocument();
+                        String node_id = doc.get("nodeID");
+                        System.out.println("  <Match id=\"" + node_id + "\" />");
+                        // From the document, extract the Term Vector for the
+                        // TX field
+                        TermFreqVector term_freq_vector = reader.getTermFreqVector(hit.getId(), "TX");
+                        if (term_freq_vector.size() > 0)
+                            {
+                                int[] term_frequencies = term_freq_vector.getTermFrequencies();
+                                // Now for each query term, determine the
+                                // frequency - which may of course be 0.
+                                Set term_counts_set = term_counts.keySet();
+                                Iterator terms_iter = term_counts_set.iterator();
+                                while (terms_iter.hasNext())
+                                    {
+                                        String term = (String) terms_iter.next();
+                                        Integer count_integer = (Integer) term_counts.get(term);
+                                        int count = count_integer.intValue();
+                                        int index = term_freq_vector.indexOf(term);
+                                        // If the term has a count, then add to
+                                        // the total count for this term
+                                        if (index != -1)
+                                            {
+                                                count += term_frequencies[index];
+                                            }
+                                        // Store the result
+                                        term_counts.put(term, new Integer(count));
+                                        count_integer = null;
+                                        term = null;
+                                    }
+                                terms_iter = null;
+                                term_counts_set = null;
+                            }
+                        else
+                            {
+                                ///ystem.err.println("Error! Missing term vector for document " + hit.getId());
+                            }
+                    }
+                // Retrieve all the useful terms
+                Set term_counts_set = term_counts.keySet();
+                System.out.println("  <QueryTermsInfo num=\"" + term_counts_set.size() + "\"/>");
+                // Iterate over them
+                Iterator terms_iter = term_counts_set.iterator();
+                while (terms_iter.hasNext())
+                    {
+                        String term = (String) terms_iter.next();
+                        Integer count = (Integer) term_counts.get(term);
+                        String field = (String) term_fields.get(term);
+                        System.out.println("  <Term value=\"" + term + "\" field=\"" + field + "\" freq=\"" + count.intValue() + "\" />");
+                        count = null;
+                        term = null;
+                    }
+                // Cleanup
+                terms_iter = null;
+                term_counts_set = null;
         // Return the list of stop words removed from the query
         HashSet terms_including_stop_words = new HashSet();
         query_including_stop_words.extractTerms(terms_including_stop_words);
-        System.out.println("  <StopWordsInfo num=\"" + (terms_including_stop_words.size() - terms.size()) + "\"/>");
         Iterator terms_including_stop_words_iter = terms_including_stop_words.iterator();
         while (terms_including_stop_words_iter.hasNext()) {
 …
+        }
+        // Return the matching documents
+        System.out.println("  <MatchingDocsInfo num=\"" + hits.length() + "\"/>");
+        for (int i = 0; i < hits.length(); i++) {
+            Document doc = hits.doc(i);
+            String node_id = doc.get("nodeID");
+            System.out.println("  <Match id=\"" + node_id + "\"/>");
+        }
+        System.out.println("</ResultSet>");
+        System.out.println("</ResultSet>");
+        }

trunk/indexers/lucene-gs/src/org/greenstone/LuceneWrapper/GS2LuceneQuery.java

-              r12375
+              r12377
 import java.io.BufferedReader;
 import java.io.InputStreamReader;
+import java.util.Collections;
+import java.util.HashMap;
 import java.util.HashSet;
 import java.util.Iterator;
+import java.util.Set;
 import org.apache.lucene.analysis.Analyzer;
 …
 import org.apache.lucene.index.IndexReader;
 import org.apache.lucene.index.Term;
+import org.apache.lucene.index.TermFreqVector;
 import org.apache.lucene.queryParser.QueryParser;
+import org.apache.lucene.search.Hit;
 import org.apache.lucene.search.Hits;
 import org.apache.lucene.search.IndexSearcher;
 …
         Sort sorter = new Sort();
+            boolean fuzzy = false;
             // New code to allow the default conjunction operator to be
 …
+                        {
                             i++;
+                            ///ystem.err.println("**** sort by = " + args[i]);
                             sorter = new Sort(args[i]);
+                        }
 …
                             default_conjuction_operator = args[i];
+                        }
+                    if (args[i].equals("-fuzzy"))
+                        {
+                            fuzzy = true;
+                        }
+                }
 …
             break;
+        }
+        System.err.println("**** query = " + query_string);
+        // Parse the query and rewrite it into individual terms (eg. for wildcard searches)
+        ///ystem.err.println("**** query = " + query_string);
         Query query = query_parser.parse(query_string);
         query = query.rewrite(reader);
 …
         query_including_stop_words = query_including_stop_words.rewrite(reader);
+                // If this is a fuzzy search, then we need to add the fuzzy
+                // flag to each of the query terms
+                if (fuzzy && query.toString().length() > 0)
+                    {
+                        // Revert the query to a string
+                        ///ystem.err.println("Rewritten query: " + query.toString());
+                        // Search through the string for TX:<term> query terms
+                        // and append the ~ operator. Not that this search will
+                        // not change phrase searches (TX:"<term> <term>") as
+                        // fuzzy searching is not possible for these entries.
+                        // Yahoo! Time for a state machine!
+                        StringBuffer mutable_query_string = new StringBuffer(query.toString());
+                        int o = 0; // Offset
+                        // 0 = BASE, 1 = SEEN_T, 2 = SEEN_TX, 3 = SEEN_TX:
+                        int s = 0; // State
+                        while(o < mutable_query_string.length())
+                            {
+                                char c = mutable_query_string.charAt(o);
+                                ///ystem.err.println("SM: in state " + s + " and reading a " + c);
+                                if (s == 0 && c == 'T')
+                                    {
+                                        ///ystem.err.println("Found T!");
+                                        s = 1;
+                                    }
+                                else if (s == 1)
+                                    {
+                                        if (c == 'X')
+                                            {
+                                                ///ystem.err.println("Found X!");
+                                                s = 2;
+                                            }
+                                        else
+                                            {
+                                                s = 0; // Reset
+                                            }
+                                    }
+                                else if (s == 2)
+                                    {
+                                        if (c == ':')
+                                            {
+                                                ///ystem.err.println("Found TX:!");
+                                                s = 3;
+                                            }
+                                        else
+                                            {
+                                                s = 0; // Reset
+                                            }
+                                    }
+                                else if (s == 3)
+                                    {
+                                        // Don't process phrases
+                                        if (c == '"')
+                                            {
+                                                ///ystem.err.println("Stupid phrase...");
+                                                s = 0; // Reset
+                                            }
+                                        // Found the end of the term... add the
+                                        // fuzzy search indicator
+                                        // Nor outside the scope of parentheses
+                                        if (Character.isWhitespace(c) || c == ')')
+                                            {
+                                                ///ystem.err.println("Yahoo! Found fuzzy term.");
+                                                mutable_query_string.insert(o, '~');
+                                                o++;
+                                                s = 0; // Reset
+                                            }
+                                    }
+                                o++;
+                            }
+                        // If we were in the state of looking for the end of a
+                        // term - then we just found it!
+                        if (s == 3)
+                            {
+                                mutable_query_string.append('~');
+                            }
+                        // Reparse the query
+                        ///ystem.err.println("Fuzzy query: " + mutable_query_string.toString());
+                        query = query_parser.parse(mutable_query_string.toString());
+                        // And rewrite again
+                        query = query.rewrite(reader);
+                        ///ystem.err.println("Rewritten Fuzzy query: " + query.toString());
+                    }
         // Perform the query
         Hits hits = searcher.search(query, sorter);
         System.out.println("<ResultSet>");
         System.out.println("  <QueryString>" + query_string + "</QueryString>");
         // Return the list of expanded query terms and their frequencies
+                HashMap term_counts = new HashMap();
+                HashMap term_fields = new HashMap();
         HashSet terms = new HashSet();
         query.extractTerms(terms);
+        System.out.println("  <QueryTermsInfo num=\"" + terms.size() + "\"/>");
+        Iterator terms_iter = terms.iterator();
+        while (terms_iter.hasNext()) {
+            Term term = (Term) terms_iter.next();
+            System.out.println("  <Term value=\"" + term.text() + "\" freq=\"" + reader.docFreq(term) + "\" field=\"" + term.field() + "\"/>");
+        }
+        //System.out.println("  <QueryTermsInfo num=\"" + terms.size() + "\"/>");
+        Iterator iter = terms.iterator();
+        while (iter.hasNext())
+                    {
+                        Term term = (Term) iter.next();
+                        // If you wanted to limit this to just TX terms add
+                        // something like this:
+                        //if (term.field().equals("TX"))
+                        term_counts.put(term.text(), new Integer(0));
+                        term_fields.put(term.text(), term.field());
+                    }
+                // Do we need to use a hit iterator to get sorted results?
+                System.out.println("  <MatchingDocsInfo num=\"" + hits.length() + "\"/>");
+                Iterator hit_iter = hits.iterator();
+                while (hit_iter.hasNext())
+                    {
+                        Hit hit = (Hit) hit_iter.next();
+                        Document doc = hit.getDocument();
+                        String node_id = doc.get("nodeID");
+                        System.out.println("  <Match id=\"" + node_id + "\" />");
+                        // From the document, extract the Term Vector for the
+                        // TX field
+                        TermFreqVector term_freq_vector = reader.getTermFreqVector(hit.getId(), "TX");
+                        if (term_freq_vector.size() > 0)
+                            {
+                                int[] term_frequencies = term_freq_vector.getTermFrequencies();
+                                // Now for each query term, determine the
+                                // frequency - which may of course be 0.
+                                Set term_counts_set = term_counts.keySet();
+                                Iterator terms_iter = term_counts_set.iterator();
+                                while (terms_iter.hasNext())
+                                    {
+                                        String term = (String) terms_iter.next();
+                                        Integer count_integer = (Integer) term_counts.get(term);
+                                        int count = count_integer.intValue();
+                                        int index = term_freq_vector.indexOf(term);
+                                        // If the term has a count, then add to
+                                        // the total count for this term
+                                        if (index != -1)
+                                            {
+                                                count += term_frequencies[index];
+                                            }
+                                        // Store the result
+                                        term_counts.put(term, new Integer(count));
+                                        count_integer = null;
+                                        term = null;
+                                    }
+                                terms_iter = null;
+                                term_counts_set = null;
+                            }
+                        else
+                            {
+                                ///ystem.err.println("Error! Missing term vector for document " + hit.getId());
+                            }
+                    }
+                // Retrieve all the useful terms
+                Set term_counts_set = term_counts.keySet();
+                System.out.println("  <QueryTermsInfo num=\"" + term_counts_set.size() + "\"/>");
+                // Iterate over them
+                Iterator terms_iter = term_counts_set.iterator();
+                while (terms_iter.hasNext())
+                    {
+                        String term = (String) terms_iter.next();
+                        Integer count = (Integer) term_counts.get(term);
+                        String field = (String) term_fields.get(term);
+                        System.out.println("  <Term value=\"" + term + "\" field=\"" + field + "\" freq=\"" + count.intValue() + "\" />");
+                        count = null;
+                        term = null;
+                    }
+                // Cleanup
+                terms_iter = null;
+                term_counts_set = null;
         // Return the list of stop words removed from the query
         HashSet terms_including_stop_words = new HashSet();
         query_including_stop_words.extractTerms(terms_including_stop_words);
-        System.out.println("  <StopWordsInfo num=\"" + (terms_including_stop_words.size() - terms.size()) + "\"/>");
         Iterator terms_including_stop_words_iter = terms_including_stop_words.iterator();
         while (terms_including_stop_words_iter.hasNext()) {
 …
+        }
+        // Return the matching documents
+        System.out.println("  <MatchingDocsInfo num=\"" + hits.length() + "\"/>");
+        for (int i = 0; i < hits.length(); i++) {
+            Document doc = hits.doc(i);
+            String node_id = doc.get("nodeID");
+            System.out.println("  <Match id=\"" + node_id + "\"/>");
+        }
+        System.out.println("</ResultSet>");
+        System.out.println("</ResultSet>");
+        }

Note: See TracChangeset for help on using the changeset viewer.

Context Navigation

Changeset 12377

Legend:

trunk/gsdl/src/java/org/nzdl/gsdl/LuceneWrap/GS2LuceneQuery.java

trunk/indexers/lucene-gs/src/org/greenstone/LuceneWrapper/GS2LuceneQuery.java

Download in other formats: