Context Navigation

← Previous Change
Next Change →

Changeset 12418 for trunk/indexers

Timestamp:

2006-08-09T10:41:39+12:00 (18 years ago)

Author:

mdewsnip

Message:

Now returns parse exceptions and too many clauses exceptions as <Error> tags so they can be identified by the C++ code.

File:

: 1 edited

trunk/indexers/lucene-gs/src/org/greenstone/LuceneWrapper/GS2LuceneQuery.java (modified) (5 diffs)

Legend:

: Unmodified
: Added
: Removed

trunk/indexers/lucene-gs/src/org/greenstone/LuceneWrapper/GS2LuceneQuery.java

-              r12415
+              r12418
 import java.io.BufferedReader;
 import java.io.InputStreamReader;
+import java.io.IOException;
 import java.util.Collections;
 import java.util.HashMap;
 …
 import org.apache.lucene.index.Term;
 import org.apache.lucene.index.TermFreqVector;
+import org.apache.lucene.queryParser.ParseException;
 import org.apache.lucene.queryParser.QueryParser;
+import org.apache.lucene.search.BooleanQuery.TooManyClauses;
 import org.apache.lucene.search.Filter;
 import org.apache.lucene.search.Hit;
 …
+    {
     if (args.length == 0) {
         System.out.println("Usage: GS2LuceneQuery <index directory> (<sort field>)");
+        System.out.println("Usage: GS2LuceneQuery <index directory> [-fuzzy] [-filter filter_string] [-sort sort_field] [-dco AND|OR]");
         return;
+    }
 …
+                        {
                             i++;
-                            ///ystem.err.println("**** sort by = " + args[i]);
                             sorter = new Sort(args[i]);
+                        }
 …
             break;
+        }
-        ///ystem.err.println("**** query = " + query_string);
-        Query query_including_stop_words = query_parser_no_stop_words.parse(query_string);
-        query_including_stop_words = query_including_stop_words.rewrite(reader);
-        Query query = parseQuery(reader, query_parser, query_string, fuzzy);
-        query = query.rewrite(reader);
-        // Perform the query
-        Hits hits;
-        if (filter != null) {
-            hits = searcher.search(query, filter, sorter);
+        }
-        else {
-            hits = searcher.search(query, sorter);
+        }
         System.out.println("<ResultSet>");
         System.out.println("  <QueryString>" + query_string + "</QueryString>");
+        // Return the list of expanded query terms and their frequencies
+                HashMap term_counts = new HashMap();
+                HashMap term_fields = new HashMap();
+        HashSet terms = new HashSet();
+        query.extractTerms(terms);
+        Iterator iter = terms.iterator();
+        while (iter.hasNext())
+                    {
+                        Term term = (Term) iter.next();
+                        // If you wanted to limit this to just TX terms add
+                        // something like this:
+                        //if (term.field().equals("TX"))
+                        term_counts.put(term.text(), new Integer(0));
+                        term_fields.put(term.text(), term.field());
+                    }
+                // Do we need to use a hit iterator to get sorted results?
+                System.out.println("  <MatchingDocsInfo num=\"" + hits.length() + "\"/>");
+                Iterator hit_iter = hits.iterator();
+                while (hit_iter.hasNext())
+                    {
+                        Hit hit = (Hit) hit_iter.next();
+                        Document doc = hit.getDocument();
+                        String node_id = doc.get("nodeID");
+                        System.out.println("  <Match id=\"" + node_id + "\" />");
+        try {
+            Query query_including_stop_words = query_parser_no_stop_words.parse(query_string);
+            query_including_stop_words = query_including_stop_words.rewrite(reader);
+            Query query = parseQuery(reader, query_parser, query_string, fuzzy);
+            query = query.rewrite(reader);
+            // Perform the query
+            Hits hits;
+            if (filter != null) {
+            hits = searcher.search(query, filter, sorter);
+            }
+            else {
+            hits = searcher.search(query, sorter);
+            }
+            // Return the list of expanded query terms and their frequencies
+            HashMap term_counts = new HashMap();
+            HashMap term_fields = new HashMap();
+            HashSet terms = new HashSet();
+            query.extractTerms(terms);
+            Iterator iter = terms.iterator();
+            while (iter.hasNext())
+            {
+                Term term = (Term) iter.next();
+                // If you wanted to limit this to just TX terms add
+                // something like this:
+                //if (term.field().equals("TX"))
+                term_counts.put(term.text(), new Integer(0));
+                term_fields.put(term.text(), term.field());
+            }
+            // Do we need to use a hit iterator to get sorted results?
+            System.out.println("  <MatchingDocsInfo num=\"" + hits.length() + "\"/>");
+            Iterator hit_iter = hits.iterator();
+            while (hit_iter.hasNext())
+            {
+                Hit hit = (Hit) hit_iter.next();
+                Document doc = hit.getDocument();
+                String node_id = doc.get("nodeID");
+                System.out.println("  <Match id=\"" + node_id + "\" />");
                         // From the document, extract the Term Vector for the
                         // TX field
                         TermFreqVector term_freq_vector = reader.getTermFreqVector(hit.getId(), "TX");
                         if (term_freq_vector != null && term_freq_vector.size() > 0)
+                            {
                                 int[] term_frequencies = term_freq_vector.getTermFrequencies();
                                 // Now for each query term, determine the
                                 // frequency - which may of course be 0.
                                 Set term_counts_set = term_counts.keySet();
                                 Iterator terms_iter = term_counts_set.iterator();
                                 while (terms_iter.hasNext())
+                                    {
                                         String term = (String) terms_iter.next();
                                         Integer count_integer = (Integer) term_counts.get(term);
                                         int count = count_integer.intValue();
                                         int index = term_freq_vector.indexOf(term);
                                         // If the term has a count, then add to
                                         // the total count for this term
                                         if (index != -1)
+                                            {
                                                 count += term_frequencies[index];
+                // From the document, extract the Term Vector for the
+                // TX field
+                TermFreqVector term_freq_vector = reader.getTermFreqVector(hit.getId(), "TX");
+                if (term_freq_vector != null && term_freq_vector.size() > 0)
+                {
+                    int[] term_frequencies = term_freq_vector.getTermFrequencies();
+                    // Now for each query term, determine the
+                    // frequency - which may of course be 0.
+                    Set term_counts_set = term_counts.keySet();
+                    Iterator terms_iter = term_counts_set.iterator();
+                    while (terms_iter.hasNext())
+                    {
+                        String term = (String) terms_iter.next();
+                        Integer count_integer = (Integer) term_counts.get(term);
+                        int count = count_integer.intValue();
+                        int index = term_freq_vector.indexOf(term);
+                        // If the term has a count, then add to
+                        // the total count for this term
+                        if (index != -1)
+                        {
+                            count += term_frequencies[index];
+                                            }
+                                        // Store the result
+                                        term_counts.put(term, new Integer(count));
+                                        count_integer = null;
+                                        term = null;
+                                    }
+                                terms_iter = null;
+                                term_counts_set = null;
+                            }
+                        else
+                            {
+                                ///ystem.err.println("Error! Missing term vector for document " + hit.getId());
+                            }
+                    }
+                // Retrieve all the useful terms
+                Set term_counts_set = term_counts.keySet();
+                System.out.println("  <QueryTermsInfo num=\"" + term_counts_set.size() + "\"/>");
+                // Iterate over them
+                Iterator terms_iter = term_counts_set.iterator();
+                while (terms_iter.hasNext())
+                    {
+                        String term = (String) terms_iter.next();
+                        Integer count = (Integer) term_counts.get(term);
+                        String field = (String) term_fields.get(term);
+                        System.out.println("  <Term value=\"" + term + "\" field=\"" + field + "\" freq=\"" + count.intValue() + "\" />");
+                        count = null;
+                        term = null;
+                    }
+                // Cleanup
+                terms_iter = null;
+                term_counts_set = null;
+        // Return the list of stop words removed from the query
+        HashSet terms_including_stop_words = new HashSet();
+        query_including_stop_words.extractTerms(terms_including_stop_words);
+        Iterator terms_including_stop_words_iter = terms_including_stop_words.iterator();
+        while (terms_including_stop_words_iter.hasNext()) {
+            Term term = (Term) terms_including_stop_words_iter.next();
+            if (!terms.contains(term)) {
+            System.out.println("  <StopWord value=\"" + term.text() + "\"/>");
+                        }
+                        // Store the result
+                        term_counts.put(term, new Integer(count));
+                        count_integer = null;
+                        term = null;
+                    }
+                    terms_iter = null;
+                    term_counts_set = null;
+                }
+                else
+                {
+                    ///ystem.err.println("Error! Missing term vector for document " + hit.getId());
+                }
+            }
+            // Retrieve all the useful terms
+            Set term_counts_set = term_counts.keySet();
+            System.out.println("  <QueryTermsInfo num=\"" + term_counts_set.size() + "\"/>");
+            // Iterate over them
+            Iterator terms_iter = term_counts_set.iterator();
+            while (terms_iter.hasNext())
+            {
+                String term = (String) terms_iter.next();
+                Integer count = (Integer) term_counts.get(term);
+                String field = (String) term_fields.get(term);
+                System.out.println("  <Term value=\"" + term + "\" field=\"" + field + "\" freq=\"" + count.intValue() + "\" />");
+                count = null;
+                term = null;
+            }
+            // Cleanup
+            terms_iter = null;
+            term_counts_set = null;
+            // Return the list of stop words removed from the query
+            HashSet terms_including_stop_words = new HashSet();
+            query_including_stop_words.extractTerms(terms_including_stop_words);
+            Iterator terms_including_stop_words_iter = terms_including_stop_words.iterator();
+            while (terms_including_stop_words_iter.hasNext()) {
+            Term term = (Term) terms_including_stop_words_iter.next();
+            if (!terms.contains(term)) {
+                System.out.println("  <StopWord value=\"" + term.text() + "\"/>");
+            }
+            }
+        }
+        System.out.println("</ResultSet>");
+        catch (ParseException parse_exception) {
+            System.out.println("  <Error type=\"PARSE_EXCEPTION\"/>");
+        }
+        catch (TooManyClauses too_many_clauses_exception) {
+            System.out.println("  <Error type=\"TOO_MANY_CLAUSES\"/>");
+        }
+        System.out.println("</ResultSet>");
+        }
         searcher.close();
+    }
     catch (Exception exception) {
+    catch (IOException exception) {
         exception.printStackTrace();
+        }
+    }
+    }

Note: See TracChangeset for help on using the changeset viewer.

Context Navigation

Changeset 12418 for trunk/indexers

Legend:

trunk/indexers/lucene-gs/src/org/greenstone/LuceneWrapper/GS2LuceneQuery.java

Download in other formats: