Ignore:
Timestamp:
2006-08-02T12:45:56+12:00 (18 years ago)
Author:
mdewsnip
Message:

Now returns the stop words that have been removed from the query.

File:
1 edited

Legend:

Unmodified
Added
Removed
  • trunk/indexers/lucene-gs/src/org/greenstone/LuceneWrapper/GS2LuceneQuery.java

    r12364 r12372  
    3838        try {
    3939        Searcher searcher = new IndexSearcher(args[0]);
     40        IndexReader reader = ((IndexSearcher) searcher).getIndexReader();
     41
    4042        Sort sorter = new Sort();
     43
    4144            // New code to allow the default conjunction operator to be
    4245            // definable
     
    5659                }
    5760
    58         Analyzer analyzer = new StandardAnalyzer();
    59         IndexReader reader = ((IndexSearcher) searcher).getIndexReader();
     61        // Create one query parser with the standard set of stop words, and one with none
     62        QueryParser query_parser = new QueryParser("TX", new StandardAnalyzer());
     63        QueryParser query_parser_no_stop_words = new QueryParser("TX", new StandardAnalyzer(new String[] { }));
     64
     65        // Lucene does "OR" queries by default; do an "AND" query if specified
     66        if (default_conjuction_operator.equals("AND")) {
     67        query_parser.setDefaultOperator(query_parser.AND_OPERATOR);
     68        query_parser_no_stop_words.setDefaultOperator(query_parser.AND_OPERATOR);
     69        }
    6070
    6171        BufferedReader in = new BufferedReader(new InputStreamReader(System.in, "UTF-8"));
     
    6979
    7080        // Parse the query and rewrite it into individual terms (eg. for wildcard searches)
    71         QueryParser query_parser = new QueryParser("TX", analyzer);
    72                 // Set the default conjuction operator
    73                 System.err.println("**** DCO = " + default_conjuction_operator);
    74                 if (default_conjuction_operator.equals("AND"))
    75                     {
    76                         query_parser.setDefaultOperator(query_parser.AND_OPERATOR);
    77                     }
    78                 // Otherwise its OR
    79 
    8081        Query query = query_parser.parse(query_string);
    8182        query = query.rewrite(reader);
     83        Query query_including_stop_words = query_parser_no_stop_words.parse(query_string);
     84        query_including_stop_words = query_including_stop_words.rewrite(reader);
    8285
    8386        // Perform the query
     
    9093        query.extractTerms(terms);
    9194        System.out.println("  <QueryTermsInfo num=\"" + terms.size() + "\"/>");
    92         Iterator iter = terms.iterator();
    93         while (iter.hasNext()) {
    94             Term term = (Term) iter.next();
    95                     //System.out.println("  <Term value=\"" + term.text() + "\" freq=\"" + reader.docFreq(term) + "\"/>");
     95        Iterator terms_iter = terms.iterator();
     96        while (terms_iter.hasNext()) {
     97            Term term = (Term) terms_iter.next();
    9698            System.out.println("  <Term value=\"" + term.text() + "\" freq=\"" + reader.docFreq(term) + "\" field=\"" + term.field() + "\"/>");
     99        }
     100
     101        // Return the list of stop words removed from the query
     102        HashSet terms_including_stop_words = new HashSet();
     103        query_including_stop_words.extractTerms(terms_including_stop_words);
     104        System.out.println("  <StopWordsInfo num=\"" + (terms_including_stop_words.size() - terms.size()) + "\"/>");
     105        Iterator terms_including_stop_words_iter = terms_including_stop_words.iterator();
     106        while (terms_including_stop_words_iter.hasNext()) {
     107            Term term = (Term) terms_including_stop_words_iter.next();
     108            if (!terms.contains(term)) {
     109            System.err.println("  <StopWord value=\"" + term.text() + "\"/>");
     110            }
    97111        }
    98112
Note: See TracChangeset for help on using the changeset viewer.