Changeset 26157 for main/trunk


Ignore:
Timestamp:
2012-09-06T16:54:10+12:00 (12 years ago)
Author:
ak19
Message:

A much better way to ensure searches with wild cards get expanded to query terms as in GS2 which uses an older version of lucene (2.3.2). It now works as before at both document level and section level searches with wild cards. If a BooleanQuery.TooManyClauses exception is thrown, it will try rewriting the query once more, but using other rewritemethods, so that results are still returned (instead of the exception forcing it to say 0 documents are returned).

File:
1 edited

Legend:

Unmodified
Added
Removed
  • main/trunk/greenstone2/common-src/indexers/lucene-gs/src/org/greenstone/LuceneWrapper3/GS2LuceneQuery.java

    r26155 r26157  
    3939import org.apache.lucene.queryParser.ParseException;
    4040import org.apache.lucene.queryParser.QueryParser;
    41 import org.apache.lucene.search.BooleanQuery.TooManyClauses;
     41import org.apache.lucene.search.BooleanQuery; // for the TooManyClauses exception
    4242import org.apache.lucene.search.Filter;
    4343import org.apache.lucene.search.IndexSearcher;
     44import org.apache.lucene.search.MultiTermQuery;
     45import org.apache.lucene.search.MultiTermQuery.ConstantScoreAutoRewrite;
    4446import org.apache.lucene.search.Query;
    4547import org.apache.lucene.search.TermRangeFilter;
     
    5456import org.apache.lucene.util.Version;
    5557
    56 import org.apache.lucene.search.MultiTermQuery;
    57 import org.apache.lucene.search.MultiTermQuery.ConstantScoreAutoRewrite;
    5858
    5959public class GS2LuceneQuery extends SharedSoleneQuery
     
    152152
    153153        // GS2's LuceneWrapper uses lucene-2.3.2. GS3's LuceneWrapper3 works with lucene-3.3.0.
    154         // This change in lucene core library for GS3 had the side-effect that searching on
    155         // "econom*" didn't display what terms it was searching for, whereas it had done so in GS2.
     154        // This change in lucene core library for GS3 (present since after version 2.4.1) had the
     155        // side-effect that searching on "econom*" didn't display what terms it was searching for,
     156        // whereas it had done so in GS2.
    156157
    157158        // The details of this problem and its current solution are explained in the ticket
    158159        // http://trac.greenstone.org/ticket/845
    159160
    160         // We need to change the settings for rewriteMethod in order to get searches on wildcards to
    161         // produce search terms again when the query is rewritten.
     161        // We need to change the settings for the rewriteMethod in order to get searches on wildcards
     162        // to produce search terms again when the query gets rewritten.
     163
     164        // We try, in order:
     165        // 1. RewriteMethod set to BooleanQuery, to get it working as in GS2 which uses lucene-2.3.2
     166        // it will expand wildcard searches to its terms when searching at both section AND doc level.
     167        // If that throws a TooManyClauses exception (like when searching for "a*" over lucene demo collection)
     168        // 2. Then try a custom rewriteMethod which sets termCountCutoff=350 and docCountPercent cutoff=0.1%
     169        // If that throws a TooManyClauses exception (could perhaps happen if the collection has a huge number of docs
     170        // 3. Then try the default apache rewriteMethod with its optimum defaults of
     171        // termCountCutoff=350 and docCountPercent cutoff=0.1%
     172        //  See http://lucene.apache.org/core/3_6_1/api/core/org/apache/lucene/search/MultiTermQuery.html
    162173
    163174        if(query instanceof MultiTermQuery) {
    164 
    165         // default docCountPercent=0.1; default termCountCutoff=350
    166 
    167         // Creating custom cutoff values, taking into account of existing cutoff values
    168         MultiTermQuery.ConstantScoreAutoRewrite customRewriteMethod = new MultiTermQuery.ConstantScoreAutoRewrite();
    169         customRewriteMethod.setDocCountPercent(100.0);//MultiTermQuery.ConstantScoreAutoRewrite.DEFAULT_DOC_COUNT_PERCENT);
    170         customRewriteMethod.setTermCountCutoff(350);
    171 
    172175        MultiTermQuery multiTermQuery = (MultiTermQuery)query;
    173         multiTermQuery.setRewriteMethod(customRewriteMethod);
    174 
    175         // the above works when searching with wildcards over sections, the following also
    176         // works on book searches, but has been discouraged as it can throw an exception if
    177         // the number of terms exceeds BooleanQuery.getMaxClauseCount().
    178         // http://lucene.apache.org/core/3_6_1/api/core/org/apache/lucene/search/MultiTermQuery.html
    179 
    180         //multiTermQuery.setRewriteMethod(MultiTermQuery.CONSTANT_SCORE_BOOLEAN_QUERY_REWRITE);//MultiTermQuery.SCORING_BOOLEAN_QUERY_REWRITE);
    181         }
    182 
    183         query = query.rewrite(reader);
    184        
     176        multiTermQuery.setRewriteMethod(MultiTermQuery.CONSTANT_SCORE_BOOLEAN_QUERY_REWRITE);
     177             // less CPU intensive than MultiTermQuery.SCORING_BOOLEAN_QUERY_REWRITE)
     178        }
     179
     180        try {
     181        query = query.rewrite(reader);
     182        }
     183        catch(BooleanQuery.TooManyClauses clauseException) {
     184        // Example test case: try searching the lucene demo collection for "a*"
     185        // and you'll hit this exception
     186
     187        lucene_query_result.setError(LuceneQueryResult.TOO_MANY_CLAUSES_ERROR);
     188
     189        if(query instanceof MultiTermQuery) {
     190
     191            // CustomRewriteMethod: setting the docCountPercent cutoff to a custom 100%.
     192            // This will at least expand the query to its terms when searching with wildcards at section-level
     193            // (though it doesn't seem to work for doc-level searches, no matter what the cutoffs are set to).
     194
     195            MultiTermQuery.ConstantScoreAutoRewrite customRewriteMethod = new MultiTermQuery.ConstantScoreAutoRewrite();
     196            customRewriteMethod.setDocCountPercent(100.0);
     197            customRewriteMethod.setTermCountCutoff(350); // same as default
     198           
     199            MultiTermQuery multiTermQuery = (MultiTermQuery)query;
     200            multiTermQuery.setRewriteMethod(customRewriteMethod);
     201            try {
     202            query = query.rewrite(reader);
     203            }
     204            catch(BooleanQuery.TooManyClauses clauseExceptionAgain) {
     205
     206            // do what the code originally did: use the default rewriteMethod which
     207            // uses a default docCountPercent=0.1 (%) and termCountCutoff=350
     208
     209            multiTermQuery.setRewriteMethod(MultiTermQuery.CONSTANT_SCORE_AUTO_REWRITE_DEFAULT);
     210            query = query.rewrite(reader);
     211            }
     212        }
     213        }
     214
    185215        // Get the list of expanded query terms and their frequencies
    186216        // num docs matching, and total frequency       
     
    285315        lucene_query_result.setError(LuceneQueryResult.PARSE_ERROR);
    286316    }
    287     catch (TooManyClauses too_many_clauses_exception) {
     317    catch (BooleanQuery.TooManyClauses too_many_clauses_exception) {
    288318        lucene_query_result.setError(LuceneQueryResult.TOO_MANY_CLAUSES_ERROR);
    289319    }
Note: See TracChangeset for help on using the changeset viewer.