Changeset 26157

Show
Ignore:
Timestamp:
06.09.2012 16:54:10 (7 years ago)
Author:
ak19
Message:

A much better way to ensure searches with wild cards get expanded to query terms as in GS2 which uses an older version of lucene (2.3.2). It now works as before at both document level and section level searches with wild cards. If a BooleanQuery?.TooManyClauses? exception is thrown, it will try rewriting the query once more, but using other rewritemethods, so that results are still returned (instead of the exception forcing it to say 0 documents are returned).

Files:
1 modified

Legend:

Unmodified
Added
Removed
  • main/trunk/greenstone2/common-src/indexers/lucene-gs/src/org/greenstone/LuceneWrapper3/GS2LuceneQuery.java

    r26155 r26157  
    3939import org.apache.lucene.queryParser.ParseException; 
    4040import org.apache.lucene.queryParser.QueryParser; 
    41 import org.apache.lucene.search.BooleanQuery.TooManyClauses; 
     41import org.apache.lucene.search.BooleanQuery; // for the TooManyClauses exception 
    4242import org.apache.lucene.search.Filter; 
    4343import org.apache.lucene.search.IndexSearcher; 
     44import org.apache.lucene.search.MultiTermQuery; 
     45import org.apache.lucene.search.MultiTermQuery.ConstantScoreAutoRewrite; 
    4446import org.apache.lucene.search.Query; 
    4547import org.apache.lucene.search.TermRangeFilter; 
     
    5456import org.apache.lucene.util.Version; 
    5557 
    56 import org.apache.lucene.search.MultiTermQuery; 
    57 import org.apache.lucene.search.MultiTermQuery.ConstantScoreAutoRewrite; 
    5858 
    5959public class GS2LuceneQuery extends SharedSoleneQuery 
     
    152152 
    153153        // GS2's LuceneWrapper uses lucene-2.3.2. GS3's LuceneWrapper3 works with lucene-3.3.0.  
    154         // This change in lucene core library for GS3 had the side-effect that searching on  
    155         // "econom*" didn't display what terms it was searching for, whereas it had done so in GS2.  
     154        // This change in lucene core library for GS3 (present since after version 2.4.1) had the 
     155        // side-effect that searching on "econom*" didn't display what terms it was searching for,  
     156        // whereas it had done so in GS2.  
    156157 
    157158        // The details of this problem and its current solution are explained in the ticket  
    158159        // http://trac.greenstone.org/ticket/845 
    159160 
    160         // We need to change the settings for rewriteMethod in order to get searches on wildcards to  
    161         // produce search terms again when the query is rewritten. 
     161        // We need to change the settings for the rewriteMethod in order to get searches on wildcards 
     162        // to produce search terms again when the query gets rewritten. 
     163 
     164        // We try, in order: 
     165        // 1. RewriteMethod set to BooleanQuery, to get it working as in GS2 which uses lucene-2.3.2 
     166        // it will expand wildcard searches to its terms when searching at both section AND doc level. 
     167        // If that throws a TooManyClauses exception (like when searching for "a*" over lucene demo collection) 
     168        // 2. Then try a custom rewriteMethod which sets termCountCutoff=350 and docCountPercent cutoff=0.1% 
     169        // If that throws a TooManyClauses exception (could perhaps happen if the collection has a huge number of docs 
     170        // 3. Then try the default apache rewriteMethod with its optimum defaults of  
     171        // termCountCutoff=350 and docCountPercent cutoff=0.1% 
     172        //  See http://lucene.apache.org/core/3_6_1/api/core/org/apache/lucene/search/MultiTermQuery.html 
    162173 
    163174        if(query instanceof MultiTermQuery) { 
    164  
    165         // default docCountPercent=0.1; default termCountCutoff=350 
    166  
    167         // Creating custom cutoff values, taking into account of existing cutoff values 
    168         MultiTermQuery.ConstantScoreAutoRewrite customRewriteMethod = new MultiTermQuery.ConstantScoreAutoRewrite(); 
    169         customRewriteMethod.setDocCountPercent(100.0);//MultiTermQuery.ConstantScoreAutoRewrite.DEFAULT_DOC_COUNT_PERCENT); 
    170         customRewriteMethod.setTermCountCutoff(350); 
    171  
    172175        MultiTermQuery multiTermQuery = (MultiTermQuery)query; 
    173         multiTermQuery.setRewriteMethod(customRewriteMethod); 
    174  
    175         // the above works when searching with wildcards over sections, the following also  
    176         // works on book searches, but has been discouraged as it can throw an exception if  
    177         // the number of terms exceeds BooleanQuery.getMaxClauseCount(). 
    178         // http://lucene.apache.org/core/3_6_1/api/core/org/apache/lucene/search/MultiTermQuery.html 
    179  
    180         //multiTermQuery.setRewriteMethod(MultiTermQuery.CONSTANT_SCORE_BOOLEAN_QUERY_REWRITE);//MultiTermQuery.SCORING_BOOLEAN_QUERY_REWRITE);  
    181         } 
    182  
    183         query = query.rewrite(reader); 
    184          
     176        multiTermQuery.setRewriteMethod(MultiTermQuery.CONSTANT_SCORE_BOOLEAN_QUERY_REWRITE); 
     177             // less CPU intensive than MultiTermQuery.SCORING_BOOLEAN_QUERY_REWRITE) 
     178        } 
     179 
     180        try { 
     181        query = query.rewrite(reader); 
     182        }  
     183        catch(BooleanQuery.TooManyClauses clauseException) { 
     184        // Example test case: try searching the lucene demo collection for "a*"  
     185        // and you'll hit this exception 
     186 
     187        lucene_query_result.setError(LuceneQueryResult.TOO_MANY_CLAUSES_ERROR); 
     188 
     189        if(query instanceof MultiTermQuery) { 
     190 
     191            // CustomRewriteMethod: setting the docCountPercent cutoff to a custom 100%.  
     192            // This will at least expand the query to its terms when searching with wildcards at section-level  
     193            // (though it doesn't seem to work for doc-level searches, no matter what the cutoffs are set to). 
     194 
     195            MultiTermQuery.ConstantScoreAutoRewrite customRewriteMethod = new MultiTermQuery.ConstantScoreAutoRewrite(); 
     196            customRewriteMethod.setDocCountPercent(100.0); 
     197            customRewriteMethod.setTermCountCutoff(350); // same as default 
     198             
     199            MultiTermQuery multiTermQuery = (MultiTermQuery)query; 
     200            multiTermQuery.setRewriteMethod(customRewriteMethod); 
     201            try { 
     202            query = query.rewrite(reader); 
     203            }  
     204            catch(BooleanQuery.TooManyClauses clauseExceptionAgain) { 
     205 
     206            // do what the code originally did: use the default rewriteMethod which 
     207            // uses a default docCountPercent=0.1 (%) and termCountCutoff=350 
     208 
     209            multiTermQuery.setRewriteMethod(MultiTermQuery.CONSTANT_SCORE_AUTO_REWRITE_DEFAULT); 
     210            query = query.rewrite(reader); 
     211            } 
     212        } 
     213        } 
     214 
    185215        // Get the list of expanded query terms and their frequencies  
    186216        // num docs matching, and total frequency        
     
    285315        lucene_query_result.setError(LuceneQueryResult.PARSE_ERROR); 
    286316    } 
    287     catch (TooManyClauses too_many_clauses_exception) { 
     317    catch (BooleanQuery.TooManyClauses too_many_clauses_exception) { 
    288318        lucene_query_result.setError(LuceneQueryResult.TOO_MANY_CLAUSES_ERROR); 
    289319    }