Changeset 32506
- Timestamp:
- 2018-10-09T19:24:52+13:00 (5 years ago)
- File:
-
- 1 edited
Legend:
- Unmodified
- Added
- Removed
-
main/trunk/greenstone2/common-src/indexers/lucene-gs/src/org/greenstone/LuceneWrapper4/GS2LuceneQuery.java
r30159 r32506 40 40 import org.apache.lucene.queryparser.classic.ParseException; 41 41 import org.apache.lucene.queryparser.classic.QueryParser; 42 import org.apache.lucene.search.BooleanClause; 42 43 import org.apache.lucene.search.BooleanQuery; // for the TooManyClauses exception 44 import org.apache.lucene.search.ConstantScoreQuery; 43 45 import org.apache.lucene.search.Filter; 44 46 import org.apache.lucene.search.IndexSearcher; … … 167 169 query_including_stop_words = query_including_stop_words.rewrite(reader); 168 170 169 //System.err.println("********* query_string " + query_string + "****");171 System.err.println("********* query_string " + query_string + "****"); 170 172 171 173 Query query = parseQuery(reader, query_parser, query_string, fuzziness); 172 173 // GS2's LuceneWrapper uses lucene-2.3.2. GS3's LuceneWrapper3 works with lucene-3.3.0. 174 // This change in lucene core library for GS3 (present since after version 2.4.1) had the 175 // side-effect that searching on "econom*" didn't display what terms it was searching for, 176 // whereas it had done so in GS2. 177 178 // The details of this problem and its current solution are explained in the ticket 179 // http://trac.greenstone.org/ticket/845 180 181 // We need to change the settings for the rewriteMethod in order to get searches on wildcards 182 // to produce search terms again when the query gets rewritten. 183 184 // We try, in order: 185 // 1. RewriteMethod set to BooleanQuery, to get it working as in GS2 which uses lucene-2.3.2 186 // it will expand wildcard searches to its terms when searching at both section AND doc level. 187 // If that throws a TooManyClauses exception (like when searching for "a*" over lucene demo collection) 188 // 2. Then try a custom rewriteMethod which sets termCountCutoff=350 and docCountPercent cutoff=0.1% 189 // If that throws a TooManyClauses exception (could perhaps happen if the collection has a huge number of docs 190 // 3. Then try the default apache rewriteMethod with its optimum defaults of 191 // termCountCutoff=350 and docCountPercent cutoff=0.1% 192 // See http://lucene.apache.org/core/3_6_1/api/core/org/apache/lucene/search/MultiTermQuery.html 193 194 if(query instanceof MultiTermQuery) { 195 MultiTermQuery multiTermQuery = (MultiTermQuery)query; 196 multiTermQuery.setRewriteMethod(MultiTermQuery.CONSTANT_SCORE_BOOLEAN_QUERY_REWRITE); 197 // less CPU intensive than MultiTermQuery.SCORING_BOOLEAN_QUERY_REWRITE) 198 } 199 200 try { 201 query = query.rewrite(reader); 202 } 203 catch(BooleanQuery.TooManyClauses clauseException) { 204 // Example test case: try searching the lucene demo collection for "a*" 205 // and you'll hit this exception 206 207 lucene_query_result.setError(LuceneQueryResult.TOO_MANY_CLAUSES_ERROR); 208 209 if(query instanceof MultiTermQuery) { 210 211 // CustomRewriteMethod: setting the docCountPercent cutoff to a custom 100%. 212 // This will at least expand the query to its terms when searching with wildcards at section-level 213 // (though it doesn't seem to work for doc-level searches, no matter what the cutoffs are set to). 214 215 MultiTermQuery.ConstantScoreAutoRewrite customRewriteMethod = new MultiTermQuery.ConstantScoreAutoRewrite(); 216 customRewriteMethod.setDocCountPercent(100.0); 217 customRewriteMethod.setTermCountCutoff(350); // same as default 218 219 MultiTermQuery multiTermQuery = (MultiTermQuery)query; 220 multiTermQuery.setRewriteMethod(customRewriteMethod); 221 try { 222 query = query.rewrite(reader); 223 } 224 catch(BooleanQuery.TooManyClauses clauseExceptionAgain) { 225 226 // do what the code originally did: use the default rewriteMethod which 227 // uses a default docCountPercent=0.1 (%) and termCountCutoff=350 228 229 multiTermQuery.setRewriteMethod(MultiTermQuery.CONSTANT_SCORE_AUTO_REWRITE_DEFAULT); 230 query = query.rewrite(reader); 231 } 232 } 233 } 174 query = recursiveRewriteQuery(query, reader); 175 System.err.println("@@@@ final query class name: " + query.getClass()); 234 176 235 177 // http://stackoverflow.com/questions/13537126/term-frequency-in-lucene-4-0 … … 259 201 260 202 Term term = (Term) iter.next(); 203 System.err.println("@@@ GS2LuceneQuery.java: Next term: " + term.text()); 261 204 BytesRef term_bytes = term.bytes(); 262 205 DocsEnum term_docs = MultiFields.getTermDocsEnum(reader, liveDocs, term.field(), term_bytes); // flags? … … 516 459 } 517 460 461 // If you're dealing with a BooleanQuery, they need to be recursively rewritten 462 // as they can contain queries with wildcards (WildcardQuery|PrefixQuery subclasses of MultiTermQuery) 463 // e.g. season* farm 464 // If MultiTermQuery, then expand here. e.g. WildcardQuerys like season*. 465 // DON'T call this method from inside parseQuery() (in place of its query.rewrite()), because then wildcard 466 // queries like season* won't contain Terms (extractTerms() will be empty) since the ConstantScoreQuerys 467 // that a WildcardQuery gets rewritten to here will contain Filters in place of Terms. 468 // Call this method from runQuery() after it calls parseQuery(). 469 // Now searches like these will work 470 // season* farm 471 // season* farm* 472 // and not just searches like the following which already used to work: 473 // season* 474 // snail farm 475 // Idea for this method came from inspecting source code to BooleanQuery 476 // https://github.com/apache/lucene-solr/blob/master/lucene/core/src/java/org/apache/lucene/search/BooleanQuery.java 477 // which also does a recursive rewrite. Unfortunately, the existing BooleanQuery does not handle MultiTermQuery 478 // subcomponents. 479 protected Query recursiveRewriteQuery(Query orig_query, IndexReader reader) throws java.io.IOException 480 { 481 //Query query = orig_query.rewrite(reader); 482 Query query = orig_query; 483 484 if(orig_query instanceof BooleanQuery) { 485 BooleanQuery booleanQuery = (BooleanQuery)orig_query; 486 List<BooleanClause> clauses = booleanQuery.clauses(); 487 for (BooleanClause clause : clauses) { 488 Query subQuery = clause.getQuery(); 489 subQuery = recursiveRewriteQuery(subQuery, reader); 490 clause.setQuery(subQuery); 491 } 492 } 493 494 // GS2's LuceneWrapper uses lucene-2.3.2. GS3's LuceneWrapper3 works with lucene-3.3.0. 495 // This change in lucene core library for GS3 (present since after version 2.4.1) had the 496 // side-effect that searching on "econom*" didn't display what terms it was searching for, 497 // whereas it had done so in GS2. 498 499 // The details of this problem and its current solution are explained in the ticket 500 // http://trac.greenstone.org/ticket/845 501 502 // We need to change the settings for the rewriteMethod in order to get searches on wildcards 503 // to produce search terms again when the query gets rewritten. 504 505 // We try, in order: 506 // 1. RewriteMethod set to BooleanQuery, to get it working as in GS2 which uses lucene-2.3.2 507 // it will expand wildcard searches to its terms when searching at both section AND doc level. 508 // If that throws a TooManyClauses exception (like when searching for "a*" over lucene demo collection) 509 // 2. Then try a custom rewriteMethod which sets termCountCutoff=350 and docCountPercent cutoff=0.1% 510 // If that throws a TooManyClauses exception (could perhaps happen if the collection has a huge number of docs 511 // 3. Then try the default apache rewriteMethod with its optimum defaults of 512 // termCountCutoff=350 and docCountPercent cutoff=0.1% 513 // See http://lucene.apache.org/core/3_6_1/api/core/org/apache/lucene/search/MultiTermQuery.html 514 515 System.err.println("@@@@ query class name: " + orig_query.getClass()); 516 System.err.println("@@@@ QUERY: " + orig_query); 517 518 if(orig_query instanceof MultiTermQuery) { 519 MultiTermQuery multiTermQuery = (MultiTermQuery)orig_query; 520 multiTermQuery.setRewriteMethod(MultiTermQuery.CONSTANT_SCORE_BOOLEAN_QUERY_REWRITE); 521 // less CPU intensive than MultiTermQuery.SCORING_BOOLEAN_QUERY_REWRITE) 522 } 523 524 try { 525 query = orig_query.rewrite(reader); 526 } 527 catch(BooleanQuery.TooManyClauses clauseException) { 528 // Example test case: try searching the lucene demo collection for "a*" 529 // and you'll hit this exception 530 531 //lucene_query_result.setError(LuceneQueryResult.TOO_MANY_CLAUSES_ERROR); 532 533 if(query instanceof MultiTermQuery) { 534 535 // CustomRewriteMethod: setting the docCountPercent cutoff to a custom 100%. 536 // This will at least expand the query to its terms when searching with wildcards at section-level 537 // (though it doesn't seem to work for doc-level searches, no matter what the cutoffs are set to). 538 539 MultiTermQuery.ConstantScoreAutoRewrite customRewriteMethod = new MultiTermQuery.ConstantScoreAutoRewrite(); 540 customRewriteMethod.setDocCountPercent(100.0); 541 customRewriteMethod.setTermCountCutoff(350); // same as default 542 543 MultiTermQuery multiTermQuery = (MultiTermQuery)query; 544 multiTermQuery.setRewriteMethod(customRewriteMethod); 545 try { 546 query = query.rewrite(reader); 547 } 548 catch(BooleanQuery.TooManyClauses clauseExceptionAgain) { 549 550 // do what the code originally did: use the default rewriteMethod which 551 // uses a default docCountPercent=0.1 (%) and termCountCutoff=350 552 553 multiTermQuery.setRewriteMethod(MultiTermQuery.CONSTANT_SCORE_AUTO_REWRITE_DEFAULT); 554 query = query.rewrite(reader); 555 } 556 } 557 } 558 559 if(orig_query == query) { 560 return query; 561 } else { 562 return recursiveRewriteQuery(query, reader); 563 } 564 } 565 518 566 protected Filter parseFilterString(String filter_string) 519 567 {
Note:
See TracChangeset
for help on using the changeset viewer.