source: main/trunk/greenstone2/common-src/indexers/lucene-gs/src/org/greenstone/LuceneWrapper4/GS2LuceneQuery.java@ 32729

Last change on this file since 32729 was 32729, checked in by ak19, 5 years ago

Part of Ticket #947 the Lucene Index File Locking Fix. Lucene search had broken for GS2, thanks to Pascal Angst for identifying this. The fix committed at that time was incomplete as it had not been applied for GS2, because GS2 went through GS2LuceneQuery's main() method. GS2LuceneQuery still managed to compile after the fix with no syntax errors, because the superclass' initialise() method ended up getting called from main(), instead of the new GS2LuceneQuery.initialise(IndexReader) variant. Now the GS2LuceneQuery.main() method used by GS2 behaves like the GS2LuceneSearch class used by GS3: first instantiating an IndexReader object then passing this to the GS2LuceneQuery object's initialise(IndexReader) method. Lucene searching should work again in GS2, will test.

File size: 32.0 KB
Line 
1/**********************************************************************
2 *
3 * GS2LuceneQuery.java
4 *
5 * Copyright 2004 The New Zealand Digital Library Project
6 *
7 * A component of the Greenstone digital library software
8 * from the New Zealand Digital Library Project at the
9 * University of Waikato, New Zealand.
10 *
11 * This program is free software; you can redistribute it and/or modify
12 * it under the terms of the GNU General Public License as published by
13 * the Free Software Foundation; either version 2 of the License, or
14 * (at your option) any later version.
15 *
16 * This program is distributed in the hope that it will be useful,
17 * but WITHOUT ANY WARRANTY; without even the implied warranty of
18 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19 * GNU General Public License for more details.
20 *
21 * You should have received a copy of the GNU General Public License
22 * along with this program; if not, write to the Free Software
23 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24 *
25 *********************************************************************/
26package org.greenstone.LuceneWrapper4;
27
28
29import java.io.*;
30import java.util.*;
31import java.util.regex.*;
32
33import org.apache.lucene.analysis.Analyzer;
34import org.apache.lucene.analysis.standard.StandardAnalyzer;
35import org.apache.lucene.document.Document;
36import org.apache.lucene.index.DirectoryReader;
37import org.apache.lucene.index.IndexReader;
38import org.apache.lucene.index.Term;
39//import org.apache.lucene.index.TermDocs;
40import org.apache.lucene.queryparser.classic.ParseException;
41import org.apache.lucene.queryparser.classic.QueryParser;
42import org.apache.lucene.search.BooleanClause;
43import org.apache.lucene.search.BooleanQuery; // for the TooManyClauses exception
44import org.apache.lucene.search.ConstantScoreQuery;
45import org.apache.lucene.search.Filter;
46import org.apache.lucene.search.IndexSearcher; // Searcher is deprecated
47import org.apache.lucene.search.MultiTermQuery;
48import org.apache.lucene.search.MultiTermQuery.ConstantScoreAutoRewrite;
49import org.apache.lucene.search.Query;
50import org.apache.lucene.search.TermRangeFilter;
51import org.apache.lucene.search.ScoreDoc;
52import org.apache.lucene.search.Sort;
53import org.apache.lucene.search.SortField;
54import org.apache.lucene.search.TopFieldDocs;
55
56import org.apache.lucene.index.DocsEnum;
57import org.apache.lucene.index.MultiFields;
58
59import org.apache.lucene.store.Directory;
60import org.apache.lucene.store.FSDirectory;
61
62import org.apache.lucene.util.Bits;
63import org.apache.lucene.util.BytesRef;
64import org.apache.lucene.util.Version;
65
66public class GS2LuceneQuery extends SharedSoleneQuery
67{
68 public static String SORT_RANK = "rank";
69 public static String SORT_NATURAL = "natural";
70
71 protected String full_indexdir="";
72
73 protected SortField.Type sort_type = SortField.Type.SCORE;
74 protected boolean reverse_sort = false;
75 protected Sort sorter=new Sort();
76 protected Filter filter = null;
77
78 protected QueryParser query_parser = null;
79 protected QueryParser query_parser_no_stop_words = null;
80 protected IndexSearcher searcher = null;
81 protected IndexReader reader = null; // reference to a Reader resource. GS2LuceneQuery doesn't maintain it, GS2LuceneSearch maintains it!
82 // GS2LuceneSearch locally instantiates one GS2LuceneQuery object per query then allows each Query instance use a relevant Reader.
83 // But GS2LuceneSearch opens the IndexReaders and, more importantly, closes them all when a collection is deactivated.
84
85 public GS2LuceneQuery() {
86 super();
87
88 // Create one query parser with the standard set of stop words, and one with none
89
90 query_parser = new QueryParser(GSLuceneConstants.MATCH_VERSION, TEXTFIELD, new GS2Analyzer()); // uses built-in stop_words_set
91 query_parser_no_stop_words = new QueryParser(GSLuceneConstants.MATCH_VERSION, TEXTFIELD, new GS2Analyzer(new String[] { }));
92 }
93
94 public boolean initialise(IndexReader reader) {
95
96 if (!super.initialise()) {
97 return false;
98 }
99
100
101 if (full_indexdir==null || full_indexdir.length()==-1){
102 utf8out.println("Index directory is not indicated ");
103 utf8out.flush();
104 return false;
105 }
106
107 if(reader == null) {
108 return false;
109 }
110 else {
111 this.reader = reader;
112 this.searcher = new IndexSearcher(reader); // during searcher.search() will get it to compute ranks when sorting by fields
113 this.sorter = new Sort(new SortField(this.sort_field, this.sort_type, this.reverse_sort));
114 return true;
115 }
116 }
117
118 public void setIndexDir(String full_indexdir) {
119 this.full_indexdir = full_indexdir;
120 }
121
122 public void setSortField(String sort_field) {
123 if (sort_field.equals(SORT_RANK)) {
124 this.sort_field = null;
125 this.sort_type = SortField.Type.SCORE;
126 } else if (sort_field.equals(SORT_NATURAL)) {
127 this.sort_field = null;
128 this.sort_type = SortField.Type.DOC;
129 } else {
130 this.sort_field = sort_field;
131 this.sort_type = SortField.Type.STRING; // for now. numeric??
132 }
133 }
134 public void setReverseSort(boolean reverse) {
135 this.reverse_sort = reverse;
136 }
137 public boolean getReverseSort() {
138 return this.reverse_sort;
139 }
140
141 public void setFilterString(String filter_string) {
142 super.setFilterString(filter_string);
143 this.filter = parseFilterString(filter_string);
144 }
145
146 public Filter getFilter() {
147 return this.filter;
148 }
149
150
151 public LuceneQueryResult runQuery(String query_string) {
152
153 if (query_string == null || query_string.equals("")) {
154 utf8out.println("The query word is not indicated ");
155 utf8out.flush();
156 return null;
157 }
158
159 LuceneQueryResult lucene_query_result=new LuceneQueryResult();
160 lucene_query_result.clear();
161
162 if(this.reader == null) {
163 System.err.println("#### Reader is null!");
164 }
165
166 try {
167 Query query_including_stop_words = query_parser_no_stop_words.parse(query_string);
168 query_including_stop_words = query_including_stop_words.rewrite(reader);
169
170 // System.err.println("********* query_string " + query_string + "****");
171
172 Query query = parseQuery(reader, query_parser, query_string, fuzziness);
173 query = recursivelyRewriteQuery(query, reader, lucene_query_result);
174 // System.err.println("@@@@ final query class name: " + query.getClass());
175
176 // http://stackoverflow.com/questions/13537126/term-frequency-in-lucene-4-0
177 // http://stackoverflow.com/questions/20575254/lucene-4-4-how-to-get-term-frequency-over-all-index
178 // http://stackoverflow.com/questions/8938960/how-to-get-document-ids-for-document-term-vector-in-lucene?rq=1
179 // https://github.com/hibernate/hibernate-search/blob/master/orm/src/test/java/org/hibernate/search/test/filter/BestDriversFilter.java
180 // http://lucene.apache.org/core/4_7_2/MIGRATE.html
181
182 // Get the list of expanded query terms and their frequencies
183 // num docs matching, and total frequency
184 HashSet terms = new HashSet();
185 query.extractTerms(terms);
186
187 HashMap doc_term_freq_map = new HashMap();
188
189 Iterator iter = terms.iterator();
190
191 Bits liveDocs = null;
192 if(reader.hasDeletions()) {
193 System.err.println("@@@ GS2LuceneQuery.java: There have been deletions. Merging to get liveDocs.");
194 liveDocs = MultiFields.getLiveDocs(reader); // SLOW! But getLiveDocs returns null if there are no deletions
195 }
196
197 while (iter.hasNext()) {
198
199 // http://stackoverflow.com/questions/13537126/term-frequency-in-lucene-4-0
200
201 Term term = (Term) iter.next();
202 // System.err.println("@@@ GS2LuceneQuery.java: Next term: " + term.text());
203 BytesRef term_bytes = term.bytes();
204 DocsEnum term_docs = MultiFields.getTermDocsEnum(reader, liveDocs, term.field(), term_bytes); // flags?
205
206 // Get the term frequency over all the documents
207 //TermDocs term_docs = reader.termDocs(term);
208 int term_freq = 0;
209 int match_docs = 0;
210
211 if(term_docs != null) {
212 int docID = -1;
213 while((docID = term_docs.nextDoc()) != DocsEnum.NO_MORE_DOCS) {//while (term_docs.next())
214 if (term_docs.freq() != 0)
215 {
216 term_freq += term_docs.freq();
217 match_docs++;
218
219 // Calculate the document-level term frequency as well
220 Integer lucene_doc_num_obj = new Integer(term_docs.docID());
221 int doc_term_freq = 0;
222 if (doc_term_freq_map.containsKey(lucene_doc_num_obj))
223 {
224 doc_term_freq = ((Integer) doc_term_freq_map.get(lucene_doc_num_obj)).intValue();
225 }
226 doc_term_freq += term_docs.freq();
227
228 doc_term_freq_map.put(lucene_doc_num_obj, new Integer(doc_term_freq));
229 }
230 }
231 } else {
232 System.err.println("@@@ GS2LuceneQuery.java: term_docs is null for term " + term.text());
233 }
234
235 // Create a term
236 lucene_query_result.addTerm(term.text(), term.field(), match_docs, term_freq);
237 }
238
239 // Get the list of stop words removed from the query
240 HashSet terms_including_stop_words = new HashSet();
241 query_including_stop_words.extractTerms(terms_including_stop_words);
242 Iterator terms_including_stop_words_iter = terms_including_stop_words.iterator();
243 while (terms_including_stop_words_iter.hasNext()) {
244 Term term = (Term) terms_including_stop_words_iter.next();
245 if (!terms.contains(term)) {
246 lucene_query_result.addStopWord(term.text());
247 }
248 }
249
250 // Extracting all documents for a given search - http://www.gossamer-threads.com/lists/lucene/java-user/134873
251 // http://lucene.apache.org/core/3_4_0/api/core/org/apache/lucene/search/TotalHitCountCollector.html
252 // http://lucene.apache.org/core/4_7_2/core/index.html?org/apache/lucene/search/TopFieldDocs.html
253
254 // 1. Figure out how many results there will be.
255 //TotalHitCountCollecter countCollector = new TotalHitCountCollector();
256 //searcher.search(query, filter, collector);
257 //int hitCount = collector.count;
258
259 // Actually do the query
260 // Simple case for getting all the matching documents
261 if (end_results == Integer.MAX_VALUE) {
262 // Perform the query (filter and sorter may be null)
263 TopFieldDocs hits = searcher.search(query, filter, end_results, sorter, true, true); // doDocScores=true, doMaxScore=true
264 // Is there a slight difference in the definition between
265 // https://lucene.apache.org/core/3_6_0/api/all/org/apache/lucene/search/IndexSearcher.html#setDefaultFieldSortScoring%28boolean,%20boolean%29
266 // and http://lucene.apache.org/core/4_7_2/core/org/apache/lucene/search/IndexSearcher.html#search%28org.apache.lucene.search.Query,%20org.apache.lucene.search.Filter,%20int,%20org.apache.lucene.search.Sort,%20boolean,%20boolean%29
267 // Seems to be okay.
268 // See also http://stackoverflow.com/questions/7910241/in-lucene-what-is-the-purpose-of-setdefaultfieldsortscoring
269
270 lucene_query_result.setTotalDocs(hits.totalHits);
271
272 // Output the matching documents
273 lucene_query_result.setStartResults(start_results);
274 lucene_query_result.setEndResults(hits.totalHits); // ??
275
276 for (int i = start_results; i < hits.totalHits; i++) {
277 int lucene_doc_num = hits.scoreDocs[i ].doc; // i-1
278 Document doc = reader.document(lucene_doc_num);
279 int doc_term_freq = 0;
280 Integer doc_term_freq_object = (Integer) doc_term_freq_map.get(new Integer(lucene_doc_num));
281 if (doc_term_freq_object != null)
282 {
283 doc_term_freq = doc_term_freq_object.intValue();
284 }
285 lucene_query_result.addDoc(doc.get("docOID").trim(), hits.scoreDocs[i].score, doc_term_freq);
286 }
287 }
288
289 // Slightly more complicated case for returning a subset of the matching documents
290 else {
291 // Perform the query (filter may be null)
292 TopFieldDocs hits = searcher.search(query, filter, end_results, sorter, true, true); // doDocScores=true, doMaxScore=true
293 // See also http://stackoverflow.com/questions/7910241/in-lucene-what-is-the-purpose-of-setdefaultfieldsortscoring
294 lucene_query_result.setTotalDocs(hits.totalHits);
295
296 lucene_query_result.setStartResults(start_results);
297 lucene_query_result.setEndResults(end_results < hits.scoreDocs.length ? end_results: hits.scoreDocs.length);
298
299 // Output the matching documents
300 for (int i = start_results; (i < hits.scoreDocs.length && i < end_results); i++) {
301 int lucene_doc_num = hits.scoreDocs[i].doc;
302 Document doc = reader.document(lucene_doc_num);
303 int doc_term_freq = 0;
304 Integer doc_term_freq_object = (Integer) doc_term_freq_map.get(new Integer(lucene_doc_num));
305 if (doc_term_freq_object != null)
306 {
307 doc_term_freq = doc_term_freq_object.intValue();
308 }
309 lucene_query_result.addDoc(doc.get("docOID").trim(), hits.scoreDocs[i].score, doc_term_freq);
310 }
311 }
312 }
313
314 catch (ParseException parse_exception) {
315 lucene_query_result.setError(LuceneQueryResult.PARSE_ERROR);
316 }
317 catch (BooleanQuery.TooManyClauses too_many_clauses_exception) {
318 lucene_query_result.setError(LuceneQueryResult.TOO_MANY_CLAUSES_ERROR);
319 }
320 catch (IOException exception) {
321 lucene_query_result.setError(LuceneQueryResult.IO_ERROR);
322 exception.printStackTrace();
323 }
324 catch (Exception exception) {
325 lucene_query_result.setError(LuceneQueryResult.OTHER_ERROR);
326 exception.printStackTrace();
327 }
328 return lucene_query_result;
329 }
330
331 public void setDefaultConjunctionOperator(String default_conjunction_operator) {
332 super.setDefaultConjunctionOperator(default_conjunction_operator);
333
334 if (default_conjunction_operator.equals("AND")) {
335 query_parser.setDefaultOperator(query_parser.AND_OPERATOR);
336 query_parser_no_stop_words.setDefaultOperator(query_parser.AND_OPERATOR);
337 } else { // default is OR
338 query_parser.setDefaultOperator(query_parser.OR_OPERATOR);
339 query_parser_no_stop_words.setDefaultOperator(query_parser.OR_OPERATOR);
340 }
341 }
342
343 // This version of the cleanUp() method is just to clean up anything associated only with this instance of GS2LuceneQuery.
344 // So it won't clean up the singleton IndexReader instances maintained by the encapsulating GS2LuceneSearch class.
345 public void cleanUp() {
346 super.cleanUp();
347
348 searcher = null;
349
350 // Don't close the indexReader reference here.
351 // This has moved into the GS2LuceneSearch.cleanUp() method, as it maintains singleton IndexReaders
352 // for each index level (sidx, didix) with lifespans matching their collection's lifespan
353 // A collection's GS2LuceneSearch object lives for the duration of the Collection.
354 // A GS2LuceneQuery object is ephemeral: only lives for the duration of a query, allowing multiple
355 // users to do queries concurrently, sharing a single IndexReader object for each indexing level
356 // since IndexReaders support concurrency.
357 }
358
359 protected Query parseQuery(IndexReader reader, QueryParser query_parser, String query_string, String fuzziness)
360 throws java.io.IOException, org.apache.lucene.queryparser.classic.ParseException
361 {
362 // Split query string into the search terms and the filter terms
363 // * The first +(...) term contains the search terms so count
364 // up '(' and stop when we finish matching ')'
365 int offset = 0;
366 int paren_count = 0;
367 boolean seen_paren = false;
368 while (offset < query_string.length() && (!seen_paren || paren_count > 0)) {
369 if (query_string.charAt(offset) == '(') {
370 paren_count++;
371 seen_paren = true;
372 }
373 if (query_string.charAt(offset) == ')') {
374 paren_count--;
375 }
376 offset++;
377 }
378 String query_prefix = query_string.substring(0, offset);
379 String query_suffix = query_string.substring(offset);
380
381 ///ystem.err.println("Prefix: " + query_prefix);
382 ///ystem.err.println("Suffix: " + query_suffix);
383
384 Query query = query_parser.parse(query_prefix);
385 query = query.rewrite(reader);
386
387 // If this is a fuzzy search, then we need to add the fuzzy
388 // flag to each of the query terms
389 if (fuzziness != null && query.toString().length() > 0) {
390
391 // Revert the query to a string
392 System.err.println("Rewritten query: " + query.toString());
393 // Search through the string for TX:<term> query terms
394 // and append the ~ operator. Note that this search will
395 // not change phrase searches (TX:"<term> <term>") as
396 // fuzzy searching is not possible for these entries.
397 // Yahoo! Time for a state machine!
398 StringBuffer mutable_query_string = new StringBuffer(query.toString());
399 int o = 0; // Offset
400 // 0 = BASE, 1 = SEEN_T, 2 = SEEN_TX, 3 = SEEN_TX:
401 int s = 0; // State
402 while(o < mutable_query_string.length()) {
403 char c = mutable_query_string.charAt(o);
404 if (s == 0 && c == TEXTFIELD.charAt(0)) {
405 ///ystem.err.println("Found T!");
406 s = 1;
407 }
408 else if (s == 1) {
409 if (c == TEXTFIELD.charAt(1)) {
410 ///ystem.err.println("Found X!");
411 s = 2;
412 }
413 else {
414 s = 0; // Reset
415 }
416 }
417 else if (s == 2) {
418 if (c == ':') {
419 ///ystem.err.println("Found TX:!");
420 s = 3;
421 }
422 else {
423 s = 0; // Reset
424 }
425 }
426 else if (s == 3) {
427 // Don't process phrases
428 if (c == '"') {
429 ///ystem.err.println("Stupid phrase...");
430 s = 0; // Reset
431 }
432 // Found the end of the term... add the
433 // fuzzy search indicator
434 // Nor outside the scope of parentheses
435 else if (Character.isWhitespace(c) || c == ')') {
436 ///ystem.err.println("Yahoo! Found fuzzy term.");
437 mutable_query_string.insert(o, '~' + fuzziness);
438 o++;
439 s = 0; // Reset
440 }
441 }
442 o++;
443 }
444 // If we were in the state of looking for the end of a
445 // term - then we just found it!
446 if (s == 3) {
447
448 mutable_query_string.append('~' + fuzziness);
449 }
450 // Reparse the query
451 ///ystem.err.println("Fuzzy query: " + mutable_query_string.toString() + query_suffix);
452 query = query_parser.parse(mutable_query_string.toString() + query_suffix);
453 }
454 else {
455 query = query_parser.parse(query_prefix + query_suffix);
456 }
457
458 return query;
459 }
460
461 // If you're dealing with a BooleanQuery, they need to be recursively rewritten
462 // as they can contain queries with wildcards (WildcardQuery|PrefixQuery subclasses of MultiTermQuery)
463 // e.g. season* farm
464 // If MultiTermQuery, then expand here. e.g. WildcardQuerys like season*.
465 // DON'T call this method from inside parseQuery() (in place of its query.rewrite()), because then wildcard
466 // queries like season* won't contain Terms (extractTerms() will be empty) since the ConstantScoreQuerys
467 // that a WildcardQuery gets rewritten to here will contain Filters in place of Terms.
468 // Call this method from runQuery() after it calls parseQuery().
469 // Now searches like these will work
470 // season* farm
471 // season* farm*
472 // and not just searches like the following which already used to work:
473 // season*
474 // snail farm
475 // Idea for the solution of recursively processing a BooleanQuery came from inspecting source code to BooleanQuery.java
476 // https://github.com/apache/lucene-solr/blob/master/lucene/core/src/java/org/apache/lucene/search/BooleanQuery.java
477 // which also does a recursive rewrite. Unfortunately, the existing BooleanQuery does not handle MultiTermQuery
478 // subcomponents.
479 protected Query recursivelyRewriteQuery(Query orig_query, IndexReader reader, LuceneQueryResult lucene_query_result) throws java.io.IOException
480 {
481 //Query query = orig_query.rewrite(reader);
482 Query query = orig_query;
483
484 if(orig_query instanceof BooleanQuery) {
485 BooleanQuery booleanQuery = (BooleanQuery)orig_query;
486 List<BooleanClause> clauses = booleanQuery.clauses();
487 for (BooleanClause clause : clauses) {
488 Query subQuery = clause.getQuery();
489 subQuery = recursivelyRewriteQuery(subQuery, reader, lucene_query_result);
490 clause.setQuery(subQuery);
491 }
492 }
493
494 // GS2's LuceneWrapper uses lucene-2.3.2. GS3's LuceneWrapper3 works with lucene-3.3.0.
495 // This change in lucene core library for GS3 (present since after version 2.4.1) had the
496 // side-effect that searching on "econom*" didn't display what terms it was searching for,
497 // whereas it had done so in GS2.
498
499 // The details of this problem and its current solution are explained in the ticket
500 // http://trac.greenstone.org/ticket/845
501
502 // We need to change the settings for the rewriteMethod in order to get searches on wildcards
503 // to produce search terms again when the query gets rewritten.
504
505 // We try, in order:
506 // 1. RewriteMethod set to BooleanQuery, to get it working as in GS2 which uses lucene-2.3.2
507 // it will expand wildcard searches to its terms when searching at both section AND doc level.
508 // If that throws a TooManyClauses exception (like when searching for "a*" over lucene demo collection)
509 // 2. Then try a custom rewriteMethod which sets termCountCutoff=350 and docCountPercent cutoff=0.1%
510 // If that throws a TooManyClauses exception (could perhaps happen if the collection has a huge number of docs
511 // 3. Then try the default apache rewriteMethod with its optimum defaults of
512 // termCountCutoff=350 and docCountPercent cutoff=0.1%
513 // See http://lucene.apache.org/core/3_6_1/api/core/org/apache/lucene/search/MultiTermQuery.html
514
515 //System.err.println("@@@@ query class name: " + orig_query.getClass());
516 //System.err.println("@@@@ QUERY: " + orig_query);
517
518 if(orig_query instanceof MultiTermQuery) {
519 MultiTermQuery multiTermQuery = (MultiTermQuery)orig_query;
520 multiTermQuery.setRewriteMethod(MultiTermQuery.CONSTANT_SCORE_BOOLEAN_QUERY_REWRITE);
521 // less CPU intensive than MultiTermQuery.SCORING_BOOLEAN_QUERY_REWRITE)
522 }
523
524 try {
525 query = orig_query.rewrite(reader);
526 }
527 catch(BooleanQuery.TooManyClauses clauseException) {
528 // Example test case: try searching the lucene demo collection for "a*"
529 // and you'll hit this exception
530
531 lucene_query_result.setError(LuceneQueryResult.TOO_MANY_CLAUSES_ERROR);
532
533 if(query instanceof MultiTermQuery) {
534
535 // CustomRewriteMethod: setting the docCountPercent cutoff to a custom 100%.
536 // This will at least expand the query to its terms when searching with wildcards at section-level
537 // (though it doesn't seem to work for doc-level searches, no matter what the cutoffs are set to).
538
539 MultiTermQuery.ConstantScoreAutoRewrite customRewriteMethod = new MultiTermQuery.ConstantScoreAutoRewrite();
540 customRewriteMethod.setDocCountPercent(100.0);
541 customRewriteMethod.setTermCountCutoff(350); // same as default
542
543 MultiTermQuery multiTermQuery = (MultiTermQuery)query;
544 multiTermQuery.setRewriteMethod(customRewriteMethod);
545 try {
546 query = query.rewrite(reader);
547 }
548 catch(BooleanQuery.TooManyClauses clauseExceptionAgain) {
549
550 // do what the code originally did: use the default rewriteMethod which
551 // uses a default docCountPercent=0.1 (%) and termCountCutoff=350
552
553 multiTermQuery.setRewriteMethod(MultiTermQuery.CONSTANT_SCORE_AUTO_REWRITE_DEFAULT);
554 query = query.rewrite(reader);
555 }
556 }
557 }
558
559 // BooleanQuery.java recurses rewriting any query until it is identical before and after rewrite,
560 // see reference to "recursively rewrite" in
561 // https://github.com/apache/lucene-solr/blob/master/lucene/core/src/java/org/apache/lucene/search/BooleanQuery.java
562 if(orig_query == query) {
563 return query;
564 } else {
565 return recursivelyRewriteQuery(query, reader, lucene_query_result);
566 }
567 }
568
569 protected Filter parseFilterString(String filter_string)
570 {
571 Filter result = null;
572 Pattern pattern = Pattern.compile("\\s*\\+(\\w+)\\:([\\{\\[])(\\d+)\\s+TO\\s+(\\d+)([\\}\\]])\\s*");
573 Matcher matcher = pattern.matcher(filter_string);
574 if (matcher.matches()) {
575 String field_name = matcher.group(1);
576 boolean include_lower = matcher.group(2).equals("[");
577 BytesRef lower_term = new BytesRef(matcher.group(3));
578 BytesRef upper_term = new BytesRef(matcher.group(4));
579 boolean include_upper = matcher.group(5).equals("]");
580 result = new TermRangeFilter(field_name, lower_term, upper_term, include_lower, include_upper);
581 }
582 else {
583 System.err.println("Error: Could not understand filter string \"" + filter_string + "\"");
584 }
585 return result;
586 }
587
588
589 /** command line program and auxiliary methods */
590
591 // Fairly self-explanatory I should hope
592 static protected boolean query_result_caching_enabled = false;
593
594 /**
595 * This main() method is used by GS2 to do searches.
596 * In GS2, lucene_query.pl calles this main() method in the LuceneWrapper4.jar. This main method instantiates both
597 * a GS2LuceneQuery and an IndexReader object. It then passes the reader to the GS2LuceneQuery object by calling
598 * the GS2LuceneQuery.initialise(reader) method. This main() method then finally performs the search with the provided query.
599 * GS3 doesn't use this main() method. Instead a GS2LuceneSearch object (of gsdl3.jar) instantiates both
600 * the GS2LuceneQuery and IndexReader objects and proceeds the same way.
601 */
602 static public void main (String args[])
603 {
604 if (args.length == 0) {
605 System.out.println("Usage: org.greenstone.LuceneWrapper4.GS2LuceneQuery <index directory> [-fuzziness value] [-filter filter_string] [-sort sort_field] [-reverse_sort][-dco AND|OR] [-startresults number -endresults number] [query]");
606 return;
607 }
608
609 try {
610 String index_directory = args[0];
611
612 GS2LuceneQuery queryer = new GS2LuceneQuery();
613 queryer.setIndexDir(index_directory);
614
615 // Prepare the index cache directory, if query result caching is enabled
616 if (query_result_caching_enabled) {
617 // Make the index cache directory if it doesn't already exist
618 File index_cache_directory = new File(index_directory, "cache");
619 if (!index_cache_directory.exists()) {
620 index_cache_directory.mkdir();
621 }
622
623 // Disable caching if the index cache directory isn't available
624 if (!index_cache_directory.exists() || !index_cache_directory.isDirectory()) {
625 query_result_caching_enabled = false;
626 }
627 }
628
629 String query_string = null;
630
631 // Parse the command-line arguments
632 for (int i = 1; i < args.length; i++) {
633 if (args[i].equals("-sort")) {
634 i++;
635 queryer.setSortField(args[i]);
636 }
637 else if (args[i].equals("-reverse_sort")) {
638 queryer.setReverseSort(true);
639 }
640 else if (args[i].equals("-filter")) {
641 i++;
642 queryer.setFilterString(args[i]);
643 }
644 else if (args[i].equals("-dco")) {
645 i++;
646 queryer.setDefaultConjunctionOperator(args[i]);
647 }
648 else if (args[i].equals("-fuzziness")) {
649 i++;
650 queryer.setFuzziness(args[i]);
651 }
652 else if (args[i].equals("-startresults")) {
653 i++;
654 if (args[i].matches("\\d+")) {
655 queryer.setStartResults(Integer.parseInt(args[i]));
656 }
657 }
658 else if (args[i].equals("-endresults")) {
659 i++;
660 if (args[i].matches("\\d+")) {
661 queryer.setEndResults(Integer.parseInt(args[i]));
662 }
663 }
664 else {
665 query_string = args[i];
666 }
667 }
668
669 Directory full_indexdir_dir = FSDirectory.open(new File(index_directory));
670 IndexReader reader = DirectoryReader.open(full_indexdir_dir); // Returns a IndexReader reading the index in the given Directory.
671 // Now readOnly=true by default, and therefore also for searcher created in initialise() call below.
672 if (!queryer.initialise(reader)) {
673 if(reader != null) reader.close(); // close reader object IF reader was instantiated
674 queryer.cleanUp(); // will close searcher object if non-null
675 return;
676 }
677
678 // The query string has been specified as a command-line argument
679 if (query_string != null) {
680 runQueryCaching(index_directory, queryer, query_string);
681 }
682
683 // Read queries from STDIN
684 else {
685 BufferedReader in = new BufferedReader(new InputStreamReader(System.in, "UTF-8"));
686 while (true) {
687 // Read the query from STDIN
688 query_string = in.readLine();
689 if (query_string == null || query_string.length() == -1) {
690 break;
691 }
692
693 runQueryCaching(index_directory, queryer, query_string);
694
695 }
696 }
697 if(reader != null) reader.close();
698 queryer.cleanUp();
699 }
700 catch (IOException exception) {
701 exception.printStackTrace();
702 }
703 }
704
705 protected static void runQueryCaching(String index_directory, GS2LuceneQuery queryer, String query_string)
706 throws IOException
707 {
708 StringBuffer query_results_xml = new StringBuffer();
709
710 // Check if this query result has been cached from a previous search (if it's enabled)
711 File query_result_cache_file = null;
712 if (query_result_caching_enabled) {
713 // Generate the cache file name from the query options
714 String query_result_cache_file_name = query_string + "-";
715 String fuzziness = queryer.getFuzziness();
716 query_result_cache_file_name += ((fuzziness != null) ? fuzziness : "") + "-";
717 String filter_string = queryer.getFilterString();
718 query_result_cache_file_name += ((filter_string != null) ? filter_string : "") + "-";
719 String sort_string = queryer.getSortField();
720 query_result_cache_file_name += ((sort_string != null) ? sort_string : "") + "-";
721 String reverse_sort_string = (queryer.getReverseSort() ? "1" : "0");
722 query_result_cache_file_name += reverse_sort_string + "-";
723 String default_conjunction_operator = queryer.getDefaultConjunctionOperator();
724 query_result_cache_file_name += default_conjunction_operator + "-";
725 int start_results = queryer.getStartResults();
726 int end_results = queryer.getEndResults();
727 query_result_cache_file_name += start_results + "-" + end_results;
728 query_result_cache_file_name = fileSafe(query_result_cache_file_name);
729
730 // If the query result cache file exists, just return its contents and we're done
731 File index_cache_directory = new File(index_directory, "cache");
732 query_result_cache_file = new File(index_cache_directory, query_result_cache_file_name);
733 if (query_result_cache_file.exists() && query_result_cache_file.isFile()) {
734 FileInputStream fis = new FileInputStream(query_result_cache_file);
735 InputStreamReader isr = new InputStreamReader(fis, "UTF-8");
736 BufferedReader buffered_reader = new BufferedReader(isr);
737 String line = "";
738 while ((line = buffered_reader.readLine()) != null) {
739 query_results_xml.append(line + "\n");
740 }
741 String query_results_xml_string = query_results_xml.toString();
742 query_results_xml_string = query_results_xml_string.replaceFirst("cached=\"false\"", "cached=\"true\"");
743
744 utf8out.print(query_results_xml_string);
745 utf8out.flush();
746
747 return;
748 }
749 }
750
751 // not cached
752 query_results_xml.append("<ResultSet cached=\"false\">\n");
753 query_results_xml.append("<QueryString>" + LuceneQueryResult.xmlSafe(query_string) + "</QueryString>\n");
754 Filter filter = queryer.getFilter();
755 if (filter != null) {
756 query_results_xml.append("<FilterString>" + filter.toString() + "</FilterString>\n");
757 }
758
759 LuceneQueryResult query_result = queryer.runQuery(query_string);
760 if (query_result == null) {
761 System.err.println("Couldn't run the query");
762 return;
763 }
764
765 if (query_result.getError() != LuceneQueryResult.NO_ERROR) {
766 query_results_xml.append("<Error type=\""+query_result.getErrorString()+"\" />\n");
767 } else {
768 query_results_xml.append(query_result.getXMLString());
769 }
770 query_results_xml.append("</ResultSet>\n");
771
772 utf8out.print(query_results_xml);
773 utf8out.flush();
774
775 // Cache this query result, if desired
776 if (query_result_caching_enabled) {
777 // Catch any exceptions thrown trying to write the query result cache file and warn about them, but don't
778 // bother with the full stack trace. It won't affect the functionality if we can't write some cache
779 // files, it will just affect the speed of subsequent requests.
780 // Example exceptions are "permission denied" errors, or "filename too long" errors (the filter string
781 // can get very long in some collections)
782 try
783 {
784 FileWriter query_result_cache_file_writer = new FileWriter(query_result_cache_file);
785 query_result_cache_file_writer.write(query_results_xml.toString());
786 query_result_cache_file_writer.close();
787 }
788 catch (Exception exception)
789 {
790 System.err.println("Warning: Exception occurred trying to write query result cache file (" + exception + ")");
791 }
792 }
793 }
794
795 protected static String fileSafe(String text)
796 {
797 StringBuffer file_safe_text = new StringBuffer();
798 for (int i = 0; i < text.length(); i++) {
799 char character = text.charAt(i);
800 if ((character >= 'A' && character <= 'Z') || (character >= 'a' && character <= 'z') || (character >= '0' && character <= '9') || character == '-') {
801 file_safe_text.append(character);
802 }
803 else {
804 file_safe_text.append('%');
805 file_safe_text.append((int) character);
806 }
807 }
808 return file_safe_text.toString();
809 }
810
811
812}
813
814
Note: See TracBrowser for help on using the repository browser.