source: main/trunk/greenstone2/common-src/indexers/lucene-gs/src/org/greenstone/LuceneWrapper4/GS2LuceneQuery.java@ 32609

Last change on this file since 32609 was 32609, checked in by ak19, 5 years ago

Preliminary stage before tackling a different bug. This commit is a bugfix to the index folder file locking problem that occurs on Windows when coll deactivate doesn't close all file handles to the coll index folder after doing some lucene searches. Inspecting the code revealed the possibility of another different bug, for which Kathy devised a test to confirm its existence. After testing, found the bug is real: multiple queries configure the same query object (and its internal reader object) but the last configuration is always used to run a search. For example, one user wants to search a lucene collection at doc level and a second user wants to search the same collection at section level. The 2nd user's configuration wins if they configure between the first person's query object being configured and its query being run. So the first person now ends up seeing search results that are at section level.

File size: 30.4 KB
Line 
1/**********************************************************************
2 *
3 * GS2LuceneQuery.java
4 *
5 * Copyright 2004 The New Zealand Digital Library Project
6 *
7 * A component of the Greenstone digital library software
8 * from the New Zealand Digital Library Project at the
9 * University of Waikato, New Zealand.
10 *
11 * This program is free software; you can redistribute it and/or modify
12 * it under the terms of the GNU General Public License as published by
13 * the Free Software Foundation; either version 2 of the License, or
14 * (at your option) any later version.
15 *
16 * This program is distributed in the hope that it will be useful,
17 * but WITHOUT ANY WARRANTY; without even the implied warranty of
18 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19 * GNU General Public License for more details.
20 *
21 * You should have received a copy of the GNU General Public License
22 * along with this program; if not, write to the Free Software
23 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24 *
25 *********************************************************************/
26package org.greenstone.LuceneWrapper4;
27
28
29import java.io.*;
30import java.util.*;
31import java.util.regex.*;
32
33import org.apache.lucene.analysis.Analyzer;
34import org.apache.lucene.analysis.standard.StandardAnalyzer;
35import org.apache.lucene.document.Document;
36import org.apache.lucene.index.DirectoryReader;
37import org.apache.lucene.index.IndexReader;
38import org.apache.lucene.index.Term;
39//import org.apache.lucene.index.TermDocs;
40import org.apache.lucene.queryparser.classic.ParseException;
41import org.apache.lucene.queryparser.classic.QueryParser;
42import org.apache.lucene.search.BooleanClause;
43import org.apache.lucene.search.BooleanQuery; // for the TooManyClauses exception
44import org.apache.lucene.search.ConstantScoreQuery;
45import org.apache.lucene.search.Filter;
46import org.apache.lucene.search.IndexSearcher;
47import org.apache.lucene.search.MultiTermQuery;
48import org.apache.lucene.search.MultiTermQuery.ConstantScoreAutoRewrite;
49import org.apache.lucene.search.Query;
50import org.apache.lucene.search.TermRangeFilter;
51import org.apache.lucene.search.IndexSearcher; // Searcher is deprecated
52import org.apache.lucene.search.ScoreDoc;
53import org.apache.lucene.search.Sort;
54import org.apache.lucene.search.SortField;
55import org.apache.lucene.search.TopFieldDocs;
56
57import org.apache.lucene.index.DocsEnum;
58import org.apache.lucene.index.MultiFields;
59
60import org.apache.lucene.store.Directory;
61import org.apache.lucene.store.FSDirectory;
62
63import org.apache.lucene.util.Bits;
64import org.apache.lucene.util.BytesRef;
65import org.apache.lucene.util.Version;
66
67public class GS2LuceneQuery extends SharedSoleneQuery
68{
69 public static String SORT_RANK = "rank";
70 public static String SORT_NATURAL = "natural";
71
72 protected String full_indexdir="";
73
74 protected SortField.Type sort_type = SortField.Type.SCORE;
75 protected boolean reverse_sort = false;
76 protected Sort sorter=new Sort();
77 protected Filter filter = null;
78
79 protected QueryParser query_parser = null;
80 protected QueryParser query_parser_no_stop_words = null;
81 protected IndexSearcher searcher = null;
82 protected IndexReader reader = null;
83
84 public GS2LuceneQuery() {
85 super();
86
87 // Create one query parser with the standard set of stop words, and one with none
88
89 query_parser = new QueryParser(GSLuceneConstants.MATCH_VERSION, TEXTFIELD, new GS2Analyzer()); // uses built-in stop_words_set
90 query_parser_no_stop_words = new QueryParser(GSLuceneConstants.MATCH_VERSION, TEXTFIELD, new GS2Analyzer(new String[] { }));
91 }
92
93
94 public boolean initialise() {
95
96 if (!super.initialise()) {
97 return false;
98 }
99
100
101 if (full_indexdir==null || full_indexdir.length()==-1){
102 utf8out.println("Index directory is not indicated ");
103 utf8out.flush();
104 return false;
105 }
106
107 try {
108
109 if(reader != null) {
110 reader.close();
111 searcher = null;
112 }
113
114 Directory full_indexdir_dir = FSDirectory.open(new File(full_indexdir));
115
116 reader = DirectoryReader.open(full_indexdir_dir); // Returns a IndexReader reading the index in the given Directory. now readOnly=true by default, and therefore also for searcher
117 searcher = new IndexSearcher(reader); // during searcher.search() will get it to compute ranks when sorting by fields
118
119 this.sorter = new Sort(new SortField(this.sort_field, this.sort_type, this.reverse_sort));
120 }
121 catch (IOException exception) {
122 exception.printStackTrace();
123 return false;
124 }
125 return true;
126
127 }
128
129 public void setIndexDir(String full_indexdir) {
130 this.full_indexdir = full_indexdir;
131 }
132
133 public void setSortField(String sort_field) {
134 if (sort_field.equals(SORT_RANK)) {
135 this.sort_field = null;
136 this.sort_type = SortField.Type.SCORE;
137 } else if (sort_field.equals(SORT_NATURAL)) {
138 this.sort_field = null;
139 this.sort_type = SortField.Type.DOC;
140 } else {
141 this.sort_field = sort_field;
142 this.sort_type = SortField.Type.STRING; // for now. numeric??
143 }
144 }
145 public void setReverseSort(boolean reverse) {
146 this.reverse_sort = reverse;
147 }
148 public boolean getReverseSort() {
149 return this.reverse_sort;
150 }
151
152 public void setFilterString(String filter_string) {
153 super.setFilterString(filter_string);
154 this.filter = parseFilterString(filter_string);
155 }
156
157 public Filter getFilter() {
158 return this.filter;
159 }
160
161
162 public LuceneQueryResult runQuery(String query_string) {
163
164 if (query_string == null || query_string.equals("")) {
165 utf8out.println("The query word is not indicated ");
166 utf8out.flush();
167 return null;
168 }
169
170 LuceneQueryResult lucene_query_result=new LuceneQueryResult();
171 lucene_query_result.clear();
172
173 try {
174 Query query_including_stop_words = query_parser_no_stop_words.parse(query_string);
175 query_including_stop_words = query_including_stop_words.rewrite(reader);
176
177 // System.err.println("********* query_string " + query_string + "****");
178
179 Query query = parseQuery(reader, query_parser, query_string, fuzziness);
180 query = recursivelyRewriteQuery(query, reader, lucene_query_result);
181 // System.err.println("@@@@ final query class name: " + query.getClass());
182
183 // http://stackoverflow.com/questions/13537126/term-frequency-in-lucene-4-0
184 // http://stackoverflow.com/questions/20575254/lucene-4-4-how-to-get-term-frequency-over-all-index
185 // http://stackoverflow.com/questions/8938960/how-to-get-document-ids-for-document-term-vector-in-lucene?rq=1
186 // https://github.com/hibernate/hibernate-search/blob/master/orm/src/test/java/org/hibernate/search/test/filter/BestDriversFilter.java
187 // http://lucene.apache.org/core/4_7_2/MIGRATE.html
188
189 // Get the list of expanded query terms and their frequencies
190 // num docs matching, and total frequency
191 HashSet terms = new HashSet();
192 query.extractTerms(terms);
193
194 HashMap doc_term_freq_map = new HashMap();
195
196 Iterator iter = terms.iterator();
197
198 Bits liveDocs = null;
199 if(reader.hasDeletions()) {
200 System.err.println("@@@ GS2LuceneQuery.java: There have been deletions. Merging to get liveDocs.");
201 liveDocs = MultiFields.getLiveDocs(reader); // SLOW! But getLiveDocs returns null if there are no deletions
202 }
203
204 while (iter.hasNext()) {
205
206 // http://stackoverflow.com/questions/13537126/term-frequency-in-lucene-4-0
207
208 Term term = (Term) iter.next();
209 // System.err.println("@@@ GS2LuceneQuery.java: Next term: " + term.text());
210 BytesRef term_bytes = term.bytes();
211 DocsEnum term_docs = MultiFields.getTermDocsEnum(reader, liveDocs, term.field(), term_bytes); // flags?
212
213 // Get the term frequency over all the documents
214 //TermDocs term_docs = reader.termDocs(term);
215 int term_freq = 0;
216 int match_docs = 0;
217
218 if(term_docs != null) {
219 int docID = -1;
220 while((docID = term_docs.nextDoc()) != DocsEnum.NO_MORE_DOCS) {//while (term_docs.next())
221 if (term_docs.freq() != 0)
222 {
223 term_freq += term_docs.freq();
224 match_docs++;
225
226 // Calculate the document-level term frequency as well
227 Integer lucene_doc_num_obj = new Integer(term_docs.docID());
228 int doc_term_freq = 0;
229 if (doc_term_freq_map.containsKey(lucene_doc_num_obj))
230 {
231 doc_term_freq = ((Integer) doc_term_freq_map.get(lucene_doc_num_obj)).intValue();
232 }
233 doc_term_freq += term_docs.freq();
234
235 doc_term_freq_map.put(lucene_doc_num_obj, new Integer(doc_term_freq));
236 }
237 }
238 } else {
239 System.err.println("@@@ GS2LuceneQuery.java: term_docs is null for term " + term.text());
240 }
241
242 // Create a term
243 lucene_query_result.addTerm(term.text(), term.field(), match_docs, term_freq);
244 }
245
246 // Get the list of stop words removed from the query
247 HashSet terms_including_stop_words = new HashSet();
248 query_including_stop_words.extractTerms(terms_including_stop_words);
249 Iterator terms_including_stop_words_iter = terms_including_stop_words.iterator();
250 while (terms_including_stop_words_iter.hasNext()) {
251 Term term = (Term) terms_including_stop_words_iter.next();
252 if (!terms.contains(term)) {
253 lucene_query_result.addStopWord(term.text());
254 }
255 }
256
257 // Extracting all documents for a given search - http://www.gossamer-threads.com/lists/lucene/java-user/134873
258 // http://lucene.apache.org/core/3_4_0/api/core/org/apache/lucene/search/TotalHitCountCollector.html
259 // http://lucene.apache.org/core/4_7_2/core/index.html?org/apache/lucene/search/TopFieldDocs.html
260
261 // 1. Figure out how many results there will be.
262 //TotalHitCountCollecter countCollector = new TotalHitCountCollector();
263 //searcher.search(query, filter, collector);
264 //int hitCount = collector.count;
265
266 // Actually do the query
267 // Simple case for getting all the matching documents
268 if (end_results == Integer.MAX_VALUE) {
269 // Perform the query (filter and sorter may be null)
270 TopFieldDocs hits = searcher.search(query, filter, end_results, sorter, true, true); // doDocScores=true, doMaxScore=true
271 // Is there a slight difference in the definition between
272 // https://lucene.apache.org/core/3_6_0/api/all/org/apache/lucene/search/IndexSearcher.html#setDefaultFieldSortScoring%28boolean,%20boolean%29
273 // and http://lucene.apache.org/core/4_7_2/core/org/apache/lucene/search/IndexSearcher.html#search%28org.apache.lucene.search.Query,%20org.apache.lucene.search.Filter,%20int,%20org.apache.lucene.search.Sort,%20boolean,%20boolean%29
274 // Seems to be okay.
275 // See also http://stackoverflow.com/questions/7910241/in-lucene-what-is-the-purpose-of-setdefaultfieldsortscoring
276
277 lucene_query_result.setTotalDocs(hits.totalHits);
278
279 // Output the matching documents
280 lucene_query_result.setStartResults(start_results);
281 lucene_query_result.setEndResults(hits.totalHits); // ??
282
283 for (int i = start_results; i < hits.totalHits; i++) {
284 int lucene_doc_num = hits.scoreDocs[i ].doc; // i-1
285 Document doc = reader.document(lucene_doc_num);
286 int doc_term_freq = 0;
287 Integer doc_term_freq_object = (Integer) doc_term_freq_map.get(new Integer(lucene_doc_num));
288 if (doc_term_freq_object != null)
289 {
290 doc_term_freq = doc_term_freq_object.intValue();
291 }
292 lucene_query_result.addDoc(doc.get("docOID").trim(), hits.scoreDocs[i].score, doc_term_freq);
293 }
294 }
295
296 // Slightly more complicated case for returning a subset of the matching documents
297 else {
298 // Perform the query (filter may be null)
299 TopFieldDocs hits = searcher.search(query, filter, end_results, sorter, true, true); // doDocScores=true, doMaxScore=true
300 // See also http://stackoverflow.com/questions/7910241/in-lucene-what-is-the-purpose-of-setdefaultfieldsortscoring
301 lucene_query_result.setTotalDocs(hits.totalHits);
302
303 lucene_query_result.setStartResults(start_results);
304 lucene_query_result.setEndResults(end_results < hits.scoreDocs.length ? end_results: hits.scoreDocs.length);
305
306 // Output the matching documents
307 for (int i = start_results; (i < hits.scoreDocs.length && i < end_results); i++) {
308 int lucene_doc_num = hits.scoreDocs[i].doc;
309 Document doc = reader.document(lucene_doc_num);
310 int doc_term_freq = 0;
311 Integer doc_term_freq_object = (Integer) doc_term_freq_map.get(new Integer(lucene_doc_num));
312 if (doc_term_freq_object != null)
313 {
314 doc_term_freq = doc_term_freq_object.intValue();
315 }
316 lucene_query_result.addDoc(doc.get("docOID").trim(), hits.scoreDocs[i].score, doc_term_freq);
317 }
318 }
319 }
320
321 catch (ParseException parse_exception) {
322 lucene_query_result.setError(LuceneQueryResult.PARSE_ERROR);
323 }
324 catch (BooleanQuery.TooManyClauses too_many_clauses_exception) {
325 lucene_query_result.setError(LuceneQueryResult.TOO_MANY_CLAUSES_ERROR);
326 }
327 catch (IOException exception) {
328 lucene_query_result.setError(LuceneQueryResult.IO_ERROR);
329 exception.printStackTrace();
330 }
331 catch (Exception exception) {
332 lucene_query_result.setError(LuceneQueryResult.OTHER_ERROR);
333 exception.printStackTrace();
334 }
335 return lucene_query_result;
336 }
337
338 public void setDefaultConjunctionOperator(String default_conjunction_operator) {
339 super.setDefaultConjunctionOperator(default_conjunction_operator);
340
341 if (default_conjunction_operator.equals("AND")) {
342 query_parser.setDefaultOperator(query_parser.AND_OPERATOR);
343 query_parser_no_stop_words.setDefaultOperator(query_parser.AND_OPERATOR);
344 } else { // default is OR
345 query_parser.setDefaultOperator(query_parser.OR_OPERATOR);
346 query_parser_no_stop_words.setDefaultOperator(query_parser.OR_OPERATOR);
347 }
348 }
349
350
351 public void cleanUp() {
352 super.cleanUp();
353 try {
354 if(reader != null) {
355 reader.close();
356 // Closes files associated with this index. Also saves any new deletions to disk.
357 // No other methods should be called after this has been called.
358 }
359
360 } catch (IOException exception) {
361 exception.printStackTrace();
362 }
363 }
364
365
366 protected Query parseQuery(IndexReader reader, QueryParser query_parser, String query_string, String fuzziness)
367 throws java.io.IOException, org.apache.lucene.queryparser.classic.ParseException
368 {
369 // Split query string into the search terms and the filter terms
370 // * The first +(...) term contains the search terms so count
371 // up '(' and stop when we finish matching ')'
372 int offset = 0;
373 int paren_count = 0;
374 boolean seen_paren = false;
375 while (offset < query_string.length() && (!seen_paren || paren_count > 0)) {
376 if (query_string.charAt(offset) == '(') {
377 paren_count++;
378 seen_paren = true;
379 }
380 if (query_string.charAt(offset) == ')') {
381 paren_count--;
382 }
383 offset++;
384 }
385 String query_prefix = query_string.substring(0, offset);
386 String query_suffix = query_string.substring(offset);
387
388 ///ystem.err.println("Prefix: " + query_prefix);
389 ///ystem.err.println("Suffix: " + query_suffix);
390
391 Query query = query_parser.parse(query_prefix);
392 query = query.rewrite(reader);
393
394 // If this is a fuzzy search, then we need to add the fuzzy
395 // flag to each of the query terms
396 if (fuzziness != null && query.toString().length() > 0) {
397
398 // Revert the query to a string
399 System.err.println("Rewritten query: " + query.toString());
400 // Search through the string for TX:<term> query terms
401 // and append the ~ operator. Note that this search will
402 // not change phrase searches (TX:"<term> <term>") as
403 // fuzzy searching is not possible for these entries.
404 // Yahoo! Time for a state machine!
405 StringBuffer mutable_query_string = new StringBuffer(query.toString());
406 int o = 0; // Offset
407 // 0 = BASE, 1 = SEEN_T, 2 = SEEN_TX, 3 = SEEN_TX:
408 int s = 0; // State
409 while(o < mutable_query_string.length()) {
410 char c = mutable_query_string.charAt(o);
411 if (s == 0 && c == TEXTFIELD.charAt(0)) {
412 ///ystem.err.println("Found T!");
413 s = 1;
414 }
415 else if (s == 1) {
416 if (c == TEXTFIELD.charAt(1)) {
417 ///ystem.err.println("Found X!");
418 s = 2;
419 }
420 else {
421 s = 0; // Reset
422 }
423 }
424 else if (s == 2) {
425 if (c == ':') {
426 ///ystem.err.println("Found TX:!");
427 s = 3;
428 }
429 else {
430 s = 0; // Reset
431 }
432 }
433 else if (s == 3) {
434 // Don't process phrases
435 if (c == '"') {
436 ///ystem.err.println("Stupid phrase...");
437 s = 0; // Reset
438 }
439 // Found the end of the term... add the
440 // fuzzy search indicator
441 // Nor outside the scope of parentheses
442 else if (Character.isWhitespace(c) || c == ')') {
443 ///ystem.err.println("Yahoo! Found fuzzy term.");
444 mutable_query_string.insert(o, '~' + fuzziness);
445 o++;
446 s = 0; // Reset
447 }
448 }
449 o++;
450 }
451 // If we were in the state of looking for the end of a
452 // term - then we just found it!
453 if (s == 3) {
454
455 mutable_query_string.append('~' + fuzziness);
456 }
457 // Reparse the query
458 ///ystem.err.println("Fuzzy query: " + mutable_query_string.toString() + query_suffix);
459 query = query_parser.parse(mutable_query_string.toString() + query_suffix);
460 }
461 else {
462 query = query_parser.parse(query_prefix + query_suffix);
463 }
464
465 return query;
466 }
467
468 // If you're dealing with a BooleanQuery, they need to be recursively rewritten
469 // as they can contain queries with wildcards (WildcardQuery|PrefixQuery subclasses of MultiTermQuery)
470 // e.g. season* farm
471 // If MultiTermQuery, then expand here. e.g. WildcardQuerys like season*.
472 // DON'T call this method from inside parseQuery() (in place of its query.rewrite()), because then wildcard
473 // queries like season* won't contain Terms (extractTerms() will be empty) since the ConstantScoreQuerys
474 // that a WildcardQuery gets rewritten to here will contain Filters in place of Terms.
475 // Call this method from runQuery() after it calls parseQuery().
476 // Now searches like these will work
477 // season* farm
478 // season* farm*
479 // and not just searches like the following which already used to work:
480 // season*
481 // snail farm
482 // Idea for the solution of recursively processing a BooleanQuery came from inspecting source code to BooleanQuery.java
483 // https://github.com/apache/lucene-solr/blob/master/lucene/core/src/java/org/apache/lucene/search/BooleanQuery.java
484 // which also does a recursive rewrite. Unfortunately, the existing BooleanQuery does not handle MultiTermQuery
485 // subcomponents.
486 protected Query recursivelyRewriteQuery(Query orig_query, IndexReader reader, LuceneQueryResult lucene_query_result) throws java.io.IOException
487 {
488 //Query query = orig_query.rewrite(reader);
489 Query query = orig_query;
490
491 if(orig_query instanceof BooleanQuery) {
492 BooleanQuery booleanQuery = (BooleanQuery)orig_query;
493 List<BooleanClause> clauses = booleanQuery.clauses();
494 for (BooleanClause clause : clauses) {
495 Query subQuery = clause.getQuery();
496 subQuery = recursivelyRewriteQuery(subQuery, reader, lucene_query_result);
497 clause.setQuery(subQuery);
498 }
499 }
500
501 // GS2's LuceneWrapper uses lucene-2.3.2. GS3's LuceneWrapper3 works with lucene-3.3.0.
502 // This change in lucene core library for GS3 (present since after version 2.4.1) had the
503 // side-effect that searching on "econom*" didn't display what terms it was searching for,
504 // whereas it had done so in GS2.
505
506 // The details of this problem and its current solution are explained in the ticket
507 // http://trac.greenstone.org/ticket/845
508
509 // We need to change the settings for the rewriteMethod in order to get searches on wildcards
510 // to produce search terms again when the query gets rewritten.
511
512 // We try, in order:
513 // 1. RewriteMethod set to BooleanQuery, to get it working as in GS2 which uses lucene-2.3.2
514 // it will expand wildcard searches to its terms when searching at both section AND doc level.
515 // If that throws a TooManyClauses exception (like when searching for "a*" over lucene demo collection)
516 // 2. Then try a custom rewriteMethod which sets termCountCutoff=350 and docCountPercent cutoff=0.1%
517 // If that throws a TooManyClauses exception (could perhaps happen if the collection has a huge number of docs
518 // 3. Then try the default apache rewriteMethod with its optimum defaults of
519 // termCountCutoff=350 and docCountPercent cutoff=0.1%
520 // See http://lucene.apache.org/core/3_6_1/api/core/org/apache/lucene/search/MultiTermQuery.html
521
522 //System.err.println("@@@@ query class name: " + orig_query.getClass());
523 //System.err.println("@@@@ QUERY: " + orig_query);
524
525 if(orig_query instanceof MultiTermQuery) {
526 MultiTermQuery multiTermQuery = (MultiTermQuery)orig_query;
527 multiTermQuery.setRewriteMethod(MultiTermQuery.CONSTANT_SCORE_BOOLEAN_QUERY_REWRITE);
528 // less CPU intensive than MultiTermQuery.SCORING_BOOLEAN_QUERY_REWRITE)
529 }
530
531 try {
532 query = orig_query.rewrite(reader);
533 }
534 catch(BooleanQuery.TooManyClauses clauseException) {
535 // Example test case: try searching the lucene demo collection for "a*"
536 // and you'll hit this exception
537
538 lucene_query_result.setError(LuceneQueryResult.TOO_MANY_CLAUSES_ERROR);
539
540 if(query instanceof MultiTermQuery) {
541
542 // CustomRewriteMethod: setting the docCountPercent cutoff to a custom 100%.
543 // This will at least expand the query to its terms when searching with wildcards at section-level
544 // (though it doesn't seem to work for doc-level searches, no matter what the cutoffs are set to).
545
546 MultiTermQuery.ConstantScoreAutoRewrite customRewriteMethod = new MultiTermQuery.ConstantScoreAutoRewrite();
547 customRewriteMethod.setDocCountPercent(100.0);
548 customRewriteMethod.setTermCountCutoff(350); // same as default
549
550 MultiTermQuery multiTermQuery = (MultiTermQuery)query;
551 multiTermQuery.setRewriteMethod(customRewriteMethod);
552 try {
553 query = query.rewrite(reader);
554 }
555 catch(BooleanQuery.TooManyClauses clauseExceptionAgain) {
556
557 // do what the code originally did: use the default rewriteMethod which
558 // uses a default docCountPercent=0.1 (%) and termCountCutoff=350
559
560 multiTermQuery.setRewriteMethod(MultiTermQuery.CONSTANT_SCORE_AUTO_REWRITE_DEFAULT);
561 query = query.rewrite(reader);
562 }
563 }
564 }
565
566 // BooleanQuery.java recurses rewriting any query until it is identical before and after rewrite,
567 // see reference to "recursively rewrite" in
568 // https://github.com/apache/lucene-solr/blob/master/lucene/core/src/java/org/apache/lucene/search/BooleanQuery.java
569 if(orig_query == query) {
570 return query;
571 } else {
572 return recursivelyRewriteQuery(query, reader, lucene_query_result);
573 }
574 }
575
576 protected Filter parseFilterString(String filter_string)
577 {
578 Filter result = null;
579 Pattern pattern = Pattern.compile("\\s*\\+(\\w+)\\:([\\{\\[])(\\d+)\\s+TO\\s+(\\d+)([\\}\\]])\\s*");
580 Matcher matcher = pattern.matcher(filter_string);
581 if (matcher.matches()) {
582 String field_name = matcher.group(1);
583 boolean include_lower = matcher.group(2).equals("[");
584 BytesRef lower_term = new BytesRef(matcher.group(3));
585 BytesRef upper_term = new BytesRef(matcher.group(4));
586 boolean include_upper = matcher.group(5).equals("]");
587 result = new TermRangeFilter(field_name, lower_term, upper_term, include_lower, include_upper);
588 }
589 else {
590 System.err.println("Error: Could not understand filter string \"" + filter_string + "\"");
591 }
592 return result;
593 }
594
595
596 /** command line program and auxiliary methods */
597
598 // Fairly self-explanatory I should hope
599 static protected boolean query_result_caching_enabled = false;
600
601
602 static public void main (String args[])
603 {
604 if (args.length == 0) {
605 System.out.println("Usage: GS2LuceneQuery <index directory> [-fuzziness value] [-filter filter_string] [-sort sort_field] [-reverse_sort][-dco AND|OR] [-startresults number -endresults number] [query]");
606 return;
607 }
608
609 try {
610 String index_directory = args[0];
611
612 GS2LuceneQuery queryer = new GS2LuceneQuery();
613 queryer.setIndexDir(index_directory);
614
615 // Prepare the index cache directory, if query result caching is enabled
616 if (query_result_caching_enabled) {
617 // Make the index cache directory if it doesn't already exist
618 File index_cache_directory = new File(index_directory, "cache");
619 if (!index_cache_directory.exists()) {
620 index_cache_directory.mkdir();
621 }
622
623 // Disable caching if the index cache directory isn't available
624 if (!index_cache_directory.exists() || !index_cache_directory.isDirectory()) {
625 query_result_caching_enabled = false;
626 }
627 }
628
629 String query_string = null;
630
631 // Parse the command-line arguments
632 for (int i = 1; i < args.length; i++) {
633 if (args[i].equals("-sort")) {
634 i++;
635 queryer.setSortField(args[i]);
636 }
637 else if (args[i].equals("-reverse_sort")) {
638 queryer.setReverseSort(true);
639 }
640 else if (args[i].equals("-filter")) {
641 i++;
642 queryer.setFilterString(args[i]);
643 }
644 else if (args[i].equals("-dco")) {
645 i++;
646 queryer.setDefaultConjunctionOperator(args[i]);
647 }
648 else if (args[i].equals("-fuzziness")) {
649 i++;
650 queryer.setFuzziness(args[i]);
651 }
652 else if (args[i].equals("-startresults")) {
653 i++;
654 if (args[i].matches("\\d+")) {
655 queryer.setStartResults(Integer.parseInt(args[i]));
656 }
657 }
658 else if (args[i].equals("-endresults")) {
659 i++;
660 if (args[i].matches("\\d+")) {
661 queryer.setEndResults(Integer.parseInt(args[i]));
662 }
663 }
664 else {
665 query_string = args[i];
666 }
667 }
668
669 if (!queryer.initialise()) {
670 queryer.cleanUp(); // will close reader object IF reader was instantiated
671 return;
672 }
673
674 // The query string has been specified as a command-line argument
675 if (query_string != null) {
676 runQueryCaching(index_directory, queryer, query_string);
677 }
678
679 // Read queries from STDIN
680 else {
681 BufferedReader in = new BufferedReader(new InputStreamReader(System.in, "UTF-8"));
682 while (true) {
683 // Read the query from STDIN
684 query_string = in.readLine();
685 if (query_string == null || query_string.length() == -1) {
686 break;
687 }
688
689 runQueryCaching(index_directory, queryer, query_string);
690
691 }
692 }
693 queryer.cleanUp();
694 }
695 catch (IOException exception) {
696 exception.printStackTrace();
697 }
698 }
699
700 protected static void runQueryCaching(String index_directory, GS2LuceneQuery queryer, String query_string)
701 throws IOException
702 {
703 StringBuffer query_results_xml = new StringBuffer();
704
705 // Check if this query result has been cached from a previous search (if it's enabled)
706 File query_result_cache_file = null;
707 if (query_result_caching_enabled) {
708 // Generate the cache file name from the query options
709 String query_result_cache_file_name = query_string + "-";
710 String fuzziness = queryer.getFuzziness();
711 query_result_cache_file_name += ((fuzziness != null) ? fuzziness : "") + "-";
712 String filter_string = queryer.getFilterString();
713 query_result_cache_file_name += ((filter_string != null) ? filter_string : "") + "-";
714 String sort_string = queryer.getSortField();
715 query_result_cache_file_name += ((sort_string != null) ? sort_string : "") + "-";
716 String reverse_sort_string = (queryer.getReverseSort() ? "1" : "0");
717 query_result_cache_file_name += reverse_sort_string + "-";
718 String default_conjunction_operator = queryer.getDefaultConjunctionOperator();
719 query_result_cache_file_name += default_conjunction_operator + "-";
720 int start_results = queryer.getStartResults();
721 int end_results = queryer.getEndResults();
722 query_result_cache_file_name += start_results + "-" + end_results;
723 query_result_cache_file_name = fileSafe(query_result_cache_file_name);
724
725 // If the query result cache file exists, just return its contents and we're done
726 File index_cache_directory = new File(index_directory, "cache");
727 query_result_cache_file = new File(index_cache_directory, query_result_cache_file_name);
728 if (query_result_cache_file.exists() && query_result_cache_file.isFile()) {
729 FileInputStream fis = new FileInputStream(query_result_cache_file);
730 InputStreamReader isr = new InputStreamReader(fis, "UTF-8");
731 BufferedReader buffered_reader = new BufferedReader(isr);
732 String line = "";
733 while ((line = buffered_reader.readLine()) != null) {
734 query_results_xml.append(line + "\n");
735 }
736 String query_results_xml_string = query_results_xml.toString();
737 query_results_xml_string = query_results_xml_string.replaceFirst("cached=\"false\"", "cached=\"true\"");
738
739 utf8out.print(query_results_xml_string);
740 utf8out.flush();
741
742 return;
743 }
744 }
745
746 // not cached
747 query_results_xml.append("<ResultSet cached=\"false\">\n");
748 query_results_xml.append("<QueryString>" + LuceneQueryResult.xmlSafe(query_string) + "</QueryString>\n");
749 Filter filter = queryer.getFilter();
750 if (filter != null) {
751 query_results_xml.append("<FilterString>" + filter.toString() + "</FilterString>\n");
752 }
753
754 LuceneQueryResult query_result = queryer.runQuery(query_string);
755 if (query_result == null) {
756 System.err.println("Couldn't run the query");
757 return;
758 }
759
760 if (query_result.getError() != LuceneQueryResult.NO_ERROR) {
761 query_results_xml.append("<Error type=\""+query_result.getErrorString()+"\" />\n");
762 } else {
763 query_results_xml.append(query_result.getXMLString());
764 }
765 query_results_xml.append("</ResultSet>\n");
766
767 utf8out.print(query_results_xml);
768 utf8out.flush();
769
770 // Cache this query result, if desired
771 if (query_result_caching_enabled) {
772 // Catch any exceptions thrown trying to write the query result cache file and warn about them, but don't
773 // bother with the full stack trace. It won't affect the functionality if we can't write some cache
774 // files, it will just affect the speed of subsequent requests.
775 // Example exceptions are "permission denied" errors, or "filename too long" errors (the filter string
776 // can get very long in some collections)
777 try
778 {
779 FileWriter query_result_cache_file_writer = new FileWriter(query_result_cache_file);
780 query_result_cache_file_writer.write(query_results_xml.toString());
781 query_result_cache_file_writer.close();
782 }
783 catch (Exception exception)
784 {
785 System.err.println("Warning: Exception occurred trying to write query result cache file (" + exception + ")");
786 }
787 }
788 }
789
790 protected static String fileSafe(String text)
791 {
792 StringBuffer file_safe_text = new StringBuffer();
793 for (int i = 0; i < text.length(); i++) {
794 char character = text.charAt(i);
795 if ((character >= 'A' && character <= 'Z') || (character >= 'a' && character <= 'z') || (character >= '0' && character <= '9') || character == '-') {
796 file_safe_text.append(character);
797 }
798 else {
799 file_safe_text.append('%');
800 file_safe_text.append((int) character);
801 }
802 }
803 return file_safe_text.toString();
804 }
805
806
807}
808
809
Note: See TracBrowser for help on using the repository browser.