source: main/trunk/greenstone2/common-src/indexers/lucene-gs/src/org/greenstone/LuceneWrapper4/GS2LuceneQuery.java@ 32506

Last change on this file since 32506 was 32506, checked in by ak19, 6 years ago

Bugfix to bug that Kathy discovered in code I committed: with the upgrade to lucene 4, wildcard searches would work, e.g. season*. But boolean searches that combine wildcard search terms with regular terms or with other wildcard terms didn't work. If a query was a BooleanQuery it would not expand any wildcard search terms it contained, despite BooleanQuery otherwise recursively doing a rewrite as per its source code. The solution was to recursively rewrite query ourselves to additionally handle MultiTermQuery boolean clauses within a BooleanQuery besides the existing code to handle standalone MultiTermQuerys (which can be of type WildcardQuery and PrefixQuery, though they get wrapped in ConstantScoreQuery objects). I've moved the existing code that deals with MultiTermQuerys into the new recursive function which now does the further step (the recursive step) of recursively rewriting BooleanQuerys to preserve and expand MultiTermQuery objects.

File size: 29.8 KB
Line 
1/**********************************************************************
2 *
3 * GS2LuceneQuery.java
4 *
5 * Copyright 2004 The New Zealand Digital Library Project
6 *
7 * A component of the Greenstone digital library software
8 * from the New Zealand Digital Library Project at the
9 * University of Waikato, New Zealand.
10 *
11 * This program is free software; you can redistribute it and/or modify
12 * it under the terms of the GNU General Public License as published by
13 * the Free Software Foundation; either version 2 of the License, or
14 * (at your option) any later version.
15 *
16 * This program is distributed in the hope that it will be useful,
17 * but WITHOUT ANY WARRANTY; without even the implied warranty of
18 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19 * GNU General Public License for more details.
20 *
21 * You should have received a copy of the GNU General Public License
22 * along with this program; if not, write to the Free Software
23 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24 *
25 *********************************************************************/
26package org.greenstone.LuceneWrapper4;
27
28
29import java.io.*;
30import java.util.*;
31import java.util.regex.*;
32
33import org.apache.lucene.analysis.Analyzer;
34import org.apache.lucene.analysis.standard.StandardAnalyzer;
35import org.apache.lucene.document.Document;
36import org.apache.lucene.index.DirectoryReader;
37import org.apache.lucene.index.IndexReader;
38import org.apache.lucene.index.Term;
39//import org.apache.lucene.index.TermDocs;
40import org.apache.lucene.queryparser.classic.ParseException;
41import org.apache.lucene.queryparser.classic.QueryParser;
42import org.apache.lucene.search.BooleanClause;
43import org.apache.lucene.search.BooleanQuery; // for the TooManyClauses exception
44import org.apache.lucene.search.ConstantScoreQuery;
45import org.apache.lucene.search.Filter;
46import org.apache.lucene.search.IndexSearcher;
47import org.apache.lucene.search.MultiTermQuery;
48import org.apache.lucene.search.MultiTermQuery.ConstantScoreAutoRewrite;
49import org.apache.lucene.search.Query;
50import org.apache.lucene.search.TermRangeFilter;
51import org.apache.lucene.search.IndexSearcher; // Searcher is deprecated
52import org.apache.lucene.search.ScoreDoc;
53import org.apache.lucene.search.Sort;
54import org.apache.lucene.search.SortField;
55import org.apache.lucene.search.TopFieldDocs;
56
57import org.apache.lucene.index.DocsEnum;
58import org.apache.lucene.index.MultiFields;
59
60import org.apache.lucene.store.Directory;
61import org.apache.lucene.store.FSDirectory;
62
63import org.apache.lucene.util.Bits;
64import org.apache.lucene.util.BytesRef;
65import org.apache.lucene.util.Version;
66
67public class GS2LuceneQuery extends SharedSoleneQuery
68{
69 public static String SORT_RANK = "rank";
70 public static String SORT_NATURAL = "natural";
71
72 protected String full_indexdir="";
73
74 protected SortField.Type sort_type = SortField.Type.SCORE;
75 protected boolean reverse_sort = false;
76 protected Sort sorter=new Sort();
77 protected Filter filter = null;
78
79 protected QueryParser query_parser = null;
80 protected QueryParser query_parser_no_stop_words = null;
81 protected IndexSearcher searcher = null;
82 protected IndexReader reader = null;
83
84 public GS2LuceneQuery() {
85 super();
86
87 // Create one query parser with the standard set of stop words, and one with none
88
89 query_parser = new QueryParser(GSLuceneConstants.MATCH_VERSION, TEXTFIELD, new GS2Analyzer()); // uses built-in stop_words_set
90 query_parser_no_stop_words = new QueryParser(GSLuceneConstants.MATCH_VERSION, TEXTFIELD, new GS2Analyzer(new String[] { }));
91 }
92
93
94 public boolean initialise() {
95
96 if (!super.initialise()) {
97 return false;
98 }
99
100
101 if (full_indexdir==null || full_indexdir.length()==-1){
102 utf8out.println("Index directory is not indicated ");
103 utf8out.flush();
104 return false;
105 }
106
107 try {
108 Directory full_indexdir_dir = FSDirectory.open(new File(full_indexdir));
109
110 reader = DirectoryReader.open(full_indexdir_dir); // Returns a IndexReader reading the index in the given Directory. now readOnly=true by default, and therefore also for searcher
111 searcher = new IndexSearcher(reader); // during searcher.search() will get it to compute ranks when sorting by fields
112
113 this.sorter = new Sort(new SortField(this.sort_field, this.sort_type, this.reverse_sort));
114 }
115 catch (IOException exception) {
116 exception.printStackTrace();
117 return false;
118 }
119 return true;
120
121 }
122
123 public void setIndexDir(String full_indexdir) {
124 this.full_indexdir = full_indexdir;
125 }
126
127 public void setSortField(String sort_field) {
128 if (sort_field.equals(SORT_RANK)) {
129 this.sort_field = null;
130 this.sort_type = SortField.Type.SCORE;
131 } else if (sort_field.equals(SORT_NATURAL)) {
132 this.sort_field = null;
133 this.sort_type = SortField.Type.DOC;
134 } else {
135 this.sort_field = sort_field;
136 this.sort_type = SortField.Type.STRING; // for now. numeric??
137 }
138 }
139 public void setReverseSort(boolean reverse) {
140 this.reverse_sort = reverse;
141 }
142 public boolean getReverseSort() {
143 return this.reverse_sort;
144 }
145
146 public void setFilterString(String filter_string) {
147 super.setFilterString(filter_string);
148 this.filter = parseFilterString(filter_string);
149 }
150
151 public Filter getFilter() {
152 return this.filter;
153 }
154
155
156 public LuceneQueryResult runQuery(String query_string) {
157
158 if (query_string == null || query_string.equals("")) {
159 utf8out.println("The query word is not indicated ");
160 utf8out.flush();
161 return null;
162 }
163
164 LuceneQueryResult lucene_query_result=new LuceneQueryResult();
165 lucene_query_result.clear();
166
167 try {
168 Query query_including_stop_words = query_parser_no_stop_words.parse(query_string);
169 query_including_stop_words = query_including_stop_words.rewrite(reader);
170
171 System.err.println("********* query_string " + query_string + "****");
172
173 Query query = parseQuery(reader, query_parser, query_string, fuzziness);
174 query = recursiveRewriteQuery(query, reader);
175 System.err.println("@@@@ final query class name: " + query.getClass());
176
177 // http://stackoverflow.com/questions/13537126/term-frequency-in-lucene-4-0
178 // http://stackoverflow.com/questions/20575254/lucene-4-4-how-to-get-term-frequency-over-all-index
179 // http://stackoverflow.com/questions/8938960/how-to-get-document-ids-for-document-term-vector-in-lucene?rq=1
180 // https://github.com/hibernate/hibernate-search/blob/master/orm/src/test/java/org/hibernate/search/test/filter/BestDriversFilter.java
181 // http://lucene.apache.org/core/4_7_2/MIGRATE.html
182
183 // Get the list of expanded query terms and their frequencies
184 // num docs matching, and total frequency
185 HashSet terms = new HashSet();
186 query.extractTerms(terms);
187
188 HashMap doc_term_freq_map = new HashMap();
189
190 Iterator iter = terms.iterator();
191
192 Bits liveDocs = null;
193 if(reader.hasDeletions()) {
194 System.err.println("@@@ GS2LuceneQuery.java: There have been deletions. Merging to get liveDocs.");
195 liveDocs = MultiFields.getLiveDocs(reader); // SLOW! But getLiveDocs returns null if there are no deletions
196 }
197
198 while (iter.hasNext()) {
199
200 // http://stackoverflow.com/questions/13537126/term-frequency-in-lucene-4-0
201
202 Term term = (Term) iter.next();
203 System.err.println("@@@ GS2LuceneQuery.java: Next term: " + term.text());
204 BytesRef term_bytes = term.bytes();
205 DocsEnum term_docs = MultiFields.getTermDocsEnum(reader, liveDocs, term.field(), term_bytes); // flags?
206
207 // Get the term frequency over all the documents
208 //TermDocs term_docs = reader.termDocs(term);
209 int term_freq = 0;
210 int match_docs = 0;
211
212 if(term_docs != null) {
213 int docID = -1;
214 while((docID = term_docs.nextDoc()) != DocsEnum.NO_MORE_DOCS) {//while (term_docs.next())
215 if (term_docs.freq() != 0)
216 {
217 term_freq += term_docs.freq();
218 match_docs++;
219
220 // Calculate the document-level term frequency as well
221 Integer lucene_doc_num_obj = new Integer(term_docs.docID());
222 int doc_term_freq = 0;
223 if (doc_term_freq_map.containsKey(lucene_doc_num_obj))
224 {
225 doc_term_freq = ((Integer) doc_term_freq_map.get(lucene_doc_num_obj)).intValue();
226 }
227 doc_term_freq += term_docs.freq();
228
229 doc_term_freq_map.put(lucene_doc_num_obj, new Integer(doc_term_freq));
230 }
231 }
232 } else {
233 System.err.println("@@@ GS2LuceneQuery.java: term_docs is null for term " + term.text());
234 }
235
236 // Create a term
237 lucene_query_result.addTerm(term.text(), term.field(), match_docs, term_freq);
238 }
239
240 // Get the list of stop words removed from the query
241 HashSet terms_including_stop_words = new HashSet();
242 query_including_stop_words.extractTerms(terms_including_stop_words);
243 Iterator terms_including_stop_words_iter = terms_including_stop_words.iterator();
244 while (terms_including_stop_words_iter.hasNext()) {
245 Term term = (Term) terms_including_stop_words_iter.next();
246 if (!terms.contains(term)) {
247 lucene_query_result.addStopWord(term.text());
248 }
249 }
250
251 // Extracting all documents for a given search - http://www.gossamer-threads.com/lists/lucene/java-user/134873
252 // http://lucene.apache.org/core/3_4_0/api/core/org/apache/lucene/search/TotalHitCountCollector.html
253 // http://lucene.apache.org/core/4_7_2/core/index.html?org/apache/lucene/search/TopFieldDocs.html
254
255 // 1. Figure out how many results there will be.
256 //TotalHitCountCollecter countCollector = new TotalHitCountCollector();
257 //searcher.search(query, filter, collector);
258 //int hitCount = collector.count;
259
260 // Actually do the query
261 // Simple case for getting all the matching documents
262 if (end_results == Integer.MAX_VALUE) {
263 // Perform the query (filter and sorter may be null)
264 TopFieldDocs hits = searcher.search(query, filter, end_results, sorter, true, true); // doDocScores=true, doMaxScore=true
265 // Is there a slight difference in the definition between
266 // https://lucene.apache.org/core/3_6_0/api/all/org/apache/lucene/search/IndexSearcher.html#setDefaultFieldSortScoring%28boolean,%20boolean%29
267 // and http://lucene.apache.org/core/4_7_2/core/org/apache/lucene/search/IndexSearcher.html#search%28org.apache.lucene.search.Query,%20org.apache.lucene.search.Filter,%20int,%20org.apache.lucene.search.Sort,%20boolean,%20boolean%29
268 // Seems to be okay.
269 // See also http://stackoverflow.com/questions/7910241/in-lucene-what-is-the-purpose-of-setdefaultfieldsortscoring
270
271 lucene_query_result.setTotalDocs(hits.totalHits);
272
273 // Output the matching documents
274 lucene_query_result.setStartResults(start_results);
275 lucene_query_result.setEndResults(hits.totalHits); // ??
276
277 for (int i = start_results; i < hits.totalHits; i++) {
278 int lucene_doc_num = hits.scoreDocs[i ].doc; // i-1
279 Document doc = reader.document(lucene_doc_num);
280 int doc_term_freq = 0;
281 Integer doc_term_freq_object = (Integer) doc_term_freq_map.get(new Integer(lucene_doc_num));
282 if (doc_term_freq_object != null)
283 {
284 doc_term_freq = doc_term_freq_object.intValue();
285 }
286 lucene_query_result.addDoc(doc.get("docOID").trim(), hits.scoreDocs[i].score, doc_term_freq);
287 }
288 }
289
290 // Slightly more complicated case for returning a subset of the matching documents
291 else {
292 // Perform the query (filter may be null)
293 TopFieldDocs hits = searcher.search(query, filter, end_results, sorter, true, true); // doDocScores=true, doMaxScore=true
294 // See also http://stackoverflow.com/questions/7910241/in-lucene-what-is-the-purpose-of-setdefaultfieldsortscoring
295 lucene_query_result.setTotalDocs(hits.totalHits);
296
297 lucene_query_result.setStartResults(start_results);
298 lucene_query_result.setEndResults(end_results < hits.scoreDocs.length ? end_results: hits.scoreDocs.length);
299
300 // Output the matching documents
301 for (int i = start_results; (i < hits.scoreDocs.length && i < end_results); i++) {
302 int lucene_doc_num = hits.scoreDocs[i].doc;
303 Document doc = reader.document(lucene_doc_num);
304 int doc_term_freq = 0;
305 Integer doc_term_freq_object = (Integer) doc_term_freq_map.get(new Integer(lucene_doc_num));
306 if (doc_term_freq_object != null)
307 {
308 doc_term_freq = doc_term_freq_object.intValue();
309 }
310 lucene_query_result.addDoc(doc.get("docOID").trim(), hits.scoreDocs[i].score, doc_term_freq);
311 }
312 }
313 }
314
315 catch (ParseException parse_exception) {
316 lucene_query_result.setError(LuceneQueryResult.PARSE_ERROR);
317 }
318 catch (BooleanQuery.TooManyClauses too_many_clauses_exception) {
319 lucene_query_result.setError(LuceneQueryResult.TOO_MANY_CLAUSES_ERROR);
320 }
321 catch (IOException exception) {
322 lucene_query_result.setError(LuceneQueryResult.IO_ERROR);
323 exception.printStackTrace();
324 }
325 catch (Exception exception) {
326 lucene_query_result.setError(LuceneQueryResult.OTHER_ERROR);
327 exception.printStackTrace();
328 }
329 return lucene_query_result;
330 }
331
332 public void setDefaultConjunctionOperator(String default_conjunction_operator) {
333 super.setDefaultConjunctionOperator(default_conjunction_operator);
334
335 if (default_conjunction_operator.equals("AND")) {
336 query_parser.setDefaultOperator(query_parser.AND_OPERATOR);
337 query_parser_no_stop_words.setDefaultOperator(query_parser.AND_OPERATOR);
338 } else { // default is OR
339 query_parser.setDefaultOperator(query_parser.OR_OPERATOR);
340 query_parser_no_stop_words.setDefaultOperator(query_parser.OR_OPERATOR);
341 }
342 }
343
344
345 public void cleanUp() {
346 super.cleanUp();
347 try {
348 if(reader != null) {
349 reader.close();
350 // Closes files associated with this index. Also saves any new deletions to disk.
351 // No other methods should be called after this has been called.
352 }
353 } catch (IOException exception) {
354 exception.printStackTrace();
355 }
356 }
357
358
359 protected Query parseQuery(IndexReader reader, QueryParser query_parser, String query_string, String fuzziness)
360 throws java.io.IOException, org.apache.lucene.queryparser.classic.ParseException
361 {
362 // Split query string into the search terms and the filter terms
363 // * The first +(...) term contains the search terms so count
364 // up '(' and stop when we finish matching ')'
365 int offset = 0;
366 int paren_count = 0;
367 boolean seen_paren = false;
368 while (offset < query_string.length() && (!seen_paren || paren_count > 0)) {
369 if (query_string.charAt(offset) == '(') {
370 paren_count++;
371 seen_paren = true;
372 }
373 if (query_string.charAt(offset) == ')') {
374 paren_count--;
375 }
376 offset++;
377 }
378 String query_prefix = query_string.substring(0, offset);
379 String query_suffix = query_string.substring(offset);
380
381 ///ystem.err.println("Prefix: " + query_prefix);
382 ///ystem.err.println("Suffix: " + query_suffix);
383
384 Query query = query_parser.parse(query_prefix);
385 query = query.rewrite(reader);
386
387 // If this is a fuzzy search, then we need to add the fuzzy
388 // flag to each of the query terms
389 if (fuzziness != null && query.toString().length() > 0) {
390
391 // Revert the query to a string
392 System.err.println("Rewritten query: " + query.toString());
393 // Search through the string for TX:<term> query terms
394 // and append the ~ operator. Note that this search will
395 // not change phrase searches (TX:"<term> <term>") as
396 // fuzzy searching is not possible for these entries.
397 // Yahoo! Time for a state machine!
398 StringBuffer mutable_query_string = new StringBuffer(query.toString());
399 int o = 0; // Offset
400 // 0 = BASE, 1 = SEEN_T, 2 = SEEN_TX, 3 = SEEN_TX:
401 int s = 0; // State
402 while(o < mutable_query_string.length()) {
403 char c = mutable_query_string.charAt(o);
404 if (s == 0 && c == TEXTFIELD.charAt(0)) {
405 ///ystem.err.println("Found T!");
406 s = 1;
407 }
408 else if (s == 1) {
409 if (c == TEXTFIELD.charAt(1)) {
410 ///ystem.err.println("Found X!");
411 s = 2;
412 }
413 else {
414 s = 0; // Reset
415 }
416 }
417 else if (s == 2) {
418 if (c == ':') {
419 ///ystem.err.println("Found TX:!");
420 s = 3;
421 }
422 else {
423 s = 0; // Reset
424 }
425 }
426 else if (s == 3) {
427 // Don't process phrases
428 if (c == '"') {
429 ///ystem.err.println("Stupid phrase...");
430 s = 0; // Reset
431 }
432 // Found the end of the term... add the
433 // fuzzy search indicator
434 // Nor outside the scope of parentheses
435 else if (Character.isWhitespace(c) || c == ')') {
436 ///ystem.err.println("Yahoo! Found fuzzy term.");
437 mutable_query_string.insert(o, '~' + fuzziness);
438 o++;
439 s = 0; // Reset
440 }
441 }
442 o++;
443 }
444 // If we were in the state of looking for the end of a
445 // term - then we just found it!
446 if (s == 3) {
447
448 mutable_query_string.append('~' + fuzziness);
449 }
450 // Reparse the query
451 ///ystem.err.println("Fuzzy query: " + mutable_query_string.toString() + query_suffix);
452 query = query_parser.parse(mutable_query_string.toString() + query_suffix);
453 }
454 else {
455 query = query_parser.parse(query_prefix + query_suffix);
456 }
457
458 return query;
459 }
460
461 // If you're dealing with a BooleanQuery, they need to be recursively rewritten
462 // as they can contain queries with wildcards (WildcardQuery|PrefixQuery subclasses of MultiTermQuery)
463 // e.g. season* farm
464 // If MultiTermQuery, then expand here. e.g. WildcardQuerys like season*.
465 // DON'T call this method from inside parseQuery() (in place of its query.rewrite()), because then wildcard
466 // queries like season* won't contain Terms (extractTerms() will be empty) since the ConstantScoreQuerys
467 // that a WildcardQuery gets rewritten to here will contain Filters in place of Terms.
468 // Call this method from runQuery() after it calls parseQuery().
469 // Now searches like these will work
470 // season* farm
471 // season* farm*
472 // and not just searches like the following which already used to work:
473 // season*
474 // snail farm
475 // Idea for this method came from inspecting source code to BooleanQuery
476 // https://github.com/apache/lucene-solr/blob/master/lucene/core/src/java/org/apache/lucene/search/BooleanQuery.java
477 // which also does a recursive rewrite. Unfortunately, the existing BooleanQuery does not handle MultiTermQuery
478 // subcomponents.
479 protected Query recursiveRewriteQuery(Query orig_query, IndexReader reader) throws java.io.IOException
480 {
481 //Query query = orig_query.rewrite(reader);
482 Query query = orig_query;
483
484 if(orig_query instanceof BooleanQuery) {
485 BooleanQuery booleanQuery = (BooleanQuery)orig_query;
486 List<BooleanClause> clauses = booleanQuery.clauses();
487 for (BooleanClause clause : clauses) {
488 Query subQuery = clause.getQuery();
489 subQuery = recursiveRewriteQuery(subQuery, reader);
490 clause.setQuery(subQuery);
491 }
492 }
493
494 // GS2's LuceneWrapper uses lucene-2.3.2. GS3's LuceneWrapper3 works with lucene-3.3.0.
495 // This change in lucene core library for GS3 (present since after version 2.4.1) had the
496 // side-effect that searching on "econom*" didn't display what terms it was searching for,
497 // whereas it had done so in GS2.
498
499 // The details of this problem and its current solution are explained in the ticket
500 // http://trac.greenstone.org/ticket/845
501
502 // We need to change the settings for the rewriteMethod in order to get searches on wildcards
503 // to produce search terms again when the query gets rewritten.
504
505 // We try, in order:
506 // 1. RewriteMethod set to BooleanQuery, to get it working as in GS2 which uses lucene-2.3.2
507 // it will expand wildcard searches to its terms when searching at both section AND doc level.
508 // If that throws a TooManyClauses exception (like when searching for "a*" over lucene demo collection)
509 // 2. Then try a custom rewriteMethod which sets termCountCutoff=350 and docCountPercent cutoff=0.1%
510 // If that throws a TooManyClauses exception (could perhaps happen if the collection has a huge number of docs
511 // 3. Then try the default apache rewriteMethod with its optimum defaults of
512 // termCountCutoff=350 and docCountPercent cutoff=0.1%
513 // See http://lucene.apache.org/core/3_6_1/api/core/org/apache/lucene/search/MultiTermQuery.html
514
515 System.err.println("@@@@ query class name: " + orig_query.getClass());
516 System.err.println("@@@@ QUERY: " + orig_query);
517
518 if(orig_query instanceof MultiTermQuery) {
519 MultiTermQuery multiTermQuery = (MultiTermQuery)orig_query;
520 multiTermQuery.setRewriteMethod(MultiTermQuery.CONSTANT_SCORE_BOOLEAN_QUERY_REWRITE);
521 // less CPU intensive than MultiTermQuery.SCORING_BOOLEAN_QUERY_REWRITE)
522 }
523
524 try {
525 query = orig_query.rewrite(reader);
526 }
527 catch(BooleanQuery.TooManyClauses clauseException) {
528 // Example test case: try searching the lucene demo collection for "a*"
529 // and you'll hit this exception
530
531 //lucene_query_result.setError(LuceneQueryResult.TOO_MANY_CLAUSES_ERROR);
532
533 if(query instanceof MultiTermQuery) {
534
535 // CustomRewriteMethod: setting the docCountPercent cutoff to a custom 100%.
536 // This will at least expand the query to its terms when searching with wildcards at section-level
537 // (though it doesn't seem to work for doc-level searches, no matter what the cutoffs are set to).
538
539 MultiTermQuery.ConstantScoreAutoRewrite customRewriteMethod = new MultiTermQuery.ConstantScoreAutoRewrite();
540 customRewriteMethod.setDocCountPercent(100.0);
541 customRewriteMethod.setTermCountCutoff(350); // same as default
542
543 MultiTermQuery multiTermQuery = (MultiTermQuery)query;
544 multiTermQuery.setRewriteMethod(customRewriteMethod);
545 try {
546 query = query.rewrite(reader);
547 }
548 catch(BooleanQuery.TooManyClauses clauseExceptionAgain) {
549
550 // do what the code originally did: use the default rewriteMethod which
551 // uses a default docCountPercent=0.1 (%) and termCountCutoff=350
552
553 multiTermQuery.setRewriteMethod(MultiTermQuery.CONSTANT_SCORE_AUTO_REWRITE_DEFAULT);
554 query = query.rewrite(reader);
555 }
556 }
557 }
558
559 if(orig_query == query) {
560 return query;
561 } else {
562 return recursiveRewriteQuery(query, reader);
563 }
564 }
565
566 protected Filter parseFilterString(String filter_string)
567 {
568 Filter result = null;
569 Pattern pattern = Pattern.compile("\\s*\\+(\\w+)\\:([\\{\\[])(\\d+)\\s+TO\\s+(\\d+)([\\}\\]])\\s*");
570 Matcher matcher = pattern.matcher(filter_string);
571 if (matcher.matches()) {
572 String field_name = matcher.group(1);
573 boolean include_lower = matcher.group(2).equals("[");
574 BytesRef lower_term = new BytesRef(matcher.group(3));
575 BytesRef upper_term = new BytesRef(matcher.group(4));
576 boolean include_upper = matcher.group(5).equals("]");
577 result = new TermRangeFilter(field_name, lower_term, upper_term, include_lower, include_upper);
578 }
579 else {
580 System.err.println("Error: Could not understand filter string \"" + filter_string + "\"");
581 }
582 return result;
583 }
584
585
586 /** command line program and auxiliary methods */
587
588 // Fairly self-explanatory I should hope
589 static protected boolean query_result_caching_enabled = false;
590
591
592 static public void main (String args[])
593 {
594 if (args.length == 0) {
595 System.out.println("Usage: GS2LuceneQuery <index directory> [-fuzziness value] [-filter filter_string] [-sort sort_field] [-reverse_sort][-dco AND|OR] [-startresults number -endresults number] [query]");
596 return;
597 }
598
599 try {
600 String index_directory = args[0];
601
602 GS2LuceneQuery queryer = new GS2LuceneQuery();
603 queryer.setIndexDir(index_directory);
604
605 // Prepare the index cache directory, if query result caching is enabled
606 if (query_result_caching_enabled) {
607 // Make the index cache directory if it doesn't already exist
608 File index_cache_directory = new File(index_directory, "cache");
609 if (!index_cache_directory.exists()) {
610 index_cache_directory.mkdir();
611 }
612
613 // Disable caching if the index cache directory isn't available
614 if (!index_cache_directory.exists() || !index_cache_directory.isDirectory()) {
615 query_result_caching_enabled = false;
616 }
617 }
618
619 String query_string = null;
620
621 // Parse the command-line arguments
622 for (int i = 1; i < args.length; i++) {
623 if (args[i].equals("-sort")) {
624 i++;
625 queryer.setSortField(args[i]);
626 }
627 else if (args[i].equals("-reverse_sort")) {
628 queryer.setReverseSort(true);
629 }
630 else if (args[i].equals("-filter")) {
631 i++;
632 queryer.setFilterString(args[i]);
633 }
634 else if (args[i].equals("-dco")) {
635 i++;
636 queryer.setDefaultConjunctionOperator(args[i]);
637 }
638 else if (args[i].equals("-fuzziness")) {
639 i++;
640 queryer.setFuzziness(args[i]);
641 }
642 else if (args[i].equals("-startresults")) {
643 i++;
644 if (args[i].matches("\\d+")) {
645 queryer.setStartResults(Integer.parseInt(args[i]));
646 }
647 }
648 else if (args[i].equals("-endresults")) {
649 i++;
650 if (args[i].matches("\\d+")) {
651 queryer.setEndResults(Integer.parseInt(args[i]));
652 }
653 }
654 else {
655 query_string = args[i];
656 }
657 }
658
659 if (!queryer.initialise()) {
660 return;
661 }
662
663 // The query string has been specified as a command-line argument
664 if (query_string != null) {
665 runQueryCaching(index_directory, queryer, query_string);
666 }
667
668 // Read queries from STDIN
669 else {
670 BufferedReader in = new BufferedReader(new InputStreamReader(System.in, "UTF-8"));
671 while (true) {
672 // Read the query from STDIN
673 query_string = in.readLine();
674 if (query_string == null || query_string.length() == -1) {
675 break;
676 }
677
678 runQueryCaching(index_directory, queryer, query_string);
679
680 }
681 }
682 queryer.cleanUp();
683 }
684 catch (IOException exception) {
685 exception.printStackTrace();
686 }
687 }
688
689 protected static void runQueryCaching(String index_directory, GS2LuceneQuery queryer, String query_string)
690 throws IOException
691 {
692 StringBuffer query_results_xml = new StringBuffer();
693
694 // Check if this query result has been cached from a previous search (if it's enabled)
695 File query_result_cache_file = null;
696 if (query_result_caching_enabled) {
697 // Generate the cache file name from the query options
698 String query_result_cache_file_name = query_string + "-";
699 String fuzziness = queryer.getFuzziness();
700 query_result_cache_file_name += ((fuzziness != null) ? fuzziness : "") + "-";
701 String filter_string = queryer.getFilterString();
702 query_result_cache_file_name += ((filter_string != null) ? filter_string : "") + "-";
703 String sort_string = queryer.getSortField();
704 query_result_cache_file_name += ((sort_string != null) ? sort_string : "") + "-";
705 String reverse_sort_string = (queryer.getReverseSort() ? "1" : "0");
706 query_result_cache_file_name += reverse_sort_string + "-";
707 String default_conjunction_operator = queryer.getDefaultConjunctionOperator();
708 query_result_cache_file_name += default_conjunction_operator + "-";
709 int start_results = queryer.getStartResults();
710 int end_results = queryer.getEndResults();
711 query_result_cache_file_name += start_results + "-" + end_results;
712 query_result_cache_file_name = fileSafe(query_result_cache_file_name);
713
714 // If the query result cache file exists, just return its contents and we're done
715 File index_cache_directory = new File(index_directory, "cache");
716 query_result_cache_file = new File(index_cache_directory, query_result_cache_file_name);
717 if (query_result_cache_file.exists() && query_result_cache_file.isFile()) {
718 FileInputStream fis = new FileInputStream(query_result_cache_file);
719 InputStreamReader isr = new InputStreamReader(fis, "UTF-8");
720 BufferedReader buffered_reader = new BufferedReader(isr);
721 String line = "";
722 while ((line = buffered_reader.readLine()) != null) {
723 query_results_xml.append(line + "\n");
724 }
725 String query_results_xml_string = query_results_xml.toString();
726 query_results_xml_string = query_results_xml_string.replaceFirst("cached=\"false\"", "cached=\"true\"");
727
728 utf8out.print(query_results_xml_string);
729 utf8out.flush();
730
731 return;
732 }
733 }
734
735 // not cached
736 query_results_xml.append("<ResultSet cached=\"false\">\n");
737 query_results_xml.append("<QueryString>" + LuceneQueryResult.xmlSafe(query_string) + "</QueryString>\n");
738 Filter filter = queryer.getFilter();
739 if (filter != null) {
740 query_results_xml.append("<FilterString>" + filter.toString() + "</FilterString>\n");
741 }
742
743 LuceneQueryResult query_result = queryer.runQuery(query_string);
744 if (query_result == null) {
745 System.err.println("Couldn't run the query");
746 return;
747 }
748
749 if (query_result.getError() != LuceneQueryResult.NO_ERROR) {
750 query_results_xml.append("<Error type=\""+query_result.getErrorString()+"\" />\n");
751 } else {
752 query_results_xml.append(query_result.getXMLString());
753 }
754 query_results_xml.append("</ResultSet>\n");
755
756 utf8out.print(query_results_xml);
757 utf8out.flush();
758
759 // Cache this query result, if desired
760 if (query_result_caching_enabled) {
761 // Catch any exceptions thrown trying to write the query result cache file and warn about them, but don't
762 // bother with the full stack trace. It won't affect the functionality if we can't write some cache
763 // files, it will just affect the speed of subsequent requests.
764 // Example exceptions are "permission denied" errors, or "filename too long" errors (the filter string
765 // can get very long in some collections)
766 try
767 {
768 FileWriter query_result_cache_file_writer = new FileWriter(query_result_cache_file);
769 query_result_cache_file_writer.write(query_results_xml.toString());
770 query_result_cache_file_writer.close();
771 }
772 catch (Exception exception)
773 {
774 System.err.println("Warning: Exception occurred trying to write query result cache file (" + exception + ")");
775 }
776 }
777 }
778
779 protected static String fileSafe(String text)
780 {
781 StringBuffer file_safe_text = new StringBuffer();
782 for (int i = 0; i < text.length(); i++) {
783 char character = text.charAt(i);
784 if ((character >= 'A' && character <= 'Z') || (character >= 'a' && character <= 'z') || (character >= '0' && character <= '9') || character == '-') {
785 file_safe_text.append(character);
786 }
787 else {
788 file_safe_text.append('%');
789 file_safe_text.append((int) character);
790 }
791 }
792 return file_safe_text.toString();
793 }
794
795
796}
797
798
Note: See TracBrowser for help on using the repository browser.