source: main/trunk/greenstone2/common-src/indexers/lucene-gs/src/org/greenstone/LuceneWrapper4/GS2LuceneQuery.java@ 29167

Last change on this file since 29167 was 29167, checked in by ak19, 10 years ago

Fix to nullpointer exception when there are no term docs, such as for search term bla. Also some comments on how to get all the matching docs in lucene 4.7.2

File size: 27.5 KB
Line 
1/**********************************************************************
2 *
3 * GS2LuceneQuery.java
4 *
5 * Copyright 2004 The New Zealand Digital Library Project
6 *
7 * A component of the Greenstone digital library software
8 * from the New Zealand Digital Library Project at the
9 * University of Waikato, New Zealand.
10 *
11 * This program is free software; you can redistribute it and/or modify
12 * it under the terms of the GNU General Public License as published by
13 * the Free Software Foundation; either version 2 of the License, or
14 * (at your option) any later version.
15 *
16 * This program is distributed in the hope that it will be useful,
17 * but WITHOUT ANY WARRANTY; without even the implied warranty of
18 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19 * GNU General Public License for more details.
20 *
21 * You should have received a copy of the GNU General Public License
22 * along with this program; if not, write to the Free Software
23 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24 *
25 *********************************************************************/
26package org.greenstone.LuceneWrapper4;
27
28
29import java.io.*;
30import java.util.*;
31import java.util.regex.*;
32
33import org.apache.lucene.analysis.Analyzer;
34import org.apache.lucene.analysis.standard.StandardAnalyzer;
35import org.apache.lucene.document.Document;
36import org.apache.lucene.index.DirectoryReader;
37import org.apache.lucene.index.IndexReader;
38import org.apache.lucene.index.Term;
39//import org.apache.lucene.index.TermDocs;
40import org.apache.lucene.queryparser.classic.ParseException;
41import org.apache.lucene.queryparser.classic.QueryParser;
42import org.apache.lucene.search.BooleanQuery; // for the TooManyClauses exception
43import org.apache.lucene.search.Filter;
44import org.apache.lucene.search.IndexSearcher;
45import org.apache.lucene.search.MultiTermQuery;
46import org.apache.lucene.search.MultiTermQuery.ConstantScoreAutoRewrite;
47import org.apache.lucene.search.Query;
48import org.apache.lucene.search.TermRangeFilter;
49import org.apache.lucene.search.IndexSearcher; // Searcher is deprecated
50import org.apache.lucene.search.ScoreDoc;
51import org.apache.lucene.search.Sort;
52import org.apache.lucene.search.SortField;
53import org.apache.lucene.search.TopFieldDocs;
54
55import org.apache.lucene.index.DocsEnum;
56import org.apache.lucene.index.MultiFields;
57
58import org.apache.lucene.store.Directory;
59import org.apache.lucene.store.FSDirectory;
60
61import org.apache.lucene.util.Bits;
62import org.apache.lucene.util.BytesRef;
63import org.apache.lucene.util.Version;
64
65public class GS2LuceneQuery extends SharedSoleneQuery
66{
67 public static String SORT_RANK = "rank";
68 public static String SORT_NATURAL = "natural";
69
70 protected String full_indexdir="";
71
72 protected SortField.Type sort_type = SortField.Type.SCORE;
73 protected boolean reverse_sort = false;
74 protected Sort sorter=new Sort();
75 protected Filter filter = null;
76
77 protected QueryParser query_parser = null;
78 protected QueryParser query_parser_no_stop_words = null;
79 protected IndexSearcher searcher = null;
80 protected IndexReader reader = null;
81
82 public GS2LuceneQuery() {
83 super();
84
85 // Create one query parser with the standard set of stop words, and one with none
86
87 query_parser = new QueryParser(GSLuceneConstants.MATCH_VERSION, TEXTFIELD, new GS2Analyzer()); // uses built-in stop_words_set
88 query_parser_no_stop_words = new QueryParser(GSLuceneConstants.MATCH_VERSION, TEXTFIELD, new GS2Analyzer(new String[] { }));
89 }
90
91
92 public boolean initialise() {
93
94 if (!super.initialise()) {
95 return false;
96 }
97
98
99 if (full_indexdir==null || full_indexdir.length()==-1){
100 utf8out.println("Index directory is not indicated ");
101 utf8out.flush();
102 return false;
103 }
104
105 try {
106 Directory full_indexdir_dir = FSDirectory.open(new File(full_indexdir));
107
108 reader = DirectoryReader.open(full_indexdir_dir); // Returns a IndexReader reading the index in the given Directory. now readOnly=true by default, and therefore also for searcher
109 searcher = new IndexSearcher(reader); // during searcher.search() will get it to compute ranks when sorting by fields
110
111 this.sorter = new Sort(new SortField(this.sort_field, this.sort_type, this.reverse_sort));
112 }
113 catch (IOException exception) {
114 exception.printStackTrace();
115 return false;
116 }
117 return true;
118
119 }
120
121 public void setIndexDir(String full_indexdir) {
122 this.full_indexdir = full_indexdir;
123 }
124
125 public void setSortField(String sort_field) {
126 if (sort_field.equals(SORT_RANK)) {
127 this.sort_field = null;
128 this.sort_type = SortField.Type.SCORE;
129 } else if (sort_field.equals(SORT_NATURAL)) {
130 this.sort_field = null;
131 this.sort_type = SortField.Type.DOC;
132 } else {
133 this.sort_field = sort_field;
134 this.sort_type = SortField.Type.STRING; // for now. numeric??
135 }
136 }
137 public void setReverseSort(boolean reverse) {
138 this.reverse_sort = reverse;
139 }
140 public boolean getReverseSort() {
141 return this.reverse_sort;
142 }
143
144 public void setFilterString(String filter_string) {
145 super.setFilterString(filter_string);
146 this.filter = parseFilterString(filter_string);
147 }
148
149 public Filter getFilter() {
150 return this.filter;
151 }
152
153
154 public LuceneQueryResult runQuery(String query_string) {
155
156 if (query_string == null || query_string.equals("")) {
157 utf8out.println("The query word is not indicated ");
158 utf8out.flush();
159 return null;
160 }
161
162 LuceneQueryResult lucene_query_result=new LuceneQueryResult();
163 lucene_query_result.clear();
164
165 try {
166 Query query_including_stop_words = query_parser_no_stop_words.parse(query_string);
167 query_including_stop_words = query_including_stop_words.rewrite(reader);
168
169 // System.err.println("********* query_string " + query_string + "****");
170
171 Query query = parseQuery(reader, query_parser, query_string, fuzziness);
172
173 // GS2's LuceneWrapper uses lucene-2.3.2. GS3's LuceneWrapper3 works with lucene-3.3.0.
174 // This change in lucene core library for GS3 (present since after version 2.4.1) had the
175 // side-effect that searching on "econom*" didn't display what terms it was searching for,
176 // whereas it had done so in GS2.
177
178 // The details of this problem and its current solution are explained in the ticket
179 // http://trac.greenstone.org/ticket/845
180
181 // We need to change the settings for the rewriteMethod in order to get searches on wildcards
182 // to produce search terms again when the query gets rewritten.
183
184 // We try, in order:
185 // 1. RewriteMethod set to BooleanQuery, to get it working as in GS2 which uses lucene-2.3.2
186 // it will expand wildcard searches to its terms when searching at both section AND doc level.
187 // If that throws a TooManyClauses exception (like when searching for "a*" over lucene demo collection)
188 // 2. Then try a custom rewriteMethod which sets termCountCutoff=350 and docCountPercent cutoff=0.1%
189 // If that throws a TooManyClauses exception (could perhaps happen if the collection has a huge number of docs
190 // 3. Then try the default apache rewriteMethod with its optimum defaults of
191 // termCountCutoff=350 and docCountPercent cutoff=0.1%
192 // See http://lucene.apache.org/core/3_6_1/api/core/org/apache/lucene/search/MultiTermQuery.html
193
194 if(query instanceof MultiTermQuery) {
195 MultiTermQuery multiTermQuery = (MultiTermQuery)query;
196 multiTermQuery.setRewriteMethod(MultiTermQuery.CONSTANT_SCORE_BOOLEAN_QUERY_REWRITE);
197 // less CPU intensive than MultiTermQuery.SCORING_BOOLEAN_QUERY_REWRITE)
198 }
199
200 try {
201 query = query.rewrite(reader);
202 }
203 catch(BooleanQuery.TooManyClauses clauseException) {
204 // Example test case: try searching the lucene demo collection for "a*"
205 // and you'll hit this exception
206
207 lucene_query_result.setError(LuceneQueryResult.TOO_MANY_CLAUSES_ERROR);
208
209 if(query instanceof MultiTermQuery) {
210
211 // CustomRewriteMethod: setting the docCountPercent cutoff to a custom 100%.
212 // This will at least expand the query to its terms when searching with wildcards at section-level
213 // (though it doesn't seem to work for doc-level searches, no matter what the cutoffs are set to).
214
215 MultiTermQuery.ConstantScoreAutoRewrite customRewriteMethod = new MultiTermQuery.ConstantScoreAutoRewrite();
216 customRewriteMethod.setDocCountPercent(100.0);
217 customRewriteMethod.setTermCountCutoff(350); // same as default
218
219 MultiTermQuery multiTermQuery = (MultiTermQuery)query;
220 multiTermQuery.setRewriteMethod(customRewriteMethod);
221 try {
222 query = query.rewrite(reader);
223 }
224 catch(BooleanQuery.TooManyClauses clauseExceptionAgain) {
225
226 // do what the code originally did: use the default rewriteMethod which
227 // uses a default docCountPercent=0.1 (%) and termCountCutoff=350
228
229 multiTermQuery.setRewriteMethod(MultiTermQuery.CONSTANT_SCORE_AUTO_REWRITE_DEFAULT);
230 query = query.rewrite(reader);
231 }
232 }
233 }
234
235 // http://stackoverflow.com/questions/13537126/term-frequency-in-lucene-4-0
236 // http://stackoverflow.com/questions/20575254/lucene-4-4-how-to-get-term-frequency-over-all-index
237 // http://stackoverflow.com/questions/8938960/how-to-get-document-ids-for-document-term-vector-in-lucene?rq=1
238 // https://github.com/hibernate/hibernate-search/blob/master/orm/src/test/java/org/hibernate/search/test/filter/BestDriversFilter.java
239 // http://lucene.apache.org/core/4_7_2/MIGRATE.html
240
241 // Get the list of expanded query terms and their frequencies
242 // num docs matching, and total frequency
243 HashSet terms = new HashSet();
244 query.extractTerms(terms);
245
246 HashMap doc_term_freq_map = new HashMap();
247
248 Iterator iter = terms.iterator();
249
250 Bits liveDocs = null;
251 if(reader.hasDeletions()) {
252 System.err.println("@@@ GS2LuceneQuery.java: There have been deletions. Merging to get liveDocs.");
253 liveDocs = MultiFields.getLiveDocs(reader); // SLOW! But getLiveDocs returns null if there are no deletions
254 }
255
256 while (iter.hasNext()) {
257
258 // http://stackoverflow.com/questions/13537126/term-frequency-in-lucene-4-0
259
260 Term term = (Term) iter.next();
261 BytesRef term_bytes = term.bytes();
262 DocsEnum term_docs = MultiFields.getTermDocsEnum(reader, liveDocs, term.field(), term_bytes); // flags?
263
264 // Get the term frequency over all the documents
265 //TermDocs term_docs = reader.termDocs(term);
266 int term_freq = 0;
267 int match_docs = 0;
268
269 if(term_docs != null) {
270 int docID = -1;
271 while((docID = term_docs.nextDoc()) != DocsEnum.NO_MORE_DOCS) {//while (term_docs.next())
272 if (term_docs.freq() != 0)
273 {
274 term_freq += term_docs.freq();
275 match_docs++;
276
277 // Calculate the document-level term frequency as well
278 Integer lucene_doc_num_obj = new Integer(term_docs.docID());
279 int doc_term_freq = 0;
280 if (doc_term_freq_map.containsKey(lucene_doc_num_obj))
281 {
282 doc_term_freq = ((Integer) doc_term_freq_map.get(lucene_doc_num_obj)).intValue();
283 }
284 doc_term_freq += term_docs.freq();
285
286 doc_term_freq_map.put(lucene_doc_num_obj, new Integer(doc_term_freq));
287 }
288 }
289 } else {
290 System.err.println("@@@ GS2LuceneQuery.java: term_docs is null for term " + term.text());
291 }
292
293 // Create a term
294 lucene_query_result.addTerm(term.text(), term.field(), match_docs, term_freq);
295 }
296
297 // Get the list of stop words removed from the query
298 HashSet terms_including_stop_words = new HashSet();
299 query_including_stop_words.extractTerms(terms_including_stop_words);
300 Iterator terms_including_stop_words_iter = terms_including_stop_words.iterator();
301 while (terms_including_stop_words_iter.hasNext()) {
302 Term term = (Term) terms_including_stop_words_iter.next();
303 if (!terms.contains(term)) {
304 lucene_query_result.addStopWord(term.text());
305 }
306 }
307
308 // Extracting all documents for a given search - http://www.gossamer-threads.com/lists/lucene/java-user/134873
309 // http://lucene.apache.org/core/3_4_0/api/core/org/apache/lucene/search/TotalHitCountCollector.html
310 // http://lucene.apache.org/core/4_7_2/core/index.html?org/apache/lucene/search/TopFieldDocs.html
311
312 // 1. Figure out how many results there will be.
313 //TotalHitCountCollecter countCollector = new TotalHitCountCollector();
314 //searcher.search(query, filter, collector);
315 //int hitCount = collector.count;
316
317 // Actually do the query
318 // Simple case for getting all the matching documents
319 if (end_results == Integer.MAX_VALUE) {
320 // Perform the query (filter and sorter may be null)
321 TopFieldDocs hits = searcher.search(query, filter, end_results, sorter, true, true); // doDocScores=true, doMaxScore=true
322 // Is there a slight difference in the definition between
323 // https://lucene.apache.org/core/3_6_0/api/all/org/apache/lucene/search/IndexSearcher.html#setDefaultFieldSortScoring%28boolean,%20boolean%29
324 // and http://lucene.apache.org/core/4_7_2/core/org/apache/lucene/search/IndexSearcher.html#search%28org.apache.lucene.search.Query,%20org.apache.lucene.search.Filter,%20int,%20org.apache.lucene.search.Sort,%20boolean,%20boolean%29
325 // Seems to be okay.
326 // See also http://stackoverflow.com/questions/7910241/in-lucene-what-is-the-purpose-of-setdefaultfieldsortscoring
327
328 lucene_query_result.setTotalDocs(hits.totalHits);
329
330 // Output the matching documents
331 lucene_query_result.setStartResults(start_results);
332 lucene_query_result.setEndResults(hits.totalHits);
333
334 for (int i = start_results; i <= hits.totalHits; i++) {
335 int lucene_doc_num = hits.scoreDocs[i - 1].doc;
336 Document doc = reader.document(lucene_doc_num);
337 int doc_term_freq = 0;
338 Integer doc_term_freq_object = (Integer) doc_term_freq_map.get(new Integer(lucene_doc_num));
339 if (doc_term_freq_object != null)
340 {
341 doc_term_freq = doc_term_freq_object.intValue();
342 }
343 lucene_query_result.addDoc(doc.get("docOID").trim(), hits.scoreDocs[i-1].score, doc_term_freq);
344 }
345 }
346
347 // Slightly more complicated case for returning a subset of the matching documents
348 else {
349 // Perform the query (filter may be null)
350 TopFieldDocs hits = searcher.search(query, filter, end_results, sorter, true, true); // doDocScores=true, doMaxScore=true
351 // See also http://stackoverflow.com/questions/7910241/in-lucene-what-is-the-purpose-of-setdefaultfieldsortscoring
352 lucene_query_result.setTotalDocs(hits.totalHits);
353
354 lucene_query_result.setStartResults(start_results);
355 lucene_query_result.setEndResults(end_results < hits.scoreDocs.length ? end_results: hits.scoreDocs.length);
356
357 // Output the matching documents
358 for (int i = start_results; (i <= hits.scoreDocs.length && i <= end_results); i++) {
359 int lucene_doc_num = hits.scoreDocs[i - 1].doc;
360 Document doc = reader.document(lucene_doc_num);
361 int doc_term_freq = 0;
362 Integer doc_term_freq_object = (Integer) doc_term_freq_map.get(new Integer(lucene_doc_num));
363 if (doc_term_freq_object != null)
364 {
365 doc_term_freq = doc_term_freq_object.intValue();
366 }
367 lucene_query_result.addDoc(doc.get("docOID").trim(), hits.scoreDocs[i-1].score, doc_term_freq);
368 }
369 }
370 }
371
372 catch (ParseException parse_exception) {
373 lucene_query_result.setError(LuceneQueryResult.PARSE_ERROR);
374 }
375 catch (BooleanQuery.TooManyClauses too_many_clauses_exception) {
376 lucene_query_result.setError(LuceneQueryResult.TOO_MANY_CLAUSES_ERROR);
377 }
378 catch (IOException exception) {
379 lucene_query_result.setError(LuceneQueryResult.IO_ERROR);
380 exception.printStackTrace();
381 }
382 catch (Exception exception) {
383 lucene_query_result.setError(LuceneQueryResult.OTHER_ERROR);
384 exception.printStackTrace();
385 }
386 return lucene_query_result;
387 }
388
389 public void setDefaultConjunctionOperator(String default_conjunction_operator) {
390 super.setDefaultConjunctionOperator(default_conjunction_operator);
391
392 if (default_conjunction_operator.equals("AND")) {
393 query_parser.setDefaultOperator(query_parser.AND_OPERATOR);
394 query_parser_no_stop_words.setDefaultOperator(query_parser.AND_OPERATOR);
395 } else { // default is OR
396 query_parser.setDefaultOperator(query_parser.OR_OPERATOR);
397 query_parser_no_stop_words.setDefaultOperator(query_parser.OR_OPERATOR);
398 }
399 }
400
401
402 public void cleanUp() {
403 super.cleanUp();
404 try {
405 if(reader != null) {
406 reader.close();
407 // Closes files associated with this index. Also saves any new deletions to disk.
408 // No other methods should be called after this has been called.
409 }
410 } catch (IOException exception) {
411 exception.printStackTrace();
412 }
413 }
414
415
416 protected Query parseQuery(IndexReader reader, QueryParser query_parser, String query_string, String fuzziness)
417 throws java.io.IOException, org.apache.lucene.queryparser.classic.ParseException
418 {
419 // Split query string into the search terms and the filter terms
420 // * The first +(...) term contains the search terms so count
421 // up '(' and stop when we finish matching ')'
422 int offset = 0;
423 int paren_count = 0;
424 boolean seen_paren = false;
425 while (offset < query_string.length() && (!seen_paren || paren_count > 0)) {
426 if (query_string.charAt(offset) == '(') {
427 paren_count++;
428 seen_paren = true;
429 }
430 if (query_string.charAt(offset) == ')') {
431 paren_count--;
432 }
433 offset++;
434 }
435 String query_prefix = query_string.substring(0, offset);
436 String query_suffix = query_string.substring(offset);
437
438 ///ystem.err.println("Prefix: " + query_prefix);
439 ///ystem.err.println("Suffix: " + query_suffix);
440
441 Query query = query_parser.parse(query_prefix);
442 query = query.rewrite(reader);
443
444 // If this is a fuzzy search, then we need to add the fuzzy
445 // flag to each of the query terms
446 if (fuzziness != null && query.toString().length() > 0) {
447
448 // Revert the query to a string
449 System.err.println("Rewritten query: " + query.toString());
450 // Search through the string for TX:<term> query terms
451 // and append the ~ operator. Note that this search will
452 // not change phrase searches (TX:"<term> <term>") as
453 // fuzzy searching is not possible for these entries.
454 // Yahoo! Time for a state machine!
455 StringBuffer mutable_query_string = new StringBuffer(query.toString());
456 int o = 0; // Offset
457 // 0 = BASE, 1 = SEEN_T, 2 = SEEN_TX, 3 = SEEN_TX:
458 int s = 0; // State
459 while(o < mutable_query_string.length()) {
460 char c = mutable_query_string.charAt(o);
461 if (s == 0 && c == TEXTFIELD.charAt(0)) {
462 ///ystem.err.println("Found T!");
463 s = 1;
464 }
465 else if (s == 1) {
466 if (c == TEXTFIELD.charAt(1)) {
467 ///ystem.err.println("Found X!");
468 s = 2;
469 }
470 else {
471 s = 0; // Reset
472 }
473 }
474 else if (s == 2) {
475 if (c == ':') {
476 ///ystem.err.println("Found TX:!");
477 s = 3;
478 }
479 else {
480 s = 0; // Reset
481 }
482 }
483 else if (s == 3) {
484 // Don't process phrases
485 if (c == '"') {
486 ///ystem.err.println("Stupid phrase...");
487 s = 0; // Reset
488 }
489 // Found the end of the term... add the
490 // fuzzy search indicator
491 // Nor outside the scope of parentheses
492 else if (Character.isWhitespace(c) || c == ')') {
493 ///ystem.err.println("Yahoo! Found fuzzy term.");
494 mutable_query_string.insert(o, '~' + fuzziness);
495 o++;
496 s = 0; // Reset
497 }
498 }
499 o++;
500 }
501 // If we were in the state of looking for the end of a
502 // term - then we just found it!
503 if (s == 3) {
504
505 mutable_query_string.append('~' + fuzziness);
506 }
507 // Reparse the query
508 ///ystem.err.println("Fuzzy query: " + mutable_query_string.toString() + query_suffix);
509 query = query_parser.parse(mutable_query_string.toString() + query_suffix);
510 }
511 else {
512 query = query_parser.parse(query_prefix + query_suffix);
513 }
514
515 return query;
516 }
517
518 protected Filter parseFilterString(String filter_string)
519 {
520 Filter result = null;
521 Pattern pattern = Pattern.compile("\\s*\\+(\\w+)\\:([\\{\\[])(\\d+)\\s+TO\\s+(\\d+)([\\}\\]])\\s*");
522 Matcher matcher = pattern.matcher(filter_string);
523 if (matcher.matches()) {
524 String field_name = matcher.group(1);
525 boolean include_lower = matcher.group(2).equals("[");
526 BytesRef lower_term = new BytesRef(matcher.group(3));
527 BytesRef upper_term = new BytesRef(matcher.group(4));
528 boolean include_upper = matcher.group(5).equals("]");
529 result = new TermRangeFilter(field_name, lower_term, upper_term, include_lower, include_upper);
530 }
531 else {
532 System.err.println("Error: Could not understand filter string \"" + filter_string + "\"");
533 }
534 return result;
535 }
536
537
538 /** command line program and auxiliary methods */
539
540 // Fairly self-explanatory I should hope
541 static protected boolean query_result_caching_enabled = false;
542
543
544 static public void main (String args[])
545 {
546 if (args.length == 0) {
547 System.out.println("Usage: GS2LuceneQuery <index directory> [-fuzziness value] [-filter filter_string] [-sort sort_field] [-reverse_sort][-dco AND|OR] [-startresults number -endresults number] [query]");
548 return;
549 }
550
551 try {
552 String index_directory = args[0];
553
554 GS2LuceneQuery queryer = new GS2LuceneQuery();
555 queryer.setIndexDir(index_directory);
556
557 // Prepare the index cache directory, if query result caching is enabled
558 if (query_result_caching_enabled) {
559 // Make the index cache directory if it doesn't already exist
560 File index_cache_directory = new File(index_directory, "cache");
561 if (!index_cache_directory.exists()) {
562 index_cache_directory.mkdir();
563 }
564
565 // Disable caching if the index cache directory isn't available
566 if (!index_cache_directory.exists() || !index_cache_directory.isDirectory()) {
567 query_result_caching_enabled = false;
568 }
569 }
570
571 String query_string = null;
572
573 // Parse the command-line arguments
574 for (int i = 1; i < args.length; i++) {
575 if (args[i].equals("-sort")) {
576 i++;
577 queryer.setSortField(args[i]);
578 }
579 else if (args[i].equals("-reverse_sort")) {
580 queryer.setReverseSort(true);
581 }
582 else if (args[i].equals("-filter")) {
583 i++;
584 queryer.setFilterString(args[i]);
585 }
586 else if (args[i].equals("-dco")) {
587 i++;
588 queryer.setDefaultConjunctionOperator(args[i]);
589 }
590 else if (args[i].equals("-fuzziness")) {
591 i++;
592 queryer.setFuzziness(args[i]);
593 }
594 else if (args[i].equals("-startresults")) {
595 i++;
596 if (args[i].matches("\\d+")) {
597 queryer.setStartResults(Integer.parseInt(args[i]));
598 }
599 }
600 else if (args[i].equals("-endresults")) {
601 i++;
602 if (args[i].matches("\\d+")) {
603 queryer.setEndResults(Integer.parseInt(args[i]));
604 }
605 }
606 else {
607 query_string = args[i];
608 }
609 }
610
611 if (!queryer.initialise()) {
612 return;
613 }
614
615 // The query string has been specified as a command-line argument
616 if (query_string != null) {
617 runQueryCaching(index_directory, queryer, query_string);
618 }
619
620 // Read queries from STDIN
621 else {
622 BufferedReader in = new BufferedReader(new InputStreamReader(System.in, "UTF-8"));
623 while (true) {
624 // Read the query from STDIN
625 query_string = in.readLine();
626 if (query_string == null || query_string.length() == -1) {
627 break;
628 }
629
630 runQueryCaching(index_directory, queryer, query_string);
631
632 }
633 }
634 queryer.cleanUp();
635 }
636 catch (IOException exception) {
637 exception.printStackTrace();
638 }
639 }
640
641 protected static void runQueryCaching(String index_directory, GS2LuceneQuery queryer, String query_string)
642 throws IOException
643 {
644 StringBuffer query_results_xml = new StringBuffer();
645
646 // Check if this query result has been cached from a previous search (if it's enabled)
647 File query_result_cache_file = null;
648 if (query_result_caching_enabled) {
649 // Generate the cache file name from the query options
650 String query_result_cache_file_name = query_string + "-";
651 String fuzziness = queryer.getFuzziness();
652 query_result_cache_file_name += ((fuzziness != null) ? fuzziness : "") + "-";
653 String filter_string = queryer.getFilterString();
654 query_result_cache_file_name += ((filter_string != null) ? filter_string : "") + "-";
655 String sort_string = queryer.getSortField();
656 query_result_cache_file_name += ((sort_string != null) ? sort_string : "") + "-";
657 String reverse_sort_string = (queryer.getReverseSort() ? "1" : "0");
658 query_result_cache_file_name += reverse_sort_string + "-";
659 String default_conjunction_operator = queryer.getDefaultConjunctionOperator();
660 query_result_cache_file_name += default_conjunction_operator + "-";
661 int start_results = queryer.getStartResults();
662 int end_results = queryer.getEndResults();
663 query_result_cache_file_name += start_results + "-" + end_results;
664 query_result_cache_file_name = fileSafe(query_result_cache_file_name);
665
666 // If the query result cache file exists, just return its contents and we're done
667 File index_cache_directory = new File(index_directory, "cache");
668 query_result_cache_file = new File(index_cache_directory, query_result_cache_file_name);
669 if (query_result_cache_file.exists() && query_result_cache_file.isFile()) {
670 FileInputStream fis = new FileInputStream(query_result_cache_file);
671 InputStreamReader isr = new InputStreamReader(fis, "UTF-8");
672 BufferedReader buffered_reader = new BufferedReader(isr);
673 String line = "";
674 while ((line = buffered_reader.readLine()) != null) {
675 query_results_xml.append(line + "\n");
676 }
677 String query_results_xml_string = query_results_xml.toString();
678 query_results_xml_string = query_results_xml_string.replaceFirst("cached=\"false\"", "cached=\"true\"");
679
680 utf8out.print(query_results_xml_string);
681 utf8out.flush();
682
683 return;
684 }
685 }
686
687 // not cached
688 query_results_xml.append("<ResultSet cached=\"false\">\n");
689 query_results_xml.append("<QueryString>" + LuceneQueryResult.xmlSafe(query_string) + "</QueryString>\n");
690 Filter filter = queryer.getFilter();
691 if (filter != null) {
692 query_results_xml.append("<FilterString>" + filter.toString() + "</FilterString>\n");
693 }
694
695 LuceneQueryResult query_result = queryer.runQuery(query_string);
696 if (query_result == null) {
697 System.err.println("Couldn't run the query");
698 return;
699 }
700
701 if (query_result.getError() != LuceneQueryResult.NO_ERROR) {
702 query_results_xml.append("<Error type=\""+query_result.getErrorString()+"\" />\n");
703 } else {
704 query_results_xml.append(query_result.getXMLString());
705 }
706 query_results_xml.append("</ResultSet>\n");
707
708 utf8out.print(query_results_xml);
709 utf8out.flush();
710
711 // Cache this query result, if desired
712 if (query_result_caching_enabled) {
713 // Catch any exceptions thrown trying to write the query result cache file and warn about them, but don't
714 // bother with the full stack trace. It won't affect the functionality if we can't write some cache
715 // files, it will just affect the speed of subsequent requests.
716 // Example exceptions are "permission denied" errors, or "filename too long" errors (the filter string
717 // can get very long in some collections)
718 try
719 {
720 FileWriter query_result_cache_file_writer = new FileWriter(query_result_cache_file);
721 query_result_cache_file_writer.write(query_results_xml.toString());
722 query_result_cache_file_writer.close();
723 }
724 catch (Exception exception)
725 {
726 System.err.println("Warning: Exception occurred trying to write query result cache file (" + exception + ")");
727 }
728 }
729 }
730
731 protected static String fileSafe(String text)
732 {
733 StringBuffer file_safe_text = new StringBuffer();
734 for (int i = 0; i < text.length(); i++) {
735 char character = text.charAt(i);
736 if ((character >= 'A' && character <= 'Z') || (character >= 'a' && character <= 'z') || (character >= '0' && character <= '9') || character == '-') {
737 file_safe_text.append(character);
738 }
739 else {
740 file_safe_text.append('%');
741 file_safe_text.append((int) character);
742 }
743 }
744 return file_safe_text.toString();
745 }
746
747
748}
749
750
Note: See TracBrowser for help on using the repository browser.