source: main/trunk/greenstone2/common-src/indexers/lucene-gs/src/org/greenstone/LuceneWrapper4/GS2LuceneQuery.java@ 29148

Last change on this file since 29148 was 29148, checked in by ak19, 10 years ago

Part of port from lucene3.3.0 to lucene4.7.2. Related to LuceneWrapper. 1. Updating the lucene-gs makefiles to allow compiling up Lucene4Wrapper.jar or Lucene3Wrapper.jar. Only the Linux Makefile.in has been tested so far. 2. Adding in the jar files necessary for Lucene4Wrapper into the lib folder's new lucene4 subfolder. 3. Updating the Lucene src code to use lucene4.7.2 instead of lucene3.3.0.

File size: 26.9 KB
Line 
1/**********************************************************************
2 *
3 * GS2LuceneQuery.java
4 *
5 * Copyright 2004 The New Zealand Digital Library Project
6 *
7 * A component of the Greenstone digital library software
8 * from the New Zealand Digital Library Project at the
9 * University of Waikato, New Zealand.
10 *
11 * This program is free software; you can redistribute it and/or modify
12 * it under the terms of the GNU General Public License as published by
13 * the Free Software Foundation; either version 2 of the License, or
14 * (at your option) any later version.
15 *
16 * This program is distributed in the hope that it will be useful,
17 * but WITHOUT ANY WARRANTY; without even the implied warranty of
18 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19 * GNU General Public License for more details.
20 *
21 * You should have received a copy of the GNU General Public License
22 * along with this program; if not, write to the Free Software
23 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24 *
25 *********************************************************************/
26package org.greenstone.LuceneWrapper4;
27
28
29import java.io.*;
30import java.util.*;
31import java.util.regex.*;
32
33import org.apache.lucene.analysis.Analyzer;
34import org.apache.lucene.analysis.standard.StandardAnalyzer;
35import org.apache.lucene.document.Document;
36import org.apache.lucene.index.DirectoryReader;
37import org.apache.lucene.index.IndexReader;
38import org.apache.lucene.index.Term;
39//import org.apache.lucene.index.TermDocs;
40import org.apache.lucene.queryparser.classic.ParseException;
41import org.apache.lucene.queryparser.classic.QueryParser;
42import org.apache.lucene.search.BooleanQuery; // for the TooManyClauses exception
43import org.apache.lucene.search.Filter;
44import org.apache.lucene.search.IndexSearcher;
45import org.apache.lucene.search.MultiTermQuery;
46import org.apache.lucene.search.MultiTermQuery.ConstantScoreAutoRewrite;
47import org.apache.lucene.search.Query;
48import org.apache.lucene.search.TermRangeFilter;
49import org.apache.lucene.search.IndexSearcher; // Searcher is deprecated
50import org.apache.lucene.search.ScoreDoc;
51import org.apache.lucene.search.Sort;
52import org.apache.lucene.search.SortField;
53import org.apache.lucene.search.TopFieldDocs;
54
55import org.apache.lucene.index.DocsEnum;
56import org.apache.lucene.index.MultiFields;
57
58import org.apache.lucene.store.Directory;
59import org.apache.lucene.store.FSDirectory;
60
61import org.apache.lucene.util.Bits;
62import org.apache.lucene.util.BytesRef;
63import org.apache.lucene.util.Version;
64
65public class GS2LuceneQuery extends SharedSoleneQuery
66{
67 public static String SORT_RANK = "rank";
68 public static String SORT_NATURAL = "natural";
69
70 protected String full_indexdir="";
71
72 protected SortField.Type sort_type = SortField.Type.SCORE;
73 protected boolean reverse_sort = false;
74 protected Sort sorter=new Sort();
75 protected Filter filter = null;
76
77 protected QueryParser query_parser = null;
78 protected QueryParser query_parser_no_stop_words = null;
79 protected IndexSearcher searcher = null;
80 protected IndexReader reader = null;
81
82 public GS2LuceneQuery() {
83 super();
84
85 // Create one query parser with the standard set of stop words, and one with none
86
87 query_parser = new QueryParser(GSLuceneConstants.MATCH_VERSION, TEXTFIELD, new GS2Analyzer()); // uses built-in stop_words_set
88 query_parser_no_stop_words = new QueryParser(GSLuceneConstants.MATCH_VERSION, TEXTFIELD, new GS2Analyzer(new String[] { }));
89 }
90
91
92 public boolean initialise() {
93
94 if (!super.initialise()) {
95 return false;
96 }
97
98
99 if (full_indexdir==null || full_indexdir.length()==-1){
100 utf8out.println("Index directory is not indicated ");
101 utf8out.flush();
102 return false;
103 }
104
105 try {
106 Directory full_indexdir_dir = FSDirectory.open(new File(full_indexdir));
107
108 reader = DirectoryReader.open(full_indexdir_dir); // Returns a IndexReader reading the index in the given Directory. now readOnly=true by default, and therefore also for searcher
109 searcher = new IndexSearcher(reader); // during searcher.search() will get it to compute ranks when sorting by fields
110
111 this.sorter = new Sort(new SortField(this.sort_field, this.sort_type, this.reverse_sort));
112 }
113 catch (IOException exception) {
114 exception.printStackTrace();
115 return false;
116 }
117 return true;
118
119 }
120
121 public void setIndexDir(String full_indexdir) {
122 this.full_indexdir = full_indexdir;
123 }
124
125 public void setSortField(String sort_field) {
126 if (sort_field.equals(SORT_RANK)) {
127 this.sort_field = null;
128 this.sort_type = SortField.Type.SCORE;
129 } else if (sort_field.equals(SORT_NATURAL)) {
130 this.sort_field = null;
131 this.sort_type = SortField.Type.DOC;
132 } else {
133 this.sort_field = sort_field;
134 this.sort_type = SortField.Type.STRING; // for now. numeric??
135 }
136 }
137 public void setReverseSort(boolean reverse) {
138 this.reverse_sort = reverse;
139 }
140 public boolean getReverseSort() {
141 return this.reverse_sort;
142 }
143
144 public void setFilterString(String filter_string) {
145 super.setFilterString(filter_string);
146 this.filter = parseFilterString(filter_string);
147 }
148
149 public Filter getFilter() {
150 return this.filter;
151 }
152
153
154 public LuceneQueryResult runQuery(String query_string) {
155
156 if (query_string == null || query_string.equals("")) {
157 utf8out.println("The query word is not indicated ");
158 utf8out.flush();
159 return null;
160 }
161
162 LuceneQueryResult lucene_query_result=new LuceneQueryResult();
163 lucene_query_result.clear();
164
165 try {
166 Query query_including_stop_words = query_parser_no_stop_words.parse(query_string);
167 query_including_stop_words = query_including_stop_words.rewrite(reader);
168
169 // System.err.println("********* query_string " + query_string + "****");
170
171 Query query = parseQuery(reader, query_parser, query_string, fuzziness);
172
173 // GS2's LuceneWrapper uses lucene-2.3.2. GS3's LuceneWrapper3 works with lucene-3.3.0.
174 // This change in lucene core library for GS3 (present since after version 2.4.1) had the
175 // side-effect that searching on "econom*" didn't display what terms it was searching for,
176 // whereas it had done so in GS2.
177
178 // The details of this problem and its current solution are explained in the ticket
179 // http://trac.greenstone.org/ticket/845
180
181 // We need to change the settings for the rewriteMethod in order to get searches on wildcards
182 // to produce search terms again when the query gets rewritten.
183
184 // We try, in order:
185 // 1. RewriteMethod set to BooleanQuery, to get it working as in GS2 which uses lucene-2.3.2
186 // it will expand wildcard searches to its terms when searching at both section AND doc level.
187 // If that throws a TooManyClauses exception (like when searching for "a*" over lucene demo collection)
188 // 2. Then try a custom rewriteMethod which sets termCountCutoff=350 and docCountPercent cutoff=0.1%
189 // If that throws a TooManyClauses exception (could perhaps happen if the collection has a huge number of docs
190 // 3. Then try the default apache rewriteMethod with its optimum defaults of
191 // termCountCutoff=350 and docCountPercent cutoff=0.1%
192 // See http://lucene.apache.org/core/3_6_1/api/core/org/apache/lucene/search/MultiTermQuery.html
193
194 if(query instanceof MultiTermQuery) {
195 MultiTermQuery multiTermQuery = (MultiTermQuery)query;
196 multiTermQuery.setRewriteMethod(MultiTermQuery.CONSTANT_SCORE_BOOLEAN_QUERY_REWRITE);
197 // less CPU intensive than MultiTermQuery.SCORING_BOOLEAN_QUERY_REWRITE)
198 }
199
200 try {
201 query = query.rewrite(reader);
202 }
203 catch(BooleanQuery.TooManyClauses clauseException) {
204 // Example test case: try searching the lucene demo collection for "a*"
205 // and you'll hit this exception
206
207 lucene_query_result.setError(LuceneQueryResult.TOO_MANY_CLAUSES_ERROR);
208
209 if(query instanceof MultiTermQuery) {
210
211 // CustomRewriteMethod: setting the docCountPercent cutoff to a custom 100%.
212 // This will at least expand the query to its terms when searching with wildcards at section-level
213 // (though it doesn't seem to work for doc-level searches, no matter what the cutoffs are set to).
214
215 MultiTermQuery.ConstantScoreAutoRewrite customRewriteMethod = new MultiTermQuery.ConstantScoreAutoRewrite();
216 customRewriteMethod.setDocCountPercent(100.0);
217 customRewriteMethod.setTermCountCutoff(350); // same as default
218
219 MultiTermQuery multiTermQuery = (MultiTermQuery)query;
220 multiTermQuery.setRewriteMethod(customRewriteMethod);
221 try {
222 query = query.rewrite(reader);
223 }
224 catch(BooleanQuery.TooManyClauses clauseExceptionAgain) {
225
226 // do what the code originally did: use the default rewriteMethod which
227 // uses a default docCountPercent=0.1 (%) and termCountCutoff=350
228
229 multiTermQuery.setRewriteMethod(MultiTermQuery.CONSTANT_SCORE_AUTO_REWRITE_DEFAULT);
230 query = query.rewrite(reader);
231 }
232 }
233 }
234
235 // http://stackoverflow.com/questions/13537126/term-frequency-in-lucene-4-0
236 // http://stackoverflow.com/questions/20575254/lucene-4-4-how-to-get-term-frequency-over-all-index
237 // http://stackoverflow.com/questions/8938960/how-to-get-document-ids-for-document-term-vector-in-lucene?rq=1
238 // https://github.com/hibernate/hibernate-search/blob/master/orm/src/test/java/org/hibernate/search/test/filter/BestDriversFilter.java
239 // http://lucene.apache.org/core/4_7_2/MIGRATE.html
240
241 // Get the list of expanded query terms and their frequencies
242 // num docs matching, and total frequency
243 HashSet terms = new HashSet();
244 query.extractTerms(terms);
245
246 HashMap doc_term_freq_map = new HashMap();
247
248 Iterator iter = terms.iterator();
249
250 Bits liveDocs = null;
251 if(reader.hasDeletions()) {
252 System.err.println("@@@ GS2LuceneQuery.java: There have been deletions. Merging to get liveDocs.");
253 liveDocs = MultiFields.getLiveDocs(reader); // SLOW! But getLiveDocs returns null if there are no deletions
254 }
255
256 while (iter.hasNext()) {
257
258 // http://stackoverflow.com/questions/13537126/term-frequency-in-lucene-4-0
259
260 Term term = (Term) iter.next();
261 BytesRef term_bytes = term.bytes();
262 DocsEnum term_docs = MultiFields.getTermDocsEnum(reader, liveDocs, term.field(), term_bytes); // flags?
263
264 // Get the term frequency over all the documents
265 //TermDocs term_docs = reader.termDocs(term);
266 int term_freq = 0;
267 int match_docs = 0;
268
269 int docID = -1;
270 while((docID = term_docs.nextDoc()) != DocsEnum.NO_MORE_DOCS) {//while (term_docs.next())
271 if (term_docs.freq() != 0)
272 {
273 term_freq += term_docs.freq();
274 match_docs++;
275
276 // Calculate the document-level term frequency as well
277 Integer lucene_doc_num_obj = new Integer(term_docs.docID());
278 int doc_term_freq = 0;
279 if (doc_term_freq_map.containsKey(lucene_doc_num_obj))
280 {
281 doc_term_freq = ((Integer) doc_term_freq_map.get(lucene_doc_num_obj)).intValue();
282 }
283 doc_term_freq += term_docs.freq();
284
285 doc_term_freq_map.put(lucene_doc_num_obj, new Integer(doc_term_freq));
286 }
287 }
288
289 // Create a term
290 lucene_query_result.addTerm(term.text(), term.field(), match_docs, term_freq);
291 }
292
293 // Get the list of stop words removed from the query
294 HashSet terms_including_stop_words = new HashSet();
295 query_including_stop_words.extractTerms(terms_including_stop_words);
296 Iterator terms_including_stop_words_iter = terms_including_stop_words.iterator();
297 while (terms_including_stop_words_iter.hasNext()) {
298 Term term = (Term) terms_including_stop_words_iter.next();
299 if (!terms.contains(term)) {
300 lucene_query_result.addStopWord(term.text());
301 }
302 }
303
304 // do the query
305 // Simple case for getting all the matching documents
306 if (end_results == Integer.MAX_VALUE) {
307 // Perform the query (filter and sorter may be null)
308 TopFieldDocs hits = searcher.search(query, filter, end_results, sorter, true, true); // doDocScores=true, doMaxScore=true
309 // Is there a slight difference in the definition between
310 // https://lucene.apache.org/core/3_6_0/api/all/org/apache/lucene/search/IndexSearcher.html#setDefaultFieldSortScoring%28boolean,%20boolean%29
311 // and http://lucene.apache.org/core/4_7_2/core/org/apache/lucene/search/IndexSearcher.html#search%28org.apache.lucene.search.Query,%20org.apache.lucene.search.Filter,%20int,%20org.apache.lucene.search.Sort,%20boolean,%20boolean%29
312 // Seems to be okay.
313 // See also http://stackoverflow.com/questions/7910241/in-lucene-what-is-the-purpose-of-setdefaultfieldsortscoring
314
315 lucene_query_result.setTotalDocs(hits.totalHits);
316
317 // Output the matching documents
318 lucene_query_result.setStartResults(start_results);
319 lucene_query_result.setEndResults(hits.totalHits);
320
321 for (int i = start_results; i <= hits.totalHits; i++) {
322 int lucene_doc_num = hits.scoreDocs[i - 1].doc;
323 Document doc = reader.document(lucene_doc_num);
324 int doc_term_freq = 0;
325 Integer doc_term_freq_object = (Integer) doc_term_freq_map.get(new Integer(lucene_doc_num));
326 if (doc_term_freq_object != null)
327 {
328 doc_term_freq = doc_term_freq_object.intValue();
329 }
330 lucene_query_result.addDoc(doc.get("docOID").trim(), hits.scoreDocs[i-1].score, doc_term_freq);
331 }
332 }
333
334 // Slightly more complicated case for returning a subset of the matching documents
335 else {
336 // Perform the query (filter may be null)
337 TopFieldDocs hits = searcher.search(query, filter, end_results, sorter, true, true); // doDocScores=true, doMaxScore=true
338 // See also http://stackoverflow.com/questions/7910241/in-lucene-what-is-the-purpose-of-setdefaultfieldsortscoring
339 lucene_query_result.setTotalDocs(hits.totalHits);
340
341 lucene_query_result.setStartResults(start_results);
342 lucene_query_result.setEndResults(end_results < hits.scoreDocs.length ? end_results: hits.scoreDocs.length);
343
344 // Output the matching documents
345 for (int i = start_results; (i <= hits.scoreDocs.length && i <= end_results); i++) {
346 int lucene_doc_num = hits.scoreDocs[i - 1].doc;
347 Document doc = reader.document(lucene_doc_num);
348 int doc_term_freq = 0;
349 Integer doc_term_freq_object = (Integer) doc_term_freq_map.get(new Integer(lucene_doc_num));
350 if (doc_term_freq_object != null)
351 {
352 doc_term_freq = doc_term_freq_object.intValue();
353 }
354 lucene_query_result.addDoc(doc.get("docOID").trim(), hits.scoreDocs[i-1].score, doc_term_freq);
355 }
356 }
357 }
358
359 catch (ParseException parse_exception) {
360 lucene_query_result.setError(LuceneQueryResult.PARSE_ERROR);
361 }
362 catch (BooleanQuery.TooManyClauses too_many_clauses_exception) {
363 lucene_query_result.setError(LuceneQueryResult.TOO_MANY_CLAUSES_ERROR);
364 }
365 catch (IOException exception) {
366 lucene_query_result.setError(LuceneQueryResult.IO_ERROR);
367 exception.printStackTrace();
368 }
369 catch (Exception exception) {
370 lucene_query_result.setError(LuceneQueryResult.OTHER_ERROR);
371 exception.printStackTrace();
372 }
373 return lucene_query_result;
374 }
375
376 public void setDefaultConjunctionOperator(String default_conjunction_operator) {
377 super.setDefaultConjunctionOperator(default_conjunction_operator);
378
379 if (default_conjunction_operator.equals("AND")) {
380 query_parser.setDefaultOperator(query_parser.AND_OPERATOR);
381 query_parser_no_stop_words.setDefaultOperator(query_parser.AND_OPERATOR);
382 } else { // default is OR
383 query_parser.setDefaultOperator(query_parser.OR_OPERATOR);
384 query_parser_no_stop_words.setDefaultOperator(query_parser.OR_OPERATOR);
385 }
386 }
387
388
389 public void cleanUp() {
390 super.cleanUp();
391 try {
392 if(reader != null) {
393 reader.close();
394 // Closes files associated with this index. Also saves any new deletions to disk.
395 // No other methods should be called after this has been called.
396 }
397 } catch (IOException exception) {
398 exception.printStackTrace();
399 }
400 }
401
402
403 protected Query parseQuery(IndexReader reader, QueryParser query_parser, String query_string, String fuzziness)
404 throws java.io.IOException, org.apache.lucene.queryparser.classic.ParseException
405 {
406 // Split query string into the search terms and the filter terms
407 // * The first +(...) term contains the search terms so count
408 // up '(' and stop when we finish matching ')'
409 int offset = 0;
410 int paren_count = 0;
411 boolean seen_paren = false;
412 while (offset < query_string.length() && (!seen_paren || paren_count > 0)) {
413 if (query_string.charAt(offset) == '(') {
414 paren_count++;
415 seen_paren = true;
416 }
417 if (query_string.charAt(offset) == ')') {
418 paren_count--;
419 }
420 offset++;
421 }
422 String query_prefix = query_string.substring(0, offset);
423 String query_suffix = query_string.substring(offset);
424
425 ///ystem.err.println("Prefix: " + query_prefix);
426 ///ystem.err.println("Suffix: " + query_suffix);
427
428 Query query = query_parser.parse(query_prefix);
429 query = query.rewrite(reader);
430
431 // If this is a fuzzy search, then we need to add the fuzzy
432 // flag to each of the query terms
433 if (fuzziness != null && query.toString().length() > 0) {
434
435 // Revert the query to a string
436 System.err.println("Rewritten query: " + query.toString());
437 // Search through the string for TX:<term> query terms
438 // and append the ~ operator. Note that this search will
439 // not change phrase searches (TX:"<term> <term>") as
440 // fuzzy searching is not possible for these entries.
441 // Yahoo! Time for a state machine!
442 StringBuffer mutable_query_string = new StringBuffer(query.toString());
443 int o = 0; // Offset
444 // 0 = BASE, 1 = SEEN_T, 2 = SEEN_TX, 3 = SEEN_TX:
445 int s = 0; // State
446 while(o < mutable_query_string.length()) {
447 char c = mutable_query_string.charAt(o);
448 if (s == 0 && c == TEXTFIELD.charAt(0)) {
449 ///ystem.err.println("Found T!");
450 s = 1;
451 }
452 else if (s == 1) {
453 if (c == TEXTFIELD.charAt(1)) {
454 ///ystem.err.println("Found X!");
455 s = 2;
456 }
457 else {
458 s = 0; // Reset
459 }
460 }
461 else if (s == 2) {
462 if (c == ':') {
463 ///ystem.err.println("Found TX:!");
464 s = 3;
465 }
466 else {
467 s = 0; // Reset
468 }
469 }
470 else if (s == 3) {
471 // Don't process phrases
472 if (c == '"') {
473 ///ystem.err.println("Stupid phrase...");
474 s = 0; // Reset
475 }
476 // Found the end of the term... add the
477 // fuzzy search indicator
478 // Nor outside the scope of parentheses
479 else if (Character.isWhitespace(c) || c == ')') {
480 ///ystem.err.println("Yahoo! Found fuzzy term.");
481 mutable_query_string.insert(o, '~' + fuzziness);
482 o++;
483 s = 0; // Reset
484 }
485 }
486 o++;
487 }
488 // If we were in the state of looking for the end of a
489 // term - then we just found it!
490 if (s == 3) {
491
492 mutable_query_string.append('~' + fuzziness);
493 }
494 // Reparse the query
495 ///ystem.err.println("Fuzzy query: " + mutable_query_string.toString() + query_suffix);
496 query = query_parser.parse(mutable_query_string.toString() + query_suffix);
497 }
498 else {
499 query = query_parser.parse(query_prefix + query_suffix);
500 }
501
502 return query;
503 }
504
505 protected Filter parseFilterString(String filter_string)
506 {
507 Filter result = null;
508 Pattern pattern = Pattern.compile("\\s*\\+(\\w+)\\:([\\{\\[])(\\d+)\\s+TO\\s+(\\d+)([\\}\\]])\\s*");
509 Matcher matcher = pattern.matcher(filter_string);
510 if (matcher.matches()) {
511 String field_name = matcher.group(1);
512 boolean include_lower = matcher.group(2).equals("[");
513 BytesRef lower_term = new BytesRef(matcher.group(3));
514 BytesRef upper_term = new BytesRef(matcher.group(4));
515 boolean include_upper = matcher.group(5).equals("]");
516 result = new TermRangeFilter(field_name, lower_term, upper_term, include_lower, include_upper);
517 }
518 else {
519 System.err.println("Error: Could not understand filter string \"" + filter_string + "\"");
520 }
521 return result;
522 }
523
524
525 /** command line program and auxiliary methods */
526
527 // Fairly self-explanatory I should hope
528 static protected boolean query_result_caching_enabled = false;
529
530
531 static public void main (String args[])
532 {
533 if (args.length == 0) {
534 System.out.println("Usage: GS2LuceneQuery <index directory> [-fuzziness value] [-filter filter_string] [-sort sort_field] [-reverse_sort][-dco AND|OR] [-startresults number -endresults number] [query]");
535 return;
536 }
537
538 try {
539 String index_directory = args[0];
540
541 GS2LuceneQuery queryer = new GS2LuceneQuery();
542 queryer.setIndexDir(index_directory);
543
544 // Prepare the index cache directory, if query result caching is enabled
545 if (query_result_caching_enabled) {
546 // Make the index cache directory if it doesn't already exist
547 File index_cache_directory = new File(index_directory, "cache");
548 if (!index_cache_directory.exists()) {
549 index_cache_directory.mkdir();
550 }
551
552 // Disable caching if the index cache directory isn't available
553 if (!index_cache_directory.exists() || !index_cache_directory.isDirectory()) {
554 query_result_caching_enabled = false;
555 }
556 }
557
558 String query_string = null;
559
560 // Parse the command-line arguments
561 for (int i = 1; i < args.length; i++) {
562 if (args[i].equals("-sort")) {
563 i++;
564 queryer.setSortField(args[i]);
565 }
566 else if (args[i].equals("-reverse_sort")) {
567 queryer.setReverseSort(true);
568 }
569 else if (args[i].equals("-filter")) {
570 i++;
571 queryer.setFilterString(args[i]);
572 }
573 else if (args[i].equals("-dco")) {
574 i++;
575 queryer.setDefaultConjunctionOperator(args[i]);
576 }
577 else if (args[i].equals("-fuzziness")) {
578 i++;
579 queryer.setFuzziness(args[i]);
580 }
581 else if (args[i].equals("-startresults")) {
582 i++;
583 if (args[i].matches("\\d+")) {
584 queryer.setStartResults(Integer.parseInt(args[i]));
585 }
586 }
587 else if (args[i].equals("-endresults")) {
588 i++;
589 if (args[i].matches("\\d+")) {
590 queryer.setEndResults(Integer.parseInt(args[i]));
591 }
592 }
593 else {
594 query_string = args[i];
595 }
596 }
597
598 if (!queryer.initialise()) {
599 return;
600 }
601
602 // The query string has been specified as a command-line argument
603 if (query_string != null) {
604 runQueryCaching(index_directory, queryer, query_string);
605 }
606
607 // Read queries from STDIN
608 else {
609 BufferedReader in = new BufferedReader(new InputStreamReader(System.in, "UTF-8"));
610 while (true) {
611 // Read the query from STDIN
612 query_string = in.readLine();
613 if (query_string == null || query_string.length() == -1) {
614 break;
615 }
616
617 runQueryCaching(index_directory, queryer, query_string);
618
619 }
620 }
621 queryer.cleanUp();
622 }
623 catch (IOException exception) {
624 exception.printStackTrace();
625 }
626 }
627
628 protected static void runQueryCaching(String index_directory, GS2LuceneQuery queryer, String query_string)
629 throws IOException
630 {
631 StringBuffer query_results_xml = new StringBuffer();
632
633 // Check if this query result has been cached from a previous search (if it's enabled)
634 File query_result_cache_file = null;
635 if (query_result_caching_enabled) {
636 // Generate the cache file name from the query options
637 String query_result_cache_file_name = query_string + "-";
638 String fuzziness = queryer.getFuzziness();
639 query_result_cache_file_name += ((fuzziness != null) ? fuzziness : "") + "-";
640 String filter_string = queryer.getFilterString();
641 query_result_cache_file_name += ((filter_string != null) ? filter_string : "") + "-";
642 String sort_string = queryer.getSortField();
643 query_result_cache_file_name += ((sort_string != null) ? sort_string : "") + "-";
644 String reverse_sort_string = (queryer.getReverseSort() ? "1" : "0");
645 query_result_cache_file_name += reverse_sort_string + "-";
646 String default_conjunction_operator = queryer.getDefaultConjunctionOperator();
647 query_result_cache_file_name += default_conjunction_operator + "-";
648 int start_results = queryer.getStartResults();
649 int end_results = queryer.getEndResults();
650 query_result_cache_file_name += start_results + "-" + end_results;
651 query_result_cache_file_name = fileSafe(query_result_cache_file_name);
652
653 // If the query result cache file exists, just return its contents and we're done
654 File index_cache_directory = new File(index_directory, "cache");
655 query_result_cache_file = new File(index_cache_directory, query_result_cache_file_name);
656 if (query_result_cache_file.exists() && query_result_cache_file.isFile()) {
657 FileInputStream fis = new FileInputStream(query_result_cache_file);
658 InputStreamReader isr = new InputStreamReader(fis, "UTF-8");
659 BufferedReader buffered_reader = new BufferedReader(isr);
660 String line = "";
661 while ((line = buffered_reader.readLine()) != null) {
662 query_results_xml.append(line + "\n");
663 }
664 String query_results_xml_string = query_results_xml.toString();
665 query_results_xml_string = query_results_xml_string.replaceFirst("cached=\"false\"", "cached=\"true\"");
666
667 utf8out.print(query_results_xml_string);
668 utf8out.flush();
669
670 return;
671 }
672 }
673
674 // not cached
675 query_results_xml.append("<ResultSet cached=\"false\">\n");
676 query_results_xml.append("<QueryString>" + LuceneQueryResult.xmlSafe(query_string) + "</QueryString>\n");
677 Filter filter = queryer.getFilter();
678 if (filter != null) {
679 query_results_xml.append("<FilterString>" + filter.toString() + "</FilterString>\n");
680 }
681
682 LuceneQueryResult query_result = queryer.runQuery(query_string);
683 if (query_result == null) {
684 System.err.println("Couldn't run the query");
685 return;
686 }
687
688 if (query_result.getError() != LuceneQueryResult.NO_ERROR) {
689 query_results_xml.append("<Error type=\""+query_result.getErrorString()+"\" />\n");
690 } else {
691 query_results_xml.append(query_result.getXMLString());
692 }
693 query_results_xml.append("</ResultSet>\n");
694
695 utf8out.print(query_results_xml);
696 utf8out.flush();
697
698 // Cache this query result, if desired
699 if (query_result_caching_enabled) {
700 // Catch any exceptions thrown trying to write the query result cache file and warn about them, but don't
701 // bother with the full stack trace. It won't affect the functionality if we can't write some cache
702 // files, it will just affect the speed of subsequent requests.
703 // Example exceptions are "permission denied" errors, or "filename too long" errors (the filter string
704 // can get very long in some collections)
705 try
706 {
707 FileWriter query_result_cache_file_writer = new FileWriter(query_result_cache_file);
708 query_result_cache_file_writer.write(query_results_xml.toString());
709 query_result_cache_file_writer.close();
710 }
711 catch (Exception exception)
712 {
713 System.err.println("Warning: Exception occurred trying to write query result cache file (" + exception + ")");
714 }
715 }
716 }
717
718 protected static String fileSafe(String text)
719 {
720 StringBuffer file_safe_text = new StringBuffer();
721 for (int i = 0; i < text.length(); i++) {
722 char character = text.charAt(i);
723 if ((character >= 'A' && character <= 'Z') || (character >= 'a' && character <= 'z') || (character >= '0' && character <= '9') || character == '-') {
724 file_safe_text.append(character);
725 }
726 else {
727 file_safe_text.append('%');
728 file_safe_text.append((int) character);
729 }
730 }
731 return file_safe_text.toString();
732 }
733
734
735}
736
737
Note: See TracBrowser for help on using the repository browser.