source: main/trunk/greenstone2/common-src/indexers/lucene-gs/src/org/greenstone/LuceneWrapper3/GS2LuceneQuery.java@ 26155

Last change on this file since 26155 was 26155, checked in by ak19, 12 years ago

Searching with wildcards on lucene collection now displays term info in search results. But only at section level, not yet at document level. The latter can be accomplished by configuring the multitermquery rewrite method to a setting that could throw an exception if the number of terms exceeds BooleanQuery.MaxClauseCount(). But then searching with wildcards will work like GS2 again (where it works now since GS2's LuceneWrapper uses lucene core library 2.3.2 and GS3's LuceneWrapper3 uses lucene core library 3.3.0.

File size: 22.2 KB
Line 
1/**********************************************************************
2 *
3 * GS2LuceneQuery.java
4 *
5 * Copyright 2004 The New Zealand Digital Library Project
6 *
7 * A component of the Greenstone digital library software
8 * from the New Zealand Digital Library Project at the
9 * University of Waikato, New Zealand.
10 *
11 * This program is free software; you can redistribute it and/or modify
12 * it under the terms of the GNU General Public License as published by
13 * the Free Software Foundation; either version 2 of the License, or
14 * (at your option) any later version.
15 *
16 * This program is distributed in the hope that it will be useful,
17 * but WITHOUT ANY WARRANTY; without even the implied warranty of
18 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19 * GNU General Public License for more details.
20 *
21 * You should have received a copy of the GNU General Public License
22 * along with this program; if not, write to the Free Software
23 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24 *
25 *********************************************************************/
26package org.greenstone.LuceneWrapper3;
27
28
29import java.io.*;
30import java.util.*;
31import java.util.regex.*;
32
33import org.apache.lucene.analysis.Analyzer;
34import org.apache.lucene.analysis.standard.StandardAnalyzer;
35import org.apache.lucene.document.Document;
36import org.apache.lucene.index.IndexReader;
37import org.apache.lucene.index.Term;
38import org.apache.lucene.index.TermDocs;
39import org.apache.lucene.queryParser.ParseException;
40import org.apache.lucene.queryParser.QueryParser;
41import org.apache.lucene.search.BooleanQuery.TooManyClauses;
42import org.apache.lucene.search.Filter;
43import org.apache.lucene.search.IndexSearcher;
44import org.apache.lucene.search.Query;
45import org.apache.lucene.search.TermRangeFilter;
46import org.apache.lucene.search.Searcher;
47import org.apache.lucene.search.ScoreDoc;
48import org.apache.lucene.search.Sort;
49import org.apache.lucene.search.SortField;
50import org.apache.lucene.search.TopFieldDocs;
51
52import org.apache.lucene.store.Directory;
53import org.apache.lucene.store.FSDirectory;
54import org.apache.lucene.util.Version;
55
56import org.apache.lucene.search.MultiTermQuery;
57import org.apache.lucene.search.MultiTermQuery.ConstantScoreAutoRewrite;
58
59public class GS2LuceneQuery extends SharedSoleneQuery
60{
61 protected String full_indexdir="";
62
63 protected Sort sorter=new Sort();
64 protected Filter filter = null;
65
66 protected static Version matchVersion = Version.LUCENE_24;
67
68 protected QueryParser query_parser = null;
69 protected QueryParser query_parser_no_stop_words = null;
70 protected Searcher searcher = null;
71 protected IndexReader reader = null;
72
73 public GS2LuceneQuery() {
74 super();
75
76 // Create one query parser with the standard set of stop words, and one with none
77
78 query_parser = new QueryParser(matchVersion, TEXTFIELD, new GS2Analyzer()); // uses built-in stop_words_set
79 query_parser_no_stop_words = new QueryParser(matchVersion, TEXTFIELD, new GS2Analyzer(new String[] { }));
80 }
81
82
83 public boolean initialise() {
84
85 if (!super.initialise()) {
86 return false;
87 }
88
89
90 if (full_indexdir==null || full_indexdir.length()==-1){
91 utf8out.println("Index directory is not indicated ");
92 utf8out.flush();
93 return false;
94 }
95
96 try {
97 Directory full_indexdir_dir = FSDirectory.open(new File(full_indexdir));
98 searcher = new IndexSearcher(full_indexdir_dir,true);
99 reader = ((IndexSearcher) searcher).getIndexReader();
100
101 }
102 catch (IOException exception) {
103 exception.printStackTrace();
104 return false;
105 }
106 return true;
107
108 }
109
110 public void setIndexDir(String full_indexdir) {
111 this.full_indexdir = full_indexdir;
112 }
113
114 public void setSortField(String sort_field) {
115 super.setSortField(sort_field);
116
117 if (sort_field == null) {
118 this.sorter = new Sort();
119 } else {
120 this.sorter = new Sort(new SortField(sort_field,SortField.STRING)); // **** can do better than this?!?
121 }
122 }
123
124 public void setFilterString(String filter_string) {
125 super.setFilterString(filter_string);
126 this.filter = parseFilterString(filter_string);
127 }
128
129 public Filter getFilter() {
130 return this.filter;
131 }
132
133
134 public LuceneQueryResult runQuery(String query_string) {
135
136 if (query_string == null || query_string.equals("")) {
137 utf8out.println("The query word is not indicated ");
138 utf8out.flush();
139 return null;
140 }
141
142 LuceneQueryResult lucene_query_result=new LuceneQueryResult();
143 lucene_query_result.clear();
144
145 try {
146 Query query_including_stop_words = query_parser_no_stop_words.parse(query_string);
147 query_including_stop_words = query_including_stop_words.rewrite(reader);
148
149 // System.err.println("********* query_string " + query_string + "****");
150
151 Query query = parseQuery(reader, query_parser, query_string, fuzziness);
152
153 // GS2's LuceneWrapper uses lucene-2.3.2. GS3's LuceneWrapper3 works with lucene-3.3.0.
154 // This change in lucene core library for GS3 had the side-effect that searching on
155 // "econom*" didn't display what terms it was searching for, whereas it had done so in GS2.
156
157 // The details of this problem and its current solution are explained in the ticket
158 // http://trac.greenstone.org/ticket/845
159
160 // We need to change the settings for rewriteMethod in order to get searches on wildcards to
161 // produce search terms again when the query is rewritten.
162
163 if(query instanceof MultiTermQuery) {
164
165 // default docCountPercent=0.1; default termCountCutoff=350
166
167 // Creating custom cutoff values, taking into account of existing cutoff values
168 MultiTermQuery.ConstantScoreAutoRewrite customRewriteMethod = new MultiTermQuery.ConstantScoreAutoRewrite();
169 customRewriteMethod.setDocCountPercent(100.0);//MultiTermQuery.ConstantScoreAutoRewrite.DEFAULT_DOC_COUNT_PERCENT);
170 customRewriteMethod.setTermCountCutoff(350);
171
172 MultiTermQuery multiTermQuery = (MultiTermQuery)query;
173 multiTermQuery.setRewriteMethod(customRewriteMethod);
174
175 // the above works when searching with wildcards over sections, the following also
176 // works on book searches, but has been discouraged as it can throw an exception if
177 // the number of terms exceeds BooleanQuery.getMaxClauseCount().
178 // http://lucene.apache.org/core/3_6_1/api/core/org/apache/lucene/search/MultiTermQuery.html
179
180 //multiTermQuery.setRewriteMethod(MultiTermQuery.CONSTANT_SCORE_BOOLEAN_QUERY_REWRITE);//MultiTermQuery.SCORING_BOOLEAN_QUERY_REWRITE);
181 }
182
183 query = query.rewrite(reader);
184
185 // Get the list of expanded query terms and their frequencies
186 // num docs matching, and total frequency
187 HashSet terms = new HashSet();
188 query.extractTerms(terms);
189
190 HashMap doc_term_freq_map = new HashMap();
191
192 Iterator iter = terms.iterator();
193 while (iter.hasNext()) {
194
195 Term term = (Term) iter.next();
196
197 // Get the term frequency over all the documents
198 TermDocs term_docs = reader.termDocs(term);
199 int term_freq = 0;
200 int match_docs = 0;
201 while (term_docs.next())
202 {
203 if (term_docs.freq() != 0)
204 {
205 term_freq += term_docs.freq();
206 match_docs++;
207
208 // Calculate the document-level term frequency as well
209 Integer lucene_doc_num_obj = new Integer(term_docs.doc());
210 int doc_term_freq = 0;
211 if (doc_term_freq_map.containsKey(lucene_doc_num_obj))
212 {
213 doc_term_freq = ((Integer) doc_term_freq_map.get(lucene_doc_num_obj)).intValue();
214 }
215 doc_term_freq += term_docs.freq();
216
217 doc_term_freq_map.put(lucene_doc_num_obj, new Integer(doc_term_freq));
218 }
219 }
220
221 // Create a term
222 lucene_query_result.addTerm(term.text(), term.field(), match_docs, term_freq);
223 }
224
225 // Get the list of stop words removed from the query
226 HashSet terms_including_stop_words = new HashSet();
227 query_including_stop_words.extractTerms(terms_including_stop_words);
228 Iterator terms_including_stop_words_iter = terms_including_stop_words.iterator();
229 while (terms_including_stop_words_iter.hasNext()) {
230 Term term = (Term) terms_including_stop_words_iter.next();
231 if (!terms.contains(term)) {
232 lucene_query_result.addStopWord(term.text());
233 }
234 }
235
236 // do the query
237 // Simple case for getting all the matching documents
238 if (end_results == Integer.MAX_VALUE) {
239 // Perform the query (filter and sorter may be null)
240 TopFieldDocs hits = searcher.search(query, filter, end_results, sorter);
241 lucene_query_result.setTotalDocs(hits.totalHits);
242
243 // Output the matching documents
244 lucene_query_result.setStartResults(start_results);
245 lucene_query_result.setEndResults(hits.totalHits);
246
247 for (int i = start_results; i <= hits.totalHits; i++) {
248 int lucene_doc_num = hits.scoreDocs[i - 1].doc;
249 Document doc = reader.document(lucene_doc_num);
250 int doc_term_freq = 0;
251 Integer doc_term_freq_object = (Integer) doc_term_freq_map.get(new Integer(lucene_doc_num));
252 if (doc_term_freq_object != null)
253 {
254 doc_term_freq = doc_term_freq_object.intValue();
255 }
256 lucene_query_result.addDoc(doc.get("docOID").trim(), hits.scoreDocs[i-1].score, doc_term_freq);
257 }
258 }
259
260 // Slightly more complicated case for returning a subset of the matching documents
261 else {
262 // Perform the query (filter may be null)
263 TopFieldDocs hits = searcher.search(query, filter, end_results, sorter);
264 lucene_query_result.setTotalDocs(hits.totalHits);
265
266 lucene_query_result.setStartResults(start_results);
267 lucene_query_result.setEndResults(end_results < hits.scoreDocs.length ? end_results: hits.scoreDocs.length);
268
269 // Output the matching documents
270 for (int i = start_results; (i <= hits.scoreDocs.length && i <= end_results); i++) {
271 int lucene_doc_num = hits.scoreDocs[i - 1].doc;
272 Document doc = reader.document(lucene_doc_num);
273 int doc_term_freq = 0;
274 Integer doc_term_freq_object = (Integer) doc_term_freq_map.get(new Integer(lucene_doc_num));
275 if (doc_term_freq_object != null)
276 {
277 doc_term_freq = doc_term_freq_object.intValue();
278 }
279 lucene_query_result.addDoc(doc.get("docOID").trim(), hits.scoreDocs[i-1].score, doc_term_freq);
280 }
281 }
282 }
283
284 catch (ParseException parse_exception) {
285 lucene_query_result.setError(LuceneQueryResult.PARSE_ERROR);
286 }
287 catch (TooManyClauses too_many_clauses_exception) {
288 lucene_query_result.setError(LuceneQueryResult.TOO_MANY_CLAUSES_ERROR);
289 }
290 catch (IOException exception) {
291 lucene_query_result.setError(LuceneQueryResult.IO_ERROR);
292 exception.printStackTrace();
293 }
294 catch (Exception exception) {
295 lucene_query_result.setError(LuceneQueryResult.OTHER_ERROR);
296 exception.printStackTrace();
297 }
298 return lucene_query_result;
299 }
300
301 public void setDefaultConjunctionOperator(String default_conjunction_operator) {
302 super.setDefaultConjunctionOperator(default_conjunction_operator);
303
304 if (default_conjunction_operator.equals("AND")) {
305 query_parser.setDefaultOperator(query_parser.AND_OPERATOR);
306 query_parser_no_stop_words.setDefaultOperator(query_parser.AND_OPERATOR);
307 } else { // default is OR
308 query_parser.setDefaultOperator(query_parser.OR_OPERATOR);
309 query_parser_no_stop_words.setDefaultOperator(query_parser.OR_OPERATOR);
310 }
311 }
312
313
314 public void cleanUp() {
315 super.cleanUp();
316 try {
317 if (searcher != null) {
318 searcher.close();
319 }
320 } catch (IOException exception) {
321 exception.printStackTrace();
322 }
323 }
324
325
326 protected Query parseQuery(IndexReader reader, QueryParser query_parser, String query_string, String fuzziness)
327 throws java.io.IOException, org.apache.lucene.queryParser.ParseException
328 {
329 // Split query string into the search terms and the filter terms
330 // * The first +(...) term contains the search terms so count
331 // up '(' and stop when we finish matching ')'
332 int offset = 0;
333 int paren_count = 0;
334 boolean seen_paren = false;
335 while (offset < query_string.length() && (!seen_paren || paren_count > 0)) {
336 if (query_string.charAt(offset) == '(') {
337 paren_count++;
338 seen_paren = true;
339 }
340 if (query_string.charAt(offset) == ')') {
341 paren_count--;
342 }
343 offset++;
344 }
345 String query_prefix = query_string.substring(0, offset);
346 String query_suffix = query_string.substring(offset);
347
348 ///ystem.err.println("Prefix: " + query_prefix);
349 ///ystem.err.println("Suffix: " + query_suffix);
350
351 Query query = query_parser.parse(query_prefix);
352 query = query.rewrite(reader);
353
354 // If this is a fuzzy search, then we need to add the fuzzy
355 // flag to each of the query terms
356 if (fuzziness != null && query.toString().length() > 0) {
357
358 // Revert the query to a string
359 System.err.println("Rewritten query: " + query.toString());
360 // Search through the string for TX:<term> query terms
361 // and append the ~ operator. Note that this search will
362 // not change phrase searches (TX:"<term> <term>") as
363 // fuzzy searching is not possible for these entries.
364 // Yahoo! Time for a state machine!
365 StringBuffer mutable_query_string = new StringBuffer(query.toString());
366 int o = 0; // Offset
367 // 0 = BASE, 1 = SEEN_T, 2 = SEEN_TX, 3 = SEEN_TX:
368 int s = 0; // State
369 while(o < mutable_query_string.length()) {
370 char c = mutable_query_string.charAt(o);
371 if (s == 0 && c == TEXTFIELD.charAt(0)) {
372 ///ystem.err.println("Found T!");
373 s = 1;
374 }
375 else if (s == 1) {
376 if (c == TEXTFIELD.charAt(1)) {
377 ///ystem.err.println("Found X!");
378 s = 2;
379 }
380 else {
381 s = 0; // Reset
382 }
383 }
384 else if (s == 2) {
385 if (c == ':') {
386 ///ystem.err.println("Found TX:!");
387 s = 3;
388 }
389 else {
390 s = 0; // Reset
391 }
392 }
393 else if (s == 3) {
394 // Don't process phrases
395 if (c == '"') {
396 ///ystem.err.println("Stupid phrase...");
397 s = 0; // Reset
398 }
399 // Found the end of the term... add the
400 // fuzzy search indicator
401 // Nor outside the scope of parentheses
402 else if (Character.isWhitespace(c) || c == ')') {
403 ///ystem.err.println("Yahoo! Found fuzzy term.");
404 mutable_query_string.insert(o, '~' + fuzziness);
405 o++;
406 s = 0; // Reset
407 }
408 }
409 o++;
410 }
411 // If we were in the state of looking for the end of a
412 // term - then we just found it!
413 if (s == 3) {
414
415 mutable_query_string.append('~' + fuzziness);
416 }
417 // Reparse the query
418 ///ystem.err.println("Fuzzy query: " + mutable_query_string.toString() + query_suffix);
419 query = query_parser.parse(mutable_query_string.toString() + query_suffix);
420 }
421 else {
422 query = query_parser.parse(query_prefix + query_suffix);
423 }
424
425 return query;
426 }
427
428 protected Filter parseFilterString(String filter_string)
429 {
430 Filter result = null;
431 Pattern pattern = Pattern.compile("\\s*\\+(\\w+)\\:([\\{\\[])(\\d+)\\s+TO\\s+(\\d+)([\\}\\]])\\s*");
432 Matcher matcher = pattern.matcher(filter_string);
433 if (matcher.matches()) {
434 String field_name = matcher.group(1);
435 boolean include_lower = matcher.group(2).equals("[");
436 String lower_term = matcher.group(3);
437 String upper_term = matcher.group(4);
438 boolean include_upper = matcher.group(5).equals("]");
439 result = new TermRangeFilter(field_name, lower_term, upper_term, include_lower, include_upper);
440 }
441 else {
442 System.err.println("Error: Could not understand filter string \"" + filter_string + "\"");
443 }
444 return result;
445 }
446
447
448 /** command line program and auxiliary methods */
449
450 // Fairly self-explanatory I should hope
451 static protected boolean query_result_caching_enabled = false;
452
453
454 static public void main (String args[])
455 {
456 if (args.length == 0) {
457 System.out.println("Usage: GS2LuceneQuery <index directory> [-fuzziness value] [-filter filter_string] [-sort sort_field] [-dco AND|OR] [-startresults number -endresults number] [query]");
458 return;
459 }
460
461 try {
462 String index_directory = args[0];
463
464 GS2LuceneQuery queryer = new GS2LuceneQuery();
465 queryer.setIndexDir(index_directory);
466
467 // Prepare the index cache directory, if query result caching is enabled
468 if (query_result_caching_enabled) {
469 // Make the index cache directory if it doesn't already exist
470 File index_cache_directory = new File(index_directory, "cache");
471 if (!index_cache_directory.exists()) {
472 index_cache_directory.mkdir();
473 }
474
475 // Disable caching if the index cache directory isn't available
476 if (!index_cache_directory.exists() || !index_cache_directory.isDirectory()) {
477 query_result_caching_enabled = false;
478 }
479 }
480
481 String query_string = null;
482
483 // Parse the command-line arguments
484 for (int i = 1; i < args.length; i++) {
485 if (args[i].equals("-sort")) {
486 i++;
487 queryer.setSortField(args[i]);
488 }
489 else if (args[i].equals("-filter")) {
490 i++;
491 queryer.setFilterString(args[i]);
492 }
493 else if (args[i].equals("-dco")) {
494 i++;
495 queryer.setDefaultConjunctionOperator(args[i]);
496 }
497 else if (args[i].equals("-fuzziness")) {
498 i++;
499 queryer.setFuzziness(args[i]);
500 }
501 else if (args[i].equals("-startresults")) {
502 i++;
503 if (args[i].matches("\\d+")) {
504 queryer.setStartResults(Integer.parseInt(args[i]));
505 }
506 }
507 else if (args[i].equals("-endresults")) {
508 i++;
509 if (args[i].matches("\\d+")) {
510 queryer.setEndResults(Integer.parseInt(args[i]));
511 }
512 }
513 else {
514 query_string = args[i];
515 }
516 }
517
518 if (!queryer.initialise()) {
519 return;
520 }
521
522 // The query string has been specified as a command-line argument
523 if (query_string != null) {
524 runQueryCaching(index_directory, queryer, query_string);
525 }
526
527 // Read queries from STDIN
528 else {
529 BufferedReader in = new BufferedReader(new InputStreamReader(System.in, "UTF-8"));
530 while (true) {
531 // Read the query from STDIN
532 query_string = in.readLine();
533 if (query_string == null || query_string.length() == -1) {
534 break;
535 }
536
537 runQueryCaching(index_directory, queryer, query_string);
538
539 }
540 }
541 queryer.cleanUp();
542 }
543 catch (IOException exception) {
544 exception.printStackTrace();
545 }
546 }
547
548 protected static void runQueryCaching(String index_directory, GS2LuceneQuery queryer, String query_string)
549 throws IOException
550 {
551 StringBuffer query_results_xml = new StringBuffer();
552
553 // Check if this query result has been cached from a previous search (if it's enabled)
554 File query_result_cache_file = null;
555 if (query_result_caching_enabled) {
556 // Generate the cache file name from the query options
557 String query_result_cache_file_name = query_string + "-";
558 String fuzziness = queryer.getFuzziness();
559 query_result_cache_file_name += ((fuzziness != null) ? fuzziness : "") + "-";
560 String filter_string = queryer.getFilterString();
561 query_result_cache_file_name += ((filter_string != null) ? filter_string : "") + "-";
562 String sort_string = queryer.getSortField();
563 query_result_cache_file_name += ((sort_string != null) ? sort_string : "") + "-";
564 String default_conjunction_operator = queryer.getDefaultConjunctionOperator();
565 query_result_cache_file_name += default_conjunction_operator + "-";
566 int start_results = queryer.getStartResults();
567 int end_results = queryer.getEndResults();
568 query_result_cache_file_name += start_results + "-" + end_results;
569 query_result_cache_file_name = fileSafe(query_result_cache_file_name);
570
571 // If the query result cache file exists, just return its contents and we're done
572 File index_cache_directory = new File(index_directory, "cache");
573 query_result_cache_file = new File(index_cache_directory, query_result_cache_file_name);
574 if (query_result_cache_file.exists() && query_result_cache_file.isFile()) {
575 FileInputStream fis = new FileInputStream(query_result_cache_file);
576 InputStreamReader isr = new InputStreamReader(fis, "UTF-8");
577 BufferedReader buffered_reader = new BufferedReader(isr);
578 String line = "";
579 while ((line = buffered_reader.readLine()) != null) {
580 query_results_xml.append(line + "\n");
581 }
582 String query_results_xml_string = query_results_xml.toString();
583 query_results_xml_string = query_results_xml_string.replaceFirst("cached=\"false\"", "cached=\"true\"");
584
585 utf8out.print(query_results_xml_string);
586 utf8out.flush();
587
588 return;
589 }
590 }
591
592 // not cached
593 query_results_xml.append("<ResultSet cached=\"false\">\n");
594 query_results_xml.append("<QueryString>" + LuceneQueryResult.xmlSafe(query_string) + "</QueryString>\n");
595 Filter filter = queryer.getFilter();
596 if (filter != null) {
597 query_results_xml.append("<FilterString>" + filter.toString() + "</FilterString>\n");
598 }
599
600 LuceneQueryResult query_result = queryer.runQuery(query_string);
601 if (query_result == null) {
602 System.err.println("Couldn't run the query");
603 return;
604 }
605
606 if (query_result.getError() != LuceneQueryResult.NO_ERROR) {
607 query_results_xml.append("<Error type=\""+query_result.getErrorString()+"\" />\n");
608 } else {
609 query_results_xml.append(query_result.getXMLString());
610 }
611 query_results_xml.append("</ResultSet>\n");
612
613 utf8out.print(query_results_xml);
614 utf8out.flush();
615
616 // Cache this query result, if desired
617 if (query_result_caching_enabled) {
618 // Catch any exceptions thrown trying to write the query result cache file and warn about them, but don't
619 // bother with the full stack trace. It won't affect the functionality if we can't write some cache
620 // files, it will just affect the speed of subsequent requests.
621 // Example exceptions are "permission denied" errors, or "filename too long" errors (the filter string
622 // can get very long in some collections)
623 try
624 {
625 FileWriter query_result_cache_file_writer = new FileWriter(query_result_cache_file);
626 query_result_cache_file_writer.write(query_results_xml.toString());
627 query_result_cache_file_writer.close();
628 }
629 catch (Exception exception)
630 {
631 System.err.println("Warning: Exception occurred trying to write query result cache file (" + exception + ")");
632 }
633 }
634 }
635
636 protected static String fileSafe(String text)
637 {
638 StringBuffer file_safe_text = new StringBuffer();
639 for (int i = 0; i < text.length(); i++) {
640 char character = text.charAt(i);
641 if ((character >= 'A' && character <= 'Z') || (character >= 'a' && character <= 'z') || (character >= '0' && character <= '9') || character == '-') {
642 file_safe_text.append(character);
643 }
644 else {
645 file_safe_text.append('%');
646 file_safe_text.append((int) character);
647 }
648 }
649 return file_safe_text.toString();
650 }
651
652
653}
654
655
Note: See TracBrowser for help on using the repository browser.