source: main/trunk/greenstone2/common-src/indexers/lucene-gs/src/org/greenstone/LuceneWrapper/GS2LuceneQuery.java@ 24731

Last change on this file since 24731 was 24731, checked in by sjm84, 13 years ago

Lucene 3.x version of code accidentally commited rolling back to 2.x compatible version

  • Property svn:keywords set to Author Date Id Revision
File size: 21.5 KB
Line 
1/**********************************************************************
2 *
3 * GS2LuceneQuery.java
4 *
5 * Copyright 2004 The New Zealand Digital Library Project
6 *
7 * A component of the Greenstone digital library software
8 * from the New Zealand Digital Library Project at the
9 * University of Waikato, New Zealand.
10 *
11 * This program is free software; you can redistribute it and/or modify
12 * it under the terms of the GNU General Public License as published by
13 * the Free Software Foundation; either version 2 of the License, or
14 * (at your option) any later version.
15 *
16 * This program is distributed in the hope that it will be useful,
17 * but WITHOUT ANY WARRANTY; without even the implied warranty of
18 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19 * GNU General Public License for more details.
20 *
21 * You should have received a copy of the GNU General Public License
22 * along with this program; if not, write to the Free Software
23 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24 *
25 *********************************************************************/
26package org.greenstone.LuceneWrapper;
27
28
29import java.io.*;
30import java.util.*;
31import java.util.regex.*;
32
33import org.apache.lucene.analysis.Analyzer;
34import org.apache.lucene.analysis.standard.StandardAnalyzer;
35import org.apache.lucene.document.Document;
36import org.apache.lucene.index.IndexReader;
37import org.apache.lucene.index.Term;
38import org.apache.lucene.index.TermDocs;
39import org.apache.lucene.queryParser.ParseException;
40import org.apache.lucene.queryParser.QueryParser;
41import org.apache.lucene.search.BooleanQuery.TooManyClauses;
42import org.apache.lucene.search.Filter;
43import org.apache.lucene.search.Hit;
44import org.apache.lucene.search.Hits;
45import org.apache.lucene.search.IndexSearcher;
46import org.apache.lucene.search.Query;
47import org.apache.lucene.search.RangeFilter;
48import org.apache.lucene.search.Searcher;
49import org.apache.lucene.search.ScoreDoc;
50import org.apache.lucene.search.Sort;
51import org.apache.lucene.search.TopFieldDocs;
52
53
54public class GS2LuceneQuery
55{
56
57
58 static private String TEXTFIELD = "TX";
59
60 // Use the standard set of English stop words by default
61 static private String[] stop_words = GS2Analyzer.STOP_WORDS;
62
63 private String full_indexdir="";
64 private String default_conjunction_operator = "OR";
65 private String fuzziness = null;
66 private String sort_field = null;
67 private Sort sorter=new Sort();
68 private String filter_string = null;
69 private Filter filter = null;
70 private int start_results=1;
71 private int end_results=Integer.MAX_VALUE;
72
73 private QueryParser query_parser = null;
74 private QueryParser query_parser_no_stop_words = null;
75 private Searcher searcher = null;
76 private IndexReader reader = null;
77
78 static private PrintWriter utf8out = null;
79
80 static
81 {
82 try {
83 OutputStreamWriter osw = new OutputStreamWriter(System.out, "UTF-8");
84 utf8out = new PrintWriter(osw, true);
85 }
86 catch (UnsupportedEncodingException e) {
87 System.out.println(e);
88 }
89 }
90
91
92 public GS2LuceneQuery() {
93
94 // Create one query parser with the standard set of stop words, and one with none
95
96 query_parser = new QueryParser(TEXTFIELD, new GS2Analyzer(stop_words));
97 query_parser_no_stop_words = new QueryParser(TEXTFIELD, new GS2Analyzer(new String[] { }));
98 }
99
100
101 public boolean initialise() {
102
103 if (full_indexdir==null || full_indexdir.length()==-1){
104 utf8out.println("Index directory is not indicated ");
105 utf8out.flush();
106 return false;
107 }
108 try {
109 searcher = new IndexSearcher(full_indexdir);
110 reader = ((IndexSearcher) searcher).getIndexReader();
111
112 }
113 catch (IOException exception) {
114 exception.printStackTrace();
115 return false;
116 }
117 return true;
118
119 }
120
121 public LuceneQueryResult runQuery(String query_string) {
122
123 if (query_string == null || query_string.equals("")) {
124 utf8out.println("The query word is not indicated ");
125 utf8out.flush();
126 return null;
127 }
128
129 LuceneQueryResult lucene_query_result=new LuceneQueryResult();
130 lucene_query_result.clear();
131
132 try {
133 Query query_including_stop_words = query_parser_no_stop_words.parse(query_string);
134 query_including_stop_words = query_including_stop_words.rewrite(reader);
135
136 // System.err.println("********* query_string " + query_string + "****");
137
138 Query query = parseQuery(reader, query_parser, query_string, fuzziness);
139 query = query.rewrite(reader);
140
141 // Get the list of expanded query terms and their frequencies
142 // num docs matching, and total frequency
143 HashSet terms = new HashSet();
144 query.extractTerms(terms);
145
146 HashMap doc_term_freq_map = new HashMap();
147
148 Iterator iter = terms.iterator();
149 while (iter.hasNext()) {
150
151 Term term = (Term) iter.next();
152
153 // Get the term frequency over all the documents
154 TermDocs term_docs = reader.termDocs(term);
155 int term_freq = 0;
156 int match_docs = 0;
157 while (term_docs.next())
158 {
159 if (term_docs.freq() != 0)
160 {
161 term_freq += term_docs.freq();
162 match_docs++;
163
164 // Calculate the document-level term frequency as well
165 Integer lucene_doc_num_obj = new Integer(term_docs.doc());
166 int doc_term_freq = 0;
167 if (doc_term_freq_map.containsKey(lucene_doc_num_obj))
168 {
169 doc_term_freq = ((Integer) doc_term_freq_map.get(lucene_doc_num_obj)).intValue();
170 }
171 doc_term_freq += term_docs.freq();
172
173 doc_term_freq_map.put(lucene_doc_num_obj, new Integer(doc_term_freq));
174 }
175 }
176
177 // Create a term
178 lucene_query_result.addTerm(term.text(), term.field(), match_docs, term_freq);
179 }
180
181 // Get the list of stop words removed from the query
182 HashSet terms_including_stop_words = new HashSet();
183 query_including_stop_words.extractTerms(terms_including_stop_words);
184 Iterator terms_including_stop_words_iter = terms_including_stop_words.iterator();
185 while (terms_including_stop_words_iter.hasNext()) {
186 Term term = (Term) terms_including_stop_words_iter.next();
187 if (!terms.contains(term)) {
188 lucene_query_result.addStopWord(term.text());
189 }
190 }
191
192 // do the query
193 // Simple case for getting all the matching documents
194 if (end_results == Integer.MAX_VALUE) {
195 // Perform the query (filter and sorter may be null)
196 Hits hits = searcher.search(query, filter, sorter);
197 lucene_query_result.setTotalDocs(hits.length());
198
199 // Output the matching documents
200 lucene_query_result.setStartResults(start_results);
201 lucene_query_result.setEndResults(hits.length());
202
203 for (int i = start_results; i <= hits.length(); i++) {
204 int lucene_doc_num = hits.id(i - 1);
205 Document doc = hits.doc(i - 1);
206 int doc_term_freq = 0;
207 Integer doc_term_freq_object = (Integer) doc_term_freq_map.get(new Integer(lucene_doc_num));
208 if (doc_term_freq_object != null)
209 {
210 doc_term_freq = doc_term_freq_object.intValue();
211 }
212 lucene_query_result.addDoc(doc.get("docOID").trim(), hits.score(i-1), doc_term_freq);
213 }
214 }
215
216 // Slightly more complicated case for returning a subset of the matching documents
217 else {
218 // Perform the query (filter may be null)
219 TopFieldDocs hits = searcher.search(query, filter, end_results, sorter);
220 lucene_query_result.setTotalDocs(hits.totalHits);
221
222 lucene_query_result.setStartResults(start_results);
223 lucene_query_result.setEndResults(end_results < hits.scoreDocs.length ? end_results: hits.scoreDocs.length);
224
225 // Output the matching documents
226 for (int i = start_results; (i <= hits.scoreDocs.length && i <= end_results); i++) {
227 int lucene_doc_num = hits.scoreDocs[i - 1].doc;
228 Document doc = reader.document(lucene_doc_num);
229 int doc_term_freq = 0;
230 Integer doc_term_freq_object = (Integer) doc_term_freq_map.get(new Integer(lucene_doc_num));
231 if (doc_term_freq_object != null)
232 {
233 doc_term_freq = doc_term_freq_object.intValue();
234 }
235 lucene_query_result.addDoc(doc.get("docOID").trim(), hits.scoreDocs[i-1].score, doc_term_freq);
236 }
237 }
238 }
239
240 catch (ParseException parse_exception) {
241 lucene_query_result.setError(LuceneQueryResult.PARSE_ERROR);
242 }
243 catch (TooManyClauses too_many_clauses_exception) {
244 lucene_query_result.setError(LuceneQueryResult.TOO_MANY_CLAUSES_ERROR);
245 }
246 catch (IOException exception) {
247 lucene_query_result.setError(LuceneQueryResult.IO_ERROR);
248 exception.printStackTrace();
249 }
250 catch (Exception exception) {
251 lucene_query_result.setError(LuceneQueryResult.OTHER_ERROR);
252 exception.printStackTrace();
253 }
254 return lucene_query_result;
255 }
256
257 public void setDefaultConjunctionOperator(String default_conjunction_operator) {
258 this.default_conjunction_operator = default_conjunction_operator.toUpperCase();
259 if (default_conjunction_operator.equals("AND")) {
260 query_parser.setDefaultOperator(query_parser.AND_OPERATOR);
261 query_parser_no_stop_words.setDefaultOperator(query_parser.AND_OPERATOR);
262 } else { // default is OR
263 query_parser.setDefaultOperator(query_parser.OR_OPERATOR);
264 query_parser_no_stop_words.setDefaultOperator(query_parser.OR_OPERATOR);
265 }
266 }
267
268 public String getDefaultConjunctionOperator() {
269 return this.default_conjunction_operator;
270 }
271
272 public void setEndResults(int end_results) {
273 this.end_results = end_results;
274 }
275 public int getEndResults() {
276 return this.end_results;
277 }
278
279 public void setFilterString(String filter_string) {
280 this.filter_string = filter_string;
281 this.filter = parseFilterString(filter_string);
282 }
283 public String getFilterString() {
284 return this.filter_string ;
285 }
286
287 public Filter getFilter() {
288 return this.filter;
289 }
290
291 public void setIndexDir(String full_indexdir) {
292 this.full_indexdir = full_indexdir;
293 }
294
295 public void setFuzziness(String fuzziness) {
296 this.fuzziness = fuzziness;
297 }
298 public String getFuzziness() {
299 return this.fuzziness;
300 }
301
302 public void setSortField(String sort_field) {
303 this.sort_field = sort_field;
304 if (sort_field == null) {
305 this.sorter = new Sort();
306 } else {
307 this.sorter = new Sort(sort_field);
308 }
309 }
310 public String getSortField() {
311 return this.sort_field;
312 }
313
314 public void setStartResults(int start_results) {
315 if (start_results < 1) {
316 start_results = 1;
317 }
318 this.start_results = start_results;
319 }
320 public int getStartResults() {
321 return this.start_results;
322 }
323
324 public void cleanUp() {
325 try {
326 if (searcher != null) {
327 searcher.close();
328 }
329 } catch (IOException exception) {
330 exception.printStackTrace();
331 }
332 }
333
334 private Query parseQuery(IndexReader reader, QueryParser query_parser, String query_string, String fuzziness)
335 throws java.io.IOException, org.apache.lucene.queryParser.ParseException
336 {
337 // Split query string into the search terms and the filter terms
338 // * The first +(...) term contains the search terms so count
339 // up '(' and stop when we finish matching ')'
340 int offset = 0;
341 int paren_count = 0;
342 boolean seen_paren = false;
343 while (offset < query_string.length() && (!seen_paren || paren_count > 0)) {
344 if (query_string.charAt(offset) == '(') {
345 paren_count++;
346 seen_paren = true;
347 }
348 if (query_string.charAt(offset) == ')') {
349 paren_count--;
350 }
351 offset++;
352 }
353 String query_prefix = query_string.substring(0, offset);
354 String query_suffix = query_string.substring(offset);
355
356 ///ystem.err.println("Prefix: " + query_prefix);
357 ///ystem.err.println("Suffix: " + query_suffix);
358
359 Query query = query_parser.parse(query_prefix);
360 query = query.rewrite(reader);
361
362 // If this is a fuzzy search, then we need to add the fuzzy
363 // flag to each of the query terms
364 if (fuzziness != null && query.toString().length() > 0) {
365
366 // Revert the query to a string
367 System.err.println("Rewritten query: " + query.toString());
368 // Search through the string for TX:<term> query terms
369 // and append the ~ operator. Note that this search will
370 // not change phrase searches (TX:"<term> <term>") as
371 // fuzzy searching is not possible for these entries.
372 // Yahoo! Time for a state machine!
373 StringBuffer mutable_query_string = new StringBuffer(query.toString());
374 int o = 0; // Offset
375 // 0 = BASE, 1 = SEEN_T, 2 = SEEN_TX, 3 = SEEN_TX:
376 int s = 0; // State
377 while(o < mutable_query_string.length()) {
378 char c = mutable_query_string.charAt(o);
379 if (s == 0 && c == TEXTFIELD.charAt(0)) {
380 ///ystem.err.println("Found T!");
381 s = 1;
382 }
383 else if (s == 1) {
384 if (c == TEXTFIELD.charAt(1)) {
385 ///ystem.err.println("Found X!");
386 s = 2;
387 }
388 else {
389 s = 0; // Reset
390 }
391 }
392 else if (s == 2) {
393 if (c == ':') {
394 ///ystem.err.println("Found TX:!");
395 s = 3;
396 }
397 else {
398 s = 0; // Reset
399 }
400 }
401 else if (s == 3) {
402 // Don't process phrases
403 if (c == '"') {
404 ///ystem.err.println("Stupid phrase...");
405 s = 0; // Reset
406 }
407 // Found the end of the term... add the
408 // fuzzy search indicator
409 // Nor outside the scope of parentheses
410 else if (Character.isWhitespace(c) || c == ')') {
411 ///ystem.err.println("Yahoo! Found fuzzy term.");
412 mutable_query_string.insert(o, '~' + fuzziness);
413 o++;
414 s = 0; // Reset
415 }
416 }
417 o++;
418 }
419 // If we were in the state of looking for the end of a
420 // term - then we just found it!
421 if (s == 3) {
422
423 mutable_query_string.append('~' + fuzziness);
424 }
425 // Reparse the query
426 ///ystem.err.println("Fuzzy query: " + mutable_query_string.toString() + query_suffix);
427 query = query_parser.parse(mutable_query_string.toString() + query_suffix);
428 }
429 else {
430 query = query_parser.parse(query_prefix + query_suffix);
431 }
432
433 return query;
434 }
435
436 private Filter parseFilterString(String filter_string)
437 {
438 Filter result = null;
439 Pattern pattern = Pattern.compile("\\s*\\+(\\w+)\\:([\\{\\[])(\\d+)\\s+TO\\s+(\\d+)([\\}\\]])\\s*");
440 Matcher matcher = pattern.matcher(filter_string);
441 if (matcher.matches()) {
442 String field_name = matcher.group(1);
443 boolean include_lower = matcher.group(2).equals("[");
444 String lower_term = matcher.group(3);
445 String upper_term = matcher.group(4);
446 boolean include_upper = matcher.group(5).equals("]");
447 result = new RangeFilter(field_name, lower_term, upper_term, include_lower, include_upper);
448 }
449 else {
450 System.err.println("Error: Could not understand filter string \"" + filter_string + "\"");
451 }
452 return result;
453 }
454
455
456 protected void finalize() throws Throwable
457 {
458 try {
459 utf8out.flush();
460 } finally {
461 super.finalize();
462 }
463 }
464
465
466 /** command line program and auxiliary methods */
467
468 // Fairly self-explanatory I should hope
469 static private boolean query_result_caching_enabled = false;
470
471
472 static public void main (String args[])
473 {
474
475
476 if (args.length == 0) {
477 System.out.println("Usage: GS2LuceneQuery <index directory> [-fuzziness value] [-filter filter_string] [-sort sort_field] [-dco AND|OR] [-startresults number -endresults number] [query]");
478 return;
479 }
480
481 try {
482 String index_directory = args[0];
483
484 GS2LuceneQuery queryer = new GS2LuceneQuery();
485 queryer.setIndexDir(index_directory);
486
487 // Prepare the index cache directory, if query result caching is enabled
488 if (query_result_caching_enabled) {
489 // Make the index cache directory if it doesn't already exist
490 File index_cache_directory = new File(index_directory, "cache");
491 if (!index_cache_directory.exists()) {
492 index_cache_directory.mkdir();
493 }
494
495 // Disable caching if the index cache directory isn't available
496 if (!index_cache_directory.exists() || !index_cache_directory.isDirectory()) {
497 query_result_caching_enabled = false;
498 }
499 }
500
501 String query_string = null;
502
503 // Parse the command-line arguments
504 for (int i = 1; i < args.length; i++) {
505 if (args[i].equals("-sort")) {
506 i++;
507 queryer.setSortField(args[i]);
508 }
509 else if (args[i].equals("-filter")) {
510 i++;
511 queryer.setFilterString(args[i]);
512 }
513 else if (args[i].equals("-dco")) {
514 i++;
515 queryer.setDefaultConjunctionOperator(args[i]);
516 }
517 else if (args[i].equals("-fuzziness")) {
518 i++;
519 queryer.setFuzziness(args[i]);
520 }
521 else if (args[i].equals("-startresults")) {
522 i++;
523 if (args[i].matches("\\d+")) {
524 queryer.setStartResults(Integer.parseInt(args[i]));
525 }
526 }
527 else if (args[i].equals("-endresults")) {
528 i++;
529 if (args[i].matches("\\d+")) {
530 queryer.setEndResults(Integer.parseInt(args[i]));
531 }
532 }
533 else {
534 query_string = args[i];
535 }
536 }
537
538 if (!queryer.initialise()) {
539 return;
540 }
541
542 // The query string has been specified as a command-line argument
543 if (query_string != null) {
544 runQueryCaching(index_directory, queryer, query_string);
545 }
546
547 // Read queries from STDIN
548 else {
549 BufferedReader in = new BufferedReader(new InputStreamReader(System.in, "UTF-8"));
550 while (true) {
551 // Read the query from STDIN
552 query_string = in.readLine();
553 if (query_string == null || query_string.length() == -1) {
554 break;
555 }
556
557 runQueryCaching(index_directory, queryer, query_string);
558
559 }
560 }
561 queryer.cleanUp();
562 }
563 catch (IOException exception) {
564 exception.printStackTrace();
565 }
566 }
567
568 private static void runQueryCaching(String index_directory, GS2LuceneQuery queryer, String query_string)
569 throws IOException
570 {
571 StringBuffer query_results_xml = new StringBuffer();
572
573 // Check if this query result has been cached from a previous search (if it's enabled)
574 File query_result_cache_file = null;
575 if (query_result_caching_enabled) {
576 // Generate the cache file name from the query options
577 String query_result_cache_file_name = query_string + "-";
578 String fuzziness = queryer.getFuzziness();
579 query_result_cache_file_name += ((fuzziness != null) ? fuzziness : "") + "-";
580 String filter_string = queryer.getFilterString();
581 query_result_cache_file_name += ((filter_string != null) ? filter_string : "") + "-";
582 String sort_string = queryer.getSortField();
583 query_result_cache_file_name += ((sort_string != null) ? sort_string : "") + "-";
584 String default_conjunction_operator = queryer.getDefaultConjunctionOperator();
585 query_result_cache_file_name += default_conjunction_operator + "-";
586 int start_results = queryer.getStartResults();
587 int end_results = queryer.getEndResults();
588 query_result_cache_file_name += start_results + "-" + end_results;
589 query_result_cache_file_name = fileSafe(query_result_cache_file_name);
590
591 // If the query result cache file exists, just return its contents and we're done
592 File index_cache_directory = new File(index_directory, "cache");
593 query_result_cache_file = new File(index_cache_directory, query_result_cache_file_name);
594 if (query_result_cache_file.exists() && query_result_cache_file.isFile()) {
595 FileInputStream fis = new FileInputStream(query_result_cache_file);
596 InputStreamReader isr = new InputStreamReader(fis, "UTF-8");
597 BufferedReader buffered_reader = new BufferedReader(isr);
598 String line = "";
599 while ((line = buffered_reader.readLine()) != null) {
600 query_results_xml.append(line + "\n");
601 }
602 String query_results_xml_string = query_results_xml.toString();
603 query_results_xml_string = query_results_xml_string.replaceFirst("cached=\"false\"", "cached=\"true\"");
604
605 utf8out.print(query_results_xml_string);
606 utf8out.flush();
607
608 return;
609 }
610 }
611
612 // not cached
613 query_results_xml.append("<ResultSet cached=\"false\">\n");
614 query_results_xml.append("<QueryString>" + LuceneQueryResult.xmlSafe(query_string) + "</QueryString>\n");
615 Filter filter = queryer.getFilter();
616 if (filter != null) {
617 query_results_xml.append("<FilterString>" + filter.toString() + "</FilterString>\n");
618 }
619
620 LuceneQueryResult query_result = queryer.runQuery(query_string);
621 if (query_result == null) {
622 System.err.println("Couldn't run the query");
623 return;
624 }
625
626 if (query_result.getError() != LuceneQueryResult.NO_ERROR) {
627 query_results_xml.append("<Error type=\""+query_result.getErrorString()+"\" />\n");
628 } else {
629 query_results_xml.append(query_result.getXMLString());
630 }
631 query_results_xml.append("</ResultSet>\n");
632
633 utf8out.print(query_results_xml);
634 utf8out.flush();
635
636 // Cache this query result, if desired
637 if (query_result_caching_enabled) {
638 // Catch any exceptions thrown trying to write the query result cache file and warn about them, but don't
639 // bother with the full stack trace. It won't affect the functionality if we can't write some cache
640 // files, it will just affect the speed of subsequent requests.
641 // Example exceptions are "permission denied" errors, or "filename too long" errors (the filter string
642 // can get very long in some collections)
643 try
644 {
645 FileWriter query_result_cache_file_writer = new FileWriter(query_result_cache_file);
646 query_result_cache_file_writer.write(query_results_xml.toString());
647 query_result_cache_file_writer.close();
648 }
649 catch (Exception exception)
650 {
651 System.err.println("Warning: Exception occurred trying to write query result cache file (" + exception + ")");
652 }
653 }
654 }
655
656 private static String fileSafe(String text)
657 {
658 StringBuffer file_safe_text = new StringBuffer();
659 for (int i = 0; i < text.length(); i++) {
660 char character = text.charAt(i);
661 if ((character >= 'A' && character <= 'Z') || (character >= 'a' && character <= 'z') || (character >= '0' && character <= '9') || character == '-') {
662 file_safe_text.append(character);
663 }
664 else {
665 file_safe_text.append('%');
666 file_safe_text.append((int) character);
667 }
668 }
669 return file_safe_text.toString();
670 }
671
672
673}
674
675
Note: See TracBrowser for help on using the repository browser.