source: indexers/trunk/lucene-gs/src/org/greenstone/LuceneWrapper/GS2LuceneQuery.java@ 17804

Last change on this file since 17804 was 17804, checked in by davidb, 15 years ago

Introduction of GS2Analyzer, which overrides default behaviour of StandardAnalyzer to make accent folding of Latin-1 *on*

  • Property svn:keywords set to Author Date Id Revision
File size: 21.2 KB
Line 
1/**********************************************************************
2 *
3 * GS2LuceneQuery.java
4 *
5 * Copyright 2004 The New Zealand Digital Library Project
6 *
7 * A component of the Greenstone digital library software
8 * from the New Zealand Digital Library Project at the
9 * University of Waikato, New Zealand.
10 *
11 * This program is free software; you can redistribute it and/or modify
12 * it under the terms of the GNU General Public License as published by
13 * the Free Software Foundation; either version 2 of the License, or
14 * (at your option) any later version.
15 *
16 * This program is distributed in the hope that it will be useful,
17 * but WITHOUT ANY WARRANTY; without even the implied warranty of
18 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19 * GNU General Public License for more details.
20 *
21 * You should have received a copy of the GNU General Public License
22 * along with this program; if not, write to the Free Software
23 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24 *
25 *********************************************************************/
26package org.greenstone.LuceneWrapper;
27
28
29import java.io.*;
30import java.util.*;
31import java.util.regex.*;
32
33import org.apache.lucene.analysis.Analyzer;
34import org.apache.lucene.analysis.standard.StandardAnalyzer;
35import org.apache.lucene.document.Document;
36import org.apache.lucene.index.IndexReader;
37import org.apache.lucene.index.Term;
38import org.apache.lucene.index.TermDocs;
39import org.apache.lucene.queryParser.ParseException;
40import org.apache.lucene.queryParser.QueryParser;
41import org.apache.lucene.search.BooleanQuery.TooManyClauses;
42import org.apache.lucene.search.Filter;
43import org.apache.lucene.search.Hit;
44import org.apache.lucene.search.Hits;
45import org.apache.lucene.search.IndexSearcher;
46import org.apache.lucene.search.Query;
47import org.apache.lucene.search.RangeFilter;
48import org.apache.lucene.search.Searcher;
49import org.apache.lucene.search.ScoreDoc;
50import org.apache.lucene.search.Sort;
51import org.apache.lucene.search.TopFieldDocs;
52
53
54public class GS2LuceneQuery
55{
56
57
58 static private String TEXTFIELD = "TX";
59
60 // Use the standard set of English stop words by default
61 static private String[] stop_words = GS2Analyzer.STOP_WORDS;
62
63 private String full_indexdir="";
64 private String default_conjunction_operator = "OR";
65 private String fuzziness = null;
66 private String sort_field = null;
67 private Sort sorter=new Sort();
68 private String filter_string = null;
69 private Filter filter = null;
70 private int start_results=1;
71 private int end_results=Integer.MAX_VALUE;
72
73 private QueryParser query_parser = null;
74 private QueryParser query_parser_no_stop_words = null;
75 private Searcher searcher = null;
76 private IndexReader reader = null;
77
78 static private PrintWriter utf8out = null;
79
80 static
81 {
82 try {
83 OutputStreamWriter osw = new OutputStreamWriter(System.out, "UTF-8");
84 utf8out = new PrintWriter(osw, true);
85 }
86 catch (UnsupportedEncodingException e) {
87 System.out.println(e);
88 }
89 }
90
91
92 public GS2LuceneQuery() {
93
94 // Create one query parser with the standard set of stop words, and one with none
95
96 query_parser = new QueryParser(TEXTFIELD, new GS2Analyzer(stop_words));
97 query_parser_no_stop_words = new QueryParser(TEXTFIELD, new GS2Analyzer(new String[] { }));
98 }
99
100
101 public boolean initialise() {
102
103 if (full_indexdir==null || full_indexdir.length()==-1){
104 utf8out.println("Index directory is not indicated ");
105 utf8out.flush();
106 return false;
107 }
108 try {
109 searcher = new IndexSearcher(full_indexdir);
110 reader = ((IndexSearcher) searcher).getIndexReader();
111
112 }
113 catch (IOException exception) {
114 exception.printStackTrace();
115 return false;
116 }
117 return true;
118
119 }
120
121 public LuceneQueryResult runQuery(String query_string) {
122
123 if (query_string == null || query_string.equals("")) {
124 utf8out.println("The query word is not indicated ");
125 utf8out.flush();
126 return null;
127 }
128
129 LuceneQueryResult lucene_query_result=new LuceneQueryResult();
130 lucene_query_result.clear();
131
132 try {
133 Query query_including_stop_words = query_parser_no_stop_words.parse(query_string);
134 query_including_stop_words = query_including_stop_words.rewrite(reader);
135
136 // System.err.println("********* query_string " + query_string + "****");
137
138 Query query = parseQuery(reader, query_parser, query_string, fuzziness);
139 query = query.rewrite(reader);
140
141 // Get the list of expanded query terms and their frequencies
142 // num docs matching, and total frequency
143 HashSet terms = new HashSet();
144 query.extractTerms(terms);
145
146 HashMap doc_term_freq_map = new HashMap();
147
148 Iterator iter = terms.iterator();
149 while (iter.hasNext()) {
150
151 Term term = (Term) iter.next();
152
153 // Get the term frequency over all the documents
154 TermDocs term_docs = reader.termDocs(term);
155 int term_freq = 0;
156 int match_docs = 0;
157 while (term_docs.next())
158 {
159 if (term_docs.freq() != 0)
160 {
161 term_freq += term_docs.freq();
162 match_docs++;
163
164 // Calculate the document-level term frequency as well
165 Integer lucene_doc_num_obj = new Integer(term_docs.doc());
166 int doc_term_freq = 0;
167 if (doc_term_freq_map.containsKey(lucene_doc_num_obj))
168 {
169 doc_term_freq = ((Integer) doc_term_freq_map.get(lucene_doc_num_obj)).intValue();
170 }
171 doc_term_freq += term_docs.freq();
172
173 doc_term_freq_map.put(lucene_doc_num_obj, new Integer(doc_term_freq));
174 }
175 }
176
177 // Create a term
178 lucene_query_result.addTerm(term.text(), term.field(), match_docs, term_freq);
179 }
180
181 // Get the list of stop words removed from the query
182 HashSet terms_including_stop_words = new HashSet();
183 query_including_stop_words.extractTerms(terms_including_stop_words);
184 Iterator terms_including_stop_words_iter = terms_including_stop_words.iterator();
185 while (terms_including_stop_words_iter.hasNext()) {
186 Term term = (Term) terms_including_stop_words_iter.next();
187 if (!terms.contains(term)) {
188 lucene_query_result.addStopWord(term.text());
189 }
190 }
191
192 // do the query
193 // Simple case for getting all the matching documents
194 if (end_results == Integer.MAX_VALUE) {
195 // Perform the query (filter and sorter may be null)
196 Hits hits = searcher.search(query, filter, sorter);
197 lucene_query_result.setTotalDocs(hits.length());
198
199 // Output the matching documents
200 lucene_query_result.setStartResults(start_results);
201 lucene_query_result.setEndResults(hits.length());
202
203 for (int i = start_results; i <= hits.length(); i++) {
204 int lucene_doc_num = hits.id(i - 1);
205 Document doc = hits.doc(i - 1);
206 int doc_term_freq = ((Integer) doc_term_freq_map.get(new Integer(lucene_doc_num))).intValue();
207 lucene_query_result.addDoc(doc.get("docOID").trim(), hits.score(i-1), doc_term_freq);
208 }
209 }
210
211 // Slightly more complicated case for returning a subset of the matching documents
212 else {
213 // Perform the query (filter may be null)
214 TopFieldDocs hits = searcher.search(query, filter, end_results, sorter);
215 lucene_query_result.setTotalDocs(hits.totalHits);
216
217 lucene_query_result.setStartResults(start_results);
218 lucene_query_result.setEndResults(end_results < hits.scoreDocs.length ? end_results: hits.scoreDocs.length);
219
220 // Output the matching documents
221 for (int i = start_results; (i <= hits.scoreDocs.length && i <= end_results); i++) {
222 int lucene_doc_num = hits.scoreDocs[i - 1].doc;
223 Document doc = reader.document(lucene_doc_num);
224 int doc_term_freq = ((Integer) doc_term_freq_map.get(new Integer(lucene_doc_num))).intValue();
225 lucene_query_result.addDoc(doc.get("docOID").trim(), hits.scoreDocs[i-1].score, doc_term_freq);
226 }
227 }
228 }
229
230 catch (ParseException parse_exception) {
231 lucene_query_result.setError(LuceneQueryResult.PARSE_ERROR);
232 }
233 catch (TooManyClauses too_many_clauses_exception) {
234 lucene_query_result.setError(LuceneQueryResult.TOO_MANY_CLAUSES_ERROR);
235 }
236 catch (IOException exception) {
237 lucene_query_result.setError(LuceneQueryResult.IO_ERROR);
238 exception.printStackTrace();
239 }
240 catch (Exception exception) {
241 lucene_query_result.setError(LuceneQueryResult.OTHER_ERROR);
242 exception.printStackTrace();
243 }
244 return lucene_query_result;
245 }
246
247 public void setDefaultConjunctionOperator(String default_conjunction_operator) {
248 this.default_conjunction_operator = default_conjunction_operator.toUpperCase();
249 if (default_conjunction_operator.equals("AND")) {
250 query_parser.setDefaultOperator(query_parser.AND_OPERATOR);
251 query_parser_no_stop_words.setDefaultOperator(query_parser.AND_OPERATOR);
252 } else { // default is OR
253 query_parser.setDefaultOperator(query_parser.OR_OPERATOR);
254 query_parser_no_stop_words.setDefaultOperator(query_parser.OR_OPERATOR);
255 }
256 }
257
258 public String getDefaultConjunctionOperator() {
259 return this.default_conjunction_operator;
260 }
261
262 public void setEndResults(int end_results) {
263 this.end_results = end_results;
264 }
265 public int getEndResults() {
266 return this.end_results;
267 }
268
269 public void setFilterString(String filter_string) {
270 this.filter_string = filter_string;
271 this.filter = parseFilterString(filter_string);
272 }
273 public String getFilterString() {
274 return this.filter_string ;
275 }
276
277 public Filter getFilter() {
278 return this.filter;
279 }
280
281 public void setIndexDir(String full_indexdir) {
282 this.full_indexdir = full_indexdir;
283 }
284
285 public void setFuzziness(String fuzziness) {
286 this.fuzziness = fuzziness;
287 }
288 public String getFuzziness() {
289 return this.fuzziness;
290 }
291
292 public void setSortField(String sort_field) {
293 this.sort_field = sort_field;
294 if (sort_field == null) {
295 this.sorter = new Sort();
296 } else {
297 this.sorter = new Sort(sort_field);
298 }
299 }
300 public String getSortField() {
301 return this.sort_field;
302 }
303
304 public void setStartResults(int start_results) {
305 if (start_results < 1) {
306 start_results = 1;
307 }
308 this.start_results = start_results;
309 }
310 public int getStartResults() {
311 return this.start_results;
312 }
313
314 public void cleanUp() {
315 try {
316 if (searcher != null) {
317 searcher.close();
318 }
319 } catch (IOException exception) {
320 exception.printStackTrace();
321 }
322 }
323
324 private Query parseQuery(IndexReader reader, QueryParser query_parser, String query_string, String fuzziness)
325 throws java.io.IOException, org.apache.lucene.queryParser.ParseException
326 {
327 // Split query string into the search terms and the filter terms
328 // * The first +(...) term contains the search terms so count
329 // up '(' and stop when we finish matching ')'
330 int offset = 0;
331 int paren_count = 0;
332 boolean seen_paren = false;
333 while (offset < query_string.length() && (!seen_paren || paren_count > 0)) {
334 if (query_string.charAt(offset) == '(') {
335 paren_count++;
336 seen_paren = true;
337 }
338 if (query_string.charAt(offset) == ')') {
339 paren_count--;
340 }
341 offset++;
342 }
343 String query_prefix = query_string.substring(0, offset);
344 String query_suffix = query_string.substring(offset);
345
346 ///ystem.err.println("Prefix: " + query_prefix);
347 ///ystem.err.println("Suffix: " + query_suffix);
348
349 Query query = query_parser.parse(query_prefix);
350 query = query.rewrite(reader);
351
352 // If this is a fuzzy search, then we need to add the fuzzy
353 // flag to each of the query terms
354 if (fuzziness != null && query.toString().length() > 0) {
355
356 // Revert the query to a string
357 System.err.println("Rewritten query: " + query.toString());
358 // Search through the string for TX:<term> query terms
359 // and append the ~ operator. Note that this search will
360 // not change phrase searches (TX:"<term> <term>") as
361 // fuzzy searching is not possible for these entries.
362 // Yahoo! Time for a state machine!
363 StringBuffer mutable_query_string = new StringBuffer(query.toString());
364 int o = 0; // Offset
365 // 0 = BASE, 1 = SEEN_T, 2 = SEEN_TX, 3 = SEEN_TX:
366 int s = 0; // State
367 while(o < mutable_query_string.length()) {
368 char c = mutable_query_string.charAt(o);
369 if (s == 0 && c == TEXTFIELD.charAt(0)) {
370 ///ystem.err.println("Found T!");
371 s = 1;
372 }
373 else if (s == 1) {
374 if (c == TEXTFIELD.charAt(1)) {
375 ///ystem.err.println("Found X!");
376 s = 2;
377 }
378 else {
379 s = 0; // Reset
380 }
381 }
382 else if (s == 2) {
383 if (c == ':') {
384 ///ystem.err.println("Found TX:!");
385 s = 3;
386 }
387 else {
388 s = 0; // Reset
389 }
390 }
391 else if (s == 3) {
392 // Don't process phrases
393 if (c == '"') {
394 ///ystem.err.println("Stupid phrase...");
395 s = 0; // Reset
396 }
397 // Found the end of the term... add the
398 // fuzzy search indicator
399 // Nor outside the scope of parentheses
400 else if (Character.isWhitespace(c) || c == ')') {
401 ///ystem.err.println("Yahoo! Found fuzzy term.");
402 mutable_query_string.insert(o, '~' + fuzziness);
403 o++;
404 s = 0; // Reset
405 }
406 }
407 o++;
408 }
409 // If we were in the state of looking for the end of a
410 // term - then we just found it!
411 if (s == 3) {
412
413 mutable_query_string.append('~' + fuzziness);
414 }
415 // Reparse the query
416 ///ystem.err.println("Fuzzy query: " + mutable_query_string.toString() + query_suffix);
417 query = query_parser.parse(mutable_query_string.toString() + query_suffix);
418 }
419 else {
420 query = query_parser.parse(query_prefix + query_suffix);
421 }
422
423 return query;
424 }
425
426 private Filter parseFilterString(String filter_string)
427 {
428 Filter result = null;
429 Pattern pattern = Pattern.compile("\\s*\\+(\\w+)\\:([\\{\\[])(\\d+)\\s+TO\\s+(\\d+)([\\}\\]])\\s*");
430 Matcher matcher = pattern.matcher(filter_string);
431 if (matcher.matches()) {
432 String field_name = matcher.group(1);
433 boolean include_lower = matcher.group(2).equals("[");
434 String lower_term = matcher.group(3);
435 String upper_term = matcher.group(4);
436 boolean include_upper = matcher.group(5).equals("]");
437 result = new RangeFilter(field_name, lower_term, upper_term, include_lower, include_upper);
438 }
439 else {
440 System.err.println("Error: Could not understand filter string \"" + filter_string + "\"");
441 }
442 return result;
443 }
444
445
446 protected void finalize() throws Throwable
447 {
448 try {
449 utf8out.flush();
450 } finally {
451 super.finalize();
452 }
453 }
454
455
456 /** command line program and auxiliary methods */
457
458 // Fairly self-explanatory I should hope
459 static private boolean query_result_caching_enabled = false;
460
461
462 static public void main (String args[])
463 {
464
465
466 if (args.length == 0) {
467 System.out.println("Usage: GS2LuceneQuery <index directory> [-fuzziness value] [-filter filter_string] [-sort sort_field] [-dco AND|OR] [-startresults number -endresults number] [query]");
468 return;
469 }
470
471 try {
472 String index_directory = args[0];
473
474 GS2LuceneQuery queryer = new GS2LuceneQuery();
475 queryer.setIndexDir(index_directory);
476
477 // Prepare the index cache directory, if query result caching is enabled
478 if (query_result_caching_enabled) {
479 // Make the index cache directory if it doesn't already exist
480 File index_cache_directory = new File(index_directory, "cache");
481 if (!index_cache_directory.exists()) {
482 index_cache_directory.mkdir();
483 }
484
485 // Disable caching if the index cache directory isn't available
486 if (!index_cache_directory.exists() || !index_cache_directory.isDirectory()) {
487 query_result_caching_enabled = false;
488 }
489 }
490
491 String query_string = null;
492
493 // Parse the command-line arguments
494 for (int i = 1; i < args.length; i++) {
495 if (args[i].equals("-sort")) {
496 i++;
497 queryer.setSortField(args[i]);
498 }
499 else if (args[i].equals("-filter")) {
500 i++;
501 queryer.setFilterString(args[i]);
502 }
503 else if (args[i].equals("-dco")) {
504 i++;
505 queryer.setDefaultConjunctionOperator(args[i]);
506 }
507 else if (args[i].equals("-fuzziness")) {
508 i++;
509 queryer.setFuzziness(args[i]);
510 }
511 else if (args[i].equals("-startresults")) {
512 i++;
513 if (args[i].matches("\\d+")) {
514 queryer.setStartResults(Integer.parseInt(args[i]));
515 }
516 }
517 else if (args[i].equals("-endresults")) {
518 i++;
519 if (args[i].matches("\\d+")) {
520 queryer.setEndResults(Integer.parseInt(args[i]));
521 }
522 }
523 else {
524 query_string = args[i];
525 }
526 }
527
528 if (!queryer.initialise()) {
529 return;
530 }
531
532 // The query string has been specified as a command-line argument
533 if (query_string != null) {
534 runQueryCaching(index_directory, queryer, query_string);
535 }
536
537 // Read queries from STDIN
538 else {
539 BufferedReader in = new BufferedReader(new InputStreamReader(System.in, "UTF-8"));
540 while (true) {
541 // Read the query from STDIN
542 query_string = in.readLine();
543 if (query_string == null || query_string.length() == -1) {
544 break;
545 }
546
547 runQueryCaching(index_directory, queryer, query_string);
548
549 }
550 }
551 queryer.cleanUp();
552 }
553 catch (IOException exception) {
554 exception.printStackTrace();
555 }
556 }
557
558 private static void runQueryCaching(String index_directory, GS2LuceneQuery queryer, String query_string)
559 throws IOException
560 {
561 StringBuffer query_results_xml = new StringBuffer();
562
563 // Check if this query result has been cached from a previous search (if it's enabled)
564 File query_result_cache_file = null;
565 if (query_result_caching_enabled) {
566 // Generate the cache file name from the query options
567 String query_result_cache_file_name = query_string + "-";
568 String fuzziness = queryer.getFuzziness();
569 query_result_cache_file_name += ((fuzziness != null) ? fuzziness : "") + "-";
570 String filter_string = queryer.getFilterString();
571 query_result_cache_file_name += ((filter_string != null) ? filter_string : "") + "-";
572 String sort_string = queryer.getSortField();
573 query_result_cache_file_name += ((sort_string != null) ? sort_string : "") + "-";
574 String default_conjunction_operator = queryer.getDefaultConjunctionOperator();
575 query_result_cache_file_name += default_conjunction_operator + "-";
576 int start_results = queryer.getStartResults();
577 int end_results = queryer.getEndResults();
578 query_result_cache_file_name += start_results + "-" + end_results;
579 query_result_cache_file_name = fileSafe(query_result_cache_file_name);
580
581 // If the query result cache file exists, just return its contents and we're done
582 File index_cache_directory = new File(index_directory, "cache");
583 query_result_cache_file = new File(index_cache_directory, query_result_cache_file_name);
584 if (query_result_cache_file.exists() && query_result_cache_file.isFile()) {
585 FileInputStream fis = new FileInputStream(query_result_cache_file);
586 InputStreamReader isr = new InputStreamReader(fis, "UTF-8");
587 BufferedReader buffered_reader = new BufferedReader(isr);
588 String line = "";
589 while ((line = buffered_reader.readLine()) != null) {
590 query_results_xml.append(line + "\n");
591 }
592 String query_results_xml_string = query_results_xml.toString();
593 query_results_xml_string = query_results_xml_string.replaceFirst("cached=\"false\"", "cached=\"true\"");
594
595 utf8out.print(query_results_xml_string);
596 utf8out.flush();
597
598 return;
599 }
600 }
601
602 // not cached
603 query_results_xml.append("<ResultSet cached=\"false\">\n");
604 query_results_xml.append("<QueryString>" + LuceneQueryResult.xmlSafe(query_string) + "</QueryString>\n");
605 Filter filter = queryer.getFilter();
606 if (filter != null) {
607 query_results_xml.append("<FilterString>" + filter.toString() + "</FilterString>\n");
608 }
609
610 LuceneQueryResult query_result = queryer.runQuery(query_string);
611 if (query_result == null) {
612 System.err.println("Couldn't run the query");
613 return;
614 }
615
616 if (query_result.getError() != LuceneQueryResult.NO_ERROR) {
617 query_results_xml.append("<Error type=\""+query_result.getErrorString()+"\" />\n");
618 } else {
619 query_results_xml.append(query_result.getXMLString());
620 }
621 query_results_xml.append("</ResultSet>\n");
622
623 utf8out.print(query_results_xml);
624 utf8out.flush();
625
626 try {
627 /*
628 Writer output = null;
629 File file = new File("/tmp/lucenequery.txt");
630 output = new BufferedWriter(new FileWriter(file,"UTF-8"));
631 output.write(query_results_xml.toString());
632 output.close();
633 */
634
635 FileOutputStream fos = new FileOutputStream("/tmp/lucenequery.txt");
636
637 OutputStreamWriter osw2 = new OutputStreamWriter(fos, "UTF-8");
638
639 osw2.write("Query string = " + query_string + "\n");
640 osw2.write(query_results_xml.toString());
641 osw2.close();
642 }
643 catch (Exception e) {
644 e.printStackTrace();
645 }
646
647
648
649 // Cache this query result, if desired
650 if (query_result_caching_enabled) {
651 FileWriter query_result_cache_file_writer = new FileWriter(query_result_cache_file);
652 query_result_cache_file_writer.write(query_results_xml.toString());
653 query_result_cache_file_writer.close();
654 }
655 }
656
657 private static String fileSafe(String text)
658 {
659 StringBuffer file_safe_text = new StringBuffer();
660 for (int i = 0; i < text.length(); i++) {
661 char character = text.charAt(i);
662 if ((character >= 'A' && character <= 'Z') || (character >= 'a' && character <= 'z') || (character >= '0' && character <= '9') || character == '-') {
663 file_safe_text.append(character);
664 }
665 else {
666 file_safe_text.append('%');
667 file_safe_text.append((int) character);
668 }
669 }
670 return file_safe_text.toString();
671 }
672
673
674}
675
676
Note: See TracBrowser for help on using the repository browser.