source: indexers/trunk/lucene-gs/src/org/greenstone/LuceneWrapper/GS2LuceneQuery.java@ 16015

Last change on this file since 16015 was 16015, checked in by davidb, 16 years ago

Printing to standard out (used as the communication mechanism back to Perl script is now wrapped up in a UTF-8 PrintWriter. Testing showed that it was important to 'flush' output each time a message is printed

  • Property svn:keywords set to Author Date Id Revision
File size: 20.4 KB
Line 
1/**********************************************************************
2 *
3 * GS2LuceneQuery.java
4 *
5 * Copyright 2004 The New Zealand Digital Library Project
6 *
7 * A component of the Greenstone digital library software
8 * from the New Zealand Digital Library Project at the
9 * University of Waikato, New Zealand.
10 *
11 * This program is free software; you can redistribute it and/or modify
12 * it under the terms of the GNU General Public License as published by
13 * the Free Software Foundation; either version 2 of the License, or
14 * (at your option) any later version.
15 *
16 * This program is distributed in the hope that it will be useful,
17 * but WITHOUT ANY WARRANTY; without even the implied warranty of
18 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19 * GNU General Public License for more details.
20 *
21 * You should have received a copy of the GNU General Public License
22 * along with this program; if not, write to the Free Software
23 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24 *
25 *********************************************************************/
26package org.greenstone.LuceneWrapper;
27
28
29import java.io.*;
30import java.util.*;
31import java.util.regex.*;
32
33import org.apache.lucene.analysis.Analyzer;
34import org.apache.lucene.analysis.standard.StandardAnalyzer;
35import org.apache.lucene.document.Document;
36import org.apache.lucene.index.IndexReader;
37import org.apache.lucene.index.Term;
38import org.apache.lucene.index.TermDocs;
39import org.apache.lucene.queryParser.ParseException;
40import org.apache.lucene.queryParser.QueryParser;
41import org.apache.lucene.search.BooleanQuery.TooManyClauses;
42import org.apache.lucene.search.Filter;
43import org.apache.lucene.search.Hit;
44import org.apache.lucene.search.Hits;
45import org.apache.lucene.search.IndexSearcher;
46import org.apache.lucene.search.Query;
47import org.apache.lucene.search.RangeFilter;
48import org.apache.lucene.search.Searcher;
49import org.apache.lucene.search.ScoreDoc;
50import org.apache.lucene.search.Sort;
51import org.apache.lucene.search.TopFieldDocs;
52
53
54public class GS2LuceneQuery
55{
56
57
58 static private String TEXTFIELD = "TX";
59
60 // Use the standard set of English stop words by default
61 static private String[] stop_words = StandardAnalyzer.STOP_WORDS;
62
63 private String full_indexdir="";
64 private String default_conjunction_operator = "OR";
65 private String fuzziness = null;
66 private String sort_field = null;
67 private Sort sorter=new Sort();
68 private String filter_string = null;
69 private Filter filter = null;
70 private int start_results=1;
71 private int end_results=Integer.MAX_VALUE;
72
73 private QueryParser query_parser = null;
74 private QueryParser query_parser_no_stop_words = null;
75 private Searcher searcher = null;
76 private IndexReader reader = null;
77
78 static private PrintWriter utf8out = null;
79
80 static
81 {
82 try {
83 OutputStreamWriter osw = new OutputStreamWriter(System.out, "UTF-8");
84 utf8out = new PrintWriter(osw, true);
85 }
86 catch (UnsupportedEncodingException e) {
87 System.out.println(e);
88 }
89 }
90
91
92 public GS2LuceneQuery() {
93
94 // Create one query parser with the standard set of stop words, and one with none
95
96 query_parser = new QueryParser(TEXTFIELD, new StandardAnalyzer(stop_words));
97 query_parser_no_stop_words = new QueryParser(TEXTFIELD, new StandardAnalyzer(new String[] { }));
98 }
99
100
101 public boolean initialise() {
102
103 if (full_indexdir==null || full_indexdir.length()==-1){
104 utf8out.println("Index directory is not indicated ");
105 utf8out.flush();
106 return false;
107 }
108 try {
109 searcher = new IndexSearcher(full_indexdir);
110 reader = ((IndexSearcher) searcher).getIndexReader();
111
112 }
113 catch (IOException exception) {
114 exception.printStackTrace();
115 return false;
116 }
117 return true;
118
119 }
120
121 public LuceneQueryResult runQuery(String query_string) {
122
123 if (query_string == null || query_string.equals("")) {
124 utf8out.println("The query word is not indicated ");
125 utf8out.flush();
126 return null;
127 }
128
129 LuceneQueryResult lucene_query_result=new LuceneQueryResult();
130 lucene_query_result.clear();
131
132 try {
133 Query query_including_stop_words = query_parser_no_stop_words.parse(query_string);
134 query_including_stop_words = query_including_stop_words.rewrite(reader);
135
136 Query query = parseQuery(reader, query_parser, query_string, fuzziness);
137 query = query.rewrite(reader);
138
139 // Get the list of expanded query terms and their frequencies
140 // num docs matching, and total frequency
141 HashSet terms = new HashSet();
142 query.extractTerms(terms);
143
144 Iterator iter = terms.iterator();
145 while (iter.hasNext()) {
146
147 Term term = (Term) iter.next();
148
149 // Get the term frequency over all the documents
150 TermDocs term_docs = reader.termDocs(term);
151 int term_freq = term_docs.freq();
152 int match_docs = 0;
153 if (term_freq != 0) match_docs++;
154 while (term_docs.next()) {
155 term_freq += term_docs.freq();
156 if (term_docs.freq()!= 0) {
157 match_docs++;
158 }
159 }
160
161 // Create a term
162 lucene_query_result.addTerm(term.text(), term.field(), match_docs, term_freq);
163 }
164
165 // Get the list of stop words removed from the query
166 HashSet terms_including_stop_words = new HashSet();
167 query_including_stop_words.extractTerms(terms_including_stop_words);
168 Iterator terms_including_stop_words_iter = terms_including_stop_words.iterator();
169 while (terms_including_stop_words_iter.hasNext()) {
170 Term term = (Term) terms_including_stop_words_iter.next();
171 if (!terms.contains(term)) {
172 lucene_query_result.addStopWord(term.text());
173 }
174 }
175
176 // do the query
177 // Simple case for getting all the matching documents
178 if (end_results == Integer.MAX_VALUE) {
179 // Perform the query (filter and sorter may be null)
180 Hits hits = searcher.search(query, filter, sorter);
181 lucene_query_result.setTotalDocs(hits.length());
182
183 // Output the matching documents
184 lucene_query_result.setStartResults(start_results);
185 lucene_query_result.setEndResults(hits.length());
186
187 for (int i = start_results; i <= hits.length(); i++) {
188 Document doc = hits.doc(i - 1);
189 lucene_query_result.addDoc(Long.parseLong(doc.get("nodeID").trim()), hits.score(i-1));
190 }
191 }
192
193 // Slightly more complicated case for returning a subset of the matching documents
194 else {
195 // Perform the query (filter may be null)
196 TopFieldDocs hits = searcher.search(query, filter, end_results, sorter);
197 lucene_query_result.setTotalDocs(hits.totalHits);
198
199 lucene_query_result.setStartResults(start_results);
200 lucene_query_result.setEndResults(end_results < hits.scoreDocs.length ? end_results: hits.scoreDocs.length);
201
202 // Output the matching documents
203 for (int i = start_results; (i <= hits.scoreDocs.length && i <= end_results); i++) {
204 Document doc = reader.document(hits.scoreDocs[i - 1].doc);
205 lucene_query_result.addDoc(Long.parseLong(doc.get("nodeID").trim()), hits.scoreDocs[i-1].score);
206 }
207 }
208 }
209
210 catch (ParseException parse_exception) {
211 lucene_query_result.setError(LuceneQueryResult.PARSE_ERROR);
212 }
213 catch (TooManyClauses too_many_clauses_exception) {
214 lucene_query_result.setError(LuceneQueryResult.TOO_MANY_CLAUSES_ERROR);
215 }
216 catch (IOException exception) {
217 lucene_query_result.setError(LuceneQueryResult.IO_ERROR);
218 exception.printStackTrace();
219 }
220 catch (Exception exception) {
221 lucene_query_result.setError(LuceneQueryResult.OTHER_ERROR);
222 exception.printStackTrace();
223 }
224 return lucene_query_result;
225 }
226
227 public void setDefaultConjunctionOperator(String default_conjunction_operator) {
228 this.default_conjunction_operator = default_conjunction_operator.toUpperCase();
229 if (default_conjunction_operator.equals("AND")) {
230 query_parser.setDefaultOperator(query_parser.AND_OPERATOR);
231 query_parser_no_stop_words.setDefaultOperator(query_parser.AND_OPERATOR);
232 } else { // default is OR
233 query_parser.setDefaultOperator(query_parser.OR_OPERATOR);
234 query_parser_no_stop_words.setDefaultOperator(query_parser.OR_OPERATOR);
235 }
236 }
237
238 public String getDefaultConjunctionOperator() {
239 return this.default_conjunction_operator;
240 }
241
242 public void setEndResults(int end_results) {
243 this.end_results = end_results;
244 }
245 public int getEndResults() {
246 return this.end_results;
247 }
248
249 public void setFilterString(String filter_string) {
250 this.filter_string = filter_string;
251 this.filter = parseFilterString(filter_string);
252 }
253 public String getFilterString() {
254 return this.filter_string ;
255 }
256
257 public Filter getFilter() {
258 return this.filter;
259 }
260
261 public void setIndexDir(String full_indexdir) {
262 this.full_indexdir = full_indexdir;
263 }
264
265 public void setFuzziness(String fuzziness) {
266 this.fuzziness = fuzziness;
267 }
268 public String getFuzziness() {
269 return this.fuzziness;
270 }
271
272 public void setSortField(String sort_field) {
273 this.sort_field = sort_field;
274 if (sort_field == null) {
275 this.sorter = new Sort();
276 } else {
277 this.sorter = new Sort(sort_field);
278 }
279 }
280 public String getSortField() {
281 return this.sort_field;
282 }
283
284 public void setStartResults(int start_results) {
285 if (start_results < 1) {
286 start_results = 1;
287 }
288 this.start_results = start_results;
289 }
290 public int getStartResults() {
291 return this.start_results;
292 }
293
294 public void cleanUp() {
295 try {
296 if (searcher != null) {
297 searcher.close();
298 }
299 } catch (IOException exception) {
300 exception.printStackTrace();
301 }
302 }
303
304 private Query parseQuery(IndexReader reader, QueryParser query_parser, String query_string, String fuzziness)
305 throws java.io.IOException, org.apache.lucene.queryParser.ParseException
306 {
307 // Split query string into the search terms and the filter terms
308 // * The first +(...) term contains the search terms so count
309 // up '(' and stop when we finish matching ')'
310 int offset = 0;
311 int paren_count = 0;
312 boolean seen_paren = false;
313 while (offset < query_string.length() && (!seen_paren || paren_count > 0)) {
314 if (query_string.charAt(offset) == '(') {
315 paren_count++;
316 seen_paren = true;
317 }
318 if (query_string.charAt(offset) == ')') {
319 paren_count--;
320 }
321 offset++;
322 }
323 String query_prefix = query_string.substring(0, offset);
324 String query_suffix = query_string.substring(offset);
325
326 ///ystem.err.println("Prefix: " + query_prefix);
327 ///ystem.err.println("Suffix: " + query_suffix);
328
329 Query query = query_parser.parse(query_prefix);
330 query = query.rewrite(reader);
331
332 // If this is a fuzzy search, then we need to add the fuzzy
333 // flag to each of the query terms
334 if (fuzziness != null && query.toString().length() > 0) {
335
336 // Revert the query to a string
337 System.err.println("Rewritten query: " + query.toString());
338 // Search through the string for TX:<term> query terms
339 // and append the ~ operator. Note that this search will
340 // not change phrase searches (TX:"<term> <term>") as
341 // fuzzy searching is not possible for these entries.
342 // Yahoo! Time for a state machine!
343 StringBuffer mutable_query_string = new StringBuffer(query.toString());
344 int o = 0; // Offset
345 // 0 = BASE, 1 = SEEN_T, 2 = SEEN_TX, 3 = SEEN_TX:
346 int s = 0; // State
347 while(o < mutable_query_string.length()) {
348 char c = mutable_query_string.charAt(o);
349 if (s == 0 && c == TEXTFIELD.charAt(0)) {
350 ///ystem.err.println("Found T!");
351 s = 1;
352 }
353 else if (s == 1) {
354 if (c == TEXTFIELD.charAt(1)) {
355 ///ystem.err.println("Found X!");
356 s = 2;
357 }
358 else {
359 s = 0; // Reset
360 }
361 }
362 else if (s == 2) {
363 if (c == ':') {
364 ///ystem.err.println("Found TX:!");
365 s = 3;
366 }
367 else {
368 s = 0; // Reset
369 }
370 }
371 else if (s == 3) {
372 // Don't process phrases
373 if (c == '"') {
374 ///ystem.err.println("Stupid phrase...");
375 s = 0; // Reset
376 }
377 // Found the end of the term... add the
378 // fuzzy search indicator
379 // Nor outside the scope of parentheses
380 else if (Character.isWhitespace(c) || c == ')') {
381 ///ystem.err.println("Yahoo! Found fuzzy term.");
382 mutable_query_string.insert(o, '~' + fuzziness);
383 o++;
384 s = 0; // Reset
385 }
386 }
387 o++;
388 }
389 // If we were in the state of looking for the end of a
390 // term - then we just found it!
391 if (s == 3) {
392
393 mutable_query_string.append('~' + fuzziness);
394 }
395 // Reparse the query
396 ///ystem.err.println("Fuzzy query: " + mutable_query_string.toString() + query_suffix);
397 query = query_parser.parse(mutable_query_string.toString() + query_suffix);
398 }
399 else {
400 query = query_parser.parse(query_prefix + query_suffix);
401 }
402
403 return query;
404 }
405
406 private Filter parseFilterString(String filter_string)
407 {
408 Filter result = null;
409 Pattern pattern = Pattern.compile("\\s*\\+(\\w+)\\:([\\{\\[])(\\d+)\\s+TO\\s+(\\d+)([\\}\\]])\\s*");
410 Matcher matcher = pattern.matcher(filter_string);
411 if (matcher.matches()) {
412 String field_name = matcher.group(1);
413 boolean include_lower = matcher.group(2).equals("[");
414 String lower_term = matcher.group(3);
415 String upper_term = matcher.group(4);
416 boolean include_upper = matcher.group(5).equals("]");
417 result = new RangeFilter(field_name, lower_term, upper_term, include_lower, include_upper);
418 }
419 else {
420 System.err.println("Error: Could not understand filter string \"" + filter_string + "\"");
421 }
422 return result;
423 }
424
425
426 protected void finalize() throws Throwable
427 {
428 try {
429 utf8out.flush();
430 } finally {
431 super.finalize();
432 }
433 }
434
435
436 /** command line program and auxiliary methods */
437
438 // Fairly self-explanatory I should hope
439 static private boolean query_result_caching_enabled = false;
440
441
442 static public void main (String args[])
443 {
444
445
446 if (args.length == 0) {
447 System.out.println("Usage: GS2LuceneQuery <index directory> [-fuzziness value] [-filter filter_string] [-sort sort_field] [-dco AND|OR] [-startresults number -endresults number] [query]");
448 return;
449 }
450
451 try {
452 String index_directory = args[0];
453
454 GS2LuceneQuery queryer = new GS2LuceneQuery();
455 queryer.setIndexDir(index_directory);
456
457 // Prepare the index cache directory, if query result caching is enabled
458 if (query_result_caching_enabled) {
459 // Make the index cache directory if it doesn't already exist
460 File index_cache_directory = new File(index_directory, "cache");
461 if (!index_cache_directory.exists()) {
462 index_cache_directory.mkdir();
463 }
464
465 // Disable caching if the index cache directory isn't available
466 if (!index_cache_directory.exists() || !index_cache_directory.isDirectory()) {
467 query_result_caching_enabled = false;
468 }
469 }
470
471 String query_string = null;
472
473 // Parse the command-line arguments
474 for (int i = 1; i < args.length; i++) {
475 if (args[i].equals("-sort")) {
476 i++;
477 queryer.setSortField(args[i]);
478 }
479 else if (args[i].equals("-filter")) {
480 i++;
481 queryer.setFilterString(args[i]);
482 }
483 else if (args[i].equals("-dco")) {
484 i++;
485 queryer.setDefaultConjunctionOperator(args[i]);
486 }
487 else if (args[i].equals("-fuzziness")) {
488 i++;
489 queryer.setFuzziness(args[i]);
490 }
491 else if (args[i].equals("-startresults")) {
492 i++;
493 if (args[i].matches("\\d+")) {
494 queryer.setStartResults(Integer.parseInt(args[i]));
495 }
496 }
497 else if (args[i].equals("-endresults")) {
498 i++;
499 if (args[i].matches("\\d+")) {
500 queryer.setEndResults(Integer.parseInt(args[i]));
501 }
502 }
503 else {
504 query_string = args[i];
505 }
506 }
507
508 if (!queryer.initialise()) {
509 return;
510 }
511
512 // The query string has been specified as a command-line argument
513 if (query_string != null) {
514 runQueryCaching(index_directory, queryer, query_string);
515 }
516
517 // Read queries from STDIN
518 else {
519 BufferedReader in = new BufferedReader(new InputStreamReader(System.in, "UTF-8"));
520 while (true) {
521 // Read the query from STDIN
522 query_string = in.readLine();
523 if (query_string == null || query_string.length() == -1) {
524 break;
525 }
526
527 runQueryCaching(index_directory, queryer, query_string);
528
529 }
530 }
531 queryer.cleanUp();
532 }
533 catch (IOException exception) {
534 exception.printStackTrace();
535 }
536 }
537
538 private static void runQueryCaching(String index_directory, GS2LuceneQuery queryer, String query_string)
539 throws IOException
540 {
541 StringBuffer query_results_xml = new StringBuffer();
542
543 // Check if this query result has been cached from a previous search (if it's enabled)
544 File query_result_cache_file = null;
545 if (query_result_caching_enabled) {
546 // Generate the cache file name from the query options
547 String query_result_cache_file_name = query_string + "-";
548 String fuzziness = queryer.getFuzziness();
549 query_result_cache_file_name += ((fuzziness != null) ? fuzziness : "") + "-";
550 String filter_string = queryer.getFilterString();
551 query_result_cache_file_name += ((filter_string != null) ? filter_string : "") + "-";
552 String sort_string = queryer.getSortField();
553 query_result_cache_file_name += ((sort_string != null) ? sort_string : "") + "-";
554 String default_conjunction_operator = queryer.getDefaultConjunctionOperator();
555 query_result_cache_file_name += default_conjunction_operator + "-";
556 int start_results = queryer.getStartResults();
557 int end_results = queryer.getEndResults();
558 query_result_cache_file_name += start_results + "-" + end_results;
559 query_result_cache_file_name = fileSafe(query_result_cache_file_name);
560
561 // If the query result cache file exists, just return its contents and we're done
562 File index_cache_directory = new File(index_directory, "cache");
563 query_result_cache_file = new File(index_cache_directory, query_result_cache_file_name);
564 if (query_result_cache_file.exists() && query_result_cache_file.isFile()) {
565 FileInputStream fis = new FileInputStream(query_result_cache_file);
566 InputStreamReader isr = new InputStreamReader(fis, "UTF-8");
567 BufferedReader buffered_reader = new BufferedReader(isr);
568 String line = "";
569 while ((line = buffered_reader.readLine()) != null) {
570 query_results_xml.append(line + "\n");
571 }
572 String query_results_xml_string = query_results_xml.toString();
573 query_results_xml_string = query_results_xml_string.replaceFirst("cached=\"false\"", "cached=\"true\"");
574
575 utf8out.print(query_results_xml_string);
576 utf8out.flush();
577
578 return;
579 }
580 }
581
582 // not cached
583 query_results_xml.append("<ResultSet cached=\"false\">\n");
584 query_results_xml.append("<QueryString>" + LuceneQueryResult.xmlSafe(query_string) + "</QueryString>\n");
585 Filter filter = queryer.getFilter();
586 if (filter != null) {
587 query_results_xml.append("<FilterString>" + filter.toString() + "</FilterString>\n");
588 }
589
590 LuceneQueryResult query_result = queryer.runQuery(query_string);
591 if (query_result == null) {
592 System.err.println("Couldn't run the query");
593 return;
594 }
595
596 if (query_result.getError() != LuceneQueryResult.NO_ERROR) {
597 query_results_xml.append("<Error type=\""+query_result.getErrorString()+"\" />\n");
598 } else {
599 query_results_xml.append(query_result.getXMLString());
600 }
601 query_results_xml.append("</ResultSet>\n");
602
603 utf8out.print(query_results_xml);
604 utf8out.flush();
605
606 try {
607 /*
608 Writer output = null;
609 File file = new File("/tmp/lucenequery.txt");
610 output = new BufferedWriter(new FileWriter(file,"UTF-8"));
611 output.write(query_results_xml.toString());
612 output.close();
613 */
614
615 FileOutputStream fos = new FileOutputStream("/tmp/lucenequery.txt");
616
617 OutputStreamWriter osw2 = new OutputStreamWriter(fos, "UTF-8");
618
619 osw2.write("Query string = " + query_string + "\n");
620 osw2.write(query_results_xml.toString());
621 osw2.close();
622 }
623 catch (Exception e) {
624 e.printStackTrace();
625 }
626
627
628
629 // Cache this query result, if desired
630 if (query_result_caching_enabled) {
631 FileWriter query_result_cache_file_writer = new FileWriter(query_result_cache_file);
632 query_result_cache_file_writer.write(query_results_xml.toString());
633 query_result_cache_file_writer.close();
634 }
635 }
636
637 private static String fileSafe(String text)
638 {
639 StringBuffer file_safe_text = new StringBuffer();
640 for (int i = 0; i < text.length(); i++) {
641 char character = text.charAt(i);
642 if ((character >= 'A' && character <= 'Z') || (character >= 'a' && character <= 'z') || (character >= '0' && character <= '9') || character == '-') {
643 file_safe_text.append(character);
644 }
645 else {
646 file_safe_text.append('%');
647 file_safe_text.append((int) character);
648 }
649 }
650 return file_safe_text.toString();
651 }
652
653
654}
655
656
Note: See TracBrowser for help on using the repository browser.