source: gsdl/trunk/trunk/lucene-gs/src/org/greenstone/LuceneWrapper/GS2LuceneQuery.java@ 16583

Last change on this file since 16583 was 16583, checked in by davidb, 16 years ago

Undoing change commited in r16582

  • Property svn:keywords set to Author Date Id Revision
File size: 20.4 KB
Line 
1/**********************************************************************
2 *
3 * GS2LuceneQuery.java
4 *
5 * Copyright 2004 The New Zealand Digital Library Project
6 *
7 * A component of the Greenstone digital library software
8 * from the New Zealand Digital Library Project at the
9 * University of Waikato, New Zealand.
10 *
11 * This program is free software; you can redistribute it and/or modify
12 * it under the terms of the GNU General Public License as published by
13 * the Free Software Foundation; either version 2 of the License, or
14 * (at your option) any later version.
15 *
16 * This program is distributed in the hope that it will be useful,
17 * but WITHOUT ANY WARRANTY; without even the implied warranty of
18 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19 * GNU General Public License for more details.
20 *
21 * You should have received a copy of the GNU General Public License
22 * along with this program; if not, write to the Free Software
23 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24 *
25 *********************************************************************/
26package org.greenstone.LuceneWrapper;
27
28
29import java.io.*;
30import java.util.*;
31import java.util.regex.*;
32
33import org.apache.lucene.analysis.Analyzer;
34import org.apache.lucene.analysis.standard.StandardAnalyzer;
35import org.apache.lucene.document.Document;
36import org.apache.lucene.index.IndexReader;
37import org.apache.lucene.index.Term;
38import org.apache.lucene.index.TermDocs;
39import org.apache.lucene.queryParser.ParseException;
40import org.apache.lucene.queryParser.QueryParser;
41import org.apache.lucene.search.BooleanQuery.TooManyClauses;
42import org.apache.lucene.search.Filter;
43import org.apache.lucene.search.Hit;
44import org.apache.lucene.search.Hits;
45import org.apache.lucene.search.IndexSearcher;
46import org.apache.lucene.search.Query;
47import org.apache.lucene.search.RangeFilter;
48import org.apache.lucene.search.Searcher;
49import org.apache.lucene.search.ScoreDoc;
50import org.apache.lucene.search.Sort;
51import org.apache.lucene.search.TopFieldDocs;
52
53
54public class GS2LuceneQuery
55{
56
57
58 static private String TEXTFIELD = "TX";
59
60 // Use the standard set of English stop words by default
61 static private String[] stop_words = StandardAnalyzer.STOP_WORDS;
62
63 private String full_indexdir="";
64 private String default_conjunction_operator = "OR";
65 private String fuzziness = null;
66 private String sort_field = null;
67 private Sort sorter=new Sort();
68 private String filter_string = null;
69 private Filter filter = null;
70 private int start_results=1;
71 private int end_results=Integer.MAX_VALUE;
72
73 private QueryParser query_parser = null;
74 private QueryParser query_parser_no_stop_words = null;
75 private Searcher searcher = null;
76 private IndexReader reader = null;
77
78 static private PrintWriter utf8out = null;
79
80 static
81 {
82 try {
83 OutputStreamWriter osw = new OutputStreamWriter(System.out, "UTF-8");
84 utf8out = new PrintWriter(osw, true);
85 }
86 catch (UnsupportedEncodingException e) {
87 System.out.println(e);
88 }
89 }
90
91
92 public GS2LuceneQuery() {
93
94 // Create one query parser with the standard set of stop words, and one with none
95
96 query_parser = new QueryParser(TEXTFIELD, new StandardAnalyzer(stop_words));
97 query_parser_no_stop_words = new QueryParser(TEXTFIELD, new StandardAnalyzer(new String[] { }));
98 }
99
100
101 public boolean initialise() {
102
103 if (full_indexdir==null || full_indexdir.length()==-1){
104 utf8out.println("Index directory is not indicated ");
105 utf8out.flush();
106 return false;
107 }
108 try {
109 searcher = new IndexSearcher(full_indexdir);
110 reader = ((IndexSearcher) searcher).getIndexReader();
111
112 }
113 catch (IOException exception) {
114 exception.printStackTrace();
115 return false;
116 }
117 return true;
118
119 }
120
121 public LuceneQueryResult runQuery(String query_string) {
122
123 if (query_string == null || query_string.equals("")) {
124 utf8out.println("The query word is not indicated ");
125 utf8out.flush();
126 return null;
127 }
128
129 LuceneQueryResult lucene_query_result=new LuceneQueryResult();
130 lucene_query_result.clear();
131
132 try {
133 Query query_including_stop_words = query_parser_no_stop_words.parse(query_string);
134 query_including_stop_words = query_including_stop_words.rewrite(reader);
135
136 Query query = parseQuery(reader, query_parser, query_string, fuzziness);
137 query = query.rewrite(reader);
138
139 // Get the list of expanded query terms and their frequencies
140 // num docs matching, and total frequency
141 HashSet terms = new HashSet();
142 query.extractTerms(terms);
143
144 Iterator iter = terms.iterator();
145 while (iter.hasNext()) {
146
147 Term term = (Term) iter.next();
148
149 // Get the term frequency over all the documents
150 TermDocs term_docs = reader.termDocs(term);
151 int term_freq = term_docs.freq();
152 int match_docs = 0;
153 if (term_freq != 0) match_docs++;
154 while (term_docs.next()) {
155 term_freq += term_docs.freq();
156 if (term_docs.freq()!= 0) {
157 match_docs++;
158 }
159 }
160
161 // Create a term
162 lucene_query_result.addTerm(term.text(), term.field(), match_docs, term_freq);
163 }
164
165 // Get the list of stop words removed from the query
166 HashSet terms_including_stop_words = new HashSet();
167 query_including_stop_words.extractTerms(terms_including_stop_words);
168 Iterator terms_including_stop_words_iter = terms_including_stop_words.iterator();
169 while (terms_including_stop_words_iter.hasNext()) {
170 Term term = (Term) terms_including_stop_words_iter.next();
171 if (!terms.contains(term)) {
172 lucene_query_result.addStopWord(term.text());
173 }
174 }
175
176 // do the query
177 // Simple case for getting all the matching documents
178 if (end_results == Integer.MAX_VALUE) {
179 // Perform the query (filter and sorter may be null)
180 Hits hits = searcher.search(query, filter, sorter);
181 lucene_query_result.setTotalDocs(hits.length());
182
183 // Output the matching documents
184 lucene_query_result.setStartResults(start_results);
185 lucene_query_result.setEndResults(hits.length());
186
187 for (int i = start_results; i <= hits.length(); i++) {
188 Document doc = hits.doc(i - 1);
189 lucene_query_result.addDoc(doc.get("nodeID").trim(), hits.score(i-1));
190 }
191 }
192
193 // Slightly more complicated case for returning a subset of the matching documents
194 else {
195 // Perform the query (filter may be null)
196 TopFieldDocs hits = searcher.search(query, filter, end_results, sorter);
197 lucene_query_result.setTotalDocs(hits.totalHits);
198
199 lucene_query_result.setStartResults(start_results);
200 lucene_query_result.setEndResults(end_results < hits.scoreDocs.length ? end_results: hits.scoreDocs.length);
201
202 // Output the matching documents
203 for (int i = start_results; (i <= hits.scoreDocs.length && i <= end_results); i++) {
204 Document doc = reader.document(hits.scoreDocs[i - 1].doc);
205 lucene_query_result.addDoc(doc.get("nodeID").trim(), hits.scoreDocs[i-1].score);
206 }
207 }
208 }
209
210 catch (ParseException parse_exception) {
211 lucene_query_result.setError(LuceneQueryResult.PARSE_ERROR);
212 }
213 catch (TooManyClauses too_many_clauses_exception) {
214 lucene_query_result.setError(LuceneQueryResult.TOO_MANY_CLAUSES_ERROR);
215 }
216 catch (IOException exception) {
217 lucene_query_result.setError(LuceneQueryResult.IO_ERROR);
218 exception.printStackTrace();
219 }
220 catch (Exception exception) {
221 lucene_query_result.setError(LuceneQueryResult.OTHER_ERROR);
222 exception.printStackTrace();
223 }
224 return lucene_query_result;
225 }
226
227 public void setDefaultConjunctionOperator(String default_conjunction_operator) {
228 this.default_conjunction_operator = default_conjunction_operator.toUpperCase();
229 if (default_conjunction_operator.equals("AND")) {
230 query_parser.setDefaultOperator(query_parser.AND_OPERATOR);
231 query_parser_no_stop_words.setDefaultOperator(query_parser.AND_OPERATOR);
232 } else { // default is OR
233 query_parser.setDefaultOperator(query_parser.OR_OPERATOR);
234 query_parser_no_stop_words.setDefaultOperator(query_parser.OR_OPERATOR);
235 }
236 }
237
238 public String getDefaultConjunctionOperator() {
239 return this.default_conjunction_operator;
240 }
241
242 public void setEndResults(int end_results) {
243 this.end_results = end_results;
244 }
245 public int getEndResults() {
246 return this.end_results;
247 }
248
249 public void setFilterString(String filter_string) {
250 this.filter_string = filter_string;
251 this.filter = parseFilterString(filter_string);
252 }
253 public String getFilterString() {
254 return this.filter_string ;
255 }
256
257 public Filter getFilter() {
258 return this.filter;
259 }
260
261 public void setIndexDir(String full_indexdir) {
262 this.full_indexdir = full_indexdir;
263 }
264
265 public void setFuzziness(String fuzziness) {
266 this.fuzziness = fuzziness;
267 }
268 public String getFuzziness() {
269 return this.fuzziness;
270 }
271
272 public void setSortField(String sort_field) {
273 this.sort_field = sort_field;
274 if (sort_field == null) {
275 this.sorter = new Sort();
276 } else {
277 this.sorter = new Sort(sort_field);
278 }
279 }
280 public String getSortField() {
281 return this.sort_field;
282 }
283
284 public void setStartResults(int start_results) {
285 if (start_results < 1) {
286 start_results = 1;
287 }
288 this.start_results = start_results;
289 }
290 public int getStartResults() {
291 return this.start_results;
292 }
293
294 public void cleanUp() {
295 try {
296 if (searcher != null) {
297 searcher.close();
298 }
299 } catch (IOException exception) {
300 exception.printStackTrace();
301 }
302 }
303
304 private Query parseQuery(IndexReader reader, QueryParser query_parser, String query_string, String fuzziness)
305 throws java.io.IOException, org.apache.lucene.queryParser.ParseException
306 {
307 // Split query string into the search terms and the filter terms
308 // * The first +(...) term contains the search terms so count
309 // up '(' and stop when we finish matching ')'
310 int offset = 0;
311 int paren_count = 0;
312 boolean seen_paren = false;
313 while (offset < query_string.length() && (!seen_paren || paren_count > 0)) {
314 if (query_string.charAt(offset) == '(') {
315 paren_count++;
316 seen_paren = true;
317 }
318 if (query_string.charAt(offset) == ')') {
319 paren_count--;
320 }
321 offset++;
322 }
323 String query_prefix = query_string.substring(0, offset);
324 String query_suffix = query_string.substring(offset);
325
326 ///ystem.err.println("Prefix: " + query_prefix);
327 ///ystem.err.println("Suffix: " + query_suffix);
328
329 Query query = query_parser.parse(query_prefix);
330 query = query.rewrite(reader);
331
332 // If this is a fuzzy search, then we need to add the fuzzy
333 // flag to each of the query terms
334 if (fuzziness != null && query.toString().length() > 0) {
335
336 // Revert the query to a string
337 System.err.println("Rewritten query: " + query.toString());
338 // Search through the string for TX:<term> query terms
339 // and append the ~ operator. Note that this search will
340 // not change phrase searches (TX:"<term> <term>") as
341 // fuzzy searching is not possible for these entries.
342 // Yahoo! Time for a state machine!
343 StringBuffer mutable_query_string = new StringBuffer(query.toString());
344 int o = 0; // Offset
345 // 0 = BASE, 1 = SEEN_T, 2 = SEEN_TX, 3 = SEEN_TX:
346 int s = 0; // State
347 while(o < mutable_query_string.length()) {
348 char c = mutable_query_string.charAt(o);
349 if (s == 0 && c == TEXTFIELD.charAt(0)) {
350 ///ystem.err.println("Found T!");
351 s = 1;
352 }
353 else if (s == 1) {
354 if (c == TEXTFIELD.charAt(1)) {
355 ///ystem.err.println("Found X!");
356 s = 2;
357 }
358 else {
359 s = 0; // Reset
360 }
361 }
362 else if (s == 2) {
363 if (c == ':') {
364 ///ystem.err.println("Found TX:!");
365 s = 3;
366 }
367 else {
368 s = 0; // Reset
369 }
370 }
371 else if (s == 3) {
372 // Don't process phrases
373 if (c == '"') {
374 ///ystem.err.println("Stupid phrase...");
375 s = 0; // Reset
376 }
377 // Found the end of the term... add the
378 // fuzzy search indicator
379 // Nor outside the scope of parentheses
380 else if (Character.isWhitespace(c) || c == ')') {
381 ///ystem.err.println("Yahoo! Found fuzzy term.");
382 mutable_query_string.insert(o, '~' + fuzziness);
383 o++;
384 s = 0; // Reset
385 }
386 }
387 o++;
388 }
389 // If we were in the state of looking for the end of a
390 // term - then we just found it!
391 if (s == 3) {
392
393 mutable_query_string.append('~' + fuzziness);
394 }
395 // Reparse the query
396 ///ystem.err.println("Fuzzy query: " + mutable_query_string.toString() + query_suffix);
397 query = query_parser.parse(mutable_query_string.toString() + query_suffix);
398 }
399 else {
400 query = query_parser.parse(query_prefix + query_suffix);
401 }
402
403 return query;
404 }
405
406 private Filter parseFilterString(String filter_string)
407 {
408 Filter result = null;
409 Pattern pattern = Pattern.compile("\\s*\\+(\\w+)\\:([\\{\\[])(\\d+)\\s+TO\\s+(\\d+)([\\}\\]])\\s*");
410 Matcher matcher = pattern.matcher(filter_string);
411 if (matcher.matches()) {
412 String field_name = matcher.group(1);
413 boolean include_lower = matcher.group(2).equals("[");
414 String lower_term = matcher.group(3);
415 String upper_term = matcher.group(4);
416 boolean include_upper = matcher.group(5).equals("]");
417 result = new RangeFilter(field_name, lower_term, upper_term, include_lower, include_upper);
418 }
419 else {
420 System.err.println("Error: Could not understand filter string \"" + filter_string + "\"");
421 }
422 return result;
423 }
424
425
426 protected void finalize() throws Throwable
427 {
428 try {
429 utf8out.flush();
430 } finally {
431 super.finalize();
432 }
433 }
434
435
436 /** command line program and auxiliary methods */
437
438 // Fairly self-explanatory I should hope
439 static private boolean query_result_caching_enabled = false;
440
441
442 static public void main (String args[])
443 {
444
445
446 if (args.length == 0) {
447 System.out.println("Usage: GS2LuceneQuery <index directory> [-fuzziness value] [-filter filter_string] [-sort sort_field] [-dco AND|OR] [-startresults number -endresults number] [query]");
448 return;
449 }
450
451 try {
452 String index_directory = args[0];
453
454 GS2LuceneQuery queryer = new GS2LuceneQuery();
455 queryer.setIndexDir(index_directory);
456
457 // Prepare the index cache directory, if query result caching is enabled
458 if (query_result_caching_enabled) {
459 // Make the index cache directory if it doesn't already exist
460 File index_cache_directory = new File(index_directory, "cache");
461 if (!index_cache_directory.exists()) {
462 index_cache_directory.mkdir();
463 }
464
465 // Disable caching if the index cache directory isn't available
466 if (!index_cache_directory.exists() || !index_cache_directory.isDirectory()) {
467 query_result_caching_enabled = false;
468 }
469 }
470
471 String query_string = null;
472
473 // Parse the command-line arguments
474 for (int i = 1; i < args.length; i++) {
475 if (args[i].equals("-sort")) {
476 i++;
477 queryer.setSortField(args[i]);
478 }
479 else if (args[i].equals("-filter")) {
480 i++;
481 queryer.setFilterString(args[i]);
482 }
483 else if (args[i].equals("-dco")) {
484 i++;
485 queryer.setDefaultConjunctionOperator(args[i]);
486 }
487 else if (args[i].equals("-fuzziness")) {
488 i++;
489 queryer.setFuzziness(args[i]);
490 }
491 else if (args[i].equals("-startresults")) {
492 i++;
493 if (args[i].matches("\\d+")) {
494 queryer.setStartResults(Integer.parseInt(args[i]));
495 }
496 }
497 else if (args[i].equals("-endresults")) {
498 i++;
499 if (args[i].matches("\\d+")) {
500 queryer.setEndResults(Integer.parseInt(args[i]));
501 }
502 }
503 else {
504 query_string = args[i];
505 }
506 }
507
508 if (!queryer.initialise()) {
509 return;
510 }
511
512 // The query string has been specified as a command-line argument
513 if (query_string != null) {
514 runQueryCaching(index_directory, queryer, query_string);
515 }
516
517 // Read queries from STDIN
518 else {
519 BufferedReader in = new BufferedReader(new InputStreamReader(System.in, "UTF-8"));
520 while (true) {
521 // Read the query from STDIN
522 query_string = in.readLine();
523 if (query_string == null || query_string.length() == -1) {
524 break;
525 }
526
527 runQueryCaching(index_directory, queryer, query_string);
528
529 }
530 }
531 queryer.cleanUp();
532 }
533 catch (IOException exception) {
534 exception.printStackTrace();
535 }
536 }
537
538 private static void runQueryCaching(String index_directory, GS2LuceneQuery queryer, String query_string)
539 throws IOException
540 {
541 StringBuffer query_results_xml = new StringBuffer();
542
543 // Check if this query result has been cached from a previous search (if it's enabled)
544 File query_result_cache_file = null;
545 if (query_result_caching_enabled) {
546 // Generate the cache file name from the query options
547 String query_result_cache_file_name = query_string + "-";
548 String fuzziness = queryer.getFuzziness();
549 query_result_cache_file_name += ((fuzziness != null) ? fuzziness : "") + "-";
550 String filter_string = queryer.getFilterString();
551 query_result_cache_file_name += ((filter_string != null) ? filter_string : "") + "-";
552 String sort_string = queryer.getSortField();
553 query_result_cache_file_name += ((sort_string != null) ? sort_string : "") + "-";
554 String default_conjunction_operator = queryer.getDefaultConjunctionOperator();
555 query_result_cache_file_name += default_conjunction_operator + "-";
556 int start_results = queryer.getStartResults();
557 int end_results = queryer.getEndResults();
558 query_result_cache_file_name += start_results + "-" + end_results;
559 query_result_cache_file_name = fileSafe(query_result_cache_file_name);
560
561 // If the query result cache file exists, just return its contents and we're done
562 File index_cache_directory = new File(index_directory, "cache");
563 query_result_cache_file = new File(index_cache_directory, query_result_cache_file_name);
564 if (query_result_cache_file.exists() && query_result_cache_file.isFile()) {
565 FileInputStream fis = new FileInputStream(query_result_cache_file);
566 InputStreamReader isr = new InputStreamReader(fis, "UTF-8");
567 BufferedReader buffered_reader = new BufferedReader(isr);
568 String line = "";
569 while ((line = buffered_reader.readLine()) != null) {
570 query_results_xml.append(line + "\n");
571 }
572 String query_results_xml_string = query_results_xml.toString();
573 query_results_xml_string = query_results_xml_string.replaceFirst("cached=\"false\"", "cached=\"true\"");
574
575 utf8out.print(query_results_xml_string);
576 utf8out.flush();
577
578 return;
579 }
580 }
581
582 // not cached
583 query_results_xml.append("<ResultSet cached=\"false\">\n");
584 query_results_xml.append("<QueryString>" + LuceneQueryResult.xmlSafe(query_string) + "</QueryString>\n");
585 Filter filter = queryer.getFilter();
586 if (filter != null) {
587 query_results_xml.append("<FilterString>" + filter.toString() + "</FilterString>\n");
588 }
589
590 LuceneQueryResult query_result = queryer.runQuery(query_string);
591 if (query_result == null) {
592 System.err.println("Couldn't run the query");
593 return;
594 }
595
596 if (query_result.getError() != LuceneQueryResult.NO_ERROR) {
597 query_results_xml.append("<Error type=\""+query_result.getErrorString()+"\" />\n");
598 } else {
599 query_results_xml.append(query_result.getXMLString());
600 }
601 query_results_xml.append("</ResultSet>\n");
602
603 utf8out.print(query_results_xml);
604 utf8out.flush();
605
606 try {
607 /*
608 Writer output = null;
609 File file = new File("/tmp/lucenequery.txt");
610 output = new BufferedWriter(new FileWriter(file,"UTF-8"));
611 output.write(query_results_xml.toString());
612 output.close();
613 */
614
615 FileOutputStream fos = new FileOutputStream("/tmp/lucenequery.txt");
616
617 OutputStreamWriter osw2 = new OutputStreamWriter(fos, "UTF-8");
618
619 osw2.write("Query string = " + query_string + "\n");
620 osw2.write(query_results_xml.toString());
621 osw2.close();
622 }
623 catch (Exception e) {
624 e.printStackTrace();
625 }
626
627
628
629 // Cache this query result, if desired
630 if (query_result_caching_enabled) {
631 FileWriter query_result_cache_file_writer = new FileWriter(query_result_cache_file);
632 query_result_cache_file_writer.write(query_results_xml.toString());
633 query_result_cache_file_writer.close();
634 }
635 }
636
637 private static String fileSafe(String text)
638 {
639 StringBuffer file_safe_text = new StringBuffer();
640 for (int i = 0; i < text.length(); i++) {
641 char character = text.charAt(i);
642 if ((character >= 'A' && character <= 'Z') || (character >= 'a' && character <= 'z') || (character >= '0' && character <= '9') || character == '-') {
643 file_safe_text.append(character);
644 }
645 else {
646 file_safe_text.append('%');
647 file_safe_text.append((int) character);
648 }
649 }
650 return file_safe_text.toString();
651 }
652
653
654}
655
656
Note: See TracBrowser for help on using the repository browser.