source: indexers/trunk/lucene-gs/src/org/greenstone/LuceneWrapper/GS2LuceneQuery.java@ 16947

Last change on this file since 16947 was 16947, checked in by mdewsnip, 16 years ago

Changed the Lucene code to use the Greenstone document OIDs directly, instead of creating its own numeric IDs and then mapping them to the Greenstone OIDs in the GDBM file. As well as being simpler and more space and speed efficient (the mapping no longer needs to be stored in the GDBM file, and no lookup needs to be done for each search result), this is another important step along the road to true incremental building.

  • Property svn:keywords set to Author Date Id Revision
File size: 21.1 KB
Line 
1/**********************************************************************
2 *
3 * GS2LuceneQuery.java
4 *
5 * Copyright 2004 The New Zealand Digital Library Project
6 *
7 * A component of the Greenstone digital library software
8 * from the New Zealand Digital Library Project at the
9 * University of Waikato, New Zealand.
10 *
11 * This program is free software; you can redistribute it and/or modify
12 * it under the terms of the GNU General Public License as published by
13 * the Free Software Foundation; either version 2 of the License, or
14 * (at your option) any later version.
15 *
16 * This program is distributed in the hope that it will be useful,
17 * but WITHOUT ANY WARRANTY; without even the implied warranty of
18 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19 * GNU General Public License for more details.
20 *
21 * You should have received a copy of the GNU General Public License
22 * along with this program; if not, write to the Free Software
23 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24 *
25 *********************************************************************/
26package org.greenstone.LuceneWrapper;
27
28
29import java.io.*;
30import java.util.*;
31import java.util.regex.*;
32
33import org.apache.lucene.analysis.Analyzer;
34import org.apache.lucene.analysis.standard.StandardAnalyzer;
35import org.apache.lucene.document.Document;
36import org.apache.lucene.index.IndexReader;
37import org.apache.lucene.index.Term;
38import org.apache.lucene.index.TermDocs;
39import org.apache.lucene.queryParser.ParseException;
40import org.apache.lucene.queryParser.QueryParser;
41import org.apache.lucene.search.BooleanQuery.TooManyClauses;
42import org.apache.lucene.search.Filter;
43import org.apache.lucene.search.Hit;
44import org.apache.lucene.search.Hits;
45import org.apache.lucene.search.IndexSearcher;
46import org.apache.lucene.search.Query;
47import org.apache.lucene.search.RangeFilter;
48import org.apache.lucene.search.Searcher;
49import org.apache.lucene.search.ScoreDoc;
50import org.apache.lucene.search.Sort;
51import org.apache.lucene.search.TopFieldDocs;
52
53
54public class GS2LuceneQuery
55{
56
57
58 static private String TEXTFIELD = "TX";
59
60 // Use the standard set of English stop words by default
61 static private String[] stop_words = StandardAnalyzer.STOP_WORDS;
62
63 private String full_indexdir="";
64 private String default_conjunction_operator = "OR";
65 private String fuzziness = null;
66 private String sort_field = null;
67 private Sort sorter=new Sort();
68 private String filter_string = null;
69 private Filter filter = null;
70 private int start_results=1;
71 private int end_results=Integer.MAX_VALUE;
72
73 private QueryParser query_parser = null;
74 private QueryParser query_parser_no_stop_words = null;
75 private Searcher searcher = null;
76 private IndexReader reader = null;
77
78 static private PrintWriter utf8out = null;
79
80 static
81 {
82 try {
83 OutputStreamWriter osw = new OutputStreamWriter(System.out, "UTF-8");
84 utf8out = new PrintWriter(osw, true);
85 }
86 catch (UnsupportedEncodingException e) {
87 System.out.println(e);
88 }
89 }
90
91
92 public GS2LuceneQuery() {
93
94 // Create one query parser with the standard set of stop words, and one with none
95
96 query_parser = new QueryParser(TEXTFIELD, new StandardAnalyzer(stop_words));
97 query_parser_no_stop_words = new QueryParser(TEXTFIELD, new StandardAnalyzer(new String[] { }));
98 }
99
100
101 public boolean initialise() {
102
103 if (full_indexdir==null || full_indexdir.length()==-1){
104 utf8out.println("Index directory is not indicated ");
105 utf8out.flush();
106 return false;
107 }
108 try {
109 searcher = new IndexSearcher(full_indexdir);
110 reader = ((IndexSearcher) searcher).getIndexReader();
111
112 }
113 catch (IOException exception) {
114 exception.printStackTrace();
115 return false;
116 }
117 return true;
118
119 }
120
121 public LuceneQueryResult runQuery(String query_string) {
122
123 if (query_string == null || query_string.equals("")) {
124 utf8out.println("The query word is not indicated ");
125 utf8out.flush();
126 return null;
127 }
128
129 LuceneQueryResult lucene_query_result=new LuceneQueryResult();
130 lucene_query_result.clear();
131
132 try {
133 Query query_including_stop_words = query_parser_no_stop_words.parse(query_string);
134 query_including_stop_words = query_including_stop_words.rewrite(reader);
135
136 Query query = parseQuery(reader, query_parser, query_string, fuzziness);
137 query = query.rewrite(reader);
138
139 // Get the list of expanded query terms and their frequencies
140 // num docs matching, and total frequency
141 HashSet terms = new HashSet();
142 query.extractTerms(terms);
143
144 HashMap doc_term_freq_map = new HashMap();
145
146 Iterator iter = terms.iterator();
147 while (iter.hasNext()) {
148
149 Term term = (Term) iter.next();
150
151 // Get the term frequency over all the documents
152 TermDocs term_docs = reader.termDocs(term);
153 int term_freq = 0;
154 int match_docs = 0;
155 while (term_docs.next())
156 {
157 if (term_docs.freq() != 0)
158 {
159 term_freq += term_docs.freq();
160 match_docs++;
161
162 // Calculate the document-level term frequency as well
163 Integer lucene_doc_num_obj = new Integer(term_docs.doc());
164 int doc_term_freq = 0;
165 if (doc_term_freq_map.containsKey(lucene_doc_num_obj))
166 {
167 doc_term_freq = ((Integer) doc_term_freq_map.get(lucene_doc_num_obj)).intValue();
168 }
169 doc_term_freq += term_docs.freq();
170
171 doc_term_freq_map.put(lucene_doc_num_obj, new Integer(doc_term_freq));
172 }
173 }
174
175 // Create a term
176 lucene_query_result.addTerm(term.text(), term.field(), match_docs, term_freq);
177 }
178
179 // Get the list of stop words removed from the query
180 HashSet terms_including_stop_words = new HashSet();
181 query_including_stop_words.extractTerms(terms_including_stop_words);
182 Iterator terms_including_stop_words_iter = terms_including_stop_words.iterator();
183 while (terms_including_stop_words_iter.hasNext()) {
184 Term term = (Term) terms_including_stop_words_iter.next();
185 if (!terms.contains(term)) {
186 lucene_query_result.addStopWord(term.text());
187 }
188 }
189
190 // do the query
191 // Simple case for getting all the matching documents
192 if (end_results == Integer.MAX_VALUE) {
193 // Perform the query (filter and sorter may be null)
194 Hits hits = searcher.search(query, filter, sorter);
195 lucene_query_result.setTotalDocs(hits.length());
196
197 // Output the matching documents
198 lucene_query_result.setStartResults(start_results);
199 lucene_query_result.setEndResults(hits.length());
200
201 for (int i = start_results; i <= hits.length(); i++) {
202 int lucene_doc_num = hits.id(i - 1);
203 Document doc = hits.doc(i - 1);
204 int doc_term_freq = ((Integer) doc_term_freq_map.get(new Integer(lucene_doc_num))).intValue();
205 lucene_query_result.addDoc(doc.get("docOID").trim(), hits.score(i-1), doc_term_freq);
206 }
207 }
208
209 // Slightly more complicated case for returning a subset of the matching documents
210 else {
211 // Perform the query (filter may be null)
212 TopFieldDocs hits = searcher.search(query, filter, end_results, sorter);
213 lucene_query_result.setTotalDocs(hits.totalHits);
214
215 lucene_query_result.setStartResults(start_results);
216 lucene_query_result.setEndResults(end_results < hits.scoreDocs.length ? end_results: hits.scoreDocs.length);
217
218 // Output the matching documents
219 for (int i = start_results; (i <= hits.scoreDocs.length && i <= end_results); i++) {
220 int lucene_doc_num = hits.scoreDocs[i - 1].doc;
221 Document doc = reader.document(lucene_doc_num);
222 int doc_term_freq = ((Integer) doc_term_freq_map.get(new Integer(lucene_doc_num))).intValue();
223 lucene_query_result.addDoc(doc.get("docOID").trim(), hits.scoreDocs[i-1].score, doc_term_freq);
224 }
225 }
226 }
227
228 catch (ParseException parse_exception) {
229 lucene_query_result.setError(LuceneQueryResult.PARSE_ERROR);
230 }
231 catch (TooManyClauses too_many_clauses_exception) {
232 lucene_query_result.setError(LuceneQueryResult.TOO_MANY_CLAUSES_ERROR);
233 }
234 catch (IOException exception) {
235 lucene_query_result.setError(LuceneQueryResult.IO_ERROR);
236 exception.printStackTrace();
237 }
238 catch (Exception exception) {
239 lucene_query_result.setError(LuceneQueryResult.OTHER_ERROR);
240 exception.printStackTrace();
241 }
242 return lucene_query_result;
243 }
244
245 public void setDefaultConjunctionOperator(String default_conjunction_operator) {
246 this.default_conjunction_operator = default_conjunction_operator.toUpperCase();
247 if (default_conjunction_operator.equals("AND")) {
248 query_parser.setDefaultOperator(query_parser.AND_OPERATOR);
249 query_parser_no_stop_words.setDefaultOperator(query_parser.AND_OPERATOR);
250 } else { // default is OR
251 query_parser.setDefaultOperator(query_parser.OR_OPERATOR);
252 query_parser_no_stop_words.setDefaultOperator(query_parser.OR_OPERATOR);
253 }
254 }
255
256 public String getDefaultConjunctionOperator() {
257 return this.default_conjunction_operator;
258 }
259
260 public void setEndResults(int end_results) {
261 this.end_results = end_results;
262 }
263 public int getEndResults() {
264 return this.end_results;
265 }
266
267 public void setFilterString(String filter_string) {
268 this.filter_string = filter_string;
269 this.filter = parseFilterString(filter_string);
270 }
271 public String getFilterString() {
272 return this.filter_string ;
273 }
274
275 public Filter getFilter() {
276 return this.filter;
277 }
278
279 public void setIndexDir(String full_indexdir) {
280 this.full_indexdir = full_indexdir;
281 }
282
283 public void setFuzziness(String fuzziness) {
284 this.fuzziness = fuzziness;
285 }
286 public String getFuzziness() {
287 return this.fuzziness;
288 }
289
290 public void setSortField(String sort_field) {
291 this.sort_field = sort_field;
292 if (sort_field == null) {
293 this.sorter = new Sort();
294 } else {
295 this.sorter = new Sort(sort_field);
296 }
297 }
298 public String getSortField() {
299 return this.sort_field;
300 }
301
302 public void setStartResults(int start_results) {
303 if (start_results < 1) {
304 start_results = 1;
305 }
306 this.start_results = start_results;
307 }
308 public int getStartResults() {
309 return this.start_results;
310 }
311
312 public void cleanUp() {
313 try {
314 if (searcher != null) {
315 searcher.close();
316 }
317 } catch (IOException exception) {
318 exception.printStackTrace();
319 }
320 }
321
322 private Query parseQuery(IndexReader reader, QueryParser query_parser, String query_string, String fuzziness)
323 throws java.io.IOException, org.apache.lucene.queryParser.ParseException
324 {
325 // Split query string into the search terms and the filter terms
326 // * The first +(...) term contains the search terms so count
327 // up '(' and stop when we finish matching ')'
328 int offset = 0;
329 int paren_count = 0;
330 boolean seen_paren = false;
331 while (offset < query_string.length() && (!seen_paren || paren_count > 0)) {
332 if (query_string.charAt(offset) == '(') {
333 paren_count++;
334 seen_paren = true;
335 }
336 if (query_string.charAt(offset) == ')') {
337 paren_count--;
338 }
339 offset++;
340 }
341 String query_prefix = query_string.substring(0, offset);
342 String query_suffix = query_string.substring(offset);
343
344 ///ystem.err.println("Prefix: " + query_prefix);
345 ///ystem.err.println("Suffix: " + query_suffix);
346
347 Query query = query_parser.parse(query_prefix);
348 query = query.rewrite(reader);
349
350 // If this is a fuzzy search, then we need to add the fuzzy
351 // flag to each of the query terms
352 if (fuzziness != null && query.toString().length() > 0) {
353
354 // Revert the query to a string
355 System.err.println("Rewritten query: " + query.toString());
356 // Search through the string for TX:<term> query terms
357 // and append the ~ operator. Note that this search will
358 // not change phrase searches (TX:"<term> <term>") as
359 // fuzzy searching is not possible for these entries.
360 // Yahoo! Time for a state machine!
361 StringBuffer mutable_query_string = new StringBuffer(query.toString());
362 int o = 0; // Offset
363 // 0 = BASE, 1 = SEEN_T, 2 = SEEN_TX, 3 = SEEN_TX:
364 int s = 0; // State
365 while(o < mutable_query_string.length()) {
366 char c = mutable_query_string.charAt(o);
367 if (s == 0 && c == TEXTFIELD.charAt(0)) {
368 ///ystem.err.println("Found T!");
369 s = 1;
370 }
371 else if (s == 1) {
372 if (c == TEXTFIELD.charAt(1)) {
373 ///ystem.err.println("Found X!");
374 s = 2;
375 }
376 else {
377 s = 0; // Reset
378 }
379 }
380 else if (s == 2) {
381 if (c == ':') {
382 ///ystem.err.println("Found TX:!");
383 s = 3;
384 }
385 else {
386 s = 0; // Reset
387 }
388 }
389 else if (s == 3) {
390 // Don't process phrases
391 if (c == '"') {
392 ///ystem.err.println("Stupid phrase...");
393 s = 0; // Reset
394 }
395 // Found the end of the term... add the
396 // fuzzy search indicator
397 // Nor outside the scope of parentheses
398 else if (Character.isWhitespace(c) || c == ')') {
399 ///ystem.err.println("Yahoo! Found fuzzy term.");
400 mutable_query_string.insert(o, '~' + fuzziness);
401 o++;
402 s = 0; // Reset
403 }
404 }
405 o++;
406 }
407 // If we were in the state of looking for the end of a
408 // term - then we just found it!
409 if (s == 3) {
410
411 mutable_query_string.append('~' + fuzziness);
412 }
413 // Reparse the query
414 ///ystem.err.println("Fuzzy query: " + mutable_query_string.toString() + query_suffix);
415 query = query_parser.parse(mutable_query_string.toString() + query_suffix);
416 }
417 else {
418 query = query_parser.parse(query_prefix + query_suffix);
419 }
420
421 return query;
422 }
423
424 private Filter parseFilterString(String filter_string)
425 {
426 Filter result = null;
427 Pattern pattern = Pattern.compile("\\s*\\+(\\w+)\\:([\\{\\[])(\\d+)\\s+TO\\s+(\\d+)([\\}\\]])\\s*");
428 Matcher matcher = pattern.matcher(filter_string);
429 if (matcher.matches()) {
430 String field_name = matcher.group(1);
431 boolean include_lower = matcher.group(2).equals("[");
432 String lower_term = matcher.group(3);
433 String upper_term = matcher.group(4);
434 boolean include_upper = matcher.group(5).equals("]");
435 result = new RangeFilter(field_name, lower_term, upper_term, include_lower, include_upper);
436 }
437 else {
438 System.err.println("Error: Could not understand filter string \"" + filter_string + "\"");
439 }
440 return result;
441 }
442
443
444 protected void finalize() throws Throwable
445 {
446 try {
447 utf8out.flush();
448 } finally {
449 super.finalize();
450 }
451 }
452
453
454 /** command line program and auxiliary methods */
455
456 // Fairly self-explanatory I should hope
457 static private boolean query_result_caching_enabled = false;
458
459
460 static public void main (String args[])
461 {
462
463
464 if (args.length == 0) {
465 System.out.println("Usage: GS2LuceneQuery <index directory> [-fuzziness value] [-filter filter_string] [-sort sort_field] [-dco AND|OR] [-startresults number -endresults number] [query]");
466 return;
467 }
468
469 try {
470 String index_directory = args[0];
471
472 GS2LuceneQuery queryer = new GS2LuceneQuery();
473 queryer.setIndexDir(index_directory);
474
475 // Prepare the index cache directory, if query result caching is enabled
476 if (query_result_caching_enabled) {
477 // Make the index cache directory if it doesn't already exist
478 File index_cache_directory = new File(index_directory, "cache");
479 if (!index_cache_directory.exists()) {
480 index_cache_directory.mkdir();
481 }
482
483 // Disable caching if the index cache directory isn't available
484 if (!index_cache_directory.exists() || !index_cache_directory.isDirectory()) {
485 query_result_caching_enabled = false;
486 }
487 }
488
489 String query_string = null;
490
491 // Parse the command-line arguments
492 for (int i = 1; i < args.length; i++) {
493 if (args[i].equals("-sort")) {
494 i++;
495 queryer.setSortField(args[i]);
496 }
497 else if (args[i].equals("-filter")) {
498 i++;
499 queryer.setFilterString(args[i]);
500 }
501 else if (args[i].equals("-dco")) {
502 i++;
503 queryer.setDefaultConjunctionOperator(args[i]);
504 }
505 else if (args[i].equals("-fuzziness")) {
506 i++;
507 queryer.setFuzziness(args[i]);
508 }
509 else if (args[i].equals("-startresults")) {
510 i++;
511 if (args[i].matches("\\d+")) {
512 queryer.setStartResults(Integer.parseInt(args[i]));
513 }
514 }
515 else if (args[i].equals("-endresults")) {
516 i++;
517 if (args[i].matches("\\d+")) {
518 queryer.setEndResults(Integer.parseInt(args[i]));
519 }
520 }
521 else {
522 query_string = args[i];
523 }
524 }
525
526 if (!queryer.initialise()) {
527 return;
528 }
529
530 // The query string has been specified as a command-line argument
531 if (query_string != null) {
532 runQueryCaching(index_directory, queryer, query_string);
533 }
534
535 // Read queries from STDIN
536 else {
537 BufferedReader in = new BufferedReader(new InputStreamReader(System.in, "UTF-8"));
538 while (true) {
539 // Read the query from STDIN
540 query_string = in.readLine();
541 if (query_string == null || query_string.length() == -1) {
542 break;
543 }
544
545 runQueryCaching(index_directory, queryer, query_string);
546
547 }
548 }
549 queryer.cleanUp();
550 }
551 catch (IOException exception) {
552 exception.printStackTrace();
553 }
554 }
555
556 private static void runQueryCaching(String index_directory, GS2LuceneQuery queryer, String query_string)
557 throws IOException
558 {
559 StringBuffer query_results_xml = new StringBuffer();
560
561 // Check if this query result has been cached from a previous search (if it's enabled)
562 File query_result_cache_file = null;
563 if (query_result_caching_enabled) {
564 // Generate the cache file name from the query options
565 String query_result_cache_file_name = query_string + "-";
566 String fuzziness = queryer.getFuzziness();
567 query_result_cache_file_name += ((fuzziness != null) ? fuzziness : "") + "-";
568 String filter_string = queryer.getFilterString();
569 query_result_cache_file_name += ((filter_string != null) ? filter_string : "") + "-";
570 String sort_string = queryer.getSortField();
571 query_result_cache_file_name += ((sort_string != null) ? sort_string : "") + "-";
572 String default_conjunction_operator = queryer.getDefaultConjunctionOperator();
573 query_result_cache_file_name += default_conjunction_operator + "-";
574 int start_results = queryer.getStartResults();
575 int end_results = queryer.getEndResults();
576 query_result_cache_file_name += start_results + "-" + end_results;
577 query_result_cache_file_name = fileSafe(query_result_cache_file_name);
578
579 // If the query result cache file exists, just return its contents and we're done
580 File index_cache_directory = new File(index_directory, "cache");
581 query_result_cache_file = new File(index_cache_directory, query_result_cache_file_name);
582 if (query_result_cache_file.exists() && query_result_cache_file.isFile()) {
583 FileInputStream fis = new FileInputStream(query_result_cache_file);
584 InputStreamReader isr = new InputStreamReader(fis, "UTF-8");
585 BufferedReader buffered_reader = new BufferedReader(isr);
586 String line = "";
587 while ((line = buffered_reader.readLine()) != null) {
588 query_results_xml.append(line + "\n");
589 }
590 String query_results_xml_string = query_results_xml.toString();
591 query_results_xml_string = query_results_xml_string.replaceFirst("cached=\"false\"", "cached=\"true\"");
592
593 utf8out.print(query_results_xml_string);
594 utf8out.flush();
595
596 return;
597 }
598 }
599
600 // not cached
601 query_results_xml.append("<ResultSet cached=\"false\">\n");
602 query_results_xml.append("<QueryString>" + LuceneQueryResult.xmlSafe(query_string) + "</QueryString>\n");
603 Filter filter = queryer.getFilter();
604 if (filter != null) {
605 query_results_xml.append("<FilterString>" + filter.toString() + "</FilterString>\n");
606 }
607
608 LuceneQueryResult query_result = queryer.runQuery(query_string);
609 if (query_result == null) {
610 System.err.println("Couldn't run the query");
611 return;
612 }
613
614 if (query_result.getError() != LuceneQueryResult.NO_ERROR) {
615 query_results_xml.append("<Error type=\""+query_result.getErrorString()+"\" />\n");
616 } else {
617 query_results_xml.append(query_result.getXMLString());
618 }
619 query_results_xml.append("</ResultSet>\n");
620
621 utf8out.print(query_results_xml);
622 utf8out.flush();
623
624 try {
625 /*
626 Writer output = null;
627 File file = new File("/tmp/lucenequery.txt");
628 output = new BufferedWriter(new FileWriter(file,"UTF-8"));
629 output.write(query_results_xml.toString());
630 output.close();
631 */
632
633 FileOutputStream fos = new FileOutputStream("/tmp/lucenequery.txt");
634
635 OutputStreamWriter osw2 = new OutputStreamWriter(fos, "UTF-8");
636
637 osw2.write("Query string = " + query_string + "\n");
638 osw2.write(query_results_xml.toString());
639 osw2.close();
640 }
641 catch (Exception e) {
642 e.printStackTrace();
643 }
644
645
646
647 // Cache this query result, if desired
648 if (query_result_caching_enabled) {
649 FileWriter query_result_cache_file_writer = new FileWriter(query_result_cache_file);
650 query_result_cache_file_writer.write(query_results_xml.toString());
651 query_result_cache_file_writer.close();
652 }
653 }
654
655 private static String fileSafe(String text)
656 {
657 StringBuffer file_safe_text = new StringBuffer();
658 for (int i = 0; i < text.length(); i++) {
659 char character = text.charAt(i);
660 if ((character >= 'A' && character <= 'Z') || (character >= 'a' && character <= 'z') || (character >= '0' && character <= '9') || character == '-') {
661 file_safe_text.append(character);
662 }
663 else {
664 file_safe_text.append('%');
665 file_safe_text.append((int) character);
666 }
667 }
668 return file_safe_text.toString();
669 }
670
671
672}
673
674
Note: See TracBrowser for help on using the repository browser.