source: main/trunk/greenstone2/common-src/indexers/lucene-gs/src/org/greenstone/LuceneWrapper/GS2LuceneQuery.java@ 27068

Last change on this file since 27068 was 27068, checked in by kjdon, 11 years ago

adding in reverse sort option

  • Property svn:keywords set to Author Date Id Revision
File size: 22.2 KB
Line 
1/**********************************************************************
2 *
3 * GS2LuceneQuery.java
4 *
5 * Copyright 2004 The New Zealand Digital Library Project
6 *
7 * A component of the Greenstone digital library software
8 * from the New Zealand Digital Library Project at the
9 * University of Waikato, New Zealand.
10 *
11 * This program is free software; you can redistribute it and/or modify
12 * it under the terms of the GNU General Public License as published by
13 * the Free Software Foundation; either version 2 of the License, or
14 * (at your option) any later version.
15 *
16 * This program is distributed in the hope that it will be useful,
17 * but WITHOUT ANY WARRANTY; without even the implied warranty of
18 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19 * GNU General Public License for more details.
20 *
21 * You should have received a copy of the GNU General Public License
22 * along with this program; if not, write to the Free Software
23 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24 *
25 *********************************************************************/
26package org.greenstone.LuceneWrapper;
27
28
29import java.io.*;
30import java.util.*;
31import java.util.regex.*;
32
33import org.apache.lucene.analysis.Analyzer;
34import org.apache.lucene.analysis.standard.StandardAnalyzer;
35import org.apache.lucene.document.Document;
36import org.apache.lucene.index.IndexReader;
37import org.apache.lucene.index.Term;
38import org.apache.lucene.index.TermDocs;
39import org.apache.lucene.queryParser.ParseException;
40import org.apache.lucene.queryParser.QueryParser;
41import org.apache.lucene.search.BooleanQuery.TooManyClauses;
42import org.apache.lucene.search.Filter;
43import org.apache.lucene.search.Hit;
44import org.apache.lucene.search.Hits;
45import org.apache.lucene.search.IndexSearcher;
46import org.apache.lucene.search.Query;
47import org.apache.lucene.search.RangeFilter;
48import org.apache.lucene.search.Searcher;
49import org.apache.lucene.search.ScoreDoc;
50import org.apache.lucene.search.Sort;
51import org.apache.lucene.search.TopFieldDocs;
52
53
54public class GS2LuceneQuery
55{
56
57
58 static private String TEXTFIELD = "TX";
59
60 // Use the standard set of English stop words by default
61 static private String[] stop_words = GS2Analyzer.STOP_WORDS;
62
63 private String full_indexdir="";
64 private String default_conjunction_operator = "OR";
65 private String fuzziness = null;
66 private String sort_field = null;
67 private boolean reverse_sort = false;
68 private Sort sorter=new Sort();
69 private String filter_string = null;
70 private Filter filter = null;
71 private int start_results=1;
72 private int end_results=Integer.MAX_VALUE;
73
74 private QueryParser query_parser = null;
75 private QueryParser query_parser_no_stop_words = null;
76 private Searcher searcher = null;
77 private IndexReader reader = null;
78
79 static private PrintWriter utf8out = null;
80
81 static
82 {
83 try {
84 OutputStreamWriter osw = new OutputStreamWriter(System.out, "UTF-8");
85 utf8out = new PrintWriter(osw, true);
86 }
87 catch (UnsupportedEncodingException e) {
88 System.out.println(e);
89 }
90 }
91
92
93 public GS2LuceneQuery() {
94
95 // Create one query parser with the standard set of stop words, and one with none
96
97 query_parser = new QueryParser(TEXTFIELD, new GS2Analyzer(stop_words));
98 query_parser_no_stop_words = new QueryParser(TEXTFIELD, new GS2Analyzer(new String[] { }));
99 }
100
101
102 public boolean initialise() {
103
104 if (full_indexdir==null || full_indexdir.length()==-1){
105 utf8out.println("Index directory is not indicated ");
106 utf8out.flush();
107 return false;
108 }
109 try {
110 searcher = new IndexSearcher(full_indexdir);
111 reader = ((IndexSearcher) searcher).getIndexReader();
112
113 }
114 catch (IOException exception) {
115 exception.printStackTrace();
116 return false;
117 }
118 return true;
119
120 }
121
122 public LuceneQueryResult runQuery(String query_string) {
123
124 if (query_string == null || query_string.equals("")) {
125 utf8out.println("The query word is not indicated ");
126 utf8out.flush();
127 return null;
128 }
129
130 LuceneQueryResult lucene_query_result=new LuceneQueryResult();
131 lucene_query_result.clear();
132
133 try {
134 Query query_including_stop_words = query_parser_no_stop_words.parse(query_string);
135 System.err.println("stop query class="+query_including_stop_words.getClass().getName());
136 query_including_stop_words = query_including_stop_words.rewrite(reader);
137 System.err.println("stop query class="+query_including_stop_words.getClass().getName());
138 System.err.println("********* query_string " + query_string + "****");
139
140 Query query = parseQuery(reader, query_parser, query_string, fuzziness);
141 query = query.rewrite(reader);
142
143 // Get the list of expanded query terms and their frequencies
144 // num docs matching, and total frequency
145 HashSet terms = new HashSet();
146 query.extractTerms(terms);
147
148 HashMap doc_term_freq_map = new HashMap();
149
150 Iterator iter = terms.iterator();
151 while (iter.hasNext()) {
152
153 Term term = (Term) iter.next();
154
155 // Get the term frequency over all the documents
156 TermDocs term_docs = reader.termDocs(term);
157 int term_freq = 0;
158 int match_docs = 0;
159 while (term_docs.next())
160 {
161 if (term_docs.freq() != 0)
162 {
163 term_freq += term_docs.freq();
164 match_docs++;
165
166 // Calculate the document-level term frequency as well
167 Integer lucene_doc_num_obj = new Integer(term_docs.doc());
168 int doc_term_freq = 0;
169 if (doc_term_freq_map.containsKey(lucene_doc_num_obj))
170 {
171 doc_term_freq = ((Integer) doc_term_freq_map.get(lucene_doc_num_obj)).intValue();
172 }
173 doc_term_freq += term_docs.freq();
174
175 doc_term_freq_map.put(lucene_doc_num_obj, new Integer(doc_term_freq));
176 }
177 }
178
179 // Create a term
180 lucene_query_result.addTerm(term.text(), term.field(), match_docs, term_freq);
181 }
182
183 // Get the list of stop words removed from the query
184 HashSet terms_including_stop_words = new HashSet();
185 query_including_stop_words.extractTerms(terms_including_stop_words);
186 Iterator terms_including_stop_words_iter = terms_including_stop_words.iterator();
187 while (terms_including_stop_words_iter.hasNext()) {
188 Term term = (Term) terms_including_stop_words_iter.next();
189 if (!terms.contains(term)) {
190 lucene_query_result.addStopWord(term.text());
191 }
192 }
193
194 // do the query
195 // Simple case for getting all the matching documents
196 if (end_results == Integer.MAX_VALUE) {
197 // Perform the query (filter and sorter may be null)
198 Hits hits = searcher.search(query, filter, sorter);
199 lucene_query_result.setTotalDocs(hits.length());
200
201 // Output the matching documents
202 lucene_query_result.setStartResults(start_results);
203 lucene_query_result.setEndResults(hits.length());
204
205 for (int i = start_results; i <= hits.length(); i++) {
206 int lucene_doc_num = hits.id(i - 1);
207 Document doc = hits.doc(i - 1);
208 int doc_term_freq = 0;
209 Integer doc_term_freq_object = (Integer) doc_term_freq_map.get(new Integer(lucene_doc_num));
210 if (doc_term_freq_object != null)
211 {
212 doc_term_freq = doc_term_freq_object.intValue();
213 }
214 lucene_query_result.addDoc(doc.get("docOID").trim(), hits.score(i-1), doc_term_freq);
215 }
216 }
217
218 // Slightly more complicated case for returning a subset of the matching documents
219 else {
220 // Perform the query (filter may be null)
221 TopFieldDocs hits = searcher.search(query, filter, end_results, sorter);
222 lucene_query_result.setTotalDocs(hits.totalHits);
223
224 lucene_query_result.setStartResults(start_results);
225 lucene_query_result.setEndResults(end_results < hits.scoreDocs.length ? end_results: hits.scoreDocs.length);
226
227 // Output the matching documents
228 for (int i = start_results; (i <= hits.scoreDocs.length && i <= end_results); i++) {
229 int lucene_doc_num = hits.scoreDocs[i - 1].doc;
230 Document doc = reader.document(lucene_doc_num);
231 int doc_term_freq = 0;
232 Integer doc_term_freq_object = (Integer) doc_term_freq_map.get(new Integer(lucene_doc_num));
233 if (doc_term_freq_object != null)
234 {
235 doc_term_freq = doc_term_freq_object.intValue();
236 }
237 lucene_query_result.addDoc(doc.get("docOID").trim(), hits.scoreDocs[i-1].score, doc_term_freq);
238 }
239 }
240 }
241
242 catch (ParseException parse_exception) {
243 lucene_query_result.setError(LuceneQueryResult.PARSE_ERROR);
244 }
245 catch (TooManyClauses too_many_clauses_exception) {
246 lucene_query_result.setError(LuceneQueryResult.TOO_MANY_CLAUSES_ERROR);
247 }
248 catch (IOException exception) {
249 lucene_query_result.setError(LuceneQueryResult.IO_ERROR);
250 exception.printStackTrace();
251 }
252 catch (Exception exception) {
253 lucene_query_result.setError(LuceneQueryResult.OTHER_ERROR);
254 exception.printStackTrace();
255 }
256 return lucene_query_result;
257 }
258
259 public void setDefaultConjunctionOperator(String default_conjunction_operator) {
260 this.default_conjunction_operator = default_conjunction_operator.toUpperCase();
261 if (default_conjunction_operator.equals("AND")) {
262 query_parser.setDefaultOperator(query_parser.AND_OPERATOR);
263 query_parser_no_stop_words.setDefaultOperator(query_parser.AND_OPERATOR);
264 } else { // default is OR
265 query_parser.setDefaultOperator(query_parser.OR_OPERATOR);
266 query_parser_no_stop_words.setDefaultOperator(query_parser.OR_OPERATOR);
267 }
268 }
269
270 public String getDefaultConjunctionOperator() {
271 return this.default_conjunction_operator;
272 }
273
274 public void setEndResults(int end_results) {
275 this.end_results = end_results;
276 }
277 public int getEndResults() {
278 return this.end_results;
279 }
280
281 public void setFilterString(String filter_string) {
282 this.filter_string = filter_string;
283 this.filter = parseFilterString(filter_string);
284 }
285 public String getFilterString() {
286 return this.filter_string ;
287 }
288
289 public Filter getFilter() {
290 return this.filter;
291 }
292
293 public void setIndexDir(String full_indexdir) {
294 this.full_indexdir = full_indexdir;
295 }
296
297 public void setFuzziness(String fuzziness) {
298 this.fuzziness = fuzziness;
299 }
300 public String getFuzziness() {
301 return this.fuzziness;
302 }
303
304 public void setSortField(String sort_field) {
305 this.sort_field = sort_field;
306 if (sort_field == null) {
307 this.sorter = new Sort();
308 } else {
309 this.sorter = new Sort(sort_field, this.reverse_sort);
310 }
311 }
312 public String getSortField() {
313 return this.sort_field;
314 }
315 public void setReverseSort() {
316 this.reverse_sort = true;
317 if (this.sort_field != null) {
318 this.sorter = new Sort(this.sort_field, this.reverse_sort);
319 }
320 }
321 public boolean getReverseSort() {
322 return this.reverse_sort;
323 }
324 public void setStartResults(int start_results) {
325 if (start_results < 1) {
326 start_results = 1;
327 }
328 this.start_results = start_results;
329 }
330 public int getStartResults() {
331 return this.start_results;
332 }
333
334 public void cleanUp() {
335 try {
336 if (searcher != null) {
337 searcher.close();
338 }
339 } catch (IOException exception) {
340 exception.printStackTrace();
341 }
342 }
343
344 private Query parseQuery(IndexReader reader, QueryParser query_parser, String query_string, String fuzziness)
345 throws java.io.IOException, org.apache.lucene.queryParser.ParseException
346 {
347 // Split query string into the search terms and the filter terms
348 // * The first +(...) term contains the search terms so count
349 // up '(' and stop when we finish matching ')'
350 int offset = 0;
351 int paren_count = 0;
352 boolean seen_paren = false;
353 while (offset < query_string.length() && (!seen_paren || paren_count > 0)) {
354 if (query_string.charAt(offset) == '(') {
355 paren_count++;
356 seen_paren = true;
357 }
358 if (query_string.charAt(offset) == ')') {
359 paren_count--;
360 }
361 offset++;
362 }
363 String query_prefix = query_string.substring(0, offset);
364 String query_suffix = query_string.substring(offset);
365
366 ///ystem.err.println("Prefix: " + query_prefix);
367 ///ystem.err.println("Suffix: " + query_suffix);
368
369 Query query = query_parser.parse(query_prefix);
370 query = query.rewrite(reader);
371
372 // If this is a fuzzy search, then we need to add the fuzzy
373 // flag to each of the query terms
374 if (fuzziness != null && query.toString().length() > 0) {
375
376 // Revert the query to a string
377 System.err.println("Rewritten query: " + query.toString());
378 // Search through the string for TX:<term> query terms
379 // and append the ~ operator. Note that this search will
380 // not change phrase searches (TX:"<term> <term>") as
381 // fuzzy searching is not possible for these entries.
382 // Yahoo! Time for a state machine!
383 StringBuffer mutable_query_string = new StringBuffer(query.toString());
384 int o = 0; // Offset
385 // 0 = BASE, 1 = SEEN_T, 2 = SEEN_TX, 3 = SEEN_TX:
386 int s = 0; // State
387 while(o < mutable_query_string.length()) {
388 char c = mutable_query_string.charAt(o);
389 if (s == 0 && c == TEXTFIELD.charAt(0)) {
390 ///ystem.err.println("Found T!");
391 s = 1;
392 }
393 else if (s == 1) {
394 if (c == TEXTFIELD.charAt(1)) {
395 ///ystem.err.println("Found X!");
396 s = 2;
397 }
398 else {
399 s = 0; // Reset
400 }
401 }
402 else if (s == 2) {
403 if (c == ':') {
404 ///ystem.err.println("Found TX:!");
405 s = 3;
406 }
407 else {
408 s = 0; // Reset
409 }
410 }
411 else if (s == 3) {
412 // Don't process phrases
413 if (c == '"') {
414 ///ystem.err.println("Stupid phrase...");
415 s = 0; // Reset
416 }
417 // Found the end of the term... add the
418 // fuzzy search indicator
419 // Nor outside the scope of parentheses
420 else if (Character.isWhitespace(c) || c == ')') {
421 ///ystem.err.println("Yahoo! Found fuzzy term.");
422 mutable_query_string.insert(o, '~' + fuzziness);
423 o++;
424 s = 0; // Reset
425 }
426 }
427 o++;
428 }
429 // If we were in the state of looking for the end of a
430 // term - then we just found it!
431 if (s == 3) {
432
433 mutable_query_string.append('~' + fuzziness);
434 }
435 // Reparse the query
436 ///ystem.err.println("Fuzzy query: " + mutable_query_string.toString() + query_suffix);
437 query = query_parser.parse(mutable_query_string.toString() + query_suffix);
438 }
439 else {
440 query = query_parser.parse(query_prefix + query_suffix);
441 }
442
443 return query;
444 }
445
446 private Filter parseFilterString(String filter_string)
447 {
448 Filter result = null;
449 Pattern pattern = Pattern.compile("\\s*\\+(\\w+)\\:([\\{\\[])(\\d+)\\s+TO\\s+(\\d+)([\\}\\]])\\s*");
450 Matcher matcher = pattern.matcher(filter_string);
451 if (matcher.matches()) {
452 String field_name = matcher.group(1);
453 boolean include_lower = matcher.group(2).equals("[");
454 String lower_term = matcher.group(3);
455 String upper_term = matcher.group(4);
456 boolean include_upper = matcher.group(5).equals("]");
457 result = new RangeFilter(field_name, lower_term, upper_term, include_lower, include_upper);
458 }
459 else {
460 System.err.println("Error: Could not understand filter string \"" + filter_string + "\"");
461 }
462 return result;
463 }
464
465
466 protected void finalize() throws Throwable
467 {
468 try {
469 utf8out.flush();
470 } finally {
471 super.finalize();
472 }
473 }
474
475
476 /** command line program and auxiliary methods */
477
478 // Fairly self-explanatory I should hope
479 static private boolean query_result_caching_enabled = false;
480
481
482 static public void main (String args[])
483 {
484
485
486 if (args.length == 0) {
487 System.out.println("Usage: GS2LuceneQuery <index directory> [-fuzziness value] [-filter filter_string] [-sort sort_field] [-reverse_sort] [-dco AND|OR] [-startresults number -endresults number] [query]");
488 return;
489 }
490
491 try {
492 String index_directory = args[0];
493
494 GS2LuceneQuery queryer = new GS2LuceneQuery();
495 queryer.setIndexDir(index_directory);
496
497 // Prepare the index cache directory, if query result caching is enabled
498 if (query_result_caching_enabled) {
499 // Make the index cache directory if it doesn't already exist
500 File index_cache_directory = new File(index_directory, "cache");
501 if (!index_cache_directory.exists()) {
502 index_cache_directory.mkdir();
503 }
504
505 // Disable caching if the index cache directory isn't available
506 if (!index_cache_directory.exists() || !index_cache_directory.isDirectory()) {
507 query_result_caching_enabled = false;
508 }
509 }
510
511 String query_string = null;
512
513 // Parse the command-line arguments
514 for (int i = 1; i < args.length; i++) {
515 if (args[i].equals("-sort")) {
516 i++;
517 queryer.setSortField(args[i]);
518 }
519 else if (args[i].equals("-reverse_sort")) {
520 queryer.setReverseSort();
521 }
522 else if (args[i].equals("-filter")) {
523 i++;
524 queryer.setFilterString(args[i]);
525 }
526 else if (args[i].equals("-dco")) {
527 i++;
528 queryer.setDefaultConjunctionOperator(args[i]);
529 }
530 else if (args[i].equals("-fuzziness")) {
531 i++;
532 queryer.setFuzziness(args[i]);
533 }
534 else if (args[i].equals("-startresults")) {
535 i++;
536 if (args[i].matches("\\d+")) {
537 queryer.setStartResults(Integer.parseInt(args[i]));
538 }
539 }
540 else if (args[i].equals("-endresults")) {
541 i++;
542 if (args[i].matches("\\d+")) {
543 queryer.setEndResults(Integer.parseInt(args[i]));
544 }
545 }
546 else {
547 query_string = args[i];
548 }
549 }
550
551 if (!queryer.initialise()) {
552 return;
553 }
554
555 // The query string has been specified as a command-line argument
556 if (query_string != null) {
557 runQueryCaching(index_directory, queryer, query_string);
558 }
559
560 // Read queries from STDIN
561 else {
562 BufferedReader in = new BufferedReader(new InputStreamReader(System.in, "UTF-8"));
563 while (true) {
564 // Read the query from STDIN
565 query_string = in.readLine();
566 if (query_string == null || query_string.length() == -1) {
567 break;
568 }
569
570 runQueryCaching(index_directory, queryer, query_string);
571
572 }
573 }
574 queryer.cleanUp();
575 }
576 catch (IOException exception) {
577 exception.printStackTrace();
578 }
579 }
580
581 private static void runQueryCaching(String index_directory, GS2LuceneQuery queryer, String query_string)
582 throws IOException
583 {
584 StringBuffer query_results_xml = new StringBuffer();
585
586 // Check if this query result has been cached from a previous search (if it's enabled)
587 File query_result_cache_file = null;
588 if (query_result_caching_enabled) {
589 // Generate the cache file name from the query options
590 String query_result_cache_file_name = query_string + "-";
591 String fuzziness = queryer.getFuzziness();
592 query_result_cache_file_name += ((fuzziness != null) ? fuzziness : "") + "-";
593 String filter_string = queryer.getFilterString();
594 query_result_cache_file_name += ((filter_string != null) ? filter_string : "") + "-";
595 String sort_string = queryer.getSortField();
596 query_result_cache_file_name += ((sort_string != null) ? sort_string : "") + "-";
597 String reverse_sort_string = (queryer.getReverseSort() ? "1" : "0");
598 query_result_cache_file_name += reverse_sort_string + "-";
599 String default_conjunction_operator = queryer.getDefaultConjunctionOperator();
600 query_result_cache_file_name += default_conjunction_operator + "-";
601 int start_results = queryer.getStartResults();
602 int end_results = queryer.getEndResults();
603 query_result_cache_file_name += start_results + "-" + end_results;
604 query_result_cache_file_name = fileSafe(query_result_cache_file_name);
605
606 // If the query result cache file exists, just return its contents and we're done
607 File index_cache_directory = new File(index_directory, "cache");
608 query_result_cache_file = new File(index_cache_directory, query_result_cache_file_name);
609 if (query_result_cache_file.exists() && query_result_cache_file.isFile()) {
610 FileInputStream fis = new FileInputStream(query_result_cache_file);
611 InputStreamReader isr = new InputStreamReader(fis, "UTF-8");
612 BufferedReader buffered_reader = new BufferedReader(isr);
613 String line = "";
614 while ((line = buffered_reader.readLine()) != null) {
615 query_results_xml.append(line + "\n");
616 }
617 String query_results_xml_string = query_results_xml.toString();
618 query_results_xml_string = query_results_xml_string.replaceFirst("cached=\"false\"", "cached=\"true\"");
619
620 utf8out.print(query_results_xml_string);
621 utf8out.flush();
622
623 return;
624 }
625 }
626
627 // not cached
628 query_results_xml.append("<ResultSet cached=\"false\">\n");
629 query_results_xml.append("<QueryString>" + LuceneQueryResult.xmlSafe(query_string) + "</QueryString>\n");
630 Filter filter = queryer.getFilter();
631 if (filter != null) {
632 query_results_xml.append("<FilterString>" + filter.toString() + "</FilterString>\n");
633 }
634
635 LuceneQueryResult query_result = queryer.runQuery(query_string);
636 if (query_result == null) {
637 System.err.println("Couldn't run the query");
638 return;
639 }
640
641 if (query_result.getError() != LuceneQueryResult.NO_ERROR) {
642 query_results_xml.append("<Error type=\""+query_result.getErrorString()+"\" />\n");
643 } else {
644 query_results_xml.append(query_result.getXMLString());
645 }
646 query_results_xml.append("</ResultSet>\n");
647
648 utf8out.print(query_results_xml);
649 utf8out.flush();
650
651 // Cache this query result, if desired
652 if (query_result_caching_enabled) {
653 // Catch any exceptions thrown trying to write the query result cache file and warn about them, but don't
654 // bother with the full stack trace. It won't affect the functionality if we can't write some cache
655 // files, it will just affect the speed of subsequent requests.
656 // Example exceptions are "permission denied" errors, or "filename too long" errors (the filter string
657 // can get very long in some collections)
658 try
659 {
660 FileWriter query_result_cache_file_writer = new FileWriter(query_result_cache_file);
661 query_result_cache_file_writer.write(query_results_xml.toString());
662 query_result_cache_file_writer.close();
663 }
664 catch (Exception exception)
665 {
666 System.err.println("Warning: Exception occurred trying to write query result cache file (" + exception + ")");
667 }
668 }
669 }
670
671 private static String fileSafe(String text)
672 {
673 StringBuffer file_safe_text = new StringBuffer();
674 for (int i = 0; i < text.length(); i++) {
675 char character = text.charAt(i);
676 if ((character >= 'A' && character <= 'Z') || (character >= 'a' && character <= 'z') || (character >= '0' && character <= '9') || character == '-') {
677 file_safe_text.append(character);
678 }
679 else {
680 file_safe_text.append('%');
681 file_safe_text.append((int) character);
682 }
683 }
684 return file_safe_text.toString();
685 }
686
687
688}
689
690
Note: See TracBrowser for help on using the repository browser.