source: main/trunk/greenstone2/common-src/indexers/lucene-gs/src/org/greenstone/LuceneWrapper3/GS2LuceneQuery.java@ 24732

Last change on this file since 24732 was 24732, checked in by davidb, 13 years ago

Some additional changes that allow both Lucene 2.x and 3.x be compiled up side-by-site

File size: 20.4 KB
Line 
1/**********************************************************************
2 *
3 * GS2LuceneQuery.java
4 *
5 * Copyright 2004 The New Zealand Digital Library Project
6 *
7 * A component of the Greenstone digital library software
8 * from the New Zealand Digital Library Project at the
9 * University of Waikato, New Zealand.
10 *
11 * This program is free software; you can redistribute it and/or modify
12 * it under the terms of the GNU General Public License as published by
13 * the Free Software Foundation; either version 2 of the License, or
14 * (at your option) any later version.
15 *
16 * This program is distributed in the hope that it will be useful,
17 * but WITHOUT ANY WARRANTY; without even the implied warranty of
18 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19 * GNU General Public License for more details.
20 *
21 * You should have received a copy of the GNU General Public License
22 * along with this program; if not, write to the Free Software
23 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24 *
25 *********************************************************************/
26package org.greenstone.LuceneWrapper3;
27
28
29import java.io.*;
30import java.util.*;
31import java.util.regex.*;
32
33import org.apache.lucene.analysis.Analyzer;
34import org.apache.lucene.analysis.standard.StandardAnalyzer;
35import org.apache.lucene.document.Document;
36import org.apache.lucene.index.IndexReader;
37import org.apache.lucene.index.Term;
38import org.apache.lucene.index.TermDocs;
39import org.apache.lucene.queryParser.ParseException;
40import org.apache.lucene.queryParser.QueryParser;
41import org.apache.lucene.search.BooleanQuery.TooManyClauses;
42import org.apache.lucene.search.Filter;
43import org.apache.lucene.search.IndexSearcher;
44import org.apache.lucene.search.Query;
45import org.apache.lucene.search.TermRangeFilter;
46import org.apache.lucene.search.Searcher;
47import org.apache.lucene.search.ScoreDoc;
48import org.apache.lucene.search.Sort;
49import org.apache.lucene.search.SortField;
50import org.apache.lucene.search.TopFieldDocs;
51
52import org.apache.lucene.store.Directory;
53import org.apache.lucene.store.FSDirectory;
54import org.apache.lucene.util.Version;
55
56public class GS2LuceneQuery extends SharedSoleneQuery
57{
58 protected String full_indexdir="";
59
60 protected Sort sorter=new Sort();
61 protected Filter filter = null;
62
63 protected static Version matchVersion = Version.LUCENE_24;
64
65 protected QueryParser query_parser = null;
66 protected QueryParser query_parser_no_stop_words = null;
67 protected Searcher searcher = null;
68 protected IndexReader reader = null;
69
70 public GS2LuceneQuery() {
71 super();
72
73 // Create one query parser with the standard set of stop words, and one with none
74
75 query_parser = new QueryParser(matchVersion, TEXTFIELD, new GS2Analyzer()); // uses built-in stop_words_set
76 query_parser_no_stop_words = new QueryParser(matchVersion, TEXTFIELD, new GS2Analyzer(new String[] { }));
77 }
78
79
80 public boolean initialise() {
81
82 if (!super.initialise()) {
83 return false;
84 }
85
86
87 if (full_indexdir==null || full_indexdir.length()==-1){
88 utf8out.println("Index directory is not indicated ");
89 utf8out.flush();
90 return false;
91 }
92
93 try {
94 Directory full_indexdir_dir = FSDirectory.open(new File(full_indexdir));
95 searcher = new IndexSearcher(full_indexdir_dir,true);
96 reader = ((IndexSearcher) searcher).getIndexReader();
97
98 }
99 catch (IOException exception) {
100 exception.printStackTrace();
101 return false;
102 }
103 return true;
104
105 }
106
107 public void setIndexDir(String full_indexdir) {
108 this.full_indexdir = full_indexdir;
109 }
110
111 public void setSortField(String sort_field) {
112 super.setSortField(sort_field);
113
114 if (sort_field == null) {
115 this.sorter = new Sort();
116 } else {
117 this.sorter = new Sort(new SortField(sort_field,SortField.STRING)); // **** can do better than this?!?
118 }
119 }
120
121 public void setFilterString(String filter_string) {
122 super.setFilterString(filter_string);
123 this.filter = parseFilterString(filter_string);
124 }
125
126 public Filter getFilter() {
127 return this.filter;
128 }
129
130
131 public LuceneQueryResult runQuery(String query_string) {
132
133 if (query_string == null || query_string.equals("")) {
134 utf8out.println("The query word is not indicated ");
135 utf8out.flush();
136 return null;
137 }
138
139 LuceneQueryResult lucene_query_result=new LuceneQueryResult();
140 lucene_query_result.clear();
141
142 try {
143 Query query_including_stop_words = query_parser_no_stop_words.parse(query_string);
144 query_including_stop_words = query_including_stop_words.rewrite(reader);
145
146 // System.err.println("********* query_string " + query_string + "****");
147
148 Query query = parseQuery(reader, query_parser, query_string, fuzziness);
149 query = query.rewrite(reader);
150
151 // Get the list of expanded query terms and their frequencies
152 // num docs matching, and total frequency
153 HashSet terms = new HashSet();
154 query.extractTerms(terms);
155
156 HashMap doc_term_freq_map = new HashMap();
157
158 Iterator iter = terms.iterator();
159 while (iter.hasNext()) {
160
161 Term term = (Term) iter.next();
162
163 // Get the term frequency over all the documents
164 TermDocs term_docs = reader.termDocs(term);
165 int term_freq = 0;
166 int match_docs = 0;
167 while (term_docs.next())
168 {
169 if (term_docs.freq() != 0)
170 {
171 term_freq += term_docs.freq();
172 match_docs++;
173
174 // Calculate the document-level term frequency as well
175 Integer lucene_doc_num_obj = new Integer(term_docs.doc());
176 int doc_term_freq = 0;
177 if (doc_term_freq_map.containsKey(lucene_doc_num_obj))
178 {
179 doc_term_freq = ((Integer) doc_term_freq_map.get(lucene_doc_num_obj)).intValue();
180 }
181 doc_term_freq += term_docs.freq();
182
183 doc_term_freq_map.put(lucene_doc_num_obj, new Integer(doc_term_freq));
184 }
185 }
186
187 // Create a term
188 lucene_query_result.addTerm(term.text(), term.field(), match_docs, term_freq);
189 }
190
191 // Get the list of stop words removed from the query
192 HashSet terms_including_stop_words = new HashSet();
193 query_including_stop_words.extractTerms(terms_including_stop_words);
194 Iterator terms_including_stop_words_iter = terms_including_stop_words.iterator();
195 while (terms_including_stop_words_iter.hasNext()) {
196 Term term = (Term) terms_including_stop_words_iter.next();
197 if (!terms.contains(term)) {
198 lucene_query_result.addStopWord(term.text());
199 }
200 }
201
202 // do the query
203 // Simple case for getting all the matching documents
204 if (end_results == Integer.MAX_VALUE) {
205 // Perform the query (filter and sorter may be null)
206 TopFieldDocs hits = searcher.search(query, filter, end_results, sorter);
207 lucene_query_result.setTotalDocs(hits.totalHits);
208
209 // Output the matching documents
210 lucene_query_result.setStartResults(start_results);
211 lucene_query_result.setEndResults(hits.totalHits);
212
213 for (int i = start_results; i <= hits.totalHits; i++) {
214 int lucene_doc_num = hits.scoreDocs[i - 1].doc;
215 Document doc = reader.document(lucene_doc_num);
216 int doc_term_freq = 0;
217 Integer doc_term_freq_object = (Integer) doc_term_freq_map.get(new Integer(lucene_doc_num));
218 if (doc_term_freq_object != null)
219 {
220 doc_term_freq = doc_term_freq_object.intValue();
221 }
222 lucene_query_result.addDoc(doc.get("docOID").trim(), hits.scoreDocs[i-1].score, doc_term_freq);
223 }
224 }
225
226 // Slightly more complicated case for returning a subset of the matching documents
227 else {
228 // Perform the query (filter may be null)
229 TopFieldDocs hits = searcher.search(query, filter, end_results, sorter);
230 lucene_query_result.setTotalDocs(hits.totalHits);
231
232 lucene_query_result.setStartResults(start_results);
233 lucene_query_result.setEndResults(end_results < hits.scoreDocs.length ? end_results: hits.scoreDocs.length);
234
235 // Output the matching documents
236 for (int i = start_results; (i <= hits.scoreDocs.length && i <= end_results); i++) {
237 int lucene_doc_num = hits.scoreDocs[i - 1].doc;
238 Document doc = reader.document(lucene_doc_num);
239 int doc_term_freq = 0;
240 Integer doc_term_freq_object = (Integer) doc_term_freq_map.get(new Integer(lucene_doc_num));
241 if (doc_term_freq_object != null)
242 {
243 doc_term_freq = doc_term_freq_object.intValue();
244 }
245 lucene_query_result.addDoc(doc.get("docOID").trim(), hits.scoreDocs[i-1].score, doc_term_freq);
246 }
247 }
248 }
249
250 catch (ParseException parse_exception) {
251 lucene_query_result.setError(LuceneQueryResult.PARSE_ERROR);
252 }
253 catch (TooManyClauses too_many_clauses_exception) {
254 lucene_query_result.setError(LuceneQueryResult.TOO_MANY_CLAUSES_ERROR);
255 }
256 catch (IOException exception) {
257 lucene_query_result.setError(LuceneQueryResult.IO_ERROR);
258 exception.printStackTrace();
259 }
260 catch (Exception exception) {
261 lucene_query_result.setError(LuceneQueryResult.OTHER_ERROR);
262 exception.printStackTrace();
263 }
264 return lucene_query_result;
265 }
266
267 public void setDefaultConjunctionOperator(String default_conjunction_operator) {
268 super.setDefaultConjunctionOperator(default_conjunction_operator);
269
270 if (default_conjunction_operator.equals("AND")) {
271 query_parser.setDefaultOperator(query_parser.AND_OPERATOR);
272 query_parser_no_stop_words.setDefaultOperator(query_parser.AND_OPERATOR);
273 } else { // default is OR
274 query_parser.setDefaultOperator(query_parser.OR_OPERATOR);
275 query_parser_no_stop_words.setDefaultOperator(query_parser.OR_OPERATOR);
276 }
277 }
278
279
280 public void cleanUp() {
281 super.cleanUp();
282 try {
283 if (searcher != null) {
284 searcher.close();
285 }
286 } catch (IOException exception) {
287 exception.printStackTrace();
288 }
289 }
290
291
292 protected Query parseQuery(IndexReader reader, QueryParser query_parser, String query_string, String fuzziness)
293 throws java.io.IOException, org.apache.lucene.queryParser.ParseException
294 {
295 // Split query string into the search terms and the filter terms
296 // * The first +(...) term contains the search terms so count
297 // up '(' and stop when we finish matching ')'
298 int offset = 0;
299 int paren_count = 0;
300 boolean seen_paren = false;
301 while (offset < query_string.length() && (!seen_paren || paren_count > 0)) {
302 if (query_string.charAt(offset) == '(') {
303 paren_count++;
304 seen_paren = true;
305 }
306 if (query_string.charAt(offset) == ')') {
307 paren_count--;
308 }
309 offset++;
310 }
311 String query_prefix = query_string.substring(0, offset);
312 String query_suffix = query_string.substring(offset);
313
314 ///ystem.err.println("Prefix: " + query_prefix);
315 ///ystem.err.println("Suffix: " + query_suffix);
316
317 Query query = query_parser.parse(query_prefix);
318 query = query.rewrite(reader);
319
320 // If this is a fuzzy search, then we need to add the fuzzy
321 // flag to each of the query terms
322 if (fuzziness != null && query.toString().length() > 0) {
323
324 // Revert the query to a string
325 System.err.println("Rewritten query: " + query.toString());
326 // Search through the string for TX:<term> query terms
327 // and append the ~ operator. Note that this search will
328 // not change phrase searches (TX:"<term> <term>") as
329 // fuzzy searching is not possible for these entries.
330 // Yahoo! Time for a state machine!
331 StringBuffer mutable_query_string = new StringBuffer(query.toString());
332 int o = 0; // Offset
333 // 0 = BASE, 1 = SEEN_T, 2 = SEEN_TX, 3 = SEEN_TX:
334 int s = 0; // State
335 while(o < mutable_query_string.length()) {
336 char c = mutable_query_string.charAt(o);
337 if (s == 0 && c == TEXTFIELD.charAt(0)) {
338 ///ystem.err.println("Found T!");
339 s = 1;
340 }
341 else if (s == 1) {
342 if (c == TEXTFIELD.charAt(1)) {
343 ///ystem.err.println("Found X!");
344 s = 2;
345 }
346 else {
347 s = 0; // Reset
348 }
349 }
350 else if (s == 2) {
351 if (c == ':') {
352 ///ystem.err.println("Found TX:!");
353 s = 3;
354 }
355 else {
356 s = 0; // Reset
357 }
358 }
359 else if (s == 3) {
360 // Don't process phrases
361 if (c == '"') {
362 ///ystem.err.println("Stupid phrase...");
363 s = 0; // Reset
364 }
365 // Found the end of the term... add the
366 // fuzzy search indicator
367 // Nor outside the scope of parentheses
368 else if (Character.isWhitespace(c) || c == ')') {
369 ///ystem.err.println("Yahoo! Found fuzzy term.");
370 mutable_query_string.insert(o, '~' + fuzziness);
371 o++;
372 s = 0; // Reset
373 }
374 }
375 o++;
376 }
377 // If we were in the state of looking for the end of a
378 // term - then we just found it!
379 if (s == 3) {
380
381 mutable_query_string.append('~' + fuzziness);
382 }
383 // Reparse the query
384 ///ystem.err.println("Fuzzy query: " + mutable_query_string.toString() + query_suffix);
385 query = query_parser.parse(mutable_query_string.toString() + query_suffix);
386 }
387 else {
388 query = query_parser.parse(query_prefix + query_suffix);
389 }
390
391 return query;
392 }
393
394 protected Filter parseFilterString(String filter_string)
395 {
396 Filter result = null;
397 Pattern pattern = Pattern.compile("\\s*\\+(\\w+)\\:([\\{\\[])(\\d+)\\s+TO\\s+(\\d+)([\\}\\]])\\s*");
398 Matcher matcher = pattern.matcher(filter_string);
399 if (matcher.matches()) {
400 String field_name = matcher.group(1);
401 boolean include_lower = matcher.group(2).equals("[");
402 String lower_term = matcher.group(3);
403 String upper_term = matcher.group(4);
404 boolean include_upper = matcher.group(5).equals("]");
405 result = new TermRangeFilter(field_name, lower_term, upper_term, include_lower, include_upper);
406 }
407 else {
408 System.err.println("Error: Could not understand filter string \"" + filter_string + "\"");
409 }
410 return result;
411 }
412
413
414 /** command line program and auxiliary methods */
415
416 // Fairly self-explanatory I should hope
417 static protected boolean query_result_caching_enabled = false;
418
419
420 static public void main (String args[])
421 {
422 if (args.length == 0) {
423 System.out.println("Usage: GS2LuceneQuery <index directory> [-fuzziness value] [-filter filter_string] [-sort sort_field] [-dco AND|OR] [-startresults number -endresults number] [query]");
424 return;
425 }
426
427 try {
428 String index_directory = args[0];
429
430 GS2LuceneQuery queryer = new GS2LuceneQuery();
431 queryer.setIndexDir(index_directory);
432
433 // Prepare the index cache directory, if query result caching is enabled
434 if (query_result_caching_enabled) {
435 // Make the index cache directory if it doesn't already exist
436 File index_cache_directory = new File(index_directory, "cache");
437 if (!index_cache_directory.exists()) {
438 index_cache_directory.mkdir();
439 }
440
441 // Disable caching if the index cache directory isn't available
442 if (!index_cache_directory.exists() || !index_cache_directory.isDirectory()) {
443 query_result_caching_enabled = false;
444 }
445 }
446
447 String query_string = null;
448
449 // Parse the command-line arguments
450 for (int i = 1; i < args.length; i++) {
451 if (args[i].equals("-sort")) {
452 i++;
453 queryer.setSortField(args[i]);
454 }
455 else if (args[i].equals("-filter")) {
456 i++;
457 queryer.setFilterString(args[i]);
458 }
459 else if (args[i].equals("-dco")) {
460 i++;
461 queryer.setDefaultConjunctionOperator(args[i]);
462 }
463 else if (args[i].equals("-fuzziness")) {
464 i++;
465 queryer.setFuzziness(args[i]);
466 }
467 else if (args[i].equals("-startresults")) {
468 i++;
469 if (args[i].matches("\\d+")) {
470 queryer.setStartResults(Integer.parseInt(args[i]));
471 }
472 }
473 else if (args[i].equals("-endresults")) {
474 i++;
475 if (args[i].matches("\\d+")) {
476 queryer.setEndResults(Integer.parseInt(args[i]));
477 }
478 }
479 else {
480 query_string = args[i];
481 }
482 }
483
484 if (!queryer.initialise()) {
485 return;
486 }
487
488 // The query string has been specified as a command-line argument
489 if (query_string != null) {
490 runQueryCaching(index_directory, queryer, query_string);
491 }
492
493 // Read queries from STDIN
494 else {
495 BufferedReader in = new BufferedReader(new InputStreamReader(System.in, "UTF-8"));
496 while (true) {
497 // Read the query from STDIN
498 query_string = in.readLine();
499 if (query_string == null || query_string.length() == -1) {
500 break;
501 }
502
503 runQueryCaching(index_directory, queryer, query_string);
504
505 }
506 }
507 queryer.cleanUp();
508 }
509 catch (IOException exception) {
510 exception.printStackTrace();
511 }
512 }
513
514 protected static void runQueryCaching(String index_directory, GS2LuceneQuery queryer, String query_string)
515 throws IOException
516 {
517 StringBuffer query_results_xml = new StringBuffer();
518
519 // Check if this query result has been cached from a previous search (if it's enabled)
520 File query_result_cache_file = null;
521 if (query_result_caching_enabled) {
522 // Generate the cache file name from the query options
523 String query_result_cache_file_name = query_string + "-";
524 String fuzziness = queryer.getFuzziness();
525 query_result_cache_file_name += ((fuzziness != null) ? fuzziness : "") + "-";
526 String filter_string = queryer.getFilterString();
527 query_result_cache_file_name += ((filter_string != null) ? filter_string : "") + "-";
528 String sort_string = queryer.getSortField();
529 query_result_cache_file_name += ((sort_string != null) ? sort_string : "") + "-";
530 String default_conjunction_operator = queryer.getDefaultConjunctionOperator();
531 query_result_cache_file_name += default_conjunction_operator + "-";
532 int start_results = queryer.getStartResults();
533 int end_results = queryer.getEndResults();
534 query_result_cache_file_name += start_results + "-" + end_results;
535 query_result_cache_file_name = fileSafe(query_result_cache_file_name);
536
537 // If the query result cache file exists, just return its contents and we're done
538 File index_cache_directory = new File(index_directory, "cache");
539 query_result_cache_file = new File(index_cache_directory, query_result_cache_file_name);
540 if (query_result_cache_file.exists() && query_result_cache_file.isFile()) {
541 FileInputStream fis = new FileInputStream(query_result_cache_file);
542 InputStreamReader isr = new InputStreamReader(fis, "UTF-8");
543 BufferedReader buffered_reader = new BufferedReader(isr);
544 String line = "";
545 while ((line = buffered_reader.readLine()) != null) {
546 query_results_xml.append(line + "\n");
547 }
548 String query_results_xml_string = query_results_xml.toString();
549 query_results_xml_string = query_results_xml_string.replaceFirst("cached=\"false\"", "cached=\"true\"");
550
551 utf8out.print(query_results_xml_string);
552 utf8out.flush();
553
554 return;
555 }
556 }
557
558 // not cached
559 query_results_xml.append("<ResultSet cached=\"false\">\n");
560 query_results_xml.append("<QueryString>" + LuceneQueryResult.xmlSafe(query_string) + "</QueryString>\n");
561 Filter filter = queryer.getFilter();
562 if (filter != null) {
563 query_results_xml.append("<FilterString>" + filter.toString() + "</FilterString>\n");
564 }
565
566 LuceneQueryResult query_result = queryer.runQuery(query_string);
567 if (query_result == null) {
568 System.err.println("Couldn't run the query");
569 return;
570 }
571
572 if (query_result.getError() != LuceneQueryResult.NO_ERROR) {
573 query_results_xml.append("<Error type=\""+query_result.getErrorString()+"\" />\n");
574 } else {
575 query_results_xml.append(query_result.getXMLString());
576 }
577 query_results_xml.append("</ResultSet>\n");
578
579 utf8out.print(query_results_xml);
580 utf8out.flush();
581
582 // Cache this query result, if desired
583 if (query_result_caching_enabled) {
584 // Catch any exceptions thrown trying to write the query result cache file and warn about them, but don't
585 // bother with the full stack trace. It won't affect the functionality if we can't write some cache
586 // files, it will just affect the speed of subsequent requests.
587 // Example exceptions are "permission denied" errors, or "filename too long" errors (the filter string
588 // can get very long in some collections)
589 try
590 {
591 FileWriter query_result_cache_file_writer = new FileWriter(query_result_cache_file);
592 query_result_cache_file_writer.write(query_results_xml.toString());
593 query_result_cache_file_writer.close();
594 }
595 catch (Exception exception)
596 {
597 System.err.println("Warning: Exception occurred trying to write query result cache file (" + exception + ")");
598 }
599 }
600 }
601
602 protected static String fileSafe(String text)
603 {
604 StringBuffer file_safe_text = new StringBuffer();
605 for (int i = 0; i < text.length(); i++) {
606 char character = text.charAt(i);
607 if ((character >= 'A' && character <= 'Z') || (character >= 'a' && character <= 'z') || (character >= '0' && character <= '9') || character == '-') {
608 file_safe_text.append(character);
609 }
610 else {
611 file_safe_text.append('%');
612 file_safe_text.append((int) character);
613 }
614 }
615 return file_safe_text.toString();
616 }
617
618
619}
620
621
Note: See TracBrowser for help on using the repository browser.