source: trunk/gsdl/src/java/org/nzdl/gsdl/LuceneWrap/GS2LuceneQuery.java@ 13557

Last change on this file since 13557 was 13557, checked in by kjdon, 17 years ago

in GS3 we will use GS2LuceneQuery directly, so moved most of the functionality out of the static methods and into instance methods. main() is now pretty much just a wrapper around the class. Caching has been left in the command line version for now - maybe should be in the class version too??

  • Property svn:keywords set to Author Date Id Revision
File size: 19.1 KB
Line 
1/**********************************************************************
2 *
3 * GS2LuceneQuery.java
4 *
5 * Copyright 2004 The New Zealand Digital Library Project
6 *
7 * A component of the Greenstone digital library software
8 * from the New Zealand Digital Library Project at the
9 * University of Waikato, New Zealand.
10 *
11 * This program is free software; you can redistribute it and/or modify
12 * it under the terms of the GNU General Public License as published by
13 * the Free Software Foundation; either version 2 of the License, or
14 * (at your option) any later version.
15 *
16 * This program is distributed in the hope that it will be useful,
17 * but WITHOUT ANY WARRANTY; without even the implied warranty of
18 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19 * GNU General Public License for more details.
20 *
21 * You should have received a copy of the GNU General Public License
22 * along with this program; if not, write to the Free Software
23 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24 *
25 *********************************************************************/
26package org.nzdl.gsdl.LuceneWrap;
27
28
29import java.io.*;
30import java.util.*;
31import java.util.regex.*;
32
33import org.apache.lucene.analysis.Analyzer;
34import org.apache.lucene.analysis.standard.StandardAnalyzer;
35import org.apache.lucene.document.Document;
36import org.apache.lucene.index.IndexReader;
37import org.apache.lucene.index.Term;
38import org.apache.lucene.index.TermDocs;
39import org.apache.lucene.queryParser.ParseException;
40import org.apache.lucene.queryParser.QueryParser;
41import org.apache.lucene.search.BooleanQuery.TooManyClauses;
42import org.apache.lucene.search.Filter;
43import org.apache.lucene.search.Hit;
44import org.apache.lucene.search.Hits;
45import org.apache.lucene.search.IndexSearcher;
46import org.apache.lucene.search.Query;
47import org.apache.lucene.search.RangeFilter;
48import org.apache.lucene.search.Searcher;
49import org.apache.lucene.search.ScoreDoc;
50import org.apache.lucene.search.Sort;
51import org.apache.lucene.search.TopFieldDocs;
52
53
54public class GS2LuceneQuery
55{
56
57
58 static private String TEXTFIELD = "TX";
59
60 // Use the standard set of English stop words by default
61 static private String[] stop_words = StandardAnalyzer.STOP_WORDS;
62
63 private String full_indexdir="";
64 private String default_conjunction_operator = "OR";
65 private String fuzziness = null;
66 private String sort_field = null;
67 private Sort sorter=new Sort();
68 private String filter_string = null;
69 private Filter filter = null;
70 private int start_results=1;
71 private int end_results=Integer.MAX_VALUE;
72
73 private QueryParser query_parser = null;
74 private QueryParser query_parser_no_stop_words = null;
75 private Searcher searcher = null;
76 private IndexReader reader = null;
77
78 public GS2LuceneQuery() {
79
80 // Create one query parser with the standard set of stop words, and one with none
81
82 query_parser = new QueryParser(TEXTFIELD, new StandardAnalyzer(stop_words));
83 query_parser_no_stop_words = new QueryParser(TEXTFIELD, new StandardAnalyzer(new String[] { }));
84 }
85
86
87 public boolean initialise() {
88
89 if (full_indexdir==null || full_indexdir.length()==-1){
90 System.out.println("Index directory is not indicated ");
91 return false;
92 }
93 try {
94 searcher = new IndexSearcher(full_indexdir);
95 reader = ((IndexSearcher) searcher).getIndexReader();
96
97 }
98 catch (IOException exception) {
99 exception.printStackTrace();
100 return false;
101 }
102 return true;
103
104 }
105
106 public LuceneQueryResult runQuery(String query_string) {
107
108 if (query_string == null || query_string.equals("")) {
109 System.out.println("The query word is not indicated ");
110 return null;
111 }
112
113 LuceneQueryResult lucene_query_result=new LuceneQueryResult();
114 lucene_query_result.clear();
115
116 try {
117 Query query_including_stop_words = query_parser_no_stop_words.parse(query_string);
118 query_including_stop_words = query_including_stop_words.rewrite(reader);
119
120 Query query = parseQuery(reader, query_parser, query_string, fuzziness);
121 query = query.rewrite(reader);
122
123 // Get the list of expanded query terms and their frequencies
124 // num docs matching, and total frequency
125 HashSet terms = new HashSet();
126 query.extractTerms(terms);
127
128 Iterator iter = terms.iterator();
129 while (iter.hasNext()) {
130
131 Term term = (Term) iter.next();
132
133 // Get the term frequency over all the documents
134 TermDocs term_docs = reader.termDocs(term);
135 int term_freq = term_docs.freq();
136 int match_docs = 0;
137 if (term_freq != 0) match_docs++;
138 while (term_docs.next()) {
139 term_freq += term_docs.freq();
140 if (term_docs.freq()!= 0) {
141 match_docs++;
142 }
143 }
144
145 // Create a term
146 lucene_query_result.addTerm(term.text(), term.field(), match_docs, term_freq);
147 }
148
149 // Get the list of stop words removed from the query
150 HashSet terms_including_stop_words = new HashSet();
151 query_including_stop_words.extractTerms(terms_including_stop_words);
152 Iterator terms_including_stop_words_iter = terms_including_stop_words.iterator();
153 while (terms_including_stop_words_iter.hasNext()) {
154 Term term = (Term) terms_including_stop_words_iter.next();
155 if (!terms.contains(term)) {
156 lucene_query_result.addStopWord(term.text());
157 }
158 }
159
160 // do the query
161 // Simple case for getting all the matching documents
162 if (end_results == Integer.MAX_VALUE) {
163 // Perform the query (filter and sorter may be null)
164 Hits hits = searcher.search(query, filter, sorter);
165 lucene_query_result.setTotalDocs(hits.length());
166
167 // Output the matching documents
168 lucene_query_result.setStartResults(start_results);
169 lucene_query_result.setEndResults(hits.length());
170
171 for (int i = start_results; i <= hits.length(); i++) {
172 Document doc = hits.doc(i - 1);
173 lucene_query_result.addDoc(Long.parseLong(doc.get("nodeID").trim()), hits.score(i-1));
174 }
175 }
176
177 // Slightly more complicated case for returning a subset of the matching documents
178 else {
179 // Perform the query (filter may be null)
180 TopFieldDocs hits = searcher.search(query, filter, end_results, sorter);
181 lucene_query_result.setTotalDocs(hits.totalHits);
182
183 lucene_query_result.setStartResults(start_results);
184 lucene_query_result.setEndResults(end_results < hits.scoreDocs.length ? end_results: hits.scoreDocs.length);
185
186 // Output the matching documents
187 for (int i = start_results; (i <= hits.scoreDocs.length && i <= end_results); i++) {
188 Document doc = reader.document(hits.scoreDocs[i - 1].doc);
189 lucene_query_result.addDoc(Long.parseLong(doc.get("nodeID").trim()), hits.scoreDocs[i-1].score);
190 }
191 }
192 }
193
194 catch (ParseException parse_exception) {
195 lucene_query_result.setError(LuceneQueryResult.PARSE_ERROR);
196 }
197 catch (TooManyClauses too_many_clauses_exception) {
198 lucene_query_result.setError(LuceneQueryResult.TOO_MANY_CLAUSES_ERROR);
199 }
200 catch (IOException exception) {
201 lucene_query_result.setError(LuceneQueryResult.IO_ERROR);
202 exception.printStackTrace();
203 }
204
205 return lucene_query_result;
206 }
207
208 public void setDefaultConjunctionOperator(String default_conjunction_operator) {
209 this.default_conjunction_operator = default_conjunction_operator.toUpperCase();
210 if (default_conjunction_operator == "AND") {
211 query_parser.setDefaultOperator(query_parser.AND_OPERATOR);
212 query_parser_no_stop_words.setDefaultOperator(query_parser.AND_OPERATOR);
213 } else { // default is OR
214 query_parser.setDefaultOperator(query_parser.OR_OPERATOR);
215 query_parser_no_stop_words.setDefaultOperator(query_parser.OR_OPERATOR);
216 }
217 }
218
219 public String getDefaultConjunctionOperator() {
220 return this.default_conjunction_operator;
221 }
222
223 public void setEndResults(int end_results) {
224 this.end_results = end_results;
225 }
226 public int getEndResults() {
227 return this.end_results;
228 }
229
230 public void setFilterString(String filter_string) {
231 this.filter_string = filter_string;
232 this.filter = parseFilterString(filter_string);
233 }
234 public String getFilterString() {
235 return this.filter_string ;
236 }
237
238 public Filter getFilter() {
239 return this.filter;
240 }
241
242 public void setIndexDir(String full_indexdir) {
243 this.full_indexdir = full_indexdir;
244 }
245
246 public void setFuzziness(String fuzziness) {
247 this.fuzziness = fuzziness;
248 }
249 public String getFuzziness() {
250 return this.fuzziness;
251 }
252
253 public void setSortField(String sort_field) {
254 this.sort_field = sort_field;
255 this.sorter = new Sort(sort_field);
256 }
257 public String getSortField() {
258 return this.sort_field;
259 }
260
261 public void setStartResults(int start_results) {
262 this.start_results = start_results;
263 }
264 public int getStartResults() {
265 return this.start_results;
266 }
267
268 public void cleanUp() {
269 try {
270 searcher.close();
271 } catch (IOException exception) {
272 exception.printStackTrace();
273 }
274 }
275
276 private Query parseQuery(IndexReader reader, QueryParser query_parser, String query_string, String fuzziness)
277 throws java.io.IOException, org.apache.lucene.queryParser.ParseException
278 {
279 // Split query string into the search terms and the filter terms
280 // * The first +(...) term contains the search terms so count
281 // up '(' and stop when we finish matching ')'
282 int offset = 0;
283 int paren_count = 0;
284 boolean seen_paren = false;
285 while (offset < query_string.length() && (!seen_paren || paren_count > 0)) {
286 if (query_string.charAt(offset) == '(') {
287 paren_count++;
288 seen_paren = true;
289 }
290 if (query_string.charAt(offset) == ')') {
291 paren_count--;
292 }
293 offset++;
294 }
295 String query_prefix = query_string.substring(0, offset);
296 String query_suffix = query_string.substring(offset);
297
298 ///ystem.err.println("Prefix: " + query_prefix);
299 ///ystem.err.println("Suffix: " + query_suffix);
300
301 Query query = query_parser.parse(query_prefix);
302 query = query.rewrite(reader);
303
304 // If this is a fuzzy search, then we need to add the fuzzy
305 // flag to each of the query terms
306 if (fuzziness != null && query.toString().length() > 0) {
307
308 // Revert the query to a string
309 System.err.println("Rewritten query: " + query.toString());
310 // Search through the string for TX:<term> query terms
311 // and append the ~ operator. Note that this search will
312 // not change phrase searches (TX:"<term> <term>") as
313 // fuzzy searching is not possible for these entries.
314 // Yahoo! Time for a state machine!
315 StringBuffer mutable_query_string = new StringBuffer(query.toString());
316 int o = 0; // Offset
317 // 0 = BASE, 1 = SEEN_T, 2 = SEEN_TX, 3 = SEEN_TX:
318 int s = 0; // State
319 while(o < mutable_query_string.length()) {
320 char c = mutable_query_string.charAt(o);
321 if (s == 0 && c == TEXTFIELD.charAt(0)) {
322 ///ystem.err.println("Found T!");
323 s = 1;
324 }
325 else if (s == 1) {
326 if (c == TEXTFIELD.charAt(1)) {
327 ///ystem.err.println("Found X!");
328 s = 2;
329 }
330 else {
331 s = 0; // Reset
332 }
333 }
334 else if (s == 2) {
335 if (c == ':') {
336 ///ystem.err.println("Found TX:!");
337 s = 3;
338 }
339 else {
340 s = 0; // Reset
341 }
342 }
343 else if (s == 3) {
344 // Don't process phrases
345 if (c == '"') {
346 ///ystem.err.println("Stupid phrase...");
347 s = 0; // Reset
348 }
349 // Found the end of the term... add the
350 // fuzzy search indicator
351 // Nor outside the scope of parentheses
352 else if (Character.isWhitespace(c) || c == ')') {
353 ///ystem.err.println("Yahoo! Found fuzzy term.");
354 mutable_query_string.insert(o, '~' + fuzziness);
355 o++;
356 s = 0; // Reset
357 }
358 }
359 o++;
360 }
361 // If we were in the state of looking for the end of a
362 // term - then we just found it!
363 if (s == 3) {
364
365 mutable_query_string.append('~' + fuzziness);
366 }
367 // Reparse the query
368 ///ystem.err.println("Fuzzy query: " + mutable_query_string.toString() + query_suffix);
369 query = query_parser.parse(mutable_query_string.toString() + query_suffix);
370 }
371 else {
372 query = query_parser.parse(query_prefix + query_suffix);
373 }
374
375 return query;
376 }
377
378 private Filter parseFilterString(String filter_string)
379 {
380 Filter result = null;
381 Pattern pattern = Pattern.compile("\\s*\\+(\\w+)\\:([\\{\\[])(\\d+)\\s+TO\\s+(\\d+)([\\}\\]])\\s*");
382 Matcher matcher = pattern.matcher(filter_string);
383 if (matcher.matches()) {
384 String field_name = matcher.group(1);
385 boolean include_lower = matcher.group(2).equals("[");
386 String lower_term = matcher.group(3);
387 String upper_term = matcher.group(4);
388 boolean include_upper = matcher.group(5).equals("]");
389 result = new RangeFilter(field_name, lower_term, upper_term, include_lower, include_upper);
390 }
391 else {
392 System.err.println("Error: Could not understand filter string \"" + filter_string + "\"");
393 }
394 return result;
395 }
396
397
398 /** command line program and auxiliary methods */
399
400 // Fairly self-explanatory I should hope
401 static private boolean query_result_caching_enabled = false;
402
403 static public void main (String args[])
404 {
405 if (args.length == 0) {
406 System.out.println("Usage: GS2LuceneQuery <index directory> [-fuzziness value] [-filter filter_string] [-sort sort_field] [-dco AND|OR] [-startresults number -endresults number] [query]");
407 return;
408 }
409
410 try {
411 String index_directory = args[0];
412
413 GS2LuceneQuery queryer = new GS2LuceneQuery();
414 queryer.setIndexDir(index_directory);
415
416 // Prepare the index cache directory, if query result caching is enabled
417 if (query_result_caching_enabled) {
418 // Make the index cache directory if it doesn't already exist
419 File index_cache_directory = new File(index_directory, "cache");
420 if (!index_cache_directory.exists()) {
421 index_cache_directory.mkdir();
422 }
423
424 // Disable caching if the index cache directory isn't available
425 if (!index_cache_directory.exists() || !index_cache_directory.isDirectory()) {
426 query_result_caching_enabled = false;
427 }
428 }
429
430 String query_string = null;
431
432 // Parse the command-line arguments
433 for (int i = 1; i < args.length; i++) {
434 if (args[i].equals("-sort")) {
435 i++;
436 queryer.setSortField(args[i]);
437 }
438 else if (args[i].equals("-filter")) {
439 i++;
440 queryer.setFilterString(args[i]);
441 }
442 else if (args[i].equals("-dco")) {
443 i++;
444 queryer.setDefaultConjunctionOperator(args[i]);
445 }
446 else if (args[i].equals("-fuzziness")) {
447 i++;
448 queryer.setFuzziness(args[i]);
449 }
450 else if (args[i].equals("-startresults")) {
451 i++;
452 if (args[i].matches("\\d+")) {
453 queryer.setStartResults(Integer.parseInt(args[i]));
454 }
455 }
456 else if (args[i].equals("-endresults")) {
457 i++;
458 if (args[i].matches("\\d+")) {
459 queryer.setEndResults(Integer.parseInt(args[i]));
460 }
461 }
462 else {
463 query_string = args[i];
464 }
465 }
466
467 if (!queryer.initialise()) {
468 return;
469 }
470
471 // The query string has been specified as a command-line argument
472 if (query_string != null) {
473 runQueryCaching(index_directory, queryer, query_string);
474 }
475
476 // Read queries from STDIN
477 else {
478 BufferedReader in = new BufferedReader(new InputStreamReader(System.in, "UTF-8"));
479 while (true) {
480 // Read the query from STDIN
481 query_string = in.readLine();
482 if (query_string == null || query_string.length() == -1) {
483 break;
484 }
485 runQueryCaching(index_directory, queryer, query_string);
486
487 }
488 }
489 queryer.cleanUp();
490 }
491 catch (IOException exception) {
492 exception.printStackTrace();
493 }
494 }
495
496 private static void runQueryCaching(String index_directory, GS2LuceneQuery queryer, String query_string)
497 throws IOException
498 {
499 StringBuffer query_results_xml = new StringBuffer();
500
501 // Check if this query result has been cached from a previous search (if it's enabled)
502 File query_result_cache_file = null;
503 if (query_result_caching_enabled) {
504 // Generate the cache file name from the query options
505 String query_result_cache_file_name = query_string + "-";
506 String fuzziness = queryer.getFuzziness();
507 query_result_cache_file_name += ((fuzziness != null) ? fuzziness : "") + "-";
508 String filter_string = queryer.getFilterString();
509 query_result_cache_file_name += ((filter_string != null) ? filter_string : "") + "-";
510 String sort_string = queryer.getSortField();
511 query_result_cache_file_name += ((sort_string != null) ? sort_string : "") + "-";
512 String default_conjunction_operator = queryer.getDefaultConjunctionOperator();
513 query_result_cache_file_name += default_conjunction_operator + "-";
514 int start_results = queryer.getStartResults();
515 int end_results = queryer.getEndResults();
516 query_result_cache_file_name += start_results + "-" + end_results;
517 query_result_cache_file_name = fileSafe(query_result_cache_file_name);
518
519 // If the query result cache file exists, just return its contents and we're done
520 File index_cache_directory = new File(index_directory, "cache");
521 query_result_cache_file = new File(index_cache_directory, query_result_cache_file_name);
522 if (query_result_cache_file.exists() && query_result_cache_file.isFile()) {
523 FileInputStream fis = new FileInputStream(query_result_cache_file);
524 InputStreamReader isr = new InputStreamReader(fis, "UTF-8");
525 BufferedReader buffered_reader = new BufferedReader(isr);
526 String line = "";
527 while ((line = buffered_reader.readLine()) != null) {
528 query_results_xml.append(line + "\n");
529 }
530 String query_results_xml_string = query_results_xml.toString();
531 query_results_xml_string = query_results_xml_string.replaceFirst("cached=\"false\"", "cached=\"true\"");
532 System.out.print(query_results_xml_string);
533 return;
534 }
535 }
536
537 // not cached
538 query_results_xml.append("<ResultSet cached=\"false\">\n");
539 query_results_xml.append("<QueryString>" + LuceneQueryResult.xmlSafe(query_string) + "</QueryString>\n");
540 Filter filter = queryer.getFilter();
541 if (filter != null) {
542 query_results_xml.append("<FilterString>" + filter.toString() + "</FilterString>\n");
543 }
544
545 LuceneQueryResult query_result = queryer.runQuery(query_string);
546 if (query_result == null) {
547 System.err.println("Couldn't run the query");
548 return;
549 }
550
551 if (query_result.getError() != LuceneQueryResult.NO_ERROR) {
552 query_results_xml.append("<Error type=\""+query_result.getErrorString()+"\" />\n");
553 } else {
554 query_results_xml.append(query_result.getXMLString());
555 }
556 query_results_xml.append("</ResultSet>\n");
557
558 System.out.print(query_results_xml);
559
560 // Cache this query result, if desired
561 if (query_result_caching_enabled) {
562 FileWriter query_result_cache_file_writer = new FileWriter(query_result_cache_file);
563 query_result_cache_file_writer.write(query_results_xml.toString());
564 query_result_cache_file_writer.close();
565 }
566 }
567
568 private static String fileSafe(String text)
569 {
570 StringBuffer file_safe_text = new StringBuffer();
571 for (int i = 0; i < text.length(); i++) {
572 char character = text.charAt(i);
573 if ((character >= 'A' && character <= 'Z') || (character >= 'a' && character <= 'z') || (character >= '0' && character <= '9') || character == '-') {
574 file_safe_text.append(character);
575 }
576 else {
577 file_safe_text.append('%');
578 file_safe_text.append((int) character);
579 }
580 }
581 return file_safe_text.toString();
582 }
583
584
585}
586
587
Note: See TracBrowser for help on using the repository browser.