source: gs3-extensions/solr/trunk/src/src/java/org/greenstone/LuceneWrapper/GS2LuceneQuery.java@ 24641

Last change on this file since 24641 was 24641, checked in by davidb, 13 years ago

Initial cut at Greenstone3 runtime code to support Solr. Solr code based on version 3.3, so this also include an upgraded version of the LuceneWrapper code (gs2build/common-src/indexers/lucene-gs) that works with this version of the support jar files

File size: 20.5 KB
Line 
1/**********************************************************************
2 *
3 * GS2LuceneQuery.java
4 *
5 * Copyright 2004 The New Zealand Digital Library Project
6 *
7 * A component of the Greenstone digital library software
8 * from the New Zealand Digital Library Project at the
9 * University of Waikato, New Zealand.
10 *
11 * This program is free software; you can redistribute it and/or modify
12 * it under the terms of the GNU General Public License as published by
13 * the Free Software Foundation; either version 2 of the License, or
14 * (at your option) any later version.
15 *
16 * This program is distributed in the hope that it will be useful,
17 * but WITHOUT ANY WARRANTY; without even the implied warranty of
18 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19 * GNU General Public License for more details.
20 *
21 * You should have received a copy of the GNU General Public License
22 * along with this program; if not, write to the Free Software
23 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24 *
25 *********************************************************************/
26package org.greenstone.LuceneWrapper;
27
28
29import java.io.*;
30import java.util.*;
31import java.util.regex.*;
32
33import org.apache.lucene.analysis.Analyzer;
34import org.apache.lucene.analysis.standard.StandardAnalyzer;
35import org.apache.lucene.document.Document;
36import org.apache.lucene.index.IndexReader;
37import org.apache.lucene.index.Term;
38import org.apache.lucene.index.TermDocs;
39import org.apache.lucene.queryParser.ParseException;
40import org.apache.lucene.queryParser.QueryParser;
41import org.apache.lucene.search.BooleanQuery.TooManyClauses;
42import org.apache.lucene.search.Filter;
43import org.apache.lucene.search.Hit;
44import org.apache.lucene.search.Hits;
45import org.apache.lucene.search.IndexSearcher;
46import org.apache.lucene.search.Query;
47import org.apache.lucene.search.TermRangeFilter;
48import org.apache.lucene.search.Searcher;
49import org.apache.lucene.search.ScoreDoc;
50import org.apache.lucene.search.Sort;
51import org.apache.lucene.search.SortField;
52import org.apache.lucene.search.TopFieldDocs;
53
54import org.apache.lucene.store.Directory;
55import org.apache.lucene.store.FSDirectory;
56import org.apache.lucene.util.Version;
57
58public class GS2LuceneQuery extends SharedSoleneQuery
59{
60 protected String full_indexdir="";
61
62 protected Sort sorter=new Sort();
63 protected Filter filter = null;
64
65 protected static Version matchVersion = Version.LUCENE_24;
66
67 protected QueryParser query_parser = null;
68 protected QueryParser query_parser_no_stop_words = null;
69 protected Searcher searcher = null;
70 protected IndexReader reader = null;
71
72 public GS2LuceneQuery() {
73 super();
74
75 // Create one query parser with the standard set of stop words, and one with none
76
77 query_parser = new QueryParser(matchVersion, TEXTFIELD, new GS2Analyzer()); // uses built-in stop_words_set
78 query_parser_no_stop_words = new QueryParser(matchVersion, TEXTFIELD, new GS2Analyzer(new String[] { }));
79 }
80
81
82 public boolean initialise() {
83
84 if (!super.initialise()) {
85 return false;
86 }
87
88
89 if (full_indexdir==null || full_indexdir.length()==-1){
90 utf8out.println("Index directory is not indicated ");
91 utf8out.flush();
92 return false;
93 }
94
95 try {
96 Directory full_indexdir_dir = FSDirectory.open(new File(full_indexdir));
97 searcher = new IndexSearcher(full_indexdir_dir,true);
98 reader = ((IndexSearcher) searcher).getIndexReader();
99
100 }
101 catch (IOException exception) {
102 exception.printStackTrace();
103 return false;
104 }
105 return true;
106
107 }
108
109 public void setIndexDir(String full_indexdir) {
110 this.full_indexdir = full_indexdir;
111 }
112
113 public void setSortField(String sort_field) {
114 super.setSortField(sort_field);
115
116 if (sort_field == null) {
117 this.sorter = new Sort();
118 } else {
119 this.sorter = new Sort(new SortField(sort_field,SortField.STRING)); // **** can do better than this?!?
120 }
121 }
122
123 public void setFilterString(String filter_string) {
124 super.setFilterString(filter_string);
125 this.filter = parseFilterString(filter_string);
126 }
127
128 public Filter getFilter() {
129 return this.filter;
130 }
131
132
133 public LuceneQueryResult runQuery(String query_string) {
134
135 if (query_string == null || query_string.equals("")) {
136 utf8out.println("The query word is not indicated ");
137 utf8out.flush();
138 return null;
139 }
140
141 LuceneQueryResult lucene_query_result=new LuceneQueryResult();
142 lucene_query_result.clear();
143
144 try {
145 Query query_including_stop_words = query_parser_no_stop_words.parse(query_string);
146 query_including_stop_words = query_including_stop_words.rewrite(reader);
147
148 // System.err.println("********* query_string " + query_string + "****");
149
150 Query query = parseQuery(reader, query_parser, query_string, fuzziness);
151 query = query.rewrite(reader);
152
153 // Get the list of expanded query terms and their frequencies
154 // num docs matching, and total frequency
155 HashSet terms = new HashSet();
156 query.extractTerms(terms);
157
158 HashMap doc_term_freq_map = new HashMap();
159
160 Iterator iter = terms.iterator();
161 while (iter.hasNext()) {
162
163 Term term = (Term) iter.next();
164
165 // Get the term frequency over all the documents
166 TermDocs term_docs = reader.termDocs(term);
167 int term_freq = 0;
168 int match_docs = 0;
169 while (term_docs.next())
170 {
171 if (term_docs.freq() != 0)
172 {
173 term_freq += term_docs.freq();
174 match_docs++;
175
176 // Calculate the document-level term frequency as well
177 Integer lucene_doc_num_obj = new Integer(term_docs.doc());
178 int doc_term_freq = 0;
179 if (doc_term_freq_map.containsKey(lucene_doc_num_obj))
180 {
181 doc_term_freq = ((Integer) doc_term_freq_map.get(lucene_doc_num_obj)).intValue();
182 }
183 doc_term_freq += term_docs.freq();
184
185 doc_term_freq_map.put(lucene_doc_num_obj, new Integer(doc_term_freq));
186 }
187 }
188
189 // Create a term
190 lucene_query_result.addTerm(term.text(), term.field(), match_docs, term_freq);
191 }
192
193 // Get the list of stop words removed from the query
194 HashSet terms_including_stop_words = new HashSet();
195 query_including_stop_words.extractTerms(terms_including_stop_words);
196 Iterator terms_including_stop_words_iter = terms_including_stop_words.iterator();
197 while (terms_including_stop_words_iter.hasNext()) {
198 Term term = (Term) terms_including_stop_words_iter.next();
199 if (!terms.contains(term)) {
200 lucene_query_result.addStopWord(term.text());
201 }
202 }
203
204 // do the query
205 // Simple case for getting all the matching documents
206 if (end_results == Integer.MAX_VALUE) {
207 // Perform the query (filter and sorter may be null)
208 TopFieldDocs hits = searcher.search(query, filter, end_results, sorter);
209 lucene_query_result.setTotalDocs(hits.totalHits);
210
211 // Output the matching documents
212 lucene_query_result.setStartResults(start_results);
213 lucene_query_result.setEndResults(hits.totalHits);
214
215 for (int i = start_results; i <= hits.totalHits; i++) {
216 int lucene_doc_num = hits.scoreDocs[i - 1].doc;
217 Document doc = reader.document(lucene_doc_num);
218 int doc_term_freq = 0;
219 Integer doc_term_freq_object = (Integer) doc_term_freq_map.get(new Integer(lucene_doc_num));
220 if (doc_term_freq_object != null)
221 {
222 doc_term_freq = doc_term_freq_object.intValue();
223 }
224 lucene_query_result.addDoc(doc.get("docOID").trim(), hits.scoreDocs[i-1].score, doc_term_freq);
225 }
226 }
227
228 // Slightly more complicated case for returning a subset of the matching documents
229 else {
230 // Perform the query (filter may be null)
231 TopFieldDocs hits = searcher.search(query, filter, end_results, sorter);
232 lucene_query_result.setTotalDocs(hits.totalHits);
233
234 lucene_query_result.setStartResults(start_results);
235 lucene_query_result.setEndResults(end_results < hits.scoreDocs.length ? end_results: hits.scoreDocs.length);
236
237 // Output the matching documents
238 for (int i = start_results; (i <= hits.scoreDocs.length && i <= end_results); i++) {
239 int lucene_doc_num = hits.scoreDocs[i - 1].doc;
240 Document doc = reader.document(lucene_doc_num);
241 int doc_term_freq = 0;
242 Integer doc_term_freq_object = (Integer) doc_term_freq_map.get(new Integer(lucene_doc_num));
243 if (doc_term_freq_object != null)
244 {
245 doc_term_freq = doc_term_freq_object.intValue();
246 }
247 lucene_query_result.addDoc(doc.get("docOID").trim(), hits.scoreDocs[i-1].score, doc_term_freq);
248 }
249 }
250 }
251
252 catch (ParseException parse_exception) {
253 lucene_query_result.setError(LuceneQueryResult.PARSE_ERROR);
254 }
255 catch (TooManyClauses too_many_clauses_exception) {
256 lucene_query_result.setError(LuceneQueryResult.TOO_MANY_CLAUSES_ERROR);
257 }
258 catch (IOException exception) {
259 lucene_query_result.setError(LuceneQueryResult.IO_ERROR);
260 exception.printStackTrace();
261 }
262 catch (Exception exception) {
263 lucene_query_result.setError(LuceneQueryResult.OTHER_ERROR);
264 exception.printStackTrace();
265 }
266 return lucene_query_result;
267 }
268
269 public void setDefaultConjunctionOperator(String default_conjunction_operator) {
270 super.setDefaultConjunctionOperator(default_conjunction_operator);
271
272 if (default_conjunction_operator.equals("AND")) {
273 query_parser.setDefaultOperator(query_parser.AND_OPERATOR);
274 query_parser_no_stop_words.setDefaultOperator(query_parser.AND_OPERATOR);
275 } else { // default is OR
276 query_parser.setDefaultOperator(query_parser.OR_OPERATOR);
277 query_parser_no_stop_words.setDefaultOperator(query_parser.OR_OPERATOR);
278 }
279 }
280
281
282 public void cleanUp() {
283 super.cleanUp();
284 try {
285 if (searcher != null) {
286 searcher.close();
287 }
288 } catch (IOException exception) {
289 exception.printStackTrace();
290 }
291 }
292
293
294 protected Query parseQuery(IndexReader reader, QueryParser query_parser, String query_string, String fuzziness)
295 throws java.io.IOException, org.apache.lucene.queryParser.ParseException
296 {
297 // Split query string into the search terms and the filter terms
298 // * The first +(...) term contains the search terms so count
299 // up '(' and stop when we finish matching ')'
300 int offset = 0;
301 int paren_count = 0;
302 boolean seen_paren = false;
303 while (offset < query_string.length() && (!seen_paren || paren_count > 0)) {
304 if (query_string.charAt(offset) == '(') {
305 paren_count++;
306 seen_paren = true;
307 }
308 if (query_string.charAt(offset) == ')') {
309 paren_count--;
310 }
311 offset++;
312 }
313 String query_prefix = query_string.substring(0, offset);
314 String query_suffix = query_string.substring(offset);
315
316 ///ystem.err.println("Prefix: " + query_prefix);
317 ///ystem.err.println("Suffix: " + query_suffix);
318
319 Query query = query_parser.parse(query_prefix);
320 query = query.rewrite(reader);
321
322 // If this is a fuzzy search, then we need to add the fuzzy
323 // flag to each of the query terms
324 if (fuzziness != null && query.toString().length() > 0) {
325
326 // Revert the query to a string
327 System.err.println("Rewritten query: " + query.toString());
328 // Search through the string for TX:<term> query terms
329 // and append the ~ operator. Note that this search will
330 // not change phrase searches (TX:"<term> <term>") as
331 // fuzzy searching is not possible for these entries.
332 // Yahoo! Time for a state machine!
333 StringBuffer mutable_query_string = new StringBuffer(query.toString());
334 int o = 0; // Offset
335 // 0 = BASE, 1 = SEEN_T, 2 = SEEN_TX, 3 = SEEN_TX:
336 int s = 0; // State
337 while(o < mutable_query_string.length()) {
338 char c = mutable_query_string.charAt(o);
339 if (s == 0 && c == TEXTFIELD.charAt(0)) {
340 ///ystem.err.println("Found T!");
341 s = 1;
342 }
343 else if (s == 1) {
344 if (c == TEXTFIELD.charAt(1)) {
345 ///ystem.err.println("Found X!");
346 s = 2;
347 }
348 else {
349 s = 0; // Reset
350 }
351 }
352 else if (s == 2) {
353 if (c == ':') {
354 ///ystem.err.println("Found TX:!");
355 s = 3;
356 }
357 else {
358 s = 0; // Reset
359 }
360 }
361 else if (s == 3) {
362 // Don't process phrases
363 if (c == '"') {
364 ///ystem.err.println("Stupid phrase...");
365 s = 0; // Reset
366 }
367 // Found the end of the term... add the
368 // fuzzy search indicator
369 // Nor outside the scope of parentheses
370 else if (Character.isWhitespace(c) || c == ')') {
371 ///ystem.err.println("Yahoo! Found fuzzy term.");
372 mutable_query_string.insert(o, '~' + fuzziness);
373 o++;
374 s = 0; // Reset
375 }
376 }
377 o++;
378 }
379 // If we were in the state of looking for the end of a
380 // term - then we just found it!
381 if (s == 3) {
382
383 mutable_query_string.append('~' + fuzziness);
384 }
385 // Reparse the query
386 ///ystem.err.println("Fuzzy query: " + mutable_query_string.toString() + query_suffix);
387 query = query_parser.parse(mutable_query_string.toString() + query_suffix);
388 }
389 else {
390 query = query_parser.parse(query_prefix + query_suffix);
391 }
392
393 return query;
394 }
395
396 protected Filter parseFilterString(String filter_string)
397 {
398 Filter result = null;
399 Pattern pattern = Pattern.compile("\\s*\\+(\\w+)\\:([\\{\\[])(\\d+)\\s+TO\\s+(\\d+)([\\}\\]])\\s*");
400 Matcher matcher = pattern.matcher(filter_string);
401 if (matcher.matches()) {
402 String field_name = matcher.group(1);
403 boolean include_lower = matcher.group(2).equals("[");
404 String lower_term = matcher.group(3);
405 String upper_term = matcher.group(4);
406 boolean include_upper = matcher.group(5).equals("]");
407 result = new TermRangeFilter(field_name, lower_term, upper_term, include_lower, include_upper);
408 }
409 else {
410 System.err.println("Error: Could not understand filter string \"" + filter_string + "\"");
411 }
412 return result;
413 }
414
415
416 /** command line program and auxiliary methods */
417
418 // Fairly self-explanatory I should hope
419 static protected boolean query_result_caching_enabled = false;
420
421
422 static public void main (String args[])
423 {
424 if (args.length == 0) {
425 System.out.println("Usage: GS2LuceneQuery <index directory> [-fuzziness value] [-filter filter_string] [-sort sort_field] [-dco AND|OR] [-startresults number -endresults number] [query]");
426 return;
427 }
428
429 try {
430 String index_directory = args[0];
431
432 GS2LuceneQuery queryer = new GS2LuceneQuery();
433 queryer.setIndexDir(index_directory);
434
435 // Prepare the index cache directory, if query result caching is enabled
436 if (query_result_caching_enabled) {
437 // Make the index cache directory if it doesn't already exist
438 File index_cache_directory = new File(index_directory, "cache");
439 if (!index_cache_directory.exists()) {
440 index_cache_directory.mkdir();
441 }
442
443 // Disable caching if the index cache directory isn't available
444 if (!index_cache_directory.exists() || !index_cache_directory.isDirectory()) {
445 query_result_caching_enabled = false;
446 }
447 }
448
449 String query_string = null;
450
451 // Parse the command-line arguments
452 for (int i = 1; i < args.length; i++) {
453 if (args[i].equals("-sort")) {
454 i++;
455 queryer.setSortField(args[i]);
456 }
457 else if (args[i].equals("-filter")) {
458 i++;
459 queryer.setFilterString(args[i]);
460 }
461 else if (args[i].equals("-dco")) {
462 i++;
463 queryer.setDefaultConjunctionOperator(args[i]);
464 }
465 else if (args[i].equals("-fuzziness")) {
466 i++;
467 queryer.setFuzziness(args[i]);
468 }
469 else if (args[i].equals("-startresults")) {
470 i++;
471 if (args[i].matches("\\d+")) {
472 queryer.setStartResults(Integer.parseInt(args[i]));
473 }
474 }
475 else if (args[i].equals("-endresults")) {
476 i++;
477 if (args[i].matches("\\d+")) {
478 queryer.setEndResults(Integer.parseInt(args[i]));
479 }
480 }
481 else {
482 query_string = args[i];
483 }
484 }
485
486 if (!queryer.initialise()) {
487 return;
488 }
489
490 // The query string has been specified as a command-line argument
491 if (query_string != null) {
492 runQueryCaching(index_directory, queryer, query_string);
493 }
494
495 // Read queries from STDIN
496 else {
497 BufferedReader in = new BufferedReader(new InputStreamReader(System.in, "UTF-8"));
498 while (true) {
499 // Read the query from STDIN
500 query_string = in.readLine();
501 if (query_string == null || query_string.length() == -1) {
502 break;
503 }
504
505 runQueryCaching(index_directory, queryer, query_string);
506
507 }
508 }
509 queryer.cleanUp();
510 }
511 catch (IOException exception) {
512 exception.printStackTrace();
513 }
514 }
515
516 protected static void runQueryCaching(String index_directory, GS2LuceneQuery queryer, String query_string)
517 throws IOException
518 {
519 StringBuffer query_results_xml = new StringBuffer();
520
521 // Check if this query result has been cached from a previous search (if it's enabled)
522 File query_result_cache_file = null;
523 if (query_result_caching_enabled) {
524 // Generate the cache file name from the query options
525 String query_result_cache_file_name = query_string + "-";
526 String fuzziness = queryer.getFuzziness();
527 query_result_cache_file_name += ((fuzziness != null) ? fuzziness : "") + "-";
528 String filter_string = queryer.getFilterString();
529 query_result_cache_file_name += ((filter_string != null) ? filter_string : "") + "-";
530 String sort_string = queryer.getSortField();
531 query_result_cache_file_name += ((sort_string != null) ? sort_string : "") + "-";
532 String default_conjunction_operator = queryer.getDefaultConjunctionOperator();
533 query_result_cache_file_name += default_conjunction_operator + "-";
534 int start_results = queryer.getStartResults();
535 int end_results = queryer.getEndResults();
536 query_result_cache_file_name += start_results + "-" + end_results;
537 query_result_cache_file_name = fileSafe(query_result_cache_file_name);
538
539 // If the query result cache file exists, just return its contents and we're done
540 File index_cache_directory = new File(index_directory, "cache");
541 query_result_cache_file = new File(index_cache_directory, query_result_cache_file_name);
542 if (query_result_cache_file.exists() && query_result_cache_file.isFile()) {
543 FileInputStream fis = new FileInputStream(query_result_cache_file);
544 InputStreamReader isr = new InputStreamReader(fis, "UTF-8");
545 BufferedReader buffered_reader = new BufferedReader(isr);
546 String line = "";
547 while ((line = buffered_reader.readLine()) != null) {
548 query_results_xml.append(line + "\n");
549 }
550 String query_results_xml_string = query_results_xml.toString();
551 query_results_xml_string = query_results_xml_string.replaceFirst("cached=\"false\"", "cached=\"true\"");
552
553 utf8out.print(query_results_xml_string);
554 utf8out.flush();
555
556 return;
557 }
558 }
559
560 // not cached
561 query_results_xml.append("<ResultSet cached=\"false\">\n");
562 query_results_xml.append("<QueryString>" + LuceneQueryResult.xmlSafe(query_string) + "</QueryString>\n");
563 Filter filter = queryer.getFilter();
564 if (filter != null) {
565 query_results_xml.append("<FilterString>" + filter.toString() + "</FilterString>\n");
566 }
567
568 LuceneQueryResult query_result = queryer.runQuery(query_string);
569 if (query_result == null) {
570 System.err.println("Couldn't run the query");
571 return;
572 }
573
574 if (query_result.getError() != LuceneQueryResult.NO_ERROR) {
575 query_results_xml.append("<Error type=\""+query_result.getErrorString()+"\" />\n");
576 } else {
577 query_results_xml.append(query_result.getXMLString());
578 }
579 query_results_xml.append("</ResultSet>\n");
580
581 utf8out.print(query_results_xml);
582 utf8out.flush();
583
584 // Cache this query result, if desired
585 if (query_result_caching_enabled) {
586 // Catch any exceptions thrown trying to write the query result cache file and warn about them, but don't
587 // bother with the full stack trace. It won't affect the functionality if we can't write some cache
588 // files, it will just affect the speed of subsequent requests.
589 // Example exceptions are "permission denied" errors, or "filename too long" errors (the filter string
590 // can get very long in some collections)
591 try
592 {
593 FileWriter query_result_cache_file_writer = new FileWriter(query_result_cache_file);
594 query_result_cache_file_writer.write(query_results_xml.toString());
595 query_result_cache_file_writer.close();
596 }
597 catch (Exception exception)
598 {
599 System.err.println("Warning: Exception occurred trying to write query result cache file (" + exception + ")");
600 }
601 }
602 }
603
604 protected static String fileSafe(String text)
605 {
606 StringBuffer file_safe_text = new StringBuffer();
607 for (int i = 0; i < text.length(); i++) {
608 char character = text.charAt(i);
609 if ((character >= 'A' && character <= 'Z') || (character >= 'a' && character <= 'z') || (character >= '0' && character <= '9') || character == '-') {
610 file_safe_text.append(character);
611 }
612 else {
613 file_safe_text.append('%');
614 file_safe_text.append((int) character);
615 }
616 }
617 return file_safe_text.toString();
618 }
619
620
621}
622
623
Note: See TracBrowser for help on using the repository browser.