source: indexers/trunk/lucene-gs/src/org/greenstone/LuceneWrapper/GS2LuceneQuery.java@ 18614

Last change on this file since 18614 was 18614, checked in by mdewsnip, 15 years ago

Removed a big wodge of debug crap that someone left...

  • Property svn:keywords set to Author Date Id Revision
File size: 20.6 KB
RevLine 
[13557]1/**********************************************************************
[8521]2 *
[13557]3 * GS2LuceneQuery.java
4 *
5 * Copyright 2004 The New Zealand Digital Library Project
6 *
7 * A component of the Greenstone digital library software
8 * from the New Zealand Digital Library Project at the
9 * University of Waikato, New Zealand.
10 *
11 * This program is free software; you can redistribute it and/or modify
12 * it under the terms of the GNU General Public License as published by
13 * the Free Software Foundation; either version 2 of the License, or
14 * (at your option) any later version.
15 *
16 * This program is distributed in the hope that it will be useful,
17 * but WITHOUT ANY WARRANTY; without even the implied warranty of
18 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19 * GNU General Public License for more details.
20 *
21 * You should have received a copy of the GNU General Public License
22 * along with this program; if not, write to the Free Software
23 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24 *
25 *********************************************************************/
[13686]26package org.greenstone.LuceneWrapper;
[12257]27
28
[12429]29import java.io.*;
30import java.util.*;
[12656]31import java.util.regex.*;
[8521]32
33import org.apache.lucene.analysis.Analyzer;
34import org.apache.lucene.analysis.standard.StandardAnalyzer;
35import org.apache.lucene.document.Document;
[12256]36import org.apache.lucene.index.IndexReader;
37import org.apache.lucene.index.Term;
[12975]38import org.apache.lucene.index.TermDocs;
[12418]39import org.apache.lucene.queryParser.ParseException;
[12256]40import org.apache.lucene.queryParser.QueryParser;
[12418]41import org.apache.lucene.search.BooleanQuery.TooManyClauses;
[12408]42import org.apache.lucene.search.Filter;
[12377]43import org.apache.lucene.search.Hit;
[12256]44import org.apache.lucene.search.Hits;
[8521]45import org.apache.lucene.search.IndexSearcher;
46import org.apache.lucene.search.Query;
[12408]47import org.apache.lucene.search.RangeFilter;
[12256]48import org.apache.lucene.search.Searcher;
[12980]49import org.apache.lucene.search.ScoreDoc;
[12275]50import org.apache.lucene.search.Sort;
[12980]51import org.apache.lucene.search.TopFieldDocs;
[8521]52
53
[12256]54public class GS2LuceneQuery
55{
[13557]56
57
[12846]58 static private String TEXTFIELD = "TX";
59
[12653]60 // Use the standard set of English stop words by default
[17804]61 static private String[] stop_words = GS2Analyzer.STOP_WORDS;
[12653]62
[13557]63 private String full_indexdir="";
64 private String default_conjunction_operator = "OR";
65 private String fuzziness = null;
66 private String sort_field = null;
67 private Sort sorter=new Sort();
68 private String filter_string = null;
69 private Filter filter = null;
70 private int start_results=1;
71 private int end_results=Integer.MAX_VALUE;
[12653]72
[13557]73 private QueryParser query_parser = null;
74 private QueryParser query_parser_no_stop_words = null;
75 private Searcher searcher = null;
76 private IndexReader reader = null;
[16015]77
78 static private PrintWriter utf8out = null;
79
80 static
81 {
82 try {
83 OutputStreamWriter osw = new OutputStreamWriter(System.out, "UTF-8");
84 utf8out = new PrintWriter(osw, true);
85 }
86 catch (UnsupportedEncodingException e) {
87 System.out.println(e);
88 }
89 }
90
[13557]91
92 public GS2LuceneQuery() {
[12981]93
[13557]94 // Create one query parser with the standard set of stop words, and one with none
[12256]95
[17804]96 query_parser = new QueryParser(TEXTFIELD, new GS2Analyzer(stop_words));
97 query_parser_no_stop_words = new QueryParser(TEXTFIELD, new GS2Analyzer(new String[] { }));
[13557]98 }
99
100
101 public boolean initialise() {
102
103 if (full_indexdir==null || full_indexdir.length()==-1){
[16015]104 utf8out.println("Index directory is not indicated ");
105 utf8out.flush();
[13557]106 return false;
107 }
[8521]108 try {
[13557]109 searcher = new IndexSearcher(full_indexdir);
110 reader = ((IndexSearcher) searcher).getIndexReader();
[12772]111
[12983]112 }
113 catch (IOException exception) {
[13557]114 exception.printStackTrace();
115 return false;
116 }
117 return true;
118
[12983]119 }
[13557]120
121 public LuceneQueryResult runQuery(String query_string) {
122
123 if (query_string == null || query_string.equals("")) {
[16015]124 utf8out.println("The query word is not indicated ");
125 utf8out.flush();
[13557]126 return null;
[12999]127 }
128
[13557]129 LuceneQueryResult lucene_query_result=new LuceneQueryResult();
130 lucene_query_result.clear();
131
132 try {
133 Query query_including_stop_words = query_parser_no_stop_words.parse(query_string);
[12983]134 query_including_stop_words = query_including_stop_words.rewrite(reader);
[13557]135
[17804]136 // System.err.println("********* query_string " + query_string + "****");
137
[12983]138 Query query = parseQuery(reader, query_parser, query_string, fuzziness);
139 query = query.rewrite(reader);
[13557]140
141 // Get the list of expanded query terms and their frequencies
142 // num docs matching, and total frequency
[12983]143 HashSet terms = new HashSet();
144 query.extractTerms(terms);
[16912]145
146 HashMap doc_term_freq_map = new HashMap();
[13557]147
148 Iterator iter = terms.iterator();
149 while (iter.hasNext()) {
150
151 Term term = (Term) iter.next();
152
[12983]153 // Get the term frequency over all the documents
154 TermDocs term_docs = reader.termDocs(term);
[16912]155 int term_freq = 0;
[13557]156 int match_docs = 0;
[16912]157 while (term_docs.next())
158 {
159 if (term_docs.freq() != 0)
160 {
161 term_freq += term_docs.freq();
[13557]162 match_docs++;
[16912]163
164 // Calculate the document-level term frequency as well
165 Integer lucene_doc_num_obj = new Integer(term_docs.doc());
166 int doc_term_freq = 0;
167 if (doc_term_freq_map.containsKey(lucene_doc_num_obj))
168 {
169 doc_term_freq = ((Integer) doc_term_freq_map.get(lucene_doc_num_obj)).intValue();
170 }
171 doc_term_freq += term_docs.freq();
172
173 doc_term_freq_map.put(lucene_doc_num_obj, new Integer(doc_term_freq));
[13557]174 }
[12983]175 }
[12976]176
[13557]177 // Create a term
178 lucene_query_result.addTerm(term.text(), term.field(), match_docs, term_freq);
[12983]179 }
[13557]180
181 // Get the list of stop words removed from the query
[12983]182 HashSet terms_including_stop_words = new HashSet();
183 query_including_stop_words.extractTerms(terms_including_stop_words);
184 Iterator terms_including_stop_words_iter = terms_including_stop_words.iterator();
185 while (terms_including_stop_words_iter.hasNext()) {
186 Term term = (Term) terms_including_stop_words_iter.next();
187 if (!terms.contains(term)) {
[13557]188 lucene_query_result.addStopWord(term.text());
[12983]189 }
190 }
[13570]191
[13557]192 // do the query
[12983]193 // Simple case for getting all the matching documents
194 if (end_results == Integer.MAX_VALUE) {
195 // Perform the query (filter and sorter may be null)
196 Hits hits = searcher.search(query, filter, sorter);
[13557]197 lucene_query_result.setTotalDocs(hits.length());
[12983]198
199 // Output the matching documents
[13557]200 lucene_query_result.setStartResults(start_results);
201 lucene_query_result.setEndResults(hits.length());
202
[12983]203 for (int i = start_results; i <= hits.length(); i++) {
[16912]204 int lucene_doc_num = hits.id(i - 1);
[12983]205 Document doc = hits.doc(i - 1);
[16912]206 int doc_term_freq = ((Integer) doc_term_freq_map.get(new Integer(lucene_doc_num))).intValue();
[16947]207 lucene_query_result.addDoc(doc.get("docOID").trim(), hits.score(i-1), doc_term_freq);
[12372]208 }
[12983]209 }
210
211 // Slightly more complicated case for returning a subset of the matching documents
212 else {
213 // Perform the query (filter may be null)
214 TopFieldDocs hits = searcher.search(query, filter, end_results, sorter);
[13557]215 lucene_query_result.setTotalDocs(hits.totalHits);
216
217 lucene_query_result.setStartResults(start_results);
218 lucene_query_result.setEndResults(end_results < hits.scoreDocs.length ? end_results: hits.scoreDocs.length);
[12983]219
220 // Output the matching documents
221 for (int i = start_results; (i <= hits.scoreDocs.length && i <= end_results); i++) {
[16912]222 int lucene_doc_num = hits.scoreDocs[i - 1].doc;
223 Document doc = reader.document(lucene_doc_num);
224 int doc_term_freq = ((Integer) doc_term_freq_map.get(new Integer(lucene_doc_num))).intValue();
[16947]225 lucene_query_result.addDoc(doc.get("docOID").trim(), hits.scoreDocs[i-1].score, doc_term_freq);
[12418]226 }
[8521]227 }
228 }
[13557]229
[12983]230 catch (ParseException parse_exception) {
[13557]231 lucene_query_result.setError(LuceneQueryResult.PARSE_ERROR);
[12418]232 }
[12983]233 catch (TooManyClauses too_many_clauses_exception) {
[13557]234 lucene_query_result.setError(LuceneQueryResult.TOO_MANY_CLAUSES_ERROR);
[12983]235 }
[13557]236 catch (IOException exception) {
237 lucene_query_result.setError(LuceneQueryResult.IO_ERROR);
238 exception.printStackTrace();
239 }
[13570]240 catch (Exception exception) {
241 lucene_query_result.setError(LuceneQueryResult.OTHER_ERROR);
242 exception.printStackTrace();
243 }
[13557]244 return lucene_query_result;
245 }
[12993]246
[13557]247 public void setDefaultConjunctionOperator(String default_conjunction_operator) {
248 this.default_conjunction_operator = default_conjunction_operator.toUpperCase();
[14559]249 if (default_conjunction_operator.equals("AND")) {
[13557]250 query_parser.setDefaultOperator(query_parser.AND_OPERATOR);
251 query_parser_no_stop_words.setDefaultOperator(query_parser.AND_OPERATOR);
252 } else { // default is OR
253 query_parser.setDefaultOperator(query_parser.OR_OPERATOR);
254 query_parser_no_stop_words.setDefaultOperator(query_parser.OR_OPERATOR);
[12999]255 }
[8521]256 }
[13557]257
258 public String getDefaultConjunctionOperator() {
259 return this.default_conjunction_operator;
260 }
261
262 public void setEndResults(int end_results) {
263 this.end_results = end_results;
264 }
265 public int getEndResults() {
266 return this.end_results;
267 }
268
269 public void setFilterString(String filter_string) {
270 this.filter_string = filter_string;
271 this.filter = parseFilterString(filter_string);
272 }
273 public String getFilterString() {
274 return this.filter_string ;
275 }
276
277 public Filter getFilter() {
278 return this.filter;
279 }
[12408]280
[13557]281 public void setIndexDir(String full_indexdir) {
282 this.full_indexdir = full_indexdir;
283 }
284
285 public void setFuzziness(String fuzziness) {
286 this.fuzziness = fuzziness;
287 }
288 public String getFuzziness() {
289 return this.fuzziness;
290 }
291
292 public void setSortField(String sort_field) {
293 this.sort_field = sort_field;
[13570]294 if (sort_field == null) {
295 this.sorter = new Sort();
296 } else {
297 this.sorter = new Sort(sort_field);
298 }
[13557]299 }
300 public String getSortField() {
301 return this.sort_field;
302 }
303
304 public void setStartResults(int start_results) {
[13570]305 if (start_results < 1) {
306 start_results = 1;
307 }
[13557]308 this.start_results = start_results;
309 }
310 public int getStartResults() {
311 return this.start_results;
312 }
313
314 public void cleanUp() {
315 try {
[13909]316 if (searcher != null) {
317 searcher.close();
318 }
[13557]319 } catch (IOException exception) {
320 exception.printStackTrace();
[12999]321 }
322 }
323
[13557]324 private Query parseQuery(IndexReader reader, QueryParser query_parser, String query_string, String fuzziness)
[12415]325 throws java.io.IOException, org.apache.lucene.queryParser.ParseException
326 {
327 // Split query string into the search terms and the filter terms
328 // * The first +(...) term contains the search terms so count
329 // up '(' and stop when we finish matching ')'
330 int offset = 0;
331 int paren_count = 0;
332 boolean seen_paren = false;
[12772]333 while (offset < query_string.length() && (!seen_paren || paren_count > 0)) {
334 if (query_string.charAt(offset) == '(') {
335 paren_count++;
336 seen_paren = true;
[12415]337 }
[12772]338 if (query_string.charAt(offset) == ')') {
339 paren_count--;
340 }
341 offset++;
342 }
[12415]343 String query_prefix = query_string.substring(0, offset);
344 String query_suffix = query_string.substring(offset);
[13557]345
[12415]346 ///ystem.err.println("Prefix: " + query_prefix);
347 ///ystem.err.println("Suffix: " + query_suffix);
[13557]348
[12415]349 Query query = query_parser.parse(query_prefix);
350 query = query.rewrite(reader);
[13557]351
[12415]352 // If this is a fuzzy search, then we need to add the fuzzy
353 // flag to each of the query terms
[12772]354 if (fuzziness != null && query.toString().length() > 0) {
[13557]355
[12772]356 // Revert the query to a string
357 System.err.println("Rewritten query: " + query.toString());
358 // Search through the string for TX:<term> query terms
[13557]359 // and append the ~ operator. Note that this search will
[12772]360 // not change phrase searches (TX:"<term> <term>") as
361 // fuzzy searching is not possible for these entries.
362 // Yahoo! Time for a state machine!
363 StringBuffer mutable_query_string = new StringBuffer(query.toString());
364 int o = 0; // Offset
365 // 0 = BASE, 1 = SEEN_T, 2 = SEEN_TX, 3 = SEEN_TX:
366 int s = 0; // State
[13557]367 while(o < mutable_query_string.length()) {
[12772]368 char c = mutable_query_string.charAt(o);
[12846]369 if (s == 0 && c == TEXTFIELD.charAt(0)) {
[12772]370 ///ystem.err.println("Found T!");
371 s = 1;
372 }
373 else if (s == 1) {
[12846]374 if (c == TEXTFIELD.charAt(1)) {
[12772]375 ///ystem.err.println("Found X!");
376 s = 2;
377 }
378 else {
379 s = 0; // Reset
380 }
381 }
382 else if (s == 2) {
383 if (c == ':') {
384 ///ystem.err.println("Found TX:!");
385 s = 3;
386 }
387 else {
388 s = 0; // Reset
389 }
390 }
391 else if (s == 3) {
392 // Don't process phrases
393 if (c == '"') {
394 ///ystem.err.println("Stupid phrase...");
395 s = 0; // Reset
396 }
397 // Found the end of the term... add the
398 // fuzzy search indicator
399 // Nor outside the scope of parentheses
400 else if (Character.isWhitespace(c) || c == ')') {
401 ///ystem.err.println("Yahoo! Found fuzzy term.");
402 mutable_query_string.insert(o, '~' + fuzziness);
[12415]403 o++;
[12772]404 s = 0; // Reset
[12415]405 }
[12772]406 }
407 o++;
[12415]408 }
[12772]409 // If we were in the state of looking for the end of a
410 // term - then we just found it!
411 if (s == 3) {
[13557]412
[12772]413 mutable_query_string.append('~' + fuzziness);
[12415]414 }
[12772]415 // Reparse the query
416 ///ystem.err.println("Fuzzy query: " + mutable_query_string.toString() + query_suffix);
417 query = query_parser.parse(mutable_query_string.toString() + query_suffix);
418 }
419 else {
420 query = query_parser.parse(query_prefix + query_suffix);
421 }
422
[12415]423 return query;
424 }
[12656]425
[13557]426 private Filter parseFilterString(String filter_string)
427 {
428 Filter result = null;
429 Pattern pattern = Pattern.compile("\\s*\\+(\\w+)\\:([\\{\\[])(\\d+)\\s+TO\\s+(\\d+)([\\}\\]])\\s*");
430 Matcher matcher = pattern.matcher(filter_string);
431 if (matcher.matches()) {
432 String field_name = matcher.group(1);
433 boolean include_lower = matcher.group(2).equals("[");
434 String lower_term = matcher.group(3);
435 String upper_term = matcher.group(4);
436 boolean include_upper = matcher.group(5).equals("]");
437 result = new RangeFilter(field_name, lower_term, upper_term, include_lower, include_upper);
438 }
439 else {
440 System.err.println("Error: Could not understand filter string \"" + filter_string + "\"");
441 }
442 return result;
443 }
[12656]444
[16015]445
446 protected void finalize() throws Throwable
447 {
448 try {
449 utf8out.flush();
450 } finally {
451 super.finalize();
452 }
453 }
454
[13557]455
456 /** command line program and auxiliary methods */
457
458 // Fairly self-explanatory I should hope
459 static private boolean query_result_caching_enabled = false;
460
[16015]461
[13557]462 static public void main (String args[])
463 {
[16015]464
465
[13557]466 if (args.length == 0) {
467 System.out.println("Usage: GS2LuceneQuery <index directory> [-fuzziness value] [-filter filter_string] [-sort sort_field] [-dco AND|OR] [-startresults number -endresults number] [query]");
468 return;
469 }
470
471 try {
472 String index_directory = args[0];
473
474 GS2LuceneQuery queryer = new GS2LuceneQuery();
475 queryer.setIndexDir(index_directory);
476
477 // Prepare the index cache directory, if query result caching is enabled
478 if (query_result_caching_enabled) {
479 // Make the index cache directory if it doesn't already exist
480 File index_cache_directory = new File(index_directory, "cache");
481 if (!index_cache_directory.exists()) {
482 index_cache_directory.mkdir();
483 }
484
485 // Disable caching if the index cache directory isn't available
486 if (!index_cache_directory.exists() || !index_cache_directory.isDirectory()) {
487 query_result_caching_enabled = false;
488 }
489 }
490
491 String query_string = null;
492
493 // Parse the command-line arguments
494 for (int i = 1; i < args.length; i++) {
495 if (args[i].equals("-sort")) {
496 i++;
497 queryer.setSortField(args[i]);
498 }
499 else if (args[i].equals("-filter")) {
500 i++;
501 queryer.setFilterString(args[i]);
502 }
503 else if (args[i].equals("-dco")) {
504 i++;
505 queryer.setDefaultConjunctionOperator(args[i]);
506 }
507 else if (args[i].equals("-fuzziness")) {
508 i++;
509 queryer.setFuzziness(args[i]);
510 }
511 else if (args[i].equals("-startresults")) {
512 i++;
513 if (args[i].matches("\\d+")) {
514 queryer.setStartResults(Integer.parseInt(args[i]));
515 }
516 }
517 else if (args[i].equals("-endresults")) {
518 i++;
519 if (args[i].matches("\\d+")) {
520 queryer.setEndResults(Integer.parseInt(args[i]));
521 }
522 }
523 else {
524 query_string = args[i];
525 }
526 }
527
528 if (!queryer.initialise()) {
529 return;
530 }
531
532 // The query string has been specified as a command-line argument
533 if (query_string != null) {
534 runQueryCaching(index_directory, queryer, query_string);
535 }
536
537 // Read queries from STDIN
538 else {
539 BufferedReader in = new BufferedReader(new InputStreamReader(System.in, "UTF-8"));
540 while (true) {
541 // Read the query from STDIN
542 query_string = in.readLine();
543 if (query_string == null || query_string.length() == -1) {
544 break;
545 }
[16015]546
[13557]547 runQueryCaching(index_directory, queryer, query_string);
548
549 }
550 }
551 queryer.cleanUp();
552 }
553 catch (IOException exception) {
554 exception.printStackTrace();
555 }
[12772]556 }
[13557]557
558 private static void runQueryCaching(String index_directory, GS2LuceneQuery queryer, String query_string)
559 throws IOException
560 {
561 StringBuffer query_results_xml = new StringBuffer();
562
563 // Check if this query result has been cached from a previous search (if it's enabled)
564 File query_result_cache_file = null;
565 if (query_result_caching_enabled) {
566 // Generate the cache file name from the query options
567 String query_result_cache_file_name = query_string + "-";
568 String fuzziness = queryer.getFuzziness();
569 query_result_cache_file_name += ((fuzziness != null) ? fuzziness : "") + "-";
570 String filter_string = queryer.getFilterString();
571 query_result_cache_file_name += ((filter_string != null) ? filter_string : "") + "-";
572 String sort_string = queryer.getSortField();
573 query_result_cache_file_name += ((sort_string != null) ? sort_string : "") + "-";
574 String default_conjunction_operator = queryer.getDefaultConjunctionOperator();
575 query_result_cache_file_name += default_conjunction_operator + "-";
576 int start_results = queryer.getStartResults();
577 int end_results = queryer.getEndResults();
578 query_result_cache_file_name += start_results + "-" + end_results;
579 query_result_cache_file_name = fileSafe(query_result_cache_file_name);
580
581 // If the query result cache file exists, just return its contents and we're done
582 File index_cache_directory = new File(index_directory, "cache");
583 query_result_cache_file = new File(index_cache_directory, query_result_cache_file_name);
584 if (query_result_cache_file.exists() && query_result_cache_file.isFile()) {
585 FileInputStream fis = new FileInputStream(query_result_cache_file);
586 InputStreamReader isr = new InputStreamReader(fis, "UTF-8");
587 BufferedReader buffered_reader = new BufferedReader(isr);
588 String line = "";
589 while ((line = buffered_reader.readLine()) != null) {
590 query_results_xml.append(line + "\n");
591 }
592 String query_results_xml_string = query_results_xml.toString();
593 query_results_xml_string = query_results_xml_string.replaceFirst("cached=\"false\"", "cached=\"true\"");
[16015]594
595 utf8out.print(query_results_xml_string);
596 utf8out.flush();
597
[13557]598 return;
599 }
600 }
601
602 // not cached
603 query_results_xml.append("<ResultSet cached=\"false\">\n");
604 query_results_xml.append("<QueryString>" + LuceneQueryResult.xmlSafe(query_string) + "</QueryString>\n");
605 Filter filter = queryer.getFilter();
606 if (filter != null) {
607 query_results_xml.append("<FilterString>" + filter.toString() + "</FilterString>\n");
608 }
609
610 LuceneQueryResult query_result = queryer.runQuery(query_string);
611 if (query_result == null) {
612 System.err.println("Couldn't run the query");
613 return;
614 }
615
616 if (query_result.getError() != LuceneQueryResult.NO_ERROR) {
617 query_results_xml.append("<Error type=\""+query_result.getErrorString()+"\" />\n");
618 } else {
619 query_results_xml.append(query_result.getXMLString());
620 }
621 query_results_xml.append("</ResultSet>\n");
622
[16015]623 utf8out.print(query_results_xml);
624 utf8out.flush();
[13557]625
626 // Cache this query result, if desired
627 if (query_result_caching_enabled) {
628 FileWriter query_result_cache_file_writer = new FileWriter(query_result_cache_file);
629 query_result_cache_file_writer.write(query_results_xml.toString());
630 query_result_cache_file_writer.close();
631 }
[12772]632 }
[13557]633
634 private static String fileSafe(String text)
635 {
636 StringBuffer file_safe_text = new StringBuffer();
637 for (int i = 0; i < text.length(); i++) {
638 char character = text.charAt(i);
639 if ((character >= 'A' && character <= 'Z') || (character >= 'a' && character <= 'z') || (character >= '0' && character <= '9') || character == '-') {
640 file_safe_text.append(character);
641 }
642 else {
643 file_safe_text.append('%');
644 file_safe_text.append((int) character);
645 }
646 }
647 return file_safe_text.toString();
648 }
649
650
[8521]651}
[13557]652
653
Note: See TracBrowser for help on using the repository browser.