root/main/trunk/greenstone2/common-src/indexers/lucene-gs/src/org/greenstone/LuceneWrapper4/GS2LuceneQuery.java @ 32609

Revision 32609, 30.4 KB (checked in by ak19, 7 months ago)

Preliminary stage before tackling a different bug. This commit is a bugfix to the index folder file locking problem that occurs on Windows when coll deactivate doesn't close all file handles to the coll index folder after doing some lucene searches. Inspecting the code revealed the possibility of another different bug, for which Kathy devised a test to confirm its existence. After testing, found the bug is real: multiple queries configure the same query object (and its internal reader object) but the last configuration is always used to run a search. For example, one user wants to search a lucene collection at doc level and a second user wants to search the same collection at section level. The 2nd user's configuration wins if they configure between the first person's query object being configured and its query being run. So the first person now ends up seeing search results that are at section level.

Line 
1/**********************************************************************
2 *
3 * GS2LuceneQuery.java
4 *
5 * Copyright 2004 The New Zealand Digital Library Project
6 *
7 * A component of the Greenstone digital library software
8 * from the New Zealand Digital Library Project at the
9 * University of Waikato, New Zealand.
10 *
11 * This program is free software; you can redistribute it and/or modify
12 * it under the terms of the GNU General Public License as published by
13 * the Free Software Foundation; either version 2 of the License, or
14 * (at your option) any later version.
15 *
16 * This program is distributed in the hope that it will be useful,
17 * but WITHOUT ANY WARRANTY; without even the implied warranty of
18 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
19 * GNU General Public License for more details.
20 *
21 * You should have received a copy of the GNU General Public License
22 * along with this program; if not, write to the Free Software
23 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24 *
25 *********************************************************************/
26package org.greenstone.LuceneWrapper4;
27
28
29import java.io.*;
30import java.util.*;
31import java.util.regex.*;
32
33import org.apache.lucene.analysis.Analyzer;
34import org.apache.lucene.analysis.standard.StandardAnalyzer;
35import org.apache.lucene.document.Document;
36import org.apache.lucene.index.DirectoryReader;
37import org.apache.lucene.index.IndexReader;
38import org.apache.lucene.index.Term;
39//import org.apache.lucene.index.TermDocs;
40import org.apache.lucene.queryparser.classic.ParseException;
41import org.apache.lucene.queryparser.classic.QueryParser;
42import org.apache.lucene.search.BooleanClause;
43import org.apache.lucene.search.BooleanQuery; // for the TooManyClauses exception
44import org.apache.lucene.search.ConstantScoreQuery;
45import org.apache.lucene.search.Filter;
46import org.apache.lucene.search.IndexSearcher;
47import org.apache.lucene.search.MultiTermQuery;
48import org.apache.lucene.search.MultiTermQuery.ConstantScoreAutoRewrite;
49import org.apache.lucene.search.Query;
50import org.apache.lucene.search.TermRangeFilter;
51import org.apache.lucene.search.IndexSearcher; // Searcher is deprecated
52import org.apache.lucene.search.ScoreDoc;
53import org.apache.lucene.search.Sort;
54import org.apache.lucene.search.SortField;
55import org.apache.lucene.search.TopFieldDocs;
56
57import org.apache.lucene.index.DocsEnum;
58import org.apache.lucene.index.MultiFields;
59
60import org.apache.lucene.store.Directory;
61import org.apache.lucene.store.FSDirectory;
62
63import org.apache.lucene.util.Bits;
64import org.apache.lucene.util.BytesRef;
65import org.apache.lucene.util.Version;
66
67public class GS2LuceneQuery extends SharedSoleneQuery
68{
69  public static String SORT_RANK = "rank";
70  public static String SORT_NATURAL = "natural";
71
72    protected String full_indexdir="";
73
74  protected SortField.Type sort_type = SortField.Type.SCORE;
75  protected boolean reverse_sort = false;
76    protected Sort sorter=new Sort();
77    protected Filter filter = null;
78
79    protected QueryParser query_parser = null;
80    protected QueryParser query_parser_no_stop_words = null;
81    protected IndexSearcher searcher = null;
82    protected IndexReader reader = null;
83
84    public GS2LuceneQuery() {
85    super();
86
87    // Create one query parser with the standard set of stop words, and one with none
88
89    query_parser = new QueryParser(GSLuceneConstants.MATCH_VERSION, TEXTFIELD, new GS2Analyzer()); // uses built-in stop_words_set
90        query_parser_no_stop_words = new QueryParser(GSLuceneConstants.MATCH_VERSION, TEXTFIELD, new GS2Analyzer(new String[] { }));
91    }
92   
93   
94    public boolean initialise() {
95
96    if (!super.initialise()) {
97        return false;
98    }
99
100
101        if (full_indexdir==null || full_indexdir.length()==-1){
102        utf8out.println("Index directory is not indicated ");
103        utf8out.flush();
104        return false;
105        }
106
107        try {   
108       
109            if(reader != null) {
110                    reader.close();
111                    searcher = null;
112            }   
113           
114        Directory full_indexdir_dir = FSDirectory.open(new File(full_indexdir));
115
116        reader = DirectoryReader.open(full_indexdir_dir); // Returns a IndexReader reading the index in the given Directory. now readOnly=true by default, and therefore also for searcher
117        searcher = new IndexSearcher(reader); // during searcher.search() will get it to compute ranks when sorting by fields
118       
119        this.sorter = new Sort(new SortField(this.sort_field, this.sort_type, this.reverse_sort));
120    }
121    catch (IOException exception) {
122            exception.printStackTrace();
123        return false;
124        }
125    return true;
126
127    }
128
129    public void setIndexDir(String full_indexdir) {
130    this.full_indexdir = full_indexdir;
131    }
132
133    public void setSortField(String sort_field) {
134      if (sort_field.equals(SORT_RANK)) {
135    this.sort_field = null;
136    this.sort_type = SortField.Type.SCORE;
137      } else if (sort_field.equals(SORT_NATURAL)) {
138    this.sort_field = null;
139    this.sort_type = SortField.Type.DOC;
140      } else {
141    this.sort_field  = sort_field;
142    this.sort_type = SortField.Type.STRING; // for now. numeric??
143      }
144    }
145  public void setReverseSort(boolean reverse) {
146    this.reverse_sort = reverse;
147  }
148  public boolean getReverseSort() {
149    return this.reverse_sort;
150  }
151
152    public void setFilterString(String filter_string) {
153    super.setFilterString(filter_string);
154    this.filter = parseFilterString(filter_string);
155    }
156
157    public Filter getFilter() {
158    return this.filter;
159    }
160
161   
162    public LuceneQueryResult runQuery(String query_string) {
163   
164    if (query_string == null || query_string.equals("")) {
165        utf8out.println("The query word is not indicated ");
166        utf8out.flush();
167        return null;
168    }
169
170    LuceneQueryResult lucene_query_result=new LuceneQueryResult();
171    lucene_query_result.clear();
172       
173    try {               
174        Query query_including_stop_words = query_parser_no_stop_words.parse(query_string);
175        query_including_stop_words = query_including_stop_words.rewrite(reader);
176       
177        // System.err.println("********* query_string " + query_string + "****");
178
179        Query query = parseQuery(reader, query_parser, query_string, fuzziness);
180        query = recursivelyRewriteQuery(query, reader, lucene_query_result);       
181        // System.err.println("@@@@ final query class name: " + query.getClass());
182
183        // http://stackoverflow.com/questions/13537126/term-frequency-in-lucene-4-0
184        // http://stackoverflow.com/questions/20575254/lucene-4-4-how-to-get-term-frequency-over-all-index
185        // http://stackoverflow.com/questions/8938960/how-to-get-document-ids-for-document-term-vector-in-lucene?rq=1
186        // https://github.com/hibernate/hibernate-search/blob/master/orm/src/test/java/org/hibernate/search/test/filter/BestDriversFilter.java
187        // http://lucene.apache.org/core/4_7_2/MIGRATE.html
188
189        // Get the list of expanded query terms and their frequencies
190        // num docs matching, and total frequency       
191        HashSet terms = new HashSet();
192        query.extractTerms(terms);
193
194        HashMap doc_term_freq_map = new HashMap();
195       
196        Iterator iter = terms.iterator();
197       
198        Bits liveDocs = null;
199        if(reader.hasDeletions()) {
200        System.err.println("@@@ GS2LuceneQuery.java: There have been deletions. Merging to get liveDocs.");
201        liveDocs = MultiFields.getLiveDocs(reader); // SLOW! But getLiveDocs returns null if there are no deletions
202        }
203
204        while (iter.hasNext()) {
205
206        // http://stackoverflow.com/questions/13537126/term-frequency-in-lucene-4-0
207           
208        Term term = (Term) iter.next();
209        // System.err.println("@@@ GS2LuceneQuery.java: Next term: " + term.text());
210        BytesRef term_bytes = term.bytes();
211        DocsEnum term_docs = MultiFields.getTermDocsEnum(reader, liveDocs, term.field(), term_bytes); // flags?
212
213        // Get the term frequency over all the documents
214        //TermDocs term_docs = reader.termDocs(term);
215        int term_freq = 0;
216        int match_docs = 0;
217
218        if(term_docs != null) {
219            int docID = -1;
220            while((docID = term_docs.nextDoc()) != DocsEnum.NO_MORE_DOCS) {//while (term_docs.next())   
221            if (term_docs.freq() != 0)
222                {
223                term_freq += term_docs.freq();
224                match_docs++;
225               
226                // Calculate the document-level term frequency as well
227                Integer lucene_doc_num_obj = new Integer(term_docs.docID());
228                int doc_term_freq = 0;
229                if (doc_term_freq_map.containsKey(lucene_doc_num_obj))
230                    {
231                    doc_term_freq = ((Integer) doc_term_freq_map.get(lucene_doc_num_obj)).intValue();
232                    }
233                doc_term_freq += term_docs.freq();
234               
235                doc_term_freq_map.put(lucene_doc_num_obj, new Integer(doc_term_freq));
236                }
237            }
238        } else {
239            System.err.println("@@@ GS2LuceneQuery.java: term_docs is null for term " + term.text());
240        }
241
242        // Create a term
243        lucene_query_result.addTerm(term.text(), term.field(), match_docs, term_freq);
244        }
245   
246        // Get the list of stop words removed from the query
247        HashSet terms_including_stop_words = new HashSet();
248        query_including_stop_words.extractTerms(terms_including_stop_words);
249        Iterator terms_including_stop_words_iter = terms_including_stop_words.iterator();
250        while (terms_including_stop_words_iter.hasNext()) {
251        Term term = (Term) terms_including_stop_words_iter.next();
252        if (!terms.contains(term)) {
253            lucene_query_result.addStopWord(term.text());
254        }
255        }
256       
257        // Extracting all documents for a given search - http://www.gossamer-threads.com/lists/lucene/java-user/134873
258        // http://lucene.apache.org/core/3_4_0/api/core/org/apache/lucene/search/TotalHitCountCollector.html
259        // http://lucene.apache.org/core/4_7_2/core/index.html?org/apache/lucene/search/TopFieldDocs.html
260
261        // 1. Figure out how many results there will be.
262        //TotalHitCountCollecter countCollector = new TotalHitCountCollector();
263        //searcher.search(query, filter, collector);
264        //int hitCount = collector.count;
265
266        // Actually do the query
267        // Simple case for getting all the matching documents
268        if (end_results == Integer.MAX_VALUE) {
269        // Perform the query (filter and sorter may be null)
270        TopFieldDocs hits = searcher.search(query, filter, end_results, sorter, true, true); // doDocScores=true, doMaxScore=true
271             // Is there a slight difference in the definition between
272             // https://lucene.apache.org/core/3_6_0/api/all/org/apache/lucene/search/IndexSearcher.html#setDefaultFieldSortScoring%28boolean,%20boolean%29
273             // and http://lucene.apache.org/core/4_7_2/core/org/apache/lucene/search/IndexSearcher.html#search%28org.apache.lucene.search.Query,%20org.apache.lucene.search.Filter,%20int,%20org.apache.lucene.search.Sort,%20boolean,%20boolean%29
274             // Seems to be okay.
275             // See also http://stackoverflow.com/questions/7910241/in-lucene-what-is-the-purpose-of-setdefaultfieldsortscoring
276
277        lucene_query_result.setTotalDocs(hits.totalHits);
278
279        // Output the matching documents
280        lucene_query_result.setStartResults(start_results);
281        lucene_query_result.setEndResults(hits.totalHits); // ??
282
283        for (int i = start_results; i < hits.totalHits; i++) {
284          int lucene_doc_num = hits.scoreDocs[i ].doc; // i-1
285            Document doc = reader.document(lucene_doc_num);
286            int doc_term_freq = 0;
287            Integer doc_term_freq_object = (Integer) doc_term_freq_map.get(new Integer(lucene_doc_num));
288            if (doc_term_freq_object != null)
289            {
290            doc_term_freq = doc_term_freq_object.intValue();
291            }
292            lucene_query_result.addDoc(doc.get("docOID").trim(), hits.scoreDocs[i].score, doc_term_freq);
293        }
294        }
295
296        // Slightly more complicated case for returning a subset of the matching documents
297        else {
298        // Perform the query (filter may be null)
299        TopFieldDocs hits = searcher.search(query, filter, end_results, sorter, true, true); // doDocScores=true, doMaxScore=true
300               // See also http://stackoverflow.com/questions/7910241/in-lucene-what-is-the-purpose-of-setdefaultfieldsortscoring
301        lucene_query_result.setTotalDocs(hits.totalHits);
302       
303        lucene_query_result.setStartResults(start_results);
304        lucene_query_result.setEndResults(end_results < hits.scoreDocs.length ? end_results: hits.scoreDocs.length);
305
306        // Output the matching documents
307        for (int i = start_results; (i < hits.scoreDocs.length && i < end_results); i++) {
308            int lucene_doc_num = hits.scoreDocs[i].doc;
309            Document doc = reader.document(lucene_doc_num);
310            int doc_term_freq = 0;
311            Integer doc_term_freq_object = (Integer) doc_term_freq_map.get(new Integer(lucene_doc_num));
312            if (doc_term_freq_object != null)
313            {
314            doc_term_freq = doc_term_freq_object.intValue();
315            }
316            lucene_query_result.addDoc(doc.get("docOID").trim(), hits.scoreDocs[i].score, doc_term_freq);
317        }
318        }
319    }
320   
321    catch (ParseException parse_exception) {
322        lucene_query_result.setError(LuceneQueryResult.PARSE_ERROR);
323    }
324    catch (BooleanQuery.TooManyClauses too_many_clauses_exception) {
325        lucene_query_result.setError(LuceneQueryResult.TOO_MANY_CLAUSES_ERROR);
326    }
327    catch (IOException exception) {
328        lucene_query_result.setError(LuceneQueryResult.IO_ERROR);
329        exception.printStackTrace();
330    }
331    catch (Exception exception) {
332        lucene_query_result.setError(LuceneQueryResult.OTHER_ERROR);
333        exception.printStackTrace();
334    }
335    return lucene_query_result;
336    }
337
338    public void setDefaultConjunctionOperator(String default_conjunction_operator) {
339    super.setDefaultConjunctionOperator(default_conjunction_operator);
340
341    if (default_conjunction_operator.equals("AND")) {
342        query_parser.setDefaultOperator(query_parser.AND_OPERATOR);
343        query_parser_no_stop_words.setDefaultOperator(query_parser.AND_OPERATOR);
344    } else { // default is OR
345        query_parser.setDefaultOperator(query_parser.OR_OPERATOR);
346        query_parser_no_stop_words.setDefaultOperator(query_parser.OR_OPERATOR);
347    }
348    }
349     
350       
351    public void cleanUp() {
352    super.cleanUp();
353    try {
354        if(reader != null) {
355        reader.close();
356        // Closes files associated with this index. Also saves any new deletions to disk.
357        // No other methods should be called after this has been called.
358        }
359       
360    } catch (IOException exception) {
361        exception.printStackTrace();
362    }
363    }
364
365
366    protected Query parseQuery(IndexReader reader, QueryParser query_parser, String query_string, String fuzziness)
367    throws java.io.IOException, org.apache.lucene.queryparser.classic.ParseException
368    {
369    // Split query string into the search terms and the filter terms
370    // * The first +(...) term contains the search terms so count
371    //   up '(' and stop when we finish matching ')'
372    int offset = 0;
373    int paren_count = 0;
374    boolean seen_paren = false;
375    while (offset < query_string.length() && (!seen_paren || paren_count > 0)) {
376        if (query_string.charAt(offset) == '(') {
377        paren_count++;
378        seen_paren = true;
379        }
380        if (query_string.charAt(offset) == ')') {
381        paren_count--;
382        }
383        offset++;
384    }
385    String query_prefix = query_string.substring(0, offset);
386    String query_suffix = query_string.substring(offset);
387   
388    ///ystem.err.println("Prefix: " + query_prefix);
389    ///ystem.err.println("Suffix: " + query_suffix);
390   
391    Query query = query_parser.parse(query_prefix);
392    query = query.rewrite(reader);
393   
394    // If this is a fuzzy search, then we need to add the fuzzy
395    // flag to each of the query terms
396    if (fuzziness != null && query.toString().length() > 0) {
397       
398        // Revert the query to a string
399        System.err.println("Rewritten query: " + query.toString());
400        // Search through the string for TX:<term> query terms
401        // and append the ~ operator. Note that this search will
402        // not change phrase searches (TX:"<term> <term>") as
403        // fuzzy searching is not possible for these entries.
404        // Yahoo! Time for a state machine!
405        StringBuffer mutable_query_string = new StringBuffer(query.toString());
406        int o = 0; // Offset
407        // 0 = BASE, 1 = SEEN_T, 2 = SEEN_TX, 3 = SEEN_TX:
408        int s = 0; // State
409        while(o < mutable_query_string.length()) {
410        char c = mutable_query_string.charAt(o);
411        if (s == 0 && c == TEXTFIELD.charAt(0)) {
412            ///ystem.err.println("Found T!");
413            s = 1;
414        }
415        else if (s == 1) {
416            if (c == TEXTFIELD.charAt(1)) {
417            ///ystem.err.println("Found X!");
418            s = 2;
419            }
420            else {
421            s = 0; // Reset
422            }
423        }
424        else if (s == 2) {
425            if (c == ':') {
426            ///ystem.err.println("Found TX:!");
427            s = 3;
428            }
429            else {
430            s = 0; // Reset
431            }
432        }
433        else if (s == 3) {
434            // Don't process phrases
435            if (c == '"') {
436            ///ystem.err.println("Stupid phrase...");
437            s = 0; // Reset
438            }
439            // Found the end of the term... add the
440            // fuzzy search indicator
441            // Nor outside the scope of parentheses
442            else if (Character.isWhitespace(c) || c == ')') {
443            ///ystem.err.println("Yahoo! Found fuzzy term.");
444            mutable_query_string.insert(o, '~' + fuzziness);
445            o++;
446            s = 0; // Reset
447            }
448        }
449        o++;
450        }
451        // If we were in the state of looking for the end of a
452        // term - then we just found it!
453        if (s == 3) {
454           
455        mutable_query_string.append('~' + fuzziness);
456        }
457        // Reparse the query
458        ///ystem.err.println("Fuzzy query: " + mutable_query_string.toString() + query_suffix);
459        query = query_parser.parse(mutable_query_string.toString() + query_suffix);
460    }
461    else {
462        query = query_parser.parse(query_prefix + query_suffix);
463    }
464   
465    return query;
466    }
467
468    // If you're dealing with a BooleanQuery, they need to be recursively rewritten
469    // as they can contain queries with wildcards (WildcardQuery|PrefixQuery subclasses of MultiTermQuery)
470    // e.g. season* farm
471    // If MultiTermQuery, then expand here. e.g. WildcardQuerys like season*.
472    // DON'T call this method from inside parseQuery() (in place of its query.rewrite()), because then wildcard
473    // queries like season* won't contain Terms (extractTerms() will be empty) since the ConstantScoreQuerys
474    // that a WildcardQuery gets rewritten to here will contain Filters in place of Terms.
475    // Call this method from runQuery() after it calls parseQuery().
476    // Now searches like these will work
477    //    season* farm
478    //    season* farm*
479    // and not just searches like the following which already used to work:
480    //    season*
481    //    snail farm
482    // Idea for the solution of recursively processing a BooleanQuery came from inspecting source code to BooleanQuery.java
483    //    https://github.com/apache/lucene-solr/blob/master/lucene/core/src/java/org/apache/lucene/search/BooleanQuery.java
484    // which also does a recursive rewrite. Unfortunately, the existing BooleanQuery does not handle MultiTermQuery
485    // subcomponents.
486    protected Query recursivelyRewriteQuery(Query orig_query, IndexReader reader, LuceneQueryResult lucene_query_result) throws java.io.IOException
487    {
488    //Query query = orig_query.rewrite(reader);
489    Query query = orig_query;
490
491    if(orig_query instanceof BooleanQuery) {
492        BooleanQuery booleanQuery = (BooleanQuery)orig_query;
493        List<BooleanClause> clauses = booleanQuery.clauses();
494        for (BooleanClause clause : clauses) {
495        Query subQuery = clause.getQuery();
496        subQuery = recursivelyRewriteQuery(subQuery, reader, lucene_query_result);
497        clause.setQuery(subQuery);
498        }
499    }
500   
501    // GS2's LuceneWrapper uses lucene-2.3.2. GS3's LuceneWrapper3 works with lucene-3.3.0.
502        // This change in lucene core library for GS3 (present since after version 2.4.1) had the
503        // side-effect that searching on "econom*" didn't display what terms it was searching for,
504        // whereas it had done so in GS2.
505
506        // The details of this problem and its current solution are explained in the ticket
507        // http://trac.greenstone.org/ticket/845
508
509        // We need to change the settings for the rewriteMethod in order to get searches on wildcards
510        // to produce search terms again when the query gets rewritten.
511
512        // We try, in order:
513        // 1. RewriteMethod set to BooleanQuery, to get it working as in GS2 which uses lucene-2.3.2
514        // it will expand wildcard searches to its terms when searching at both section AND doc level.
515        // If that throws a TooManyClauses exception (like when searching for "a*" over lucene demo collection)
516        // 2. Then try a custom rewriteMethod which sets termCountCutoff=350 and docCountPercent cutoff=0.1%
517        // If that throws a TooManyClauses exception (could perhaps happen if the collection has a huge number of docs
518        // 3. Then try the default apache rewriteMethod with its optimum defaults of
519        // termCountCutoff=350 and docCountPercent cutoff=0.1%
520        //  See http://lucene.apache.org/core/3_6_1/api/core/org/apache/lucene/search/MultiTermQuery.html
521
522        //System.err.println("@@@@ query class name: " + orig_query.getClass());
523        //System.err.println("@@@@ QUERY: " + orig_query);
524
525        if(orig_query instanceof MultiTermQuery) {
526        MultiTermQuery multiTermQuery = (MultiTermQuery)orig_query;
527        multiTermQuery.setRewriteMethod(MultiTermQuery.CONSTANT_SCORE_BOOLEAN_QUERY_REWRITE);
528             // less CPU intensive than MultiTermQuery.SCORING_BOOLEAN_QUERY_REWRITE)
529        }
530
531        try {
532        query = orig_query.rewrite(reader);
533        }
534        catch(BooleanQuery.TooManyClauses clauseException) {
535        // Example test case: try searching the lucene demo collection for "a*"
536        // and you'll hit this exception
537
538        lucene_query_result.setError(LuceneQueryResult.TOO_MANY_CLAUSES_ERROR);
539
540        if(query instanceof MultiTermQuery) {
541
542            // CustomRewriteMethod: setting the docCountPercent cutoff to a custom 100%.
543            // This will at least expand the query to its terms when searching with wildcards at section-level
544            // (though it doesn't seem to work for doc-level searches, no matter what the cutoffs are set to).
545
546            MultiTermQuery.ConstantScoreAutoRewrite customRewriteMethod = new MultiTermQuery.ConstantScoreAutoRewrite();
547            customRewriteMethod.setDocCountPercent(100.0);
548            customRewriteMethod.setTermCountCutoff(350); // same as default
549           
550            MultiTermQuery multiTermQuery = (MultiTermQuery)query;
551            multiTermQuery.setRewriteMethod(customRewriteMethod);
552            try {
553            query = query.rewrite(reader);
554            }
555            catch(BooleanQuery.TooManyClauses clauseExceptionAgain) {
556
557            // do what the code originally did: use the default rewriteMethod which
558            // uses a default docCountPercent=0.1 (%) and termCountCutoff=350
559
560            multiTermQuery.setRewriteMethod(MultiTermQuery.CONSTANT_SCORE_AUTO_REWRITE_DEFAULT);
561            query = query.rewrite(reader);
562            }
563        }
564        }
565
566        // BooleanQuery.java recurses rewriting any query until it is identical before and after rewrite,
567        // see reference to "recursively rewrite" in
568        // https://github.com/apache/lucene-solr/blob/master/lucene/core/src/java/org/apache/lucene/search/BooleanQuery.java
569        if(orig_query == query) {
570        return query;
571        } else {
572        return recursivelyRewriteQuery(query, reader, lucene_query_result);
573        }
574    }
575
576    protected Filter parseFilterString(String filter_string)
577    {
578    Filter result = null;
579    Pattern pattern = Pattern.compile("\\s*\\+(\\w+)\\:([\\{\\[])(\\d+)\\s+TO\\s+(\\d+)([\\}\\]])\\s*");
580    Matcher matcher = pattern.matcher(filter_string);
581    if (matcher.matches()) {
582        String field_name = matcher.group(1);
583        boolean include_lower = matcher.group(2).equals("[");
584        BytesRef lower_term = new BytesRef(matcher.group(3));
585        BytesRef upper_term = new BytesRef(matcher.group(4));
586        boolean include_upper = matcher.group(5).equals("]");
587        result = new TermRangeFilter(field_name, lower_term, upper_term, include_lower, include_upper);
588    }
589    else {
590        System.err.println("Error: Could not understand filter string \"" + filter_string + "\"");
591    }
592    return result;
593    }
594   
595
596    /** command line program and auxiliary methods */
597
598    // Fairly self-explanatory I should hope
599    static protected boolean query_result_caching_enabled = false;
600
601
602    static public void main (String args[])
603    {
604    if (args.length == 0) {
605        System.out.println("Usage: GS2LuceneQuery <index directory> [-fuzziness value] [-filter filter_string] [-sort sort_field] [-reverse_sort][-dco AND|OR] [-startresults number -endresults number] [query]");
606        return;
607    }
608
609    try {
610        String index_directory = args[0];
611       
612        GS2LuceneQuery queryer = new GS2LuceneQuery();
613        queryer.setIndexDir(index_directory);
614
615        // Prepare the index cache directory, if query result caching is enabled
616        if (query_result_caching_enabled) {
617        // Make the index cache directory if it doesn't already exist
618        File index_cache_directory = new File(index_directory, "cache");
619        if (!index_cache_directory.exists()) {
620            index_cache_directory.mkdir();
621        }
622
623        // Disable caching if the index cache directory isn't available
624        if (!index_cache_directory.exists() || !index_cache_directory.isDirectory()) {
625            query_result_caching_enabled = false;
626        }
627        }
628
629        String query_string = null;
630
631        // Parse the command-line arguments
632            for (int i = 1; i < args.length; i++) {
633        if (args[i].equals("-sort")) {
634            i++;
635            queryer.setSortField(args[i]);
636        }
637        else if (args[i].equals("-reverse_sort")) {
638          queryer.setReverseSort(true);
639        }
640        else if (args[i].equals("-filter")) {
641            i++;
642            queryer.setFilterString(args[i]);
643        }
644        else if (args[i].equals("-dco")) {
645            i++;
646            queryer.setDefaultConjunctionOperator(args[i]);
647        }
648        else if (args[i].equals("-fuzziness")) {
649            i++;
650            queryer.setFuzziness(args[i]);
651        }
652        else if (args[i].equals("-startresults")) {
653            i++;
654            if (args[i].matches("\\d+")) {
655            queryer.setStartResults(Integer.parseInt(args[i]));
656            }
657        }
658        else if (args[i].equals("-endresults")) {
659            i++;
660            if (args[i].matches("\\d+")) {
661            queryer.setEndResults(Integer.parseInt(args[i]));
662            }
663        }
664        else {
665            query_string = args[i];
666        }
667        }
668       
669        if (!queryer.initialise()) {
670            queryer.cleanUp(); // will close reader object IF reader was instantiated
671        return;
672        }
673       
674        // The query string has been specified as a command-line argument
675        if (query_string != null) {
676        runQueryCaching(index_directory, queryer, query_string);
677        }
678
679        // Read queries from STDIN
680        else {
681        BufferedReader in = new BufferedReader(new InputStreamReader(System.in, "UTF-8"));
682        while (true) {
683            // Read the query from STDIN
684            query_string = in.readLine();
685            if (query_string == null || query_string.length() == -1) {
686            break;
687            }
688
689            runQueryCaching(index_directory, queryer, query_string);
690           
691        }
692        }
693        queryer.cleanUp();
694    }
695    catch (IOException exception) {
696        exception.printStackTrace();
697    }
698    }
699
700    protected static void runQueryCaching(String index_directory, GS2LuceneQuery queryer, String query_string)
701    throws IOException
702    {
703    StringBuffer query_results_xml = new StringBuffer();
704
705    // Check if this query result has been cached from a previous search (if it's enabled)
706    File query_result_cache_file = null;
707    if (query_result_caching_enabled) {
708        // Generate the cache file name from the query options
709        String query_result_cache_file_name = query_string + "-";
710        String fuzziness = queryer.getFuzziness();
711        query_result_cache_file_name += ((fuzziness != null) ? fuzziness : "") + "-";
712        String filter_string = queryer.getFilterString();
713        query_result_cache_file_name += ((filter_string != null) ? filter_string : "") + "-";
714        String sort_string = queryer.getSortField();
715        query_result_cache_file_name += ((sort_string != null) ? sort_string : "") + "-";
716        String reverse_sort_string = (queryer.getReverseSort() ? "1" : "0");
717        query_result_cache_file_name += reverse_sort_string + "-";
718        String default_conjunction_operator = queryer.getDefaultConjunctionOperator();
719        query_result_cache_file_name += default_conjunction_operator + "-";
720        int start_results = queryer.getStartResults();
721        int end_results = queryer.getEndResults();
722        query_result_cache_file_name += start_results + "-" + end_results;
723        query_result_cache_file_name = fileSafe(query_result_cache_file_name);
724
725        // If the query result cache file exists, just return its contents and we're done
726        File index_cache_directory = new File(index_directory, "cache");
727        query_result_cache_file = new File(index_cache_directory, query_result_cache_file_name);
728        if (query_result_cache_file.exists() && query_result_cache_file.isFile()) {
729        FileInputStream fis = new FileInputStream(query_result_cache_file);
730        InputStreamReader isr = new InputStreamReader(fis, "UTF-8");
731        BufferedReader buffered_reader = new BufferedReader(isr);
732        String line = "";
733        while ((line = buffered_reader.readLine()) != null) {
734            query_results_xml.append(line + "\n");
735        }
736        String query_results_xml_string = query_results_xml.toString();
737        query_results_xml_string = query_results_xml_string.replaceFirst("cached=\"false\"", "cached=\"true\"");
738
739        utf8out.print(query_results_xml_string);
740        utf8out.flush();
741
742        return;
743        }
744    }
745   
746    // not cached
747    query_results_xml.append("<ResultSet cached=\"false\">\n");
748    query_results_xml.append("<QueryString>" + LuceneQueryResult.xmlSafe(query_string) + "</QueryString>\n");
749    Filter filter = queryer.getFilter();
750    if (filter != null) {
751        query_results_xml.append("<FilterString>" + filter.toString() + "</FilterString>\n");
752    }
753   
754    LuceneQueryResult query_result = queryer.runQuery(query_string);
755    if (query_result == null) {
756        System.err.println("Couldn't run the query");
757        return;
758    }
759   
760    if (query_result.getError() != LuceneQueryResult.NO_ERROR) {
761        query_results_xml.append("<Error type=\""+query_result.getErrorString()+"\" />\n");
762    } else {
763        query_results_xml.append(query_result.getXMLString());
764    }
765    query_results_xml.append("</ResultSet>\n");
766
767    utf8out.print(query_results_xml);
768    utf8out.flush();
769
770    // Cache this query result, if desired
771    if (query_result_caching_enabled) {
772        // Catch any exceptions thrown trying to write the query result cache file and warn about them, but don't
773        //   bother with the full stack trace. It won't affect the functionality if we can't write some cache
774        //   files, it will just affect the speed of subsequent requests.
775        // Example exceptions are "permission denied" errors, or "filename too long" errors (the filter string
776        //   can get very long in some collections)
777        try
778        {
779        FileWriter query_result_cache_file_writer = new FileWriter(query_result_cache_file);
780        query_result_cache_file_writer.write(query_results_xml.toString());
781        query_result_cache_file_writer.close();
782        }
783        catch (Exception exception)
784        {
785        System.err.println("Warning: Exception occurred trying to write query result cache file (" + exception + ")");
786        }
787    }
788    }
789   
790    protected static String fileSafe(String text)
791    {
792    StringBuffer file_safe_text = new StringBuffer();
793    for (int i = 0; i < text.length(); i++) {
794        char character = text.charAt(i);
795        if ((character >= 'A' && character <= 'Z') || (character >= 'a' && character <= 'z') || (character >= '0' && character <= '9') || character == '-') {
796        file_safe_text.append(character);
797        }
798        else {
799        file_safe_text.append('%');
800        file_safe_text.append((int) character);
801        }
802    }
803    return file_safe_text.toString();
804    }
805
806   
807}
808
809
Note: See TracBrowser for help on using the browser.