Context Navigation

GS2LuceneQuery.java@ 32729

Last change on this file since 32729 was 32729, checked in by ak19, 5 years ago

Part of Ticket #947 the Lucene Index File Locking Fix. Lucene search had broken for GS2, thanks to Pascal Angst for identifying this. The fix committed at that time was incomplete as it had not been applied for GS2, because GS2 went through GS2LuceneQuery's main() method. GS2LuceneQuery still managed to compile after the fix with no syntax errors, because the superclass' initialise() method ended up getting called from main(), instead of the new GS2LuceneQuery.initialise(IndexReader) variant. Now the GS2LuceneQuery.main() method used by GS2 behaves like the GS2LuceneSearch class used by GS3: first instantiating an IndexReader object then passing this to the GS2LuceneQuery object's initialise(IndexReader) method. Lucene searching should work again in GS2, will test.

File size: 32.0 KB

Line
1	/**********************************************************************
2	*
3	* GS2LuceneQuery.java
4	*
5	* Copyright 2004 The New Zealand Digital Library Project
6	*
7	* A component of the Greenstone digital library software
8	* from the New Zealand Digital Library Project at the
9	* University of Waikato, New Zealand.
10	*
11	* This program is free software; you can redistribute it and/or modify
12	* it under the terms of the GNU General Public License as published by
13	* the Free Software Foundation; either version 2 of the License, or
14	* (at your option) any later version.
15	*
16	* This program is distributed in the hope that it will be useful,
17	* but WITHOUT ANY WARRANTY; without even the implied warranty of
18	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19	* GNU General Public License for more details.
20	*
21	* You should have received a copy of the GNU General Public License
22	* along with this program; if not, write to the Free Software
23	* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24	*
25	*********************************************************************/
26	package org.greenstone.LuceneWrapper4;
27
28
29	import java.io.*;
30	import java.util.*;
31	import java.util.regex.*;
32
33	import org.apache.lucene.analysis.Analyzer;
34	import org.apache.lucene.analysis.standard.StandardAnalyzer;
35	import org.apache.lucene.document.Document;
36	import org.apache.lucene.index.DirectoryReader;
37	import org.apache.lucene.index.IndexReader;
38	import org.apache.lucene.index.Term;
39	//import org.apache.lucene.index.TermDocs;
40	import org.apache.lucene.queryparser.classic.ParseException;
41	import org.apache.lucene.queryparser.classic.QueryParser;
42	import org.apache.lucene.search.BooleanClause;
43	import org.apache.lucene.search.BooleanQuery; // for the TooManyClauses exception
44	import org.apache.lucene.search.ConstantScoreQuery;
45	import org.apache.lucene.search.Filter;
46	import org.apache.lucene.search.IndexSearcher; // Searcher is deprecated
47	import org.apache.lucene.search.MultiTermQuery;
48	import org.apache.lucene.search.MultiTermQuery.ConstantScoreAutoRewrite;
49	import org.apache.lucene.search.Query;
50	import org.apache.lucene.search.TermRangeFilter;
51	import org.apache.lucene.search.ScoreDoc;
52	import org.apache.lucene.search.Sort;
53	import org.apache.lucene.search.SortField;
54	import org.apache.lucene.search.TopFieldDocs;
55
56	import org.apache.lucene.index.DocsEnum;
57	import org.apache.lucene.index.MultiFields;
58
59	import org.apache.lucene.store.Directory;
60	import org.apache.lucene.store.FSDirectory;
61
62	import org.apache.lucene.util.Bits;
63	import org.apache.lucene.util.BytesRef;
64	import org.apache.lucene.util.Version;
65
66	public class GS2LuceneQuery extends SharedSoleneQuery
67	{
68	public static String SORT_RANK = "rank";
69	public static String SORT_NATURAL = "natural";
70
71	protected String full_indexdir="";
72
73	protected SortField.Type sort_type = SortField.Type.SCORE;
74	protected boolean reverse_sort = false;
75	protected Sort sorter=new Sort();
76	protected Filter filter = null;
77
78	protected QueryParser query_parser = null;
79	protected QueryParser query_parser_no_stop_words = null;
80	protected IndexSearcher searcher = null;
81	protected IndexReader reader = null; // reference to a Reader resource. GS2LuceneQuery doesn't maintain it, GS2LuceneSearch maintains it!
82	// GS2LuceneSearch locally instantiates one GS2LuceneQuery object per query then allows each Query instance use a relevant Reader.
83	// But GS2LuceneSearch opens the IndexReaders and, more importantly, closes them all when a collection is deactivated.
84
85	public GS2LuceneQuery() {
86	super();
87
88	// Create one query parser with the standard set of stop words, and one with none
89
90	query_parser = new QueryParser(GSLuceneConstants.MATCH_VERSION, TEXTFIELD, new GS2Analyzer()); // uses built-in stop_words_set
91	query_parser_no_stop_words = new QueryParser(GSLuceneConstants.MATCH_VERSION, TEXTFIELD, new GS2Analyzer(new String[] { }));
92	}
93
94	public boolean initialise(IndexReader reader) {
95
96	if (!super.initialise()) {
97	return false;
98	}
99
100
101	if (full_indexdir==null \|\| full_indexdir.length()==-1){
102	utf8out.println("Index directory is not indicated ");
103	utf8out.flush();
104	return false;
105	}
106
107	if(reader == null) {
108	return false;
109	}
110	else {
111	this.reader = reader;
112	this.searcher = new IndexSearcher(reader); // during searcher.search() will get it to compute ranks when sorting by fields
113	this.sorter = new Sort(new SortField(this.sort_field, this.sort_type, this.reverse_sort));
114	return true;
115	}
116	}
117
118	public void setIndexDir(String full_indexdir) {
119	this.full_indexdir = full_indexdir;
120	}
121
122	public void setSortField(String sort_field) {
123	if (sort_field.equals(SORT_RANK)) {
124	this.sort_field = null;
125	this.sort_type = SortField.Type.SCORE;
126	} else if (sort_field.equals(SORT_NATURAL)) {
127	this.sort_field = null;
128	this.sort_type = SortField.Type.DOC;
129	} else {
130	this.sort_field = sort_field;
131	this.sort_type = SortField.Type.STRING; // for now. numeric??
132	}
133	}
134	public void setReverseSort(boolean reverse) {
135	this.reverse_sort = reverse;
136	}
137	public boolean getReverseSort() {
138	return this.reverse_sort;
139	}
140
141	public void setFilterString(String filter_string) {
142	super.setFilterString(filter_string);
143	this.filter = parseFilterString(filter_string);
144	}
145
146	public Filter getFilter() {
147	return this.filter;
148	}
149
150
151	public LuceneQueryResult runQuery(String query_string) {
152
153	if (query_string == null \|\| query_string.equals("")) {
154	utf8out.println("The query word is not indicated ");
155	utf8out.flush();
156	return null;
157	}
158
159	LuceneQueryResult lucene_query_result=new LuceneQueryResult();
160	lucene_query_result.clear();
161
162	if(this.reader == null) {
163	System.err.println("#### Reader is null!");
164	}
165
166	try {
167	Query query_including_stop_words = query_parser_no_stop_words.parse(query_string);
168	query_including_stop_words = query_including_stop_words.rewrite(reader);
169
170	// System.err.println("******* query_string " + query_string + "**");
171
172	Query query = parseQuery(reader, query_parser, query_string, fuzziness);
173	query = recursivelyRewriteQuery(query, reader, lucene_query_result);
174	// System.err.println("@@@@ final query class name: " + query.getClass());
175
176	// http://stackoverflow.com/questions/13537126/term-frequency-in-lucene-4-0
177	// http://stackoverflow.com/questions/20575254/lucene-4-4-how-to-get-term-frequency-over-all-index
178	// http://stackoverflow.com/questions/8938960/how-to-get-document-ids-for-document-term-vector-in-lucene?rq=1
179	// https://github.com/hibernate/hibernate-search/blob/master/orm/src/test/java/org/hibernate/search/test/filter/BestDriversFilter.java
180	// http://lucene.apache.org/core/4_7_2/MIGRATE.html
181
182	// Get the list of expanded query terms and their frequencies
183	// num docs matching, and total frequency
184	HashSet terms = new HashSet();
185	query.extractTerms(terms);
186
187	HashMap doc_term_freq_map = new HashMap();
188
189	Iterator iter = terms.iterator();
190
191	Bits liveDocs = null;
192	if(reader.hasDeletions()) {
193	System.err.println("@@@ GS2LuceneQuery.java: There have been deletions. Merging to get liveDocs.");
194	liveDocs = MultiFields.getLiveDocs(reader); // SLOW! But getLiveDocs returns null if there are no deletions
195	}
196
197	while (iter.hasNext()) {
198
199	// http://stackoverflow.com/questions/13537126/term-frequency-in-lucene-4-0
200
201	Term term = (Term) iter.next();
202	// System.err.println("@@@ GS2LuceneQuery.java: Next term: " + term.text());
203	BytesRef term_bytes = term.bytes();
204	DocsEnum term_docs = MultiFields.getTermDocsEnum(reader, liveDocs, term.field(), term_bytes); // flags?
205
206	// Get the term frequency over all the documents
207	//TermDocs term_docs = reader.termDocs(term);
208	int term_freq = 0;
209	int match_docs = 0;
210
211	if(term_docs != null) {
212	int docID = -1;
213	while((docID = term_docs.nextDoc()) != DocsEnum.NO_MORE_DOCS) {//while (term_docs.next())
214	if (term_docs.freq() != 0)
215	{
216	term_freq += term_docs.freq();
217	match_docs++;
218
219	// Calculate the document-level term frequency as well
220	Integer lucene_doc_num_obj = new Integer(term_docs.docID());
221	int doc_term_freq = 0;
222	if (doc_term_freq_map.containsKey(lucene_doc_num_obj))
223	{
224	doc_term_freq = ((Integer) doc_term_freq_map.get(lucene_doc_num_obj)).intValue();
225	}
226	doc_term_freq += term_docs.freq();
227
228	doc_term_freq_map.put(lucene_doc_num_obj, new Integer(doc_term_freq));
229	}
230	}
231	} else {
232	System.err.println("@@@ GS2LuceneQuery.java: term_docs is null for term " + term.text());
233	}
234
235	// Create a term
236	lucene_query_result.addTerm(term.text(), term.field(), match_docs, term_freq);
237	}
238
239	// Get the list of stop words removed from the query
240	HashSet terms_including_stop_words = new HashSet();
241	query_including_stop_words.extractTerms(terms_including_stop_words);
242	Iterator terms_including_stop_words_iter = terms_including_stop_words.iterator();
243	while (terms_including_stop_words_iter.hasNext()) {
244	Term term = (Term) terms_including_stop_words_iter.next();
245	if (!terms.contains(term)) {
246	lucene_query_result.addStopWord(term.text());
247	}
248	}
249
250	// Extracting all documents for a given search - http://www.gossamer-threads.com/lists/lucene/java-user/134873
251	// http://lucene.apache.org/core/3_4_0/api/core/org/apache/lucene/search/TotalHitCountCollector.html
252	// http://lucene.apache.org/core/4_7_2/core/index.html?org/apache/lucene/search/TopFieldDocs.html
253
254	// 1. Figure out how many results there will be.
255	//TotalHitCountCollecter countCollector = new TotalHitCountCollector();
256	//searcher.search(query, filter, collector);
257	//int hitCount = collector.count;
258
259	// Actually do the query
260	// Simple case for getting all the matching documents
261	if (end_results == Integer.MAX_VALUE) {
262	// Perform the query (filter and sorter may be null)
263	TopFieldDocs hits = searcher.search(query, filter, end_results, sorter, true, true); // doDocScores=true, doMaxScore=true
264	// Is there a slight difference in the definition between
265	// https://lucene.apache.org/core/3_6_0/api/all/org/apache/lucene/search/IndexSearcher.html#setDefaultFieldSortScoring%28boolean,%20boolean%29
266	// and http://lucene.apache.org/core/4_7_2/core/org/apache/lucene/search/IndexSearcher.html#search%28org.apache.lucene.search.Query,%20org.apache.lucene.search.Filter,%20int,%20org.apache.lucene.search.Sort,%20boolean,%20boolean%29
267	// Seems to be okay.
268	// See also http://stackoverflow.com/questions/7910241/in-lucene-what-is-the-purpose-of-setdefaultfieldsortscoring
269
270	lucene_query_result.setTotalDocs(hits.totalHits);
271
272	// Output the matching documents
273	lucene_query_result.setStartResults(start_results);
274	lucene_query_result.setEndResults(hits.totalHits); // ??
275
276	for (int i = start_results; i < hits.totalHits; i++) {
277	int lucene_doc_num = hits.scoreDocs[i ].doc; // i-1
278	Document doc = reader.document(lucene_doc_num);
279	int doc_term_freq = 0;
280	Integer doc_term_freq_object = (Integer) doc_term_freq_map.get(new Integer(lucene_doc_num));
281	if (doc_term_freq_object != null)
282	{
283	doc_term_freq = doc_term_freq_object.intValue();
284	}
285	lucene_query_result.addDoc(doc.get("docOID").trim(), hits.scoreDocs[i].score, doc_term_freq);
286	}
287	}
288
289	// Slightly more complicated case for returning a subset of the matching documents
290	else {
291	// Perform the query (filter may be null)
292	TopFieldDocs hits = searcher.search(query, filter, end_results, sorter, true, true); // doDocScores=true, doMaxScore=true
293	// See also http://stackoverflow.com/questions/7910241/in-lucene-what-is-the-purpose-of-setdefaultfieldsortscoring
294	lucene_query_result.setTotalDocs(hits.totalHits);
295
296	lucene_query_result.setStartResults(start_results);
297	lucene_query_result.setEndResults(end_results < hits.scoreDocs.length ? end_results: hits.scoreDocs.length);
298
299	// Output the matching documents
300	for (int i = start_results; (i < hits.scoreDocs.length && i < end_results); i++) {
301	int lucene_doc_num = hits.scoreDocs[i].doc;
302	Document doc = reader.document(lucene_doc_num);
303	int doc_term_freq = 0;
304	Integer doc_term_freq_object = (Integer) doc_term_freq_map.get(new Integer(lucene_doc_num));
305	if (doc_term_freq_object != null)
306	{
307	doc_term_freq = doc_term_freq_object.intValue();
308	}
309	lucene_query_result.addDoc(doc.get("docOID").trim(), hits.scoreDocs[i].score, doc_term_freq);
310	}
311	}
312	}
313
314	catch (ParseException parse_exception) {
315	lucene_query_result.setError(LuceneQueryResult.PARSE_ERROR);
316	}
317	catch (BooleanQuery.TooManyClauses too_many_clauses_exception) {
318	lucene_query_result.setError(LuceneQueryResult.TOO_MANY_CLAUSES_ERROR);
319	}
320	catch (IOException exception) {
321	lucene_query_result.setError(LuceneQueryResult.IO_ERROR);
322	exception.printStackTrace();
323	}
324	catch (Exception exception) {
325	lucene_query_result.setError(LuceneQueryResult.OTHER_ERROR);
326	exception.printStackTrace();
327	}
328	return lucene_query_result;
329	}
330
331	public void setDefaultConjunctionOperator(String default_conjunction_operator) {
332	super.setDefaultConjunctionOperator(default_conjunction_operator);
333
334	if (default_conjunction_operator.equals("AND")) {
335	query_parser.setDefaultOperator(query_parser.AND_OPERATOR);
336	query_parser_no_stop_words.setDefaultOperator(query_parser.AND_OPERATOR);
337	} else { // default is OR
338	query_parser.setDefaultOperator(query_parser.OR_OPERATOR);
339	query_parser_no_stop_words.setDefaultOperator(query_parser.OR_OPERATOR);
340	}
341	}
342
343	// This version of the cleanUp() method is just to clean up anything associated only with this instance of GS2LuceneQuery.
344	// So it won't clean up the singleton IndexReader instances maintained by the encapsulating GS2LuceneSearch class.
345	public void cleanUp() {
346	super.cleanUp();
347
348	searcher = null;
349
350	// Don't close the indexReader reference here.
351	// This has moved into the GS2LuceneSearch.cleanUp() method, as it maintains singleton IndexReaders
352	// for each index level (sidx, didix) with lifespans matching their collection's lifespan
353	// A collection's GS2LuceneSearch object lives for the duration of the Collection.
354	// A GS2LuceneQuery object is ephemeral: only lives for the duration of a query, allowing multiple
355	// users to do queries concurrently, sharing a single IndexReader object for each indexing level
356	// since IndexReaders support concurrency.
357	}
358
359	protected Query parseQuery(IndexReader reader, QueryParser query_parser, String query_string, String fuzziness)
360	throws java.io.IOException, org.apache.lucene.queryparser.classic.ParseException
361	{
362	// Split query string into the search terms and the filter terms
363	// * The first +(...) term contains the search terms so count
364	// up '(' and stop when we finish matching ')'
365	int offset = 0;
366	int paren_count = 0;
367	boolean seen_paren = false;
368	while (offset < query_string.length() && (!seen_paren \|\| paren_count > 0)) {
369	if (query_string.charAt(offset) == '(') {
370	paren_count++;
371	seen_paren = true;
372	}
373	if (query_string.charAt(offset) == ')') {
374	paren_count--;
375	}
376	offset++;
377	}
378	String query_prefix = query_string.substring(0, offset);
379	String query_suffix = query_string.substring(offset);
380
381	///ystem.err.println("Prefix: " + query_prefix);
382	///ystem.err.println("Suffix: " + query_suffix);
383
384	Query query = query_parser.parse(query_prefix);
385	query = query.rewrite(reader);
386
387	// If this is a fuzzy search, then we need to add the fuzzy
388	// flag to each of the query terms
389	if (fuzziness != null && query.toString().length() > 0) {
390
391	// Revert the query to a string
392	System.err.println("Rewritten query: " + query.toString());
393	// Search through the string for TX:<term> query terms
394	// and append the ~ operator. Note that this search will
395	// not change phrase searches (TX:"<term> <term>") as
396	// fuzzy searching is not possible for these entries.
397	// Yahoo! Time for a state machine!
398	StringBuffer mutable_query_string = new StringBuffer(query.toString());
399	int o = 0; // Offset
400	// 0 = BASE, 1 = SEEN_T, 2 = SEEN_TX, 3 = SEEN_TX:
401	int s = 0; // State
402	while(o < mutable_query_string.length()) {
403	char c = mutable_query_string.charAt(o);
404	if (s == 0 && c == TEXTFIELD.charAt(0)) {
405	///ystem.err.println("Found T!");
406	s = 1;
407	}
408	else if (s == 1) {
409	if (c == TEXTFIELD.charAt(1)) {
410	///ystem.err.println("Found X!");
411	s = 2;
412	}
413	else {
414	s = 0; // Reset
415	}
416	}
417	else if (s == 2) {
418	if (c == ':') {
419	///ystem.err.println("Found TX:!");
420	s = 3;
421	}
422	else {
423	s = 0; // Reset
424	}
425	}
426	else if (s == 3) {
427	// Don't process phrases
428	if (c == '"') {
429	///ystem.err.println("Stupid phrase...");
430	s = 0; // Reset
431	}
432	// Found the end of the term... add the
433	// fuzzy search indicator
434	// Nor outside the scope of parentheses
435	else if (Character.isWhitespace(c) \|\| c == ')') {
436	///ystem.err.println("Yahoo! Found fuzzy term.");
437	mutable_query_string.insert(o, '~' + fuzziness);
438	o++;
439	s = 0; // Reset
440	}
441	}
442	o++;
443	}
444	// If we were in the state of looking for the end of a
445	// term - then we just found it!
446	if (s == 3) {
447
448	mutable_query_string.append('~' + fuzziness);
449	}
450	// Reparse the query
451	///ystem.err.println("Fuzzy query: " + mutable_query_string.toString() + query_suffix);
452	query = query_parser.parse(mutable_query_string.toString() + query_suffix);
453	}
454	else {
455	query = query_parser.parse(query_prefix + query_suffix);
456	}
457
458	return query;
459	}
460
461	// If you're dealing with a BooleanQuery, they need to be recursively rewritten
462	// as they can contain queries with wildcards (WildcardQuery\|PrefixQuery subclasses of MultiTermQuery)
463	// e.g. season* farm
464	// If MultiTermQuery, then expand here. e.g. WildcardQuerys like season*.
465	// DON'T call this method from inside parseQuery() (in place of its query.rewrite()), because then wildcard
466	// queries like season* won't contain Terms (extractTerms() will be empty) since the ConstantScoreQuerys
467	// that a WildcardQuery gets rewritten to here will contain Filters in place of Terms.
468	// Call this method from runQuery() after it calls parseQuery().
469	// Now searches like these will work
470	// season* farm
471	// season* farm*
472	// and not just searches like the following which already used to work:
473	// season*
474	// snail farm
475	// Idea for the solution of recursively processing a BooleanQuery came from inspecting source code to BooleanQuery.java
476	// https://github.com/apache/lucene-solr/blob/master/lucene/core/src/java/org/apache/lucene/search/BooleanQuery.java
477	// which also does a recursive rewrite. Unfortunately, the existing BooleanQuery does not handle MultiTermQuery
478	// subcomponents.
479	protected Query recursivelyRewriteQuery(Query orig_query, IndexReader reader, LuceneQueryResult lucene_query_result) throws java.io.IOException
480	{
481	//Query query = orig_query.rewrite(reader);
482	Query query = orig_query;
483
484	if(orig_query instanceof BooleanQuery) {
485	BooleanQuery booleanQuery = (BooleanQuery)orig_query;
486	List<BooleanClause> clauses = booleanQuery.clauses();
487	for (BooleanClause clause : clauses) {
488	Query subQuery = clause.getQuery();
489	subQuery = recursivelyRewriteQuery(subQuery, reader, lucene_query_result);
490	clause.setQuery(subQuery);
491	}
492	}
493
494	// GS2's LuceneWrapper uses lucene-2.3.2. GS3's LuceneWrapper3 works with lucene-3.3.0.
495	// This change in lucene core library for GS3 (present since after version 2.4.1) had the
496	// side-effect that searching on "econom*" didn't display what terms it was searching for,
497	// whereas it had done so in GS2.
498
499	// The details of this problem and its current solution are explained in the ticket
500	// http://trac.greenstone.org/ticket/845
501
502	// We need to change the settings for the rewriteMethod in order to get searches on wildcards
503	// to produce search terms again when the query gets rewritten.
504
505	// We try, in order:
506	// 1. RewriteMethod set to BooleanQuery, to get it working as in GS2 which uses lucene-2.3.2
507	// it will expand wildcard searches to its terms when searching at both section AND doc level.
508	// If that throws a TooManyClauses exception (like when searching for "a*" over lucene demo collection)
509	// 2. Then try a custom rewriteMethod which sets termCountCutoff=350 and docCountPercent cutoff=0.1%
510	// If that throws a TooManyClauses exception (could perhaps happen if the collection has a huge number of docs
511	// 3. Then try the default apache rewriteMethod with its optimum defaults of
512	// termCountCutoff=350 and docCountPercent cutoff=0.1%
513	// See http://lucene.apache.org/core/3_6_1/api/core/org/apache/lucene/search/MultiTermQuery.html
514
515	//System.err.println("@@@@ query class name: " + orig_query.getClass());
516	//System.err.println("@@@@ QUERY: " + orig_query);
517
518	if(orig_query instanceof MultiTermQuery) {
519	MultiTermQuery multiTermQuery = (MultiTermQuery)orig_query;
520	multiTermQuery.setRewriteMethod(MultiTermQuery.CONSTANT_SCORE_BOOLEAN_QUERY_REWRITE);
521	// less CPU intensive than MultiTermQuery.SCORING_BOOLEAN_QUERY_REWRITE)
522	}
523
524	try {
525	query = orig_query.rewrite(reader);
526	}
527	catch(BooleanQuery.TooManyClauses clauseException) {
528	// Example test case: try searching the lucene demo collection for "a*"
529	// and you'll hit this exception
530
531	lucene_query_result.setError(LuceneQueryResult.TOO_MANY_CLAUSES_ERROR);
532
533	if(query instanceof MultiTermQuery) {
534
535	// CustomRewriteMethod: setting the docCountPercent cutoff to a custom 100%.
536	// This will at least expand the query to its terms when searching with wildcards at section-level
537	// (though it doesn't seem to work for doc-level searches, no matter what the cutoffs are set to).
538
539	MultiTermQuery.ConstantScoreAutoRewrite customRewriteMethod = new MultiTermQuery.ConstantScoreAutoRewrite();
540	customRewriteMethod.setDocCountPercent(100.0);
541	customRewriteMethod.setTermCountCutoff(350); // same as default
542
543	MultiTermQuery multiTermQuery = (MultiTermQuery)query;
544	multiTermQuery.setRewriteMethod(customRewriteMethod);
545	try {
546	query = query.rewrite(reader);
547	}
548	catch(BooleanQuery.TooManyClauses clauseExceptionAgain) {
549
550	// do what the code originally did: use the default rewriteMethod which
551	// uses a default docCountPercent=0.1 (%) and termCountCutoff=350
552
553	multiTermQuery.setRewriteMethod(MultiTermQuery.CONSTANT_SCORE_AUTO_REWRITE_DEFAULT);
554	query = query.rewrite(reader);
555	}
556	}
557	}
558
559	// BooleanQuery.java recurses rewriting any query until it is identical before and after rewrite,
560	// see reference to "recursively rewrite" in
561	// https://github.com/apache/lucene-solr/blob/master/lucene/core/src/java/org/apache/lucene/search/BooleanQuery.java
562	if(orig_query == query) {
563	return query;
564	} else {
565	return recursivelyRewriteQuery(query, reader, lucene_query_result);
566	}
567	}
568
569	protected Filter parseFilterString(String filter_string)
570	{
571	Filter result = null;
572	Pattern pattern = Pattern.compile("\\s\\+(\\w+)\\:([\\{\\[])(\\d+)\\s+TO\\s+(\\d+)([\\}\\]])\\s");
573	Matcher matcher = pattern.matcher(filter_string);
574	if (matcher.matches()) {
575	String field_name = matcher.group(1);
576	boolean include_lower = matcher.group(2).equals("[");
577	BytesRef lower_term = new BytesRef(matcher.group(3));
578	BytesRef upper_term = new BytesRef(matcher.group(4));
579	boolean include_upper = matcher.group(5).equals("]");
580	result = new TermRangeFilter(field_name, lower_term, upper_term, include_lower, include_upper);
581	}
582	else {
583	System.err.println("Error: Could not understand filter string \"" + filter_string + "\"");
584	}
585	return result;
586	}
587
588
589	/** command line program and auxiliary methods */
590
591	// Fairly self-explanatory I should hope
592	static protected boolean query_result_caching_enabled = false;
593
594	/**
595	* This main() method is used by GS2 to do searches.
596	* In GS2, lucene_query.pl calles this main() method in the LuceneWrapper4.jar. This main method instantiates both
597	* a GS2LuceneQuery and an IndexReader object. It then passes the reader to the GS2LuceneQuery object by calling
598	* the GS2LuceneQuery.initialise(reader) method. This main() method then finally performs the search with the provided query.
599	* GS3 doesn't use this main() method. Instead a GS2LuceneSearch object (of gsdl3.jar) instantiates both
600	* the GS2LuceneQuery and IndexReader objects and proceeds the same way.
601	*/
602	static public void main (String args[])
603	{
604	if (args.length == 0) {
605	System.out.println("Usage: org.greenstone.LuceneWrapper4.GS2LuceneQuery <index directory> [-fuzziness value] [-filter filter_string] [-sort sort_field] [-reverse_sort][-dco AND\|OR] [-startresults number -endresults number] [query]");
606	return;
607	}
608
609	try {
610	String index_directory = args[0];
611
612	GS2LuceneQuery queryer = new GS2LuceneQuery();
613	queryer.setIndexDir(index_directory);
614
615	// Prepare the index cache directory, if query result caching is enabled
616	if (query_result_caching_enabled) {
617	// Make the index cache directory if it doesn't already exist
618	File index_cache_directory = new File(index_directory, "cache");
619	if (!index_cache_directory.exists()) {
620	index_cache_directory.mkdir();
621	}
622
623	// Disable caching if the index cache directory isn't available
624	if (!index_cache_directory.exists() \|\| !index_cache_directory.isDirectory()) {
625	query_result_caching_enabled = false;
626	}
627	}
628
629	String query_string = null;
630
631	// Parse the command-line arguments
632	for (int i = 1; i < args.length; i++) {
633	if (args[i].equals("-sort")) {
634	i++;
635	queryer.setSortField(args[i]);
636	}
637	else if (args[i].equals("-reverse_sort")) {
638	queryer.setReverseSort(true);
639	}
640	else if (args[i].equals("-filter")) {
641	i++;
642	queryer.setFilterString(args[i]);
643	}
644	else if (args[i].equals("-dco")) {
645	i++;
646	queryer.setDefaultConjunctionOperator(args[i]);
647	}
648	else if (args[i].equals("-fuzziness")) {
649	i++;
650	queryer.setFuzziness(args[i]);
651	}
652	else if (args[i].equals("-startresults")) {
653	i++;
654	if (args[i].matches("\\d+")) {
655	queryer.setStartResults(Integer.parseInt(args[i]));
656	}
657	}
658	else if (args[i].equals("-endresults")) {
659	i++;
660	if (args[i].matches("\\d+")) {
661	queryer.setEndResults(Integer.parseInt(args[i]));
662	}
663	}
664	else {
665	query_string = args[i];
666	}
667	}
668
669	Directory full_indexdir_dir = FSDirectory.open(new File(index_directory));
670	IndexReader reader = DirectoryReader.open(full_indexdir_dir); // Returns a IndexReader reading the index in the given Directory.
671	// Now readOnly=true by default, and therefore also for searcher created in initialise() call below.
672	if (!queryer.initialise(reader)) {
673	if(reader != null) reader.close(); // close reader object IF reader was instantiated
674	queryer.cleanUp(); // will close searcher object if non-null
675	return;
676	}
677
678	// The query string has been specified as a command-line argument
679	if (query_string != null) {
680	runQueryCaching(index_directory, queryer, query_string);
681	}
682
683	// Read queries from STDIN
684	else {
685	BufferedReader in = new BufferedReader(new InputStreamReader(System.in, "UTF-8"));
686	while (true) {
687	// Read the query from STDIN
688	query_string = in.readLine();
689	if (query_string == null \|\| query_string.length() == -1) {
690	break;
691	}
692
693	runQueryCaching(index_directory, queryer, query_string);
694
695	}
696	}
697	if(reader != null) reader.close();
698	queryer.cleanUp();
699	}
700	catch (IOException exception) {
701	exception.printStackTrace();
702	}
703	}
704
705	protected static void runQueryCaching(String index_directory, GS2LuceneQuery queryer, String query_string)
706	throws IOException
707	{
708	StringBuffer query_results_xml = new StringBuffer();
709
710	// Check if this query result has been cached from a previous search (if it's enabled)
711	File query_result_cache_file = null;
712	if (query_result_caching_enabled) {
713	// Generate the cache file name from the query options
714	String query_result_cache_file_name = query_string + "-";
715	String fuzziness = queryer.getFuzziness();
716	query_result_cache_file_name += ((fuzziness != null) ? fuzziness : "") + "-";
717	String filter_string = queryer.getFilterString();
718	query_result_cache_file_name += ((filter_string != null) ? filter_string : "") + "-";
719	String sort_string = queryer.getSortField();
720	query_result_cache_file_name += ((sort_string != null) ? sort_string : "") + "-";
721	String reverse_sort_string = (queryer.getReverseSort() ? "1" : "0");
722	query_result_cache_file_name += reverse_sort_string + "-";
723	String default_conjunction_operator = queryer.getDefaultConjunctionOperator();
724	query_result_cache_file_name += default_conjunction_operator + "-";
725	int start_results = queryer.getStartResults();
726	int end_results = queryer.getEndResults();
727	query_result_cache_file_name += start_results + "-" + end_results;
728	query_result_cache_file_name = fileSafe(query_result_cache_file_name);
729
730	// If the query result cache file exists, just return its contents and we're done
731	File index_cache_directory = new File(index_directory, "cache");
732	query_result_cache_file = new File(index_cache_directory, query_result_cache_file_name);
733	if (query_result_cache_file.exists() && query_result_cache_file.isFile()) {
734	FileInputStream fis = new FileInputStream(query_result_cache_file);
735	InputStreamReader isr = new InputStreamReader(fis, "UTF-8");
736	BufferedReader buffered_reader = new BufferedReader(isr);
737	String line = "";
738	while ((line = buffered_reader.readLine()) != null) {
739	query_results_xml.append(line + "\n");
740	}
741	String query_results_xml_string = query_results_xml.toString();
742	query_results_xml_string = query_results_xml_string.replaceFirst("cached=\"false\"", "cached=\"true\"");
743
744	utf8out.print(query_results_xml_string);
745	utf8out.flush();
746
747	return;
748	}
749	}
750
751	// not cached
752	query_results_xml.append("<ResultSet cached=\"false\">\n");
753	query_results_xml.append("<QueryString>" + LuceneQueryResult.xmlSafe(query_string) + "</QueryString>\n");
754	Filter filter = queryer.getFilter();
755	if (filter != null) {
756	query_results_xml.append("<FilterString>" + filter.toString() + "</FilterString>\n");
757	}
758
759	LuceneQueryResult query_result = queryer.runQuery(query_string);
760	if (query_result == null) {
761	System.err.println("Couldn't run the query");
762	return;
763	}
764
765	if (query_result.getError() != LuceneQueryResult.NO_ERROR) {
766	query_results_xml.append("<Error type=\""+query_result.getErrorString()+"\" />\n");
767	} else {
768	query_results_xml.append(query_result.getXMLString());
769	}
770	query_results_xml.append("</ResultSet>\n");
771
772	utf8out.print(query_results_xml);
773	utf8out.flush();
774
775	// Cache this query result, if desired
776	if (query_result_caching_enabled) {
777	// Catch any exceptions thrown trying to write the query result cache file and warn about them, but don't
778	// bother with the full stack trace. It won't affect the functionality if we can't write some cache
779	// files, it will just affect the speed of subsequent requests.
780	// Example exceptions are "permission denied" errors, or "filename too long" errors (the filter string
781	// can get very long in some collections)
782	try
783	{
784	FileWriter query_result_cache_file_writer = new FileWriter(query_result_cache_file);
785	query_result_cache_file_writer.write(query_results_xml.toString());
786	query_result_cache_file_writer.close();
787	}
788	catch (Exception exception)
789	{
790	System.err.println("Warning: Exception occurred trying to write query result cache file (" + exception + ")");
791	}
792	}
793	}
794
795	protected static String fileSafe(String text)
796	{
797	StringBuffer file_safe_text = new StringBuffer();
798	for (int i = 0; i < text.length(); i++) {
799	char character = text.charAt(i);
800	if ((character >= 'A' && character <= 'Z') \|\| (character >= 'a' && character <= 'z') \|\| (character >= '0' && character <= '9') \|\| character == '-') {
801	file_safe_text.append(character);
802	}
803	else {
804	file_safe_text.append('%');
805	file_safe_text.append((int) character);
806	}
807	}
808	return file_safe_text.toString();
809	}
810
811
812	}
813
814

Note: See TracBrowser for help on using the repository browser.

Context Navigation

source: main/trunk/greenstone2/common-src/indexers/lucene-gs/src/org/greenstone/LuceneWrapper4/GS2LuceneQuery.java@ 32729

Download in other formats: