Context Navigation

GS2LuceneQuery.java@ 32609

Last change on this file since 32609 was 32609, checked in by ak19, 5 years ago

Preliminary stage before tackling a different bug. This commit is a bugfix to the index folder file locking problem that occurs on Windows when coll deactivate doesn't close all file handles to the coll index folder after doing some lucene searches. Inspecting the code revealed the possibility of another different bug, for which Kathy devised a test to confirm its existence. After testing, found the bug is real: multiple queries configure the same query object (and its internal reader object) but the last configuration is always used to run a search. For example, one user wants to search a lucene collection at doc level and a second user wants to search the same collection at section level. The 2nd user's configuration wins if they configure between the first person's query object being configured and its query being run. So the first person now ends up seeing search results that are at section level.

File size: 30.4 KB

Line
1	/**********************************************************************
2	*
3	* GS2LuceneQuery.java
4	*
5	* Copyright 2004 The New Zealand Digital Library Project
6	*
7	* A component of the Greenstone digital library software
8	* from the New Zealand Digital Library Project at the
9	* University of Waikato, New Zealand.
10	*
11	* This program is free software; you can redistribute it and/or modify
12	* it under the terms of the GNU General Public License as published by
13	* the Free Software Foundation; either version 2 of the License, or
14	* (at your option) any later version.
15	*
16	* This program is distributed in the hope that it will be useful,
17	* but WITHOUT ANY WARRANTY; without even the implied warranty of
18	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19	* GNU General Public License for more details.
20	*
21	* You should have received a copy of the GNU General Public License
22	* along with this program; if not, write to the Free Software
23	* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24	*
25	*********************************************************************/
26	package org.greenstone.LuceneWrapper4;
27
28
29	import java.io.*;
30	import java.util.*;
31	import java.util.regex.*;
32
33	import org.apache.lucene.analysis.Analyzer;
34	import org.apache.lucene.analysis.standard.StandardAnalyzer;
35	import org.apache.lucene.document.Document;
36	import org.apache.lucene.index.DirectoryReader;
37	import org.apache.lucene.index.IndexReader;
38	import org.apache.lucene.index.Term;
39	//import org.apache.lucene.index.TermDocs;
40	import org.apache.lucene.queryparser.classic.ParseException;
41	import org.apache.lucene.queryparser.classic.QueryParser;
42	import org.apache.lucene.search.BooleanClause;
43	import org.apache.lucene.search.BooleanQuery; // for the TooManyClauses exception
44	import org.apache.lucene.search.ConstantScoreQuery;
45	import org.apache.lucene.search.Filter;
46	import org.apache.lucene.search.IndexSearcher;
47	import org.apache.lucene.search.MultiTermQuery;
48	import org.apache.lucene.search.MultiTermQuery.ConstantScoreAutoRewrite;
49	import org.apache.lucene.search.Query;
50	import org.apache.lucene.search.TermRangeFilter;
51	import org.apache.lucene.search.IndexSearcher; // Searcher is deprecated
52	import org.apache.lucene.search.ScoreDoc;
53	import org.apache.lucene.search.Sort;
54	import org.apache.lucene.search.SortField;
55	import org.apache.lucene.search.TopFieldDocs;
56
57	import org.apache.lucene.index.DocsEnum;
58	import org.apache.lucene.index.MultiFields;
59
60	import org.apache.lucene.store.Directory;
61	import org.apache.lucene.store.FSDirectory;
62
63	import org.apache.lucene.util.Bits;
64	import org.apache.lucene.util.BytesRef;
65	import org.apache.lucene.util.Version;
66
67	public class GS2LuceneQuery extends SharedSoleneQuery
68	{
69	public static String SORT_RANK = "rank";
70	public static String SORT_NATURAL = "natural";
71
72	protected String full_indexdir="";
73
74	protected SortField.Type sort_type = SortField.Type.SCORE;
75	protected boolean reverse_sort = false;
76	protected Sort sorter=new Sort();
77	protected Filter filter = null;
78
79	protected QueryParser query_parser = null;
80	protected QueryParser query_parser_no_stop_words = null;
81	protected IndexSearcher searcher = null;
82	protected IndexReader reader = null;
83
84	public GS2LuceneQuery() {
85	super();
86
87	// Create one query parser with the standard set of stop words, and one with none
88
89	query_parser = new QueryParser(GSLuceneConstants.MATCH_VERSION, TEXTFIELD, new GS2Analyzer()); // uses built-in stop_words_set
90	query_parser_no_stop_words = new QueryParser(GSLuceneConstants.MATCH_VERSION, TEXTFIELD, new GS2Analyzer(new String[] { }));
91	}
92
93
94	public boolean initialise() {
95
96	if (!super.initialise()) {
97	return false;
98	}
99
100
101	if (full_indexdir==null \|\| full_indexdir.length()==-1){
102	utf8out.println("Index directory is not indicated ");
103	utf8out.flush();
104	return false;
105	}
106
107	try {
108
109	if(reader != null) {
110	reader.close();
111	searcher = null;
112	}
113
114	Directory full_indexdir_dir = FSDirectory.open(new File(full_indexdir));
115
116	reader = DirectoryReader.open(full_indexdir_dir); // Returns a IndexReader reading the index in the given Directory. now readOnly=true by default, and therefore also for searcher
117	searcher = new IndexSearcher(reader); // during searcher.search() will get it to compute ranks when sorting by fields
118
119	this.sorter = new Sort(new SortField(this.sort_field, this.sort_type, this.reverse_sort));
120	}
121	catch (IOException exception) {
122	exception.printStackTrace();
123	return false;
124	}
125	return true;
126
127	}
128
129	public void setIndexDir(String full_indexdir) {
130	this.full_indexdir = full_indexdir;
131	}
132
133	public void setSortField(String sort_field) {
134	if (sort_field.equals(SORT_RANK)) {
135	this.sort_field = null;
136	this.sort_type = SortField.Type.SCORE;
137	} else if (sort_field.equals(SORT_NATURAL)) {
138	this.sort_field = null;
139	this.sort_type = SortField.Type.DOC;
140	} else {
141	this.sort_field = sort_field;
142	this.sort_type = SortField.Type.STRING; // for now. numeric??
143	}
144	}
145	public void setReverseSort(boolean reverse) {
146	this.reverse_sort = reverse;
147	}
148	public boolean getReverseSort() {
149	return this.reverse_sort;
150	}
151
152	public void setFilterString(String filter_string) {
153	super.setFilterString(filter_string);
154	this.filter = parseFilterString(filter_string);
155	}
156
157	public Filter getFilter() {
158	return this.filter;
159	}
160
161
162	public LuceneQueryResult runQuery(String query_string) {
163
164	if (query_string == null \|\| query_string.equals("")) {
165	utf8out.println("The query word is not indicated ");
166	utf8out.flush();
167	return null;
168	}
169
170	LuceneQueryResult lucene_query_result=new LuceneQueryResult();
171	lucene_query_result.clear();
172
173	try {
174	Query query_including_stop_words = query_parser_no_stop_words.parse(query_string);
175	query_including_stop_words = query_including_stop_words.rewrite(reader);
176
177	// System.err.println("******* query_string " + query_string + "**");
178
179	Query query = parseQuery(reader, query_parser, query_string, fuzziness);
180	query = recursivelyRewriteQuery(query, reader, lucene_query_result);
181	// System.err.println("@@@@ final query class name: " + query.getClass());
182
183	// http://stackoverflow.com/questions/13537126/term-frequency-in-lucene-4-0
184	// http://stackoverflow.com/questions/20575254/lucene-4-4-how-to-get-term-frequency-over-all-index
185	// http://stackoverflow.com/questions/8938960/how-to-get-document-ids-for-document-term-vector-in-lucene?rq=1
186	// https://github.com/hibernate/hibernate-search/blob/master/orm/src/test/java/org/hibernate/search/test/filter/BestDriversFilter.java
187	// http://lucene.apache.org/core/4_7_2/MIGRATE.html
188
189	// Get the list of expanded query terms and their frequencies
190	// num docs matching, and total frequency
191	HashSet terms = new HashSet();
192	query.extractTerms(terms);
193
194	HashMap doc_term_freq_map = new HashMap();
195
196	Iterator iter = terms.iterator();
197
198	Bits liveDocs = null;
199	if(reader.hasDeletions()) {
200	System.err.println("@@@ GS2LuceneQuery.java: There have been deletions. Merging to get liveDocs.");
201	liveDocs = MultiFields.getLiveDocs(reader); // SLOW! But getLiveDocs returns null if there are no deletions
202	}
203
204	while (iter.hasNext()) {
205
206	// http://stackoverflow.com/questions/13537126/term-frequency-in-lucene-4-0
207
208	Term term = (Term) iter.next();
209	// System.err.println("@@@ GS2LuceneQuery.java: Next term: " + term.text());
210	BytesRef term_bytes = term.bytes();
211	DocsEnum term_docs = MultiFields.getTermDocsEnum(reader, liveDocs, term.field(), term_bytes); // flags?
212
213	// Get the term frequency over all the documents
214	//TermDocs term_docs = reader.termDocs(term);
215	int term_freq = 0;
216	int match_docs = 0;
217
218	if(term_docs != null) {
219	int docID = -1;
220	while((docID = term_docs.nextDoc()) != DocsEnum.NO_MORE_DOCS) {//while (term_docs.next())
221	if (term_docs.freq() != 0)
222	{
223	term_freq += term_docs.freq();
224	match_docs++;
225
226	// Calculate the document-level term frequency as well
227	Integer lucene_doc_num_obj = new Integer(term_docs.docID());
228	int doc_term_freq = 0;
229	if (doc_term_freq_map.containsKey(lucene_doc_num_obj))
230	{
231	doc_term_freq = ((Integer) doc_term_freq_map.get(lucene_doc_num_obj)).intValue();
232	}
233	doc_term_freq += term_docs.freq();
234
235	doc_term_freq_map.put(lucene_doc_num_obj, new Integer(doc_term_freq));
236	}
237	}
238	} else {
239	System.err.println("@@@ GS2LuceneQuery.java: term_docs is null for term " + term.text());
240	}
241
242	// Create a term
243	lucene_query_result.addTerm(term.text(), term.field(), match_docs, term_freq);
244	}
245
246	// Get the list of stop words removed from the query
247	HashSet terms_including_stop_words = new HashSet();
248	query_including_stop_words.extractTerms(terms_including_stop_words);
249	Iterator terms_including_stop_words_iter = terms_including_stop_words.iterator();
250	while (terms_including_stop_words_iter.hasNext()) {
251	Term term = (Term) terms_including_stop_words_iter.next();
252	if (!terms.contains(term)) {
253	lucene_query_result.addStopWord(term.text());
254	}
255	}
256
257	// Extracting all documents for a given search - http://www.gossamer-threads.com/lists/lucene/java-user/134873
258	// http://lucene.apache.org/core/3_4_0/api/core/org/apache/lucene/search/TotalHitCountCollector.html
259	// http://lucene.apache.org/core/4_7_2/core/index.html?org/apache/lucene/search/TopFieldDocs.html
260
261	// 1. Figure out how many results there will be.
262	//TotalHitCountCollecter countCollector = new TotalHitCountCollector();
263	//searcher.search(query, filter, collector);
264	//int hitCount = collector.count;
265
266	// Actually do the query
267	// Simple case for getting all the matching documents
268	if (end_results == Integer.MAX_VALUE) {
269	// Perform the query (filter and sorter may be null)
270	TopFieldDocs hits = searcher.search(query, filter, end_results, sorter, true, true); // doDocScores=true, doMaxScore=true
271	// Is there a slight difference in the definition between
272	// https://lucene.apache.org/core/3_6_0/api/all/org/apache/lucene/search/IndexSearcher.html#setDefaultFieldSortScoring%28boolean,%20boolean%29
273	// and http://lucene.apache.org/core/4_7_2/core/org/apache/lucene/search/IndexSearcher.html#search%28org.apache.lucene.search.Query,%20org.apache.lucene.search.Filter,%20int,%20org.apache.lucene.search.Sort,%20boolean,%20boolean%29
274	// Seems to be okay.
275	// See also http://stackoverflow.com/questions/7910241/in-lucene-what-is-the-purpose-of-setdefaultfieldsortscoring
276
277	lucene_query_result.setTotalDocs(hits.totalHits);
278
279	// Output the matching documents
280	lucene_query_result.setStartResults(start_results);
281	lucene_query_result.setEndResults(hits.totalHits); // ??
282
283	for (int i = start_results; i < hits.totalHits; i++) {
284	int lucene_doc_num = hits.scoreDocs[i ].doc; // i-1
285	Document doc = reader.document(lucene_doc_num);
286	int doc_term_freq = 0;
287	Integer doc_term_freq_object = (Integer) doc_term_freq_map.get(new Integer(lucene_doc_num));
288	if (doc_term_freq_object != null)
289	{
290	doc_term_freq = doc_term_freq_object.intValue();
291	}
292	lucene_query_result.addDoc(doc.get("docOID").trim(), hits.scoreDocs[i].score, doc_term_freq);
293	}
294	}
295
296	// Slightly more complicated case for returning a subset of the matching documents
297	else {
298	// Perform the query (filter may be null)
299	TopFieldDocs hits = searcher.search(query, filter, end_results, sorter, true, true); // doDocScores=true, doMaxScore=true
300	// See also http://stackoverflow.com/questions/7910241/in-lucene-what-is-the-purpose-of-setdefaultfieldsortscoring
301	lucene_query_result.setTotalDocs(hits.totalHits);
302
303	lucene_query_result.setStartResults(start_results);
304	lucene_query_result.setEndResults(end_results < hits.scoreDocs.length ? end_results: hits.scoreDocs.length);
305
306	// Output the matching documents
307	for (int i = start_results; (i < hits.scoreDocs.length && i < end_results); i++) {
308	int lucene_doc_num = hits.scoreDocs[i].doc;
309	Document doc = reader.document(lucene_doc_num);
310	int doc_term_freq = 0;
311	Integer doc_term_freq_object = (Integer) doc_term_freq_map.get(new Integer(lucene_doc_num));
312	if (doc_term_freq_object != null)
313	{
314	doc_term_freq = doc_term_freq_object.intValue();
315	}
316	lucene_query_result.addDoc(doc.get("docOID").trim(), hits.scoreDocs[i].score, doc_term_freq);
317	}
318	}
319	}
320
321	catch (ParseException parse_exception) {
322	lucene_query_result.setError(LuceneQueryResult.PARSE_ERROR);
323	}
324	catch (BooleanQuery.TooManyClauses too_many_clauses_exception) {
325	lucene_query_result.setError(LuceneQueryResult.TOO_MANY_CLAUSES_ERROR);
326	}
327	catch (IOException exception) {
328	lucene_query_result.setError(LuceneQueryResult.IO_ERROR);
329	exception.printStackTrace();
330	}
331	catch (Exception exception) {
332	lucene_query_result.setError(LuceneQueryResult.OTHER_ERROR);
333	exception.printStackTrace();
334	}
335	return lucene_query_result;
336	}
337
338	public void setDefaultConjunctionOperator(String default_conjunction_operator) {
339	super.setDefaultConjunctionOperator(default_conjunction_operator);
340
341	if (default_conjunction_operator.equals("AND")) {
342	query_parser.setDefaultOperator(query_parser.AND_OPERATOR);
343	query_parser_no_stop_words.setDefaultOperator(query_parser.AND_OPERATOR);
344	} else { // default is OR
345	query_parser.setDefaultOperator(query_parser.OR_OPERATOR);
346	query_parser_no_stop_words.setDefaultOperator(query_parser.OR_OPERATOR);
347	}
348	}
349
350
351	public void cleanUp() {
352	super.cleanUp();
353	try {
354	if(reader != null) {
355	reader.close();
356	// Closes files associated with this index. Also saves any new deletions to disk.
357	// No other methods should be called after this has been called.
358	}
359
360	} catch (IOException exception) {
361	exception.printStackTrace();
362	}
363	}
364
365
366	protected Query parseQuery(IndexReader reader, QueryParser query_parser, String query_string, String fuzziness)
367	throws java.io.IOException, org.apache.lucene.queryparser.classic.ParseException
368	{
369	// Split query string into the search terms and the filter terms
370	// * The first +(...) term contains the search terms so count
371	// up '(' and stop when we finish matching ')'
372	int offset = 0;
373	int paren_count = 0;
374	boolean seen_paren = false;
375	while (offset < query_string.length() && (!seen_paren \|\| paren_count > 0)) {
376	if (query_string.charAt(offset) == '(') {
377	paren_count++;
378	seen_paren = true;
379	}
380	if (query_string.charAt(offset) == ')') {
381	paren_count--;
382	}
383	offset++;
384	}
385	String query_prefix = query_string.substring(0, offset);
386	String query_suffix = query_string.substring(offset);
387
388	///ystem.err.println("Prefix: " + query_prefix);
389	///ystem.err.println("Suffix: " + query_suffix);
390
391	Query query = query_parser.parse(query_prefix);
392	query = query.rewrite(reader);
393
394	// If this is a fuzzy search, then we need to add the fuzzy
395	// flag to each of the query terms
396	if (fuzziness != null && query.toString().length() > 0) {
397
398	// Revert the query to a string
399	System.err.println("Rewritten query: " + query.toString());
400	// Search through the string for TX:<term> query terms
401	// and append the ~ operator. Note that this search will
402	// not change phrase searches (TX:"<term> <term>") as
403	// fuzzy searching is not possible for these entries.
404	// Yahoo! Time for a state machine!
405	StringBuffer mutable_query_string = new StringBuffer(query.toString());
406	int o = 0; // Offset
407	// 0 = BASE, 1 = SEEN_T, 2 = SEEN_TX, 3 = SEEN_TX:
408	int s = 0; // State
409	while(o < mutable_query_string.length()) {
410	char c = mutable_query_string.charAt(o);
411	if (s == 0 && c == TEXTFIELD.charAt(0)) {
412	///ystem.err.println("Found T!");
413	s = 1;
414	}
415	else if (s == 1) {
416	if (c == TEXTFIELD.charAt(1)) {
417	///ystem.err.println("Found X!");
418	s = 2;
419	}
420	else {
421	s = 0; // Reset
422	}
423	}
424	else if (s == 2) {
425	if (c == ':') {
426	///ystem.err.println("Found TX:!");
427	s = 3;
428	}
429	else {
430	s = 0; // Reset
431	}
432	}
433	else if (s == 3) {
434	// Don't process phrases
435	if (c == '"') {
436	///ystem.err.println("Stupid phrase...");
437	s = 0; // Reset
438	}
439	// Found the end of the term... add the
440	// fuzzy search indicator
441	// Nor outside the scope of parentheses
442	else if (Character.isWhitespace(c) \|\| c == ')') {
443	///ystem.err.println("Yahoo! Found fuzzy term.");
444	mutable_query_string.insert(o, '~' + fuzziness);
445	o++;
446	s = 0; // Reset
447	}
448	}
449	o++;
450	}
451	// If we were in the state of looking for the end of a
452	// term - then we just found it!
453	if (s == 3) {
454
455	mutable_query_string.append('~' + fuzziness);
456	}
457	// Reparse the query
458	///ystem.err.println("Fuzzy query: " + mutable_query_string.toString() + query_suffix);
459	query = query_parser.parse(mutable_query_string.toString() + query_suffix);
460	}
461	else {
462	query = query_parser.parse(query_prefix + query_suffix);
463	}
464
465	return query;
466	}
467
468	// If you're dealing with a BooleanQuery, they need to be recursively rewritten
469	// as they can contain queries with wildcards (WildcardQuery\|PrefixQuery subclasses of MultiTermQuery)
470	// e.g. season* farm
471	// If MultiTermQuery, then expand here. e.g. WildcardQuerys like season*.
472	// DON'T call this method from inside parseQuery() (in place of its query.rewrite()), because then wildcard
473	// queries like season* won't contain Terms (extractTerms() will be empty) since the ConstantScoreQuerys
474	// that a WildcardQuery gets rewritten to here will contain Filters in place of Terms.
475	// Call this method from runQuery() after it calls parseQuery().
476	// Now searches like these will work
477	// season* farm
478	// season* farm*
479	// and not just searches like the following which already used to work:
480	// season*
481	// snail farm
482	// Idea for the solution of recursively processing a BooleanQuery came from inspecting source code to BooleanQuery.java
483	// https://github.com/apache/lucene-solr/blob/master/lucene/core/src/java/org/apache/lucene/search/BooleanQuery.java
484	// which also does a recursive rewrite. Unfortunately, the existing BooleanQuery does not handle MultiTermQuery
485	// subcomponents.
486	protected Query recursivelyRewriteQuery(Query orig_query, IndexReader reader, LuceneQueryResult lucene_query_result) throws java.io.IOException
487	{
488	//Query query = orig_query.rewrite(reader);
489	Query query = orig_query;
490
491	if(orig_query instanceof BooleanQuery) {
492	BooleanQuery booleanQuery = (BooleanQuery)orig_query;
493	List<BooleanClause> clauses = booleanQuery.clauses();
494	for (BooleanClause clause : clauses) {
495	Query subQuery = clause.getQuery();
496	subQuery = recursivelyRewriteQuery(subQuery, reader, lucene_query_result);
497	clause.setQuery(subQuery);
498	}
499	}
500
501	// GS2's LuceneWrapper uses lucene-2.3.2. GS3's LuceneWrapper3 works with lucene-3.3.0.
502	// This change in lucene core library for GS3 (present since after version 2.4.1) had the
503	// side-effect that searching on "econom*" didn't display what terms it was searching for,
504	// whereas it had done so in GS2.
505
506	// The details of this problem and its current solution are explained in the ticket
507	// http://trac.greenstone.org/ticket/845
508
509	// We need to change the settings for the rewriteMethod in order to get searches on wildcards
510	// to produce search terms again when the query gets rewritten.
511
512	// We try, in order:
513	// 1. RewriteMethod set to BooleanQuery, to get it working as in GS2 which uses lucene-2.3.2
514	// it will expand wildcard searches to its terms when searching at both section AND doc level.
515	// If that throws a TooManyClauses exception (like when searching for "a*" over lucene demo collection)
516	// 2. Then try a custom rewriteMethod which sets termCountCutoff=350 and docCountPercent cutoff=0.1%
517	// If that throws a TooManyClauses exception (could perhaps happen if the collection has a huge number of docs
518	// 3. Then try the default apache rewriteMethod with its optimum defaults of
519	// termCountCutoff=350 and docCountPercent cutoff=0.1%
520	// See http://lucene.apache.org/core/3_6_1/api/core/org/apache/lucene/search/MultiTermQuery.html
521
522	//System.err.println("@@@@ query class name: " + orig_query.getClass());
523	//System.err.println("@@@@ QUERY: " + orig_query);
524
525	if(orig_query instanceof MultiTermQuery) {
526	MultiTermQuery multiTermQuery = (MultiTermQuery)orig_query;
527	multiTermQuery.setRewriteMethod(MultiTermQuery.CONSTANT_SCORE_BOOLEAN_QUERY_REWRITE);
528	// less CPU intensive than MultiTermQuery.SCORING_BOOLEAN_QUERY_REWRITE)
529	}
530
531	try {
532	query = orig_query.rewrite(reader);
533	}
534	catch(BooleanQuery.TooManyClauses clauseException) {
535	// Example test case: try searching the lucene demo collection for "a*"
536	// and you'll hit this exception
537
538	lucene_query_result.setError(LuceneQueryResult.TOO_MANY_CLAUSES_ERROR);
539
540	if(query instanceof MultiTermQuery) {
541
542	// CustomRewriteMethod: setting the docCountPercent cutoff to a custom 100%.
543	// This will at least expand the query to its terms when searching with wildcards at section-level
544	// (though it doesn't seem to work for doc-level searches, no matter what the cutoffs are set to).
545
546	MultiTermQuery.ConstantScoreAutoRewrite customRewriteMethod = new MultiTermQuery.ConstantScoreAutoRewrite();
547	customRewriteMethod.setDocCountPercent(100.0);
548	customRewriteMethod.setTermCountCutoff(350); // same as default
549
550	MultiTermQuery multiTermQuery = (MultiTermQuery)query;
551	multiTermQuery.setRewriteMethod(customRewriteMethod);
552	try {
553	query = query.rewrite(reader);
554	}
555	catch(BooleanQuery.TooManyClauses clauseExceptionAgain) {
556
557	// do what the code originally did: use the default rewriteMethod which
558	// uses a default docCountPercent=0.1 (%) and termCountCutoff=350
559
560	multiTermQuery.setRewriteMethod(MultiTermQuery.CONSTANT_SCORE_AUTO_REWRITE_DEFAULT);
561	query = query.rewrite(reader);
562	}
563	}
564	}
565
566	// BooleanQuery.java recurses rewriting any query until it is identical before and after rewrite,
567	// see reference to "recursively rewrite" in
568	// https://github.com/apache/lucene-solr/blob/master/lucene/core/src/java/org/apache/lucene/search/BooleanQuery.java
569	if(orig_query == query) {
570	return query;
571	} else {
572	return recursivelyRewriteQuery(query, reader, lucene_query_result);
573	}
574	}
575
576	protected Filter parseFilterString(String filter_string)
577	{
578	Filter result = null;
579	Pattern pattern = Pattern.compile("\\s\\+(\\w+)\\:([\\{\\[])(\\d+)\\s+TO\\s+(\\d+)([\\}\\]])\\s");
580	Matcher matcher = pattern.matcher(filter_string);
581	if (matcher.matches()) {
582	String field_name = matcher.group(1);
583	boolean include_lower = matcher.group(2).equals("[");
584	BytesRef lower_term = new BytesRef(matcher.group(3));
585	BytesRef upper_term = new BytesRef(matcher.group(4));
586	boolean include_upper = matcher.group(5).equals("]");
587	result = new TermRangeFilter(field_name, lower_term, upper_term, include_lower, include_upper);
588	}
589	else {
590	System.err.println("Error: Could not understand filter string \"" + filter_string + "\"");
591	}
592	return result;
593	}
594
595
596	/** command line program and auxiliary methods */
597
598	// Fairly self-explanatory I should hope
599	static protected boolean query_result_caching_enabled = false;
600
601
602	static public void main (String args[])
603	{
604	if (args.length == 0) {
605	System.out.println("Usage: GS2LuceneQuery <index directory> [-fuzziness value] [-filter filter_string] [-sort sort_field] [-reverse_sort][-dco AND\|OR] [-startresults number -endresults number] [query]");
606	return;
607	}
608
609	try {
610	String index_directory = args[0];
611
612	GS2LuceneQuery queryer = new GS2LuceneQuery();
613	queryer.setIndexDir(index_directory);
614
615	// Prepare the index cache directory, if query result caching is enabled
616	if (query_result_caching_enabled) {
617	// Make the index cache directory if it doesn't already exist
618	File index_cache_directory = new File(index_directory, "cache");
619	if (!index_cache_directory.exists()) {
620	index_cache_directory.mkdir();
621	}
622
623	// Disable caching if the index cache directory isn't available
624	if (!index_cache_directory.exists() \|\| !index_cache_directory.isDirectory()) {
625	query_result_caching_enabled = false;
626	}
627	}
628
629	String query_string = null;
630
631	// Parse the command-line arguments
632	for (int i = 1; i < args.length; i++) {
633	if (args[i].equals("-sort")) {
634	i++;
635	queryer.setSortField(args[i]);
636	}
637	else if (args[i].equals("-reverse_sort")) {
638	queryer.setReverseSort(true);
639	}
640	else if (args[i].equals("-filter")) {
641	i++;
642	queryer.setFilterString(args[i]);
643	}
644	else if (args[i].equals("-dco")) {
645	i++;
646	queryer.setDefaultConjunctionOperator(args[i]);
647	}
648	else if (args[i].equals("-fuzziness")) {
649	i++;
650	queryer.setFuzziness(args[i]);
651	}
652	else if (args[i].equals("-startresults")) {
653	i++;
654	if (args[i].matches("\\d+")) {
655	queryer.setStartResults(Integer.parseInt(args[i]));
656	}
657	}
658	else if (args[i].equals("-endresults")) {
659	i++;
660	if (args[i].matches("\\d+")) {
661	queryer.setEndResults(Integer.parseInt(args[i]));
662	}
663	}
664	else {
665	query_string = args[i];
666	}
667	}
668
669	if (!queryer.initialise()) {
670	queryer.cleanUp(); // will close reader object IF reader was instantiated
671	return;
672	}
673
674	// The query string has been specified as a command-line argument
675	if (query_string != null) {
676	runQueryCaching(index_directory, queryer, query_string);
677	}
678
679	// Read queries from STDIN
680	else {
681	BufferedReader in = new BufferedReader(new InputStreamReader(System.in, "UTF-8"));
682	while (true) {
683	// Read the query from STDIN
684	query_string = in.readLine();
685	if (query_string == null \|\| query_string.length() == -1) {
686	break;
687	}
688
689	runQueryCaching(index_directory, queryer, query_string);
690
691	}
692	}
693	queryer.cleanUp();
694	}
695	catch (IOException exception) {
696	exception.printStackTrace();
697	}
698	}
699
700	protected static void runQueryCaching(String index_directory, GS2LuceneQuery queryer, String query_string)
701	throws IOException
702	{
703	StringBuffer query_results_xml = new StringBuffer();
704
705	// Check if this query result has been cached from a previous search (if it's enabled)
706	File query_result_cache_file = null;
707	if (query_result_caching_enabled) {
708	// Generate the cache file name from the query options
709	String query_result_cache_file_name = query_string + "-";
710	String fuzziness = queryer.getFuzziness();
711	query_result_cache_file_name += ((fuzziness != null) ? fuzziness : "") + "-";
712	String filter_string = queryer.getFilterString();
713	query_result_cache_file_name += ((filter_string != null) ? filter_string : "") + "-";
714	String sort_string = queryer.getSortField();
715	query_result_cache_file_name += ((sort_string != null) ? sort_string : "") + "-";
716	String reverse_sort_string = (queryer.getReverseSort() ? "1" : "0");
717	query_result_cache_file_name += reverse_sort_string + "-";
718	String default_conjunction_operator = queryer.getDefaultConjunctionOperator();
719	query_result_cache_file_name += default_conjunction_operator + "-";
720	int start_results = queryer.getStartResults();
721	int end_results = queryer.getEndResults();
722	query_result_cache_file_name += start_results + "-" + end_results;
723	query_result_cache_file_name = fileSafe(query_result_cache_file_name);
724
725	// If the query result cache file exists, just return its contents and we're done
726	File index_cache_directory = new File(index_directory, "cache");
727	query_result_cache_file = new File(index_cache_directory, query_result_cache_file_name);
728	if (query_result_cache_file.exists() && query_result_cache_file.isFile()) {
729	FileInputStream fis = new FileInputStream(query_result_cache_file);
730	InputStreamReader isr = new InputStreamReader(fis, "UTF-8");
731	BufferedReader buffered_reader = new BufferedReader(isr);
732	String line = "";
733	while ((line = buffered_reader.readLine()) != null) {
734	query_results_xml.append(line + "\n");
735	}
736	String query_results_xml_string = query_results_xml.toString();
737	query_results_xml_string = query_results_xml_string.replaceFirst("cached=\"false\"", "cached=\"true\"");
738
739	utf8out.print(query_results_xml_string);
740	utf8out.flush();
741
742	return;
743	}
744	}
745
746	// not cached
747	query_results_xml.append("<ResultSet cached=\"false\">\n");
748	query_results_xml.append("<QueryString>" + LuceneQueryResult.xmlSafe(query_string) + "</QueryString>\n");
749	Filter filter = queryer.getFilter();
750	if (filter != null) {
751	query_results_xml.append("<FilterString>" + filter.toString() + "</FilterString>\n");
752	}
753
754	LuceneQueryResult query_result = queryer.runQuery(query_string);
755	if (query_result == null) {
756	System.err.println("Couldn't run the query");
757	return;
758	}
759
760	if (query_result.getError() != LuceneQueryResult.NO_ERROR) {
761	query_results_xml.append("<Error type=\""+query_result.getErrorString()+"\" />\n");
762	} else {
763	query_results_xml.append(query_result.getXMLString());
764	}
765	query_results_xml.append("</ResultSet>\n");
766
767	utf8out.print(query_results_xml);
768	utf8out.flush();
769
770	// Cache this query result, if desired
771	if (query_result_caching_enabled) {
772	// Catch any exceptions thrown trying to write the query result cache file and warn about them, but don't
773	// bother with the full stack trace. It won't affect the functionality if we can't write some cache
774	// files, it will just affect the speed of subsequent requests.
775	// Example exceptions are "permission denied" errors, or "filename too long" errors (the filter string
776	// can get very long in some collections)
777	try
778	{
779	FileWriter query_result_cache_file_writer = new FileWriter(query_result_cache_file);
780	query_result_cache_file_writer.write(query_results_xml.toString());
781	query_result_cache_file_writer.close();
782	}
783	catch (Exception exception)
784	{
785	System.err.println("Warning: Exception occurred trying to write query result cache file (" + exception + ")");
786	}
787	}
788	}
789
790	protected static String fileSafe(String text)
791	{
792	StringBuffer file_safe_text = new StringBuffer();
793	for (int i = 0; i < text.length(); i++) {
794	char character = text.charAt(i);
795	if ((character >= 'A' && character <= 'Z') \|\| (character >= 'a' && character <= 'z') \|\| (character >= '0' && character <= '9') \|\| character == '-') {
796	file_safe_text.append(character);
797	}
798	else {
799	file_safe_text.append('%');
800	file_safe_text.append((int) character);
801	}
802	}
803	return file_safe_text.toString();
804	}
805
806
807	}
808
809

Note: See TracBrowser for help on using the repository browser.

Context Navigation

source: main/trunk/greenstone2/common-src/indexers/lucene-gs/src/org/greenstone/LuceneWrapper4/GS2LuceneQuery.java@ 32609

Download in other formats: