Context Navigation

GS2LuceneQuery.java@ 32506

Last change on this file since 32506 was 32506, checked in by ak19, 6 years ago

Bugfix to bug that Kathy discovered in code I committed: with the upgrade to lucene 4, wildcard searches would work, e.g. season*. But boolean searches that combine wildcard search terms with regular terms or with other wildcard terms didn't work. If a query was a BooleanQuery it would not expand any wildcard search terms it contained, despite BooleanQuery otherwise recursively doing a rewrite as per its source code. The solution was to recursively rewrite query ourselves to additionally handle MultiTermQuery boolean clauses within a BooleanQuery besides the existing code to handle standalone MultiTermQuerys (which can be of type WildcardQuery and PrefixQuery, though they get wrapped in ConstantScoreQuery objects). I've moved the existing code that deals with MultiTermQuerys into the new recursive function which now does the further step (the recursive step) of recursively rewriting BooleanQuerys to preserve and expand MultiTermQuery objects.

File size: 29.8 KB

Line
1	/**********************************************************************
2	*
3	* GS2LuceneQuery.java
4	*
5	* Copyright 2004 The New Zealand Digital Library Project
6	*
7	* A component of the Greenstone digital library software
8	* from the New Zealand Digital Library Project at the
9	* University of Waikato, New Zealand.
10	*
11	* This program is free software; you can redistribute it and/or modify
12	* it under the terms of the GNU General Public License as published by
13	* the Free Software Foundation; either version 2 of the License, or
14	* (at your option) any later version.
15	*
16	* This program is distributed in the hope that it will be useful,
17	* but WITHOUT ANY WARRANTY; without even the implied warranty of
18	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19	* GNU General Public License for more details.
20	*
21	* You should have received a copy of the GNU General Public License
22	* along with this program; if not, write to the Free Software
23	* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24	*
25	*********************************************************************/
26	package org.greenstone.LuceneWrapper4;
27
28
29	import java.io.*;
30	import java.util.*;
31	import java.util.regex.*;
32
33	import org.apache.lucene.analysis.Analyzer;
34	import org.apache.lucene.analysis.standard.StandardAnalyzer;
35	import org.apache.lucene.document.Document;
36	import org.apache.lucene.index.DirectoryReader;
37	import org.apache.lucene.index.IndexReader;
38	import org.apache.lucene.index.Term;
39	//import org.apache.lucene.index.TermDocs;
40	import org.apache.lucene.queryparser.classic.ParseException;
41	import org.apache.lucene.queryparser.classic.QueryParser;
42	import org.apache.lucene.search.BooleanClause;
43	import org.apache.lucene.search.BooleanQuery; // for the TooManyClauses exception
44	import org.apache.lucene.search.ConstantScoreQuery;
45	import org.apache.lucene.search.Filter;
46	import org.apache.lucene.search.IndexSearcher;
47	import org.apache.lucene.search.MultiTermQuery;
48	import org.apache.lucene.search.MultiTermQuery.ConstantScoreAutoRewrite;
49	import org.apache.lucene.search.Query;
50	import org.apache.lucene.search.TermRangeFilter;
51	import org.apache.lucene.search.IndexSearcher; // Searcher is deprecated
52	import org.apache.lucene.search.ScoreDoc;
53	import org.apache.lucene.search.Sort;
54	import org.apache.lucene.search.SortField;
55	import org.apache.lucene.search.TopFieldDocs;
56
57	import org.apache.lucene.index.DocsEnum;
58	import org.apache.lucene.index.MultiFields;
59
60	import org.apache.lucene.store.Directory;
61	import org.apache.lucene.store.FSDirectory;
62
63	import org.apache.lucene.util.Bits;
64	import org.apache.lucene.util.BytesRef;
65	import org.apache.lucene.util.Version;
66
67	public class GS2LuceneQuery extends SharedSoleneQuery
68	{
69	public static String SORT_RANK = "rank";
70	public static String SORT_NATURAL = "natural";
71
72	protected String full_indexdir="";
73
74	protected SortField.Type sort_type = SortField.Type.SCORE;
75	protected boolean reverse_sort = false;
76	protected Sort sorter=new Sort();
77	protected Filter filter = null;
78
79	protected QueryParser query_parser = null;
80	protected QueryParser query_parser_no_stop_words = null;
81	protected IndexSearcher searcher = null;
82	protected IndexReader reader = null;
83
84	public GS2LuceneQuery() {
85	super();
86
87	// Create one query parser with the standard set of stop words, and one with none
88
89	query_parser = new QueryParser(GSLuceneConstants.MATCH_VERSION, TEXTFIELD, new GS2Analyzer()); // uses built-in stop_words_set
90	query_parser_no_stop_words = new QueryParser(GSLuceneConstants.MATCH_VERSION, TEXTFIELD, new GS2Analyzer(new String[] { }));
91	}
92
93
94	public boolean initialise() {
95
96	if (!super.initialise()) {
97	return false;
98	}
99
100
101	if (full_indexdir==null \|\| full_indexdir.length()==-1){
102	utf8out.println("Index directory is not indicated ");
103	utf8out.flush();
104	return false;
105	}
106
107	try {
108	Directory full_indexdir_dir = FSDirectory.open(new File(full_indexdir));
109
110	reader = DirectoryReader.open(full_indexdir_dir); // Returns a IndexReader reading the index in the given Directory. now readOnly=true by default, and therefore also for searcher
111	searcher = new IndexSearcher(reader); // during searcher.search() will get it to compute ranks when sorting by fields
112
113	this.sorter = new Sort(new SortField(this.sort_field, this.sort_type, this.reverse_sort));
114	}
115	catch (IOException exception) {
116	exception.printStackTrace();
117	return false;
118	}
119	return true;
120
121	}
122
123	public void setIndexDir(String full_indexdir) {
124	this.full_indexdir = full_indexdir;
125	}
126
127	public void setSortField(String sort_field) {
128	if (sort_field.equals(SORT_RANK)) {
129	this.sort_field = null;
130	this.sort_type = SortField.Type.SCORE;
131	} else if (sort_field.equals(SORT_NATURAL)) {
132	this.sort_field = null;
133	this.sort_type = SortField.Type.DOC;
134	} else {
135	this.sort_field = sort_field;
136	this.sort_type = SortField.Type.STRING; // for now. numeric??
137	}
138	}
139	public void setReverseSort(boolean reverse) {
140	this.reverse_sort = reverse;
141	}
142	public boolean getReverseSort() {
143	return this.reverse_sort;
144	}
145
146	public void setFilterString(String filter_string) {
147	super.setFilterString(filter_string);
148	this.filter = parseFilterString(filter_string);
149	}
150
151	public Filter getFilter() {
152	return this.filter;
153	}
154
155
156	public LuceneQueryResult runQuery(String query_string) {
157
158	if (query_string == null \|\| query_string.equals("")) {
159	utf8out.println("The query word is not indicated ");
160	utf8out.flush();
161	return null;
162	}
163
164	LuceneQueryResult lucene_query_result=new LuceneQueryResult();
165	lucene_query_result.clear();
166
167	try {
168	Query query_including_stop_words = query_parser_no_stop_words.parse(query_string);
169	query_including_stop_words = query_including_stop_words.rewrite(reader);
170
171	System.err.println("******* query_string " + query_string + "**");
172
173	Query query = parseQuery(reader, query_parser, query_string, fuzziness);
174	query = recursiveRewriteQuery(query, reader);
175	System.err.println("@@@@ final query class name: " + query.getClass());
176
177	// http://stackoverflow.com/questions/13537126/term-frequency-in-lucene-4-0
178	// http://stackoverflow.com/questions/20575254/lucene-4-4-how-to-get-term-frequency-over-all-index
179	// http://stackoverflow.com/questions/8938960/how-to-get-document-ids-for-document-term-vector-in-lucene?rq=1
180	// https://github.com/hibernate/hibernate-search/blob/master/orm/src/test/java/org/hibernate/search/test/filter/BestDriversFilter.java
181	// http://lucene.apache.org/core/4_7_2/MIGRATE.html
182
183	// Get the list of expanded query terms and their frequencies
184	// num docs matching, and total frequency
185	HashSet terms = new HashSet();
186	query.extractTerms(terms);
187
188	HashMap doc_term_freq_map = new HashMap();
189
190	Iterator iter = terms.iterator();
191
192	Bits liveDocs = null;
193	if(reader.hasDeletions()) {
194	System.err.println("@@@ GS2LuceneQuery.java: There have been deletions. Merging to get liveDocs.");
195	liveDocs = MultiFields.getLiveDocs(reader); // SLOW! But getLiveDocs returns null if there are no deletions
196	}
197
198	while (iter.hasNext()) {
199
200	// http://stackoverflow.com/questions/13537126/term-frequency-in-lucene-4-0
201
202	Term term = (Term) iter.next();
203	System.err.println("@@@ GS2LuceneQuery.java: Next term: " + term.text());
204	BytesRef term_bytes = term.bytes();
205	DocsEnum term_docs = MultiFields.getTermDocsEnum(reader, liveDocs, term.field(), term_bytes); // flags?
206
207	// Get the term frequency over all the documents
208	//TermDocs term_docs = reader.termDocs(term);
209	int term_freq = 0;
210	int match_docs = 0;
211
212	if(term_docs != null) {
213	int docID = -1;
214	while((docID = term_docs.nextDoc()) != DocsEnum.NO_MORE_DOCS) {//while (term_docs.next())
215	if (term_docs.freq() != 0)
216	{
217	term_freq += term_docs.freq();
218	match_docs++;
219
220	// Calculate the document-level term frequency as well
221	Integer lucene_doc_num_obj = new Integer(term_docs.docID());
222	int doc_term_freq = 0;
223	if (doc_term_freq_map.containsKey(lucene_doc_num_obj))
224	{
225	doc_term_freq = ((Integer) doc_term_freq_map.get(lucene_doc_num_obj)).intValue();
226	}
227	doc_term_freq += term_docs.freq();
228
229	doc_term_freq_map.put(lucene_doc_num_obj, new Integer(doc_term_freq));
230	}
231	}
232	} else {
233	System.err.println("@@@ GS2LuceneQuery.java: term_docs is null for term " + term.text());
234	}
235
236	// Create a term
237	lucene_query_result.addTerm(term.text(), term.field(), match_docs, term_freq);
238	}
239
240	// Get the list of stop words removed from the query
241	HashSet terms_including_stop_words = new HashSet();
242	query_including_stop_words.extractTerms(terms_including_stop_words);
243	Iterator terms_including_stop_words_iter = terms_including_stop_words.iterator();
244	while (terms_including_stop_words_iter.hasNext()) {
245	Term term = (Term) terms_including_stop_words_iter.next();
246	if (!terms.contains(term)) {
247	lucene_query_result.addStopWord(term.text());
248	}
249	}
250
251	// Extracting all documents for a given search - http://www.gossamer-threads.com/lists/lucene/java-user/134873
252	// http://lucene.apache.org/core/3_4_0/api/core/org/apache/lucene/search/TotalHitCountCollector.html
253	// http://lucene.apache.org/core/4_7_2/core/index.html?org/apache/lucene/search/TopFieldDocs.html
254
255	// 1. Figure out how many results there will be.
256	//TotalHitCountCollecter countCollector = new TotalHitCountCollector();
257	//searcher.search(query, filter, collector);
258	//int hitCount = collector.count;
259
260	// Actually do the query
261	// Simple case for getting all the matching documents
262	if (end_results == Integer.MAX_VALUE) {
263	// Perform the query (filter and sorter may be null)
264	TopFieldDocs hits = searcher.search(query, filter, end_results, sorter, true, true); // doDocScores=true, doMaxScore=true
265	// Is there a slight difference in the definition between
266	// https://lucene.apache.org/core/3_6_0/api/all/org/apache/lucene/search/IndexSearcher.html#setDefaultFieldSortScoring%28boolean,%20boolean%29
267	// and http://lucene.apache.org/core/4_7_2/core/org/apache/lucene/search/IndexSearcher.html#search%28org.apache.lucene.search.Query,%20org.apache.lucene.search.Filter,%20int,%20org.apache.lucene.search.Sort,%20boolean,%20boolean%29
268	// Seems to be okay.
269	// See also http://stackoverflow.com/questions/7910241/in-lucene-what-is-the-purpose-of-setdefaultfieldsortscoring
270
271	lucene_query_result.setTotalDocs(hits.totalHits);
272
273	// Output the matching documents
274	lucene_query_result.setStartResults(start_results);
275	lucene_query_result.setEndResults(hits.totalHits); // ??
276
277	for (int i = start_results; i < hits.totalHits; i++) {
278	int lucene_doc_num = hits.scoreDocs[i ].doc; // i-1
279	Document doc = reader.document(lucene_doc_num);
280	int doc_term_freq = 0;
281	Integer doc_term_freq_object = (Integer) doc_term_freq_map.get(new Integer(lucene_doc_num));
282	if (doc_term_freq_object != null)
283	{
284	doc_term_freq = doc_term_freq_object.intValue();
285	}
286	lucene_query_result.addDoc(doc.get("docOID").trim(), hits.scoreDocs[i].score, doc_term_freq);
287	}
288	}
289
290	// Slightly more complicated case for returning a subset of the matching documents
291	else {
292	// Perform the query (filter may be null)
293	TopFieldDocs hits = searcher.search(query, filter, end_results, sorter, true, true); // doDocScores=true, doMaxScore=true
294	// See also http://stackoverflow.com/questions/7910241/in-lucene-what-is-the-purpose-of-setdefaultfieldsortscoring
295	lucene_query_result.setTotalDocs(hits.totalHits);
296
297	lucene_query_result.setStartResults(start_results);
298	lucene_query_result.setEndResults(end_results < hits.scoreDocs.length ? end_results: hits.scoreDocs.length);
299
300	// Output the matching documents
301	for (int i = start_results; (i < hits.scoreDocs.length && i < end_results); i++) {
302	int lucene_doc_num = hits.scoreDocs[i].doc;
303	Document doc = reader.document(lucene_doc_num);
304	int doc_term_freq = 0;
305	Integer doc_term_freq_object = (Integer) doc_term_freq_map.get(new Integer(lucene_doc_num));
306	if (doc_term_freq_object != null)
307	{
308	doc_term_freq = doc_term_freq_object.intValue();
309	}
310	lucene_query_result.addDoc(doc.get("docOID").trim(), hits.scoreDocs[i].score, doc_term_freq);
311	}
312	}
313	}
314
315	catch (ParseException parse_exception) {
316	lucene_query_result.setError(LuceneQueryResult.PARSE_ERROR);
317	}
318	catch (BooleanQuery.TooManyClauses too_many_clauses_exception) {
319	lucene_query_result.setError(LuceneQueryResult.TOO_MANY_CLAUSES_ERROR);
320	}
321	catch (IOException exception) {
322	lucene_query_result.setError(LuceneQueryResult.IO_ERROR);
323	exception.printStackTrace();
324	}
325	catch (Exception exception) {
326	lucene_query_result.setError(LuceneQueryResult.OTHER_ERROR);
327	exception.printStackTrace();
328	}
329	return lucene_query_result;
330	}
331
332	public void setDefaultConjunctionOperator(String default_conjunction_operator) {
333	super.setDefaultConjunctionOperator(default_conjunction_operator);
334
335	if (default_conjunction_operator.equals("AND")) {
336	query_parser.setDefaultOperator(query_parser.AND_OPERATOR);
337	query_parser_no_stop_words.setDefaultOperator(query_parser.AND_OPERATOR);
338	} else { // default is OR
339	query_parser.setDefaultOperator(query_parser.OR_OPERATOR);
340	query_parser_no_stop_words.setDefaultOperator(query_parser.OR_OPERATOR);
341	}
342	}
343
344
345	public void cleanUp() {
346	super.cleanUp();
347	try {
348	if(reader != null) {
349	reader.close();
350	// Closes files associated with this index. Also saves any new deletions to disk.
351	// No other methods should be called after this has been called.
352	}
353	} catch (IOException exception) {
354	exception.printStackTrace();
355	}
356	}
357
358
359	protected Query parseQuery(IndexReader reader, QueryParser query_parser, String query_string, String fuzziness)
360	throws java.io.IOException, org.apache.lucene.queryparser.classic.ParseException
361	{
362	// Split query string into the search terms and the filter terms
363	// * The first +(...) term contains the search terms so count
364	// up '(' and stop when we finish matching ')'
365	int offset = 0;
366	int paren_count = 0;
367	boolean seen_paren = false;
368	while (offset < query_string.length() && (!seen_paren \|\| paren_count > 0)) {
369	if (query_string.charAt(offset) == '(') {
370	paren_count++;
371	seen_paren = true;
372	}
373	if (query_string.charAt(offset) == ')') {
374	paren_count--;
375	}
376	offset++;
377	}
378	String query_prefix = query_string.substring(0, offset);
379	String query_suffix = query_string.substring(offset);
380
381	///ystem.err.println("Prefix: " + query_prefix);
382	///ystem.err.println("Suffix: " + query_suffix);
383
384	Query query = query_parser.parse(query_prefix);
385	query = query.rewrite(reader);
386
387	// If this is a fuzzy search, then we need to add the fuzzy
388	// flag to each of the query terms
389	if (fuzziness != null && query.toString().length() > 0) {
390
391	// Revert the query to a string
392	System.err.println("Rewritten query: " + query.toString());
393	// Search through the string for TX:<term> query terms
394	// and append the ~ operator. Note that this search will
395	// not change phrase searches (TX:"<term> <term>") as
396	// fuzzy searching is not possible for these entries.
397	// Yahoo! Time for a state machine!
398	StringBuffer mutable_query_string = new StringBuffer(query.toString());
399	int o = 0; // Offset
400	// 0 = BASE, 1 = SEEN_T, 2 = SEEN_TX, 3 = SEEN_TX:
401	int s = 0; // State
402	while(o < mutable_query_string.length()) {
403	char c = mutable_query_string.charAt(o);
404	if (s == 0 && c == TEXTFIELD.charAt(0)) {
405	///ystem.err.println("Found T!");
406	s = 1;
407	}
408	else if (s == 1) {
409	if (c == TEXTFIELD.charAt(1)) {
410	///ystem.err.println("Found X!");
411	s = 2;
412	}
413	else {
414	s = 0; // Reset
415	}
416	}
417	else if (s == 2) {
418	if (c == ':') {
419	///ystem.err.println("Found TX:!");
420	s = 3;
421	}
422	else {
423	s = 0; // Reset
424	}
425	}
426	else if (s == 3) {
427	// Don't process phrases
428	if (c == '"') {
429	///ystem.err.println("Stupid phrase...");
430	s = 0; // Reset
431	}
432	// Found the end of the term... add the
433	// fuzzy search indicator
434	// Nor outside the scope of parentheses
435	else if (Character.isWhitespace(c) \|\| c == ')') {
436	///ystem.err.println("Yahoo! Found fuzzy term.");
437	mutable_query_string.insert(o, '~' + fuzziness);
438	o++;
439	s = 0; // Reset
440	}
441	}
442	o++;
443	}
444	// If we were in the state of looking for the end of a
445	// term - then we just found it!
446	if (s == 3) {
447
448	mutable_query_string.append('~' + fuzziness);
449	}
450	// Reparse the query
451	///ystem.err.println("Fuzzy query: " + mutable_query_string.toString() + query_suffix);
452	query = query_parser.parse(mutable_query_string.toString() + query_suffix);
453	}
454	else {
455	query = query_parser.parse(query_prefix + query_suffix);
456	}
457
458	return query;
459	}
460
461	// If you're dealing with a BooleanQuery, they need to be recursively rewritten
462	// as they can contain queries with wildcards (WildcardQuery\|PrefixQuery subclasses of MultiTermQuery)
463	// e.g. season* farm
464	// If MultiTermQuery, then expand here. e.g. WildcardQuerys like season*.
465	// DON'T call this method from inside parseQuery() (in place of its query.rewrite()), because then wildcard
466	// queries like season* won't contain Terms (extractTerms() will be empty) since the ConstantScoreQuerys
467	// that a WildcardQuery gets rewritten to here will contain Filters in place of Terms.
468	// Call this method from runQuery() after it calls parseQuery().
469	// Now searches like these will work
470	// season* farm
471	// season* farm*
472	// and not just searches like the following which already used to work:
473	// season*
474	// snail farm
475	// Idea for this method came from inspecting source code to BooleanQuery
476	// https://github.com/apache/lucene-solr/blob/master/lucene/core/src/java/org/apache/lucene/search/BooleanQuery.java
477	// which also does a recursive rewrite. Unfortunately, the existing BooleanQuery does not handle MultiTermQuery
478	// subcomponents.
479	protected Query recursiveRewriteQuery(Query orig_query, IndexReader reader) throws java.io.IOException
480	{
481	//Query query = orig_query.rewrite(reader);
482	Query query = orig_query;
483
484	if(orig_query instanceof BooleanQuery) {
485	BooleanQuery booleanQuery = (BooleanQuery)orig_query;
486	List<BooleanClause> clauses = booleanQuery.clauses();
487	for (BooleanClause clause : clauses) {
488	Query subQuery = clause.getQuery();
489	subQuery = recursiveRewriteQuery(subQuery, reader);
490	clause.setQuery(subQuery);
491	}
492	}
493
494	// GS2's LuceneWrapper uses lucene-2.3.2. GS3's LuceneWrapper3 works with lucene-3.3.0.
495	// This change in lucene core library for GS3 (present since after version 2.4.1) had the
496	// side-effect that searching on "econom*" didn't display what terms it was searching for,
497	// whereas it had done so in GS2.
498
499	// The details of this problem and its current solution are explained in the ticket
500	// http://trac.greenstone.org/ticket/845
501
502	// We need to change the settings for the rewriteMethod in order to get searches on wildcards
503	// to produce search terms again when the query gets rewritten.
504
505	// We try, in order:
506	// 1. RewriteMethod set to BooleanQuery, to get it working as in GS2 which uses lucene-2.3.2
507	// it will expand wildcard searches to its terms when searching at both section AND doc level.
508	// If that throws a TooManyClauses exception (like when searching for "a*" over lucene demo collection)
509	// 2. Then try a custom rewriteMethod which sets termCountCutoff=350 and docCountPercent cutoff=0.1%
510	// If that throws a TooManyClauses exception (could perhaps happen if the collection has a huge number of docs
511	// 3. Then try the default apache rewriteMethod with its optimum defaults of
512	// termCountCutoff=350 and docCountPercent cutoff=0.1%
513	// See http://lucene.apache.org/core/3_6_1/api/core/org/apache/lucene/search/MultiTermQuery.html
514
515	System.err.println("@@@@ query class name: " + orig_query.getClass());
516	System.err.println("@@@@ QUERY: " + orig_query);
517
518	if(orig_query instanceof MultiTermQuery) {
519	MultiTermQuery multiTermQuery = (MultiTermQuery)orig_query;
520	multiTermQuery.setRewriteMethod(MultiTermQuery.CONSTANT_SCORE_BOOLEAN_QUERY_REWRITE);
521	// less CPU intensive than MultiTermQuery.SCORING_BOOLEAN_QUERY_REWRITE)
522	}
523
524	try {
525	query = orig_query.rewrite(reader);
526	}
527	catch(BooleanQuery.TooManyClauses clauseException) {
528	// Example test case: try searching the lucene demo collection for "a*"
529	// and you'll hit this exception
530
531	//lucene_query_result.setError(LuceneQueryResult.TOO_MANY_CLAUSES_ERROR);
532
533	if(query instanceof MultiTermQuery) {
534
535	// CustomRewriteMethod: setting the docCountPercent cutoff to a custom 100%.
536	// This will at least expand the query to its terms when searching with wildcards at section-level
537	// (though it doesn't seem to work for doc-level searches, no matter what the cutoffs are set to).
538
539	MultiTermQuery.ConstantScoreAutoRewrite customRewriteMethod = new MultiTermQuery.ConstantScoreAutoRewrite();
540	customRewriteMethod.setDocCountPercent(100.0);
541	customRewriteMethod.setTermCountCutoff(350); // same as default
542
543	MultiTermQuery multiTermQuery = (MultiTermQuery)query;
544	multiTermQuery.setRewriteMethod(customRewriteMethod);
545	try {
546	query = query.rewrite(reader);
547	}
548	catch(BooleanQuery.TooManyClauses clauseExceptionAgain) {
549
550	// do what the code originally did: use the default rewriteMethod which
551	// uses a default docCountPercent=0.1 (%) and termCountCutoff=350
552
553	multiTermQuery.setRewriteMethod(MultiTermQuery.CONSTANT_SCORE_AUTO_REWRITE_DEFAULT);
554	query = query.rewrite(reader);
555	}
556	}
557	}
558
559	if(orig_query == query) {
560	return query;
561	} else {
562	return recursiveRewriteQuery(query, reader);
563	}
564	}
565
566	protected Filter parseFilterString(String filter_string)
567	{
568	Filter result = null;
569	Pattern pattern = Pattern.compile("\\s\\+(\\w+)\\:([\\{\\[])(\\d+)\\s+TO\\s+(\\d+)([\\}\\]])\\s");
570	Matcher matcher = pattern.matcher(filter_string);
571	if (matcher.matches()) {
572	String field_name = matcher.group(1);
573	boolean include_lower = matcher.group(2).equals("[");
574	BytesRef lower_term = new BytesRef(matcher.group(3));
575	BytesRef upper_term = new BytesRef(matcher.group(4));
576	boolean include_upper = matcher.group(5).equals("]");
577	result = new TermRangeFilter(field_name, lower_term, upper_term, include_lower, include_upper);
578	}
579	else {
580	System.err.println("Error: Could not understand filter string \"" + filter_string + "\"");
581	}
582	return result;
583	}
584
585
586	/** command line program and auxiliary methods */
587
588	// Fairly self-explanatory I should hope
589	static protected boolean query_result_caching_enabled = false;
590
591
592	static public void main (String args[])
593	{
594	if (args.length == 0) {
595	System.out.println("Usage: GS2LuceneQuery <index directory> [-fuzziness value] [-filter filter_string] [-sort sort_field] [-reverse_sort][-dco AND\|OR] [-startresults number -endresults number] [query]");
596	return;
597	}
598
599	try {
600	String index_directory = args[0];
601
602	GS2LuceneQuery queryer = new GS2LuceneQuery();
603	queryer.setIndexDir(index_directory);
604
605	// Prepare the index cache directory, if query result caching is enabled
606	if (query_result_caching_enabled) {
607	// Make the index cache directory if it doesn't already exist
608	File index_cache_directory = new File(index_directory, "cache");
609	if (!index_cache_directory.exists()) {
610	index_cache_directory.mkdir();
611	}
612
613	// Disable caching if the index cache directory isn't available
614	if (!index_cache_directory.exists() \|\| !index_cache_directory.isDirectory()) {
615	query_result_caching_enabled = false;
616	}
617	}
618
619	String query_string = null;
620
621	// Parse the command-line arguments
622	for (int i = 1; i < args.length; i++) {
623	if (args[i].equals("-sort")) {
624	i++;
625	queryer.setSortField(args[i]);
626	}
627	else if (args[i].equals("-reverse_sort")) {
628	queryer.setReverseSort(true);
629	}
630	else if (args[i].equals("-filter")) {
631	i++;
632	queryer.setFilterString(args[i]);
633	}
634	else if (args[i].equals("-dco")) {
635	i++;
636	queryer.setDefaultConjunctionOperator(args[i]);
637	}
638	else if (args[i].equals("-fuzziness")) {
639	i++;
640	queryer.setFuzziness(args[i]);
641	}
642	else if (args[i].equals("-startresults")) {
643	i++;
644	if (args[i].matches("\\d+")) {
645	queryer.setStartResults(Integer.parseInt(args[i]));
646	}
647	}
648	else if (args[i].equals("-endresults")) {
649	i++;
650	if (args[i].matches("\\d+")) {
651	queryer.setEndResults(Integer.parseInt(args[i]));
652	}
653	}
654	else {
655	query_string = args[i];
656	}
657	}
658
659	if (!queryer.initialise()) {
660	return;
661	}
662
663	// The query string has been specified as a command-line argument
664	if (query_string != null) {
665	runQueryCaching(index_directory, queryer, query_string);
666	}
667
668	// Read queries from STDIN
669	else {
670	BufferedReader in = new BufferedReader(new InputStreamReader(System.in, "UTF-8"));
671	while (true) {
672	// Read the query from STDIN
673	query_string = in.readLine();
674	if (query_string == null \|\| query_string.length() == -1) {
675	break;
676	}
677
678	runQueryCaching(index_directory, queryer, query_string);
679
680	}
681	}
682	queryer.cleanUp();
683	}
684	catch (IOException exception) {
685	exception.printStackTrace();
686	}
687	}
688
689	protected static void runQueryCaching(String index_directory, GS2LuceneQuery queryer, String query_string)
690	throws IOException
691	{
692	StringBuffer query_results_xml = new StringBuffer();
693
694	// Check if this query result has been cached from a previous search (if it's enabled)
695	File query_result_cache_file = null;
696	if (query_result_caching_enabled) {
697	// Generate the cache file name from the query options
698	String query_result_cache_file_name = query_string + "-";
699	String fuzziness = queryer.getFuzziness();
700	query_result_cache_file_name += ((fuzziness != null) ? fuzziness : "") + "-";
701	String filter_string = queryer.getFilterString();
702	query_result_cache_file_name += ((filter_string != null) ? filter_string : "") + "-";
703	String sort_string = queryer.getSortField();
704	query_result_cache_file_name += ((sort_string != null) ? sort_string : "") + "-";
705	String reverse_sort_string = (queryer.getReverseSort() ? "1" : "0");
706	query_result_cache_file_name += reverse_sort_string + "-";
707	String default_conjunction_operator = queryer.getDefaultConjunctionOperator();
708	query_result_cache_file_name += default_conjunction_operator + "-";
709	int start_results = queryer.getStartResults();
710	int end_results = queryer.getEndResults();
711	query_result_cache_file_name += start_results + "-" + end_results;
712	query_result_cache_file_name = fileSafe(query_result_cache_file_name);
713
714	// If the query result cache file exists, just return its contents and we're done
715	File index_cache_directory = new File(index_directory, "cache");
716	query_result_cache_file = new File(index_cache_directory, query_result_cache_file_name);
717	if (query_result_cache_file.exists() && query_result_cache_file.isFile()) {
718	FileInputStream fis = new FileInputStream(query_result_cache_file);
719	InputStreamReader isr = new InputStreamReader(fis, "UTF-8");
720	BufferedReader buffered_reader = new BufferedReader(isr);
721	String line = "";
722	while ((line = buffered_reader.readLine()) != null) {
723	query_results_xml.append(line + "\n");
724	}
725	String query_results_xml_string = query_results_xml.toString();
726	query_results_xml_string = query_results_xml_string.replaceFirst("cached=\"false\"", "cached=\"true\"");
727
728	utf8out.print(query_results_xml_string);
729	utf8out.flush();
730
731	return;
732	}
733	}
734
735	// not cached
736	query_results_xml.append("<ResultSet cached=\"false\">\n");
737	query_results_xml.append("<QueryString>" + LuceneQueryResult.xmlSafe(query_string) + "</QueryString>\n");
738	Filter filter = queryer.getFilter();
739	if (filter != null) {
740	query_results_xml.append("<FilterString>" + filter.toString() + "</FilterString>\n");
741	}
742
743	LuceneQueryResult query_result = queryer.runQuery(query_string);
744	if (query_result == null) {
745	System.err.println("Couldn't run the query");
746	return;
747	}
748
749	if (query_result.getError() != LuceneQueryResult.NO_ERROR) {
750	query_results_xml.append("<Error type=\""+query_result.getErrorString()+"\" />\n");
751	} else {
752	query_results_xml.append(query_result.getXMLString());
753	}
754	query_results_xml.append("</ResultSet>\n");
755
756	utf8out.print(query_results_xml);
757	utf8out.flush();
758
759	// Cache this query result, if desired
760	if (query_result_caching_enabled) {
761	// Catch any exceptions thrown trying to write the query result cache file and warn about them, but don't
762	// bother with the full stack trace. It won't affect the functionality if we can't write some cache
763	// files, it will just affect the speed of subsequent requests.
764	// Example exceptions are "permission denied" errors, or "filename too long" errors (the filter string
765	// can get very long in some collections)
766	try
767	{
768	FileWriter query_result_cache_file_writer = new FileWriter(query_result_cache_file);
769	query_result_cache_file_writer.write(query_results_xml.toString());
770	query_result_cache_file_writer.close();
771	}
772	catch (Exception exception)
773	{
774	System.err.println("Warning: Exception occurred trying to write query result cache file (" + exception + ")");
775	}
776	}
777	}
778
779	protected static String fileSafe(String text)
780	{
781	StringBuffer file_safe_text = new StringBuffer();
782	for (int i = 0; i < text.length(); i++) {
783	char character = text.charAt(i);
784	if ((character >= 'A' && character <= 'Z') \|\| (character >= 'a' && character <= 'z') \|\| (character >= '0' && character <= '9') \|\| character == '-') {
785	file_safe_text.append(character);
786	}
787	else {
788	file_safe_text.append('%');
789	file_safe_text.append((int) character);
790	}
791	}
792	return file_safe_text.toString();
793	}
794
795
796	}
797
798

Note: See TracBrowser for help on using the repository browser.

Context Navigation

source: main/trunk/greenstone2/common-src/indexers/lucene-gs/src/org/greenstone/LuceneWrapper4/GS2LuceneQuery.java@ 32506

Download in other formats: