Context Navigation

source: main/trunk/greenstone2/common-src/indexers/lucene-gs/src/org/greenstone/LuceneWrapper4/GS2LuceneQuery.java@ 29167

Last change on this file since 29167 was 29167, checked in by ak19, 10 years ago
Fix to nullpointer exception when there are no term docs, such as for search term bla. Also some comments on how to get all the matching docs in lucene 4.7.2
File size: 27.5 KB

Line
1	/**********************************************************************
2	*
3	* GS2LuceneQuery.java
4	*
5	* Copyright 2004 The New Zealand Digital Library Project
6	*
7	* A component of the Greenstone digital library software
8	* from the New Zealand Digital Library Project at the
9	* University of Waikato, New Zealand.
10	*
11	* This program is free software; you can redistribute it and/or modify
12	* it under the terms of the GNU General Public License as published by
13	* the Free Software Foundation; either version 2 of the License, or
14	* (at your option) any later version.
15	*
16	* This program is distributed in the hope that it will be useful,
17	* but WITHOUT ANY WARRANTY; without even the implied warranty of
18	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19	* GNU General Public License for more details.
20	*
21	* You should have received a copy of the GNU General Public License
22	* along with this program; if not, write to the Free Software
23	* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24	*
25	*********************************************************************/
26	package org.greenstone.LuceneWrapper4;
27
28
29	import java.io.*;
30	import java.util.*;
31	import java.util.regex.*;
32
33	import org.apache.lucene.analysis.Analyzer;
34	import org.apache.lucene.analysis.standard.StandardAnalyzer;
35	import org.apache.lucene.document.Document;
36	import org.apache.lucene.index.DirectoryReader;
37	import org.apache.lucene.index.IndexReader;
38	import org.apache.lucene.index.Term;
39	//import org.apache.lucene.index.TermDocs;
40	import org.apache.lucene.queryparser.classic.ParseException;
41	import org.apache.lucene.queryparser.classic.QueryParser;
42	import org.apache.lucene.search.BooleanQuery; // for the TooManyClauses exception
43	import org.apache.lucene.search.Filter;
44	import org.apache.lucene.search.IndexSearcher;
45	import org.apache.lucene.search.MultiTermQuery;
46	import org.apache.lucene.search.MultiTermQuery.ConstantScoreAutoRewrite;
47	import org.apache.lucene.search.Query;
48	import org.apache.lucene.search.TermRangeFilter;
49	import org.apache.lucene.search.IndexSearcher; // Searcher is deprecated
50	import org.apache.lucene.search.ScoreDoc;
51	import org.apache.lucene.search.Sort;
52	import org.apache.lucene.search.SortField;
53	import org.apache.lucene.search.TopFieldDocs;
54
55	import org.apache.lucene.index.DocsEnum;
56	import org.apache.lucene.index.MultiFields;
57
58	import org.apache.lucene.store.Directory;
59	import org.apache.lucene.store.FSDirectory;
60
61	import org.apache.lucene.util.Bits;
62	import org.apache.lucene.util.BytesRef;
63	import org.apache.lucene.util.Version;
64
65	public class GS2LuceneQuery extends SharedSoleneQuery
66	{
67	public static String SORT_RANK = "rank";
68	public static String SORT_NATURAL = "natural";
69
70	protected String full_indexdir="";
71
72	protected SortField.Type sort_type = SortField.Type.SCORE;
73	protected boolean reverse_sort = false;
74	protected Sort sorter=new Sort();
75	protected Filter filter = null;
76
77	protected QueryParser query_parser = null;
78	protected QueryParser query_parser_no_stop_words = null;
79	protected IndexSearcher searcher = null;
80	protected IndexReader reader = null;
81
82	public GS2LuceneQuery() {
83	super();
84
85	// Create one query parser with the standard set of stop words, and one with none
86
87	query_parser = new QueryParser(GSLuceneConstants.MATCH_VERSION, TEXTFIELD, new GS2Analyzer()); // uses built-in stop_words_set
88	query_parser_no_stop_words = new QueryParser(GSLuceneConstants.MATCH_VERSION, TEXTFIELD, new GS2Analyzer(new String[] { }));
89	}
90
91
92	public boolean initialise() {
93
94	if (!super.initialise()) {
95	return false;
96	}
97
98
99	if (full_indexdir==null \|\| full_indexdir.length()==-1){
100	utf8out.println("Index directory is not indicated ");
101	utf8out.flush();
102	return false;
103	}
104
105	try {
106	Directory full_indexdir_dir = FSDirectory.open(new File(full_indexdir));
107
108	reader = DirectoryReader.open(full_indexdir_dir); // Returns a IndexReader reading the index in the given Directory. now readOnly=true by default, and therefore also for searcher
109	searcher = new IndexSearcher(reader); // during searcher.search() will get it to compute ranks when sorting by fields
110
111	this.sorter = new Sort(new SortField(this.sort_field, this.sort_type, this.reverse_sort));
112	}
113	catch (IOException exception) {
114	exception.printStackTrace();
115	return false;
116	}
117	return true;
118
119	}
120
121	public void setIndexDir(String full_indexdir) {
122	this.full_indexdir = full_indexdir;
123	}
124
125	public void setSortField(String sort_field) {
126	if (sort_field.equals(SORT_RANK)) {
127	this.sort_field = null;
128	this.sort_type = SortField.Type.SCORE;
129	} else if (sort_field.equals(SORT_NATURAL)) {
130	this.sort_field = null;
131	this.sort_type = SortField.Type.DOC;
132	} else {
133	this.sort_field = sort_field;
134	this.sort_type = SortField.Type.STRING; // for now. numeric??
135	}
136	}
137	public void setReverseSort(boolean reverse) {
138	this.reverse_sort = reverse;
139	}
140	public boolean getReverseSort() {
141	return this.reverse_sort;
142	}
143
144	public void setFilterString(String filter_string) {
145	super.setFilterString(filter_string);
146	this.filter = parseFilterString(filter_string);
147	}
148
149	public Filter getFilter() {
150	return this.filter;
151	}
152
153
154	public LuceneQueryResult runQuery(String query_string) {
155
156	if (query_string == null \|\| query_string.equals("")) {
157	utf8out.println("The query word is not indicated ");
158	utf8out.flush();
159	return null;
160	}
161
162	LuceneQueryResult lucene_query_result=new LuceneQueryResult();
163	lucene_query_result.clear();
164
165	try {
166	Query query_including_stop_words = query_parser_no_stop_words.parse(query_string);
167	query_including_stop_words = query_including_stop_words.rewrite(reader);
168
169	// System.err.println("******* query_string " + query_string + "**");
170
171	Query query = parseQuery(reader, query_parser, query_string, fuzziness);
172
173	// GS2's LuceneWrapper uses lucene-2.3.2. GS3's LuceneWrapper3 works with lucene-3.3.0.
174	// This change in lucene core library for GS3 (present since after version 2.4.1) had the
175	// side-effect that searching on "econom*" didn't display what terms it was searching for,
176	// whereas it had done so in GS2.
177
178	// The details of this problem and its current solution are explained in the ticket
179	// http://trac.greenstone.org/ticket/845
180
181	// We need to change the settings for the rewriteMethod in order to get searches on wildcards
182	// to produce search terms again when the query gets rewritten.
183
184	// We try, in order:
185	// 1. RewriteMethod set to BooleanQuery, to get it working as in GS2 which uses lucene-2.3.2
186	// it will expand wildcard searches to its terms when searching at both section AND doc level.
187	// If that throws a TooManyClauses exception (like when searching for "a*" over lucene demo collection)
188	// 2. Then try a custom rewriteMethod which sets termCountCutoff=350 and docCountPercent cutoff=0.1%
189	// If that throws a TooManyClauses exception (could perhaps happen if the collection has a huge number of docs
190	// 3. Then try the default apache rewriteMethod with its optimum defaults of
191	// termCountCutoff=350 and docCountPercent cutoff=0.1%
192	// See http://lucene.apache.org/core/3_6_1/api/core/org/apache/lucene/search/MultiTermQuery.html
193
194	if(query instanceof MultiTermQuery) {
195	MultiTermQuery multiTermQuery = (MultiTermQuery)query;
196	multiTermQuery.setRewriteMethod(MultiTermQuery.CONSTANT_SCORE_BOOLEAN_QUERY_REWRITE);
197	// less CPU intensive than MultiTermQuery.SCORING_BOOLEAN_QUERY_REWRITE)
198	}
199
200	try {
201	query = query.rewrite(reader);
202	}
203	catch(BooleanQuery.TooManyClauses clauseException) {
204	// Example test case: try searching the lucene demo collection for "a*"
205	// and you'll hit this exception
206
207	lucene_query_result.setError(LuceneQueryResult.TOO_MANY_CLAUSES_ERROR);
208
209	if(query instanceof MultiTermQuery) {
210
211	// CustomRewriteMethod: setting the docCountPercent cutoff to a custom 100%.
212	// This will at least expand the query to its terms when searching with wildcards at section-level
213	// (though it doesn't seem to work for doc-level searches, no matter what the cutoffs are set to).
214
215	MultiTermQuery.ConstantScoreAutoRewrite customRewriteMethod = new MultiTermQuery.ConstantScoreAutoRewrite();
216	customRewriteMethod.setDocCountPercent(100.0);
217	customRewriteMethod.setTermCountCutoff(350); // same as default
218
219	MultiTermQuery multiTermQuery = (MultiTermQuery)query;
220	multiTermQuery.setRewriteMethod(customRewriteMethod);
221	try {
222	query = query.rewrite(reader);
223	}
224	catch(BooleanQuery.TooManyClauses clauseExceptionAgain) {
225
226	// do what the code originally did: use the default rewriteMethod which
227	// uses a default docCountPercent=0.1 (%) and termCountCutoff=350
228
229	multiTermQuery.setRewriteMethod(MultiTermQuery.CONSTANT_SCORE_AUTO_REWRITE_DEFAULT);
230	query = query.rewrite(reader);
231	}
232	}
233	}
234
235	// http://stackoverflow.com/questions/13537126/term-frequency-in-lucene-4-0
236	// http://stackoverflow.com/questions/20575254/lucene-4-4-how-to-get-term-frequency-over-all-index
237	// http://stackoverflow.com/questions/8938960/how-to-get-document-ids-for-document-term-vector-in-lucene?rq=1
238	// https://github.com/hibernate/hibernate-search/blob/master/orm/src/test/java/org/hibernate/search/test/filter/BestDriversFilter.java
239	// http://lucene.apache.org/core/4_7_2/MIGRATE.html
240
241	// Get the list of expanded query terms and their frequencies
242	// num docs matching, and total frequency
243	HashSet terms = new HashSet();
244	query.extractTerms(terms);
245
246	HashMap doc_term_freq_map = new HashMap();
247
248	Iterator iter = terms.iterator();
249
250	Bits liveDocs = null;
251	if(reader.hasDeletions()) {
252	System.err.println("@@@ GS2LuceneQuery.java: There have been deletions. Merging to get liveDocs.");
253	liveDocs = MultiFields.getLiveDocs(reader); // SLOW! But getLiveDocs returns null if there are no deletions
254	}
255
256	while (iter.hasNext()) {
257
258	// http://stackoverflow.com/questions/13537126/term-frequency-in-lucene-4-0
259
260	Term term = (Term) iter.next();
261	BytesRef term_bytes = term.bytes();
262	DocsEnum term_docs = MultiFields.getTermDocsEnum(reader, liveDocs, term.field(), term_bytes); // flags?
263
264	// Get the term frequency over all the documents
265	//TermDocs term_docs = reader.termDocs(term);
266	int term_freq = 0;
267	int match_docs = 0;
268
269	if(term_docs != null) {
270	int docID = -1;
271	while((docID = term_docs.nextDoc()) != DocsEnum.NO_MORE_DOCS) {//while (term_docs.next())
272	if (term_docs.freq() != 0)
273	{
274	term_freq += term_docs.freq();
275	match_docs++;
276
277	// Calculate the document-level term frequency as well
278	Integer lucene_doc_num_obj = new Integer(term_docs.docID());
279	int doc_term_freq = 0;
280	if (doc_term_freq_map.containsKey(lucene_doc_num_obj))
281	{
282	doc_term_freq = ((Integer) doc_term_freq_map.get(lucene_doc_num_obj)).intValue();
283	}
284	doc_term_freq += term_docs.freq();
285
286	doc_term_freq_map.put(lucene_doc_num_obj, new Integer(doc_term_freq));
287	}
288	}
289	} else {
290	System.err.println("@@@ GS2LuceneQuery.java: term_docs is null for term " + term.text());
291	}
292
293	// Create a term
294	lucene_query_result.addTerm(term.text(), term.field(), match_docs, term_freq);
295	}
296
297	// Get the list of stop words removed from the query
298	HashSet terms_including_stop_words = new HashSet();
299	query_including_stop_words.extractTerms(terms_including_stop_words);
300	Iterator terms_including_stop_words_iter = terms_including_stop_words.iterator();
301	while (terms_including_stop_words_iter.hasNext()) {
302	Term term = (Term) terms_including_stop_words_iter.next();
303	if (!terms.contains(term)) {
304	lucene_query_result.addStopWord(term.text());
305	}
306	}
307
308	// Extracting all documents for a given search - http://www.gossamer-threads.com/lists/lucene/java-user/134873
309	// http://lucene.apache.org/core/3_4_0/api/core/org/apache/lucene/search/TotalHitCountCollector.html
310	// http://lucene.apache.org/core/4_7_2/core/index.html?org/apache/lucene/search/TopFieldDocs.html
311
312	// 1. Figure out how many results there will be.
313	//TotalHitCountCollecter countCollector = new TotalHitCountCollector();
314	//searcher.search(query, filter, collector);
315	//int hitCount = collector.count;
316
317	// Actually do the query
318	// Simple case for getting all the matching documents
319	if (end_results == Integer.MAX_VALUE) {
320	// Perform the query (filter and sorter may be null)
321	TopFieldDocs hits = searcher.search(query, filter, end_results, sorter, true, true); // doDocScores=true, doMaxScore=true
322	// Is there a slight difference in the definition between
323	// https://lucene.apache.org/core/3_6_0/api/all/org/apache/lucene/search/IndexSearcher.html#setDefaultFieldSortScoring%28boolean,%20boolean%29
324	// and http://lucene.apache.org/core/4_7_2/core/org/apache/lucene/search/IndexSearcher.html#search%28org.apache.lucene.search.Query,%20org.apache.lucene.search.Filter,%20int,%20org.apache.lucene.search.Sort,%20boolean,%20boolean%29
325	// Seems to be okay.
326	// See also http://stackoverflow.com/questions/7910241/in-lucene-what-is-the-purpose-of-setdefaultfieldsortscoring
327
328	lucene_query_result.setTotalDocs(hits.totalHits);
329
330	// Output the matching documents
331	lucene_query_result.setStartResults(start_results);
332	lucene_query_result.setEndResults(hits.totalHits);
333
334	for (int i = start_results; i <= hits.totalHits; i++) {
335	int lucene_doc_num = hits.scoreDocs[i - 1].doc;
336	Document doc = reader.document(lucene_doc_num);
337	int doc_term_freq = 0;
338	Integer doc_term_freq_object = (Integer) doc_term_freq_map.get(new Integer(lucene_doc_num));
339	if (doc_term_freq_object != null)
340	{
341	doc_term_freq = doc_term_freq_object.intValue();
342	}
343	lucene_query_result.addDoc(doc.get("docOID").trim(), hits.scoreDocs[i-1].score, doc_term_freq);
344	}
345	}
346
347	// Slightly more complicated case for returning a subset of the matching documents
348	else {
349	// Perform the query (filter may be null)
350	TopFieldDocs hits = searcher.search(query, filter, end_results, sorter, true, true); // doDocScores=true, doMaxScore=true
351	// See also http://stackoverflow.com/questions/7910241/in-lucene-what-is-the-purpose-of-setdefaultfieldsortscoring
352	lucene_query_result.setTotalDocs(hits.totalHits);
353
354	lucene_query_result.setStartResults(start_results);
355	lucene_query_result.setEndResults(end_results < hits.scoreDocs.length ? end_results: hits.scoreDocs.length);
356
357	// Output the matching documents
358	for (int i = start_results; (i <= hits.scoreDocs.length && i <= end_results); i++) {
359	int lucene_doc_num = hits.scoreDocs[i - 1].doc;
360	Document doc = reader.document(lucene_doc_num);
361	int doc_term_freq = 0;
362	Integer doc_term_freq_object = (Integer) doc_term_freq_map.get(new Integer(lucene_doc_num));
363	if (doc_term_freq_object != null)
364	{
365	doc_term_freq = doc_term_freq_object.intValue();
366	}
367	lucene_query_result.addDoc(doc.get("docOID").trim(), hits.scoreDocs[i-1].score, doc_term_freq);
368	}
369	}
370	}
371
372	catch (ParseException parse_exception) {
373	lucene_query_result.setError(LuceneQueryResult.PARSE_ERROR);
374	}
375	catch (BooleanQuery.TooManyClauses too_many_clauses_exception) {
376	lucene_query_result.setError(LuceneQueryResult.TOO_MANY_CLAUSES_ERROR);
377	}
378	catch (IOException exception) {
379	lucene_query_result.setError(LuceneQueryResult.IO_ERROR);
380	exception.printStackTrace();
381	}
382	catch (Exception exception) {
383	lucene_query_result.setError(LuceneQueryResult.OTHER_ERROR);
384	exception.printStackTrace();
385	}
386	return lucene_query_result;
387	}
388
389	public void setDefaultConjunctionOperator(String default_conjunction_operator) {
390	super.setDefaultConjunctionOperator(default_conjunction_operator);
391
392	if (default_conjunction_operator.equals("AND")) {
393	query_parser.setDefaultOperator(query_parser.AND_OPERATOR);
394	query_parser_no_stop_words.setDefaultOperator(query_parser.AND_OPERATOR);
395	} else { // default is OR
396	query_parser.setDefaultOperator(query_parser.OR_OPERATOR);
397	query_parser_no_stop_words.setDefaultOperator(query_parser.OR_OPERATOR);
398	}
399	}
400
401
402	public void cleanUp() {
403	super.cleanUp();
404	try {
405	if(reader != null) {
406	reader.close();
407	// Closes files associated with this index. Also saves any new deletions to disk.
408	// No other methods should be called after this has been called.
409	}
410	} catch (IOException exception) {
411	exception.printStackTrace();
412	}
413	}
414
415
416	protected Query parseQuery(IndexReader reader, QueryParser query_parser, String query_string, String fuzziness)
417	throws java.io.IOException, org.apache.lucene.queryparser.classic.ParseException
418	{
419	// Split query string into the search terms and the filter terms
420	// * The first +(...) term contains the search terms so count
421	// up '(' and stop when we finish matching ')'
422	int offset = 0;
423	int paren_count = 0;
424	boolean seen_paren = false;
425	while (offset < query_string.length() && (!seen_paren \|\| paren_count > 0)) {
426	if (query_string.charAt(offset) == '(') {
427	paren_count++;
428	seen_paren = true;
429	}
430	if (query_string.charAt(offset) == ')') {
431	paren_count--;
432	}
433	offset++;
434	}
435	String query_prefix = query_string.substring(0, offset);
436	String query_suffix = query_string.substring(offset);
437
438	///ystem.err.println("Prefix: " + query_prefix);
439	///ystem.err.println("Suffix: " + query_suffix);
440
441	Query query = query_parser.parse(query_prefix);
442	query = query.rewrite(reader);
443
444	// If this is a fuzzy search, then we need to add the fuzzy
445	// flag to each of the query terms
446	if (fuzziness != null && query.toString().length() > 0) {
447
448	// Revert the query to a string
449	System.err.println("Rewritten query: " + query.toString());
450	// Search through the string for TX:<term> query terms
451	// and append the ~ operator. Note that this search will
452	// not change phrase searches (TX:"<term> <term>") as
453	// fuzzy searching is not possible for these entries.
454	// Yahoo! Time for a state machine!
455	StringBuffer mutable_query_string = new StringBuffer(query.toString());
456	int o = 0; // Offset
457	// 0 = BASE, 1 = SEEN_T, 2 = SEEN_TX, 3 = SEEN_TX:
458	int s = 0; // State
459	while(o < mutable_query_string.length()) {
460	char c = mutable_query_string.charAt(o);
461	if (s == 0 && c == TEXTFIELD.charAt(0)) {
462	///ystem.err.println("Found T!");
463	s = 1;
464	}
465	else if (s == 1) {
466	if (c == TEXTFIELD.charAt(1)) {
467	///ystem.err.println("Found X!");
468	s = 2;
469	}
470	else {
471	s = 0; // Reset
472	}
473	}
474	else if (s == 2) {
475	if (c == ':') {
476	///ystem.err.println("Found TX:!");
477	s = 3;
478	}
479	else {
480	s = 0; // Reset
481	}
482	}
483	else if (s == 3) {
484	// Don't process phrases
485	if (c == '"') {
486	///ystem.err.println("Stupid phrase...");
487	s = 0; // Reset
488	}
489	// Found the end of the term... add the
490	// fuzzy search indicator
491	// Nor outside the scope of parentheses
492	else if (Character.isWhitespace(c) \|\| c == ')') {
493	///ystem.err.println("Yahoo! Found fuzzy term.");
494	mutable_query_string.insert(o, '~' + fuzziness);
495	o++;
496	s = 0; // Reset
497	}
498	}
499	o++;
500	}
501	// If we were in the state of looking for the end of a
502	// term - then we just found it!
503	if (s == 3) {
504
505	mutable_query_string.append('~' + fuzziness);
506	}
507	// Reparse the query
508	///ystem.err.println("Fuzzy query: " + mutable_query_string.toString() + query_suffix);
509	query = query_parser.parse(mutable_query_string.toString() + query_suffix);
510	}
511	else {
512	query = query_parser.parse(query_prefix + query_suffix);
513	}
514
515	return query;
516	}
517
518	protected Filter parseFilterString(String filter_string)
519	{
520	Filter result = null;
521	Pattern pattern = Pattern.compile("\\s\\+(\\w+)\\:([\\{\\[])(\\d+)\\s+TO\\s+(\\d+)([\\}\\]])\\s");
522	Matcher matcher = pattern.matcher(filter_string);
523	if (matcher.matches()) {
524	String field_name = matcher.group(1);
525	boolean include_lower = matcher.group(2).equals("[");
526	BytesRef lower_term = new BytesRef(matcher.group(3));
527	BytesRef upper_term = new BytesRef(matcher.group(4));
528	boolean include_upper = matcher.group(5).equals("]");
529	result = new TermRangeFilter(field_name, lower_term, upper_term, include_lower, include_upper);
530	}
531	else {
532	System.err.println("Error: Could not understand filter string \"" + filter_string + "\"");
533	}
534	return result;
535	}
536
537
538	/** command line program and auxiliary methods */
539
540	// Fairly self-explanatory I should hope
541	static protected boolean query_result_caching_enabled = false;
542
543
544	static public void main (String args[])
545	{
546	if (args.length == 0) {
547	System.out.println("Usage: GS2LuceneQuery <index directory> [-fuzziness value] [-filter filter_string] [-sort sort_field] [-reverse_sort][-dco AND\|OR] [-startresults number -endresults number] [query]");
548	return;
549	}
550
551	try {
552	String index_directory = args[0];
553
554	GS2LuceneQuery queryer = new GS2LuceneQuery();
555	queryer.setIndexDir(index_directory);
556
557	// Prepare the index cache directory, if query result caching is enabled
558	if (query_result_caching_enabled) {
559	// Make the index cache directory if it doesn't already exist
560	File index_cache_directory = new File(index_directory, "cache");
561	if (!index_cache_directory.exists()) {
562	index_cache_directory.mkdir();
563	}
564
565	// Disable caching if the index cache directory isn't available
566	if (!index_cache_directory.exists() \|\| !index_cache_directory.isDirectory()) {
567	query_result_caching_enabled = false;
568	}
569	}
570
571	String query_string = null;
572
573	// Parse the command-line arguments
574	for (int i = 1; i < args.length; i++) {
575	if (args[i].equals("-sort")) {
576	i++;
577	queryer.setSortField(args[i]);
578	}
579	else if (args[i].equals("-reverse_sort")) {
580	queryer.setReverseSort(true);
581	}
582	else if (args[i].equals("-filter")) {
583	i++;
584	queryer.setFilterString(args[i]);
585	}
586	else if (args[i].equals("-dco")) {
587	i++;
588	queryer.setDefaultConjunctionOperator(args[i]);
589	}
590	else if (args[i].equals("-fuzziness")) {
591	i++;
592	queryer.setFuzziness(args[i]);
593	}
594	else if (args[i].equals("-startresults")) {
595	i++;
596	if (args[i].matches("\\d+")) {
597	queryer.setStartResults(Integer.parseInt(args[i]));
598	}
599	}
600	else if (args[i].equals("-endresults")) {
601	i++;
602	if (args[i].matches("\\d+")) {
603	queryer.setEndResults(Integer.parseInt(args[i]));
604	}
605	}
606	else {
607	query_string = args[i];
608	}
609	}
610
611	if (!queryer.initialise()) {
612	return;
613	}
614
615	// The query string has been specified as a command-line argument
616	if (query_string != null) {
617	runQueryCaching(index_directory, queryer, query_string);
618	}
619
620	// Read queries from STDIN
621	else {
622	BufferedReader in = new BufferedReader(new InputStreamReader(System.in, "UTF-8"));
623	while (true) {
624	// Read the query from STDIN
625	query_string = in.readLine();
626	if (query_string == null \|\| query_string.length() == -1) {
627	break;
628	}
629
630	runQueryCaching(index_directory, queryer, query_string);
631
632	}
633	}
634	queryer.cleanUp();
635	}
636	catch (IOException exception) {
637	exception.printStackTrace();
638	}
639	}
640
641	protected static void runQueryCaching(String index_directory, GS2LuceneQuery queryer, String query_string)
642	throws IOException
643	{
644	StringBuffer query_results_xml = new StringBuffer();
645
646	// Check if this query result has been cached from a previous search (if it's enabled)
647	File query_result_cache_file = null;
648	if (query_result_caching_enabled) {
649	// Generate the cache file name from the query options
650	String query_result_cache_file_name = query_string + "-";
651	String fuzziness = queryer.getFuzziness();
652	query_result_cache_file_name += ((fuzziness != null) ? fuzziness : "") + "-";
653	String filter_string = queryer.getFilterString();
654	query_result_cache_file_name += ((filter_string != null) ? filter_string : "") + "-";
655	String sort_string = queryer.getSortField();
656	query_result_cache_file_name += ((sort_string != null) ? sort_string : "") + "-";
657	String reverse_sort_string = (queryer.getReverseSort() ? "1" : "0");
658	query_result_cache_file_name += reverse_sort_string + "-";
659	String default_conjunction_operator = queryer.getDefaultConjunctionOperator();
660	query_result_cache_file_name += default_conjunction_operator + "-";
661	int start_results = queryer.getStartResults();
662	int end_results = queryer.getEndResults();
663	query_result_cache_file_name += start_results + "-" + end_results;
664	query_result_cache_file_name = fileSafe(query_result_cache_file_name);
665
666	// If the query result cache file exists, just return its contents and we're done
667	File index_cache_directory = new File(index_directory, "cache");
668	query_result_cache_file = new File(index_cache_directory, query_result_cache_file_name);
669	if (query_result_cache_file.exists() && query_result_cache_file.isFile()) {
670	FileInputStream fis = new FileInputStream(query_result_cache_file);
671	InputStreamReader isr = new InputStreamReader(fis, "UTF-8");
672	BufferedReader buffered_reader = new BufferedReader(isr);
673	String line = "";
674	while ((line = buffered_reader.readLine()) != null) {
675	query_results_xml.append(line + "\n");
676	}
677	String query_results_xml_string = query_results_xml.toString();
678	query_results_xml_string = query_results_xml_string.replaceFirst("cached=\"false\"", "cached=\"true\"");
679
680	utf8out.print(query_results_xml_string);
681	utf8out.flush();
682
683	return;
684	}
685	}
686
687	// not cached
688	query_results_xml.append("<ResultSet cached=\"false\">\n");
689	query_results_xml.append("<QueryString>" + LuceneQueryResult.xmlSafe(query_string) + "</QueryString>\n");
690	Filter filter = queryer.getFilter();
691	if (filter != null) {
692	query_results_xml.append("<FilterString>" + filter.toString() + "</FilterString>\n");
693	}
694
695	LuceneQueryResult query_result = queryer.runQuery(query_string);
696	if (query_result == null) {
697	System.err.println("Couldn't run the query");
698	return;
699	}
700
701	if (query_result.getError() != LuceneQueryResult.NO_ERROR) {
702	query_results_xml.append("<Error type=\""+query_result.getErrorString()+"\" />\n");
703	} else {
704	query_results_xml.append(query_result.getXMLString());
705	}
706	query_results_xml.append("</ResultSet>\n");
707
708	utf8out.print(query_results_xml);
709	utf8out.flush();
710
711	// Cache this query result, if desired
712	if (query_result_caching_enabled) {
713	// Catch any exceptions thrown trying to write the query result cache file and warn about them, but don't
714	// bother with the full stack trace. It won't affect the functionality if we can't write some cache
715	// files, it will just affect the speed of subsequent requests.
716	// Example exceptions are "permission denied" errors, or "filename too long" errors (the filter string
717	// can get very long in some collections)
718	try
719	{
720	FileWriter query_result_cache_file_writer = new FileWriter(query_result_cache_file);
721	query_result_cache_file_writer.write(query_results_xml.toString());
722	query_result_cache_file_writer.close();
723	}
724	catch (Exception exception)
725	{
726	System.err.println("Warning: Exception occurred trying to write query result cache file (" + exception + ")");
727	}
728	}
729	}
730
731	protected static String fileSafe(String text)
732	{
733	StringBuffer file_safe_text = new StringBuffer();
734	for (int i = 0; i < text.length(); i++) {
735	char character = text.charAt(i);
736	if ((character >= 'A' && character <= 'Z') \|\| (character >= 'a' && character <= 'z') \|\| (character >= '0' && character <= '9') \|\| character == '-') {
737	file_safe_text.append(character);
738	}
739	else {
740	file_safe_text.append('%');
741	file_safe_text.append((int) character);
742	}
743	}
744	return file_safe_text.toString();
745	}
746
747
748	}
749
750

Note: See TracBrowser for help on using the repository browser.

Download in other formats: