Context Navigation

source: main/trunk/greenstone2/common-src/indexers/lucene-gs/src/org/greenstone/LuceneWrapper4/GS2LuceneQuery.java@ 30159

Last change on this file since 30159 was 30159, checked in by kjdon, 9 years ago
making all search results indexes start from 0 not 1.
File size: 27.5 KB

Line
1	/**********************************************************************
2	*
3	* GS2LuceneQuery.java
4	*
5	* Copyright 2004 The New Zealand Digital Library Project
6	*
7	* A component of the Greenstone digital library software
8	* from the New Zealand Digital Library Project at the
9	* University of Waikato, New Zealand.
10	*
11	* This program is free software; you can redistribute it and/or modify
12	* it under the terms of the GNU General Public License as published by
13	* the Free Software Foundation; either version 2 of the License, or
14	* (at your option) any later version.
15	*
16	* This program is distributed in the hope that it will be useful,
17	* but WITHOUT ANY WARRANTY; without even the implied warranty of
18	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19	* GNU General Public License for more details.
20	*
21	* You should have received a copy of the GNU General Public License
22	* along with this program; if not, write to the Free Software
23	* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24	*
25	*********************************************************************/
26	package org.greenstone.LuceneWrapper4;
27
28
29	import java.io.*;
30	import java.util.*;
31	import java.util.regex.*;
32
33	import org.apache.lucene.analysis.Analyzer;
34	import org.apache.lucene.analysis.standard.StandardAnalyzer;
35	import org.apache.lucene.document.Document;
36	import org.apache.lucene.index.DirectoryReader;
37	import org.apache.lucene.index.IndexReader;
38	import org.apache.lucene.index.Term;
39	//import org.apache.lucene.index.TermDocs;
40	import org.apache.lucene.queryparser.classic.ParseException;
41	import org.apache.lucene.queryparser.classic.QueryParser;
42	import org.apache.lucene.search.BooleanQuery; // for the TooManyClauses exception
43	import org.apache.lucene.search.Filter;
44	import org.apache.lucene.search.IndexSearcher;
45	import org.apache.lucene.search.MultiTermQuery;
46	import org.apache.lucene.search.MultiTermQuery.ConstantScoreAutoRewrite;
47	import org.apache.lucene.search.Query;
48	import org.apache.lucene.search.TermRangeFilter;
49	import org.apache.lucene.search.IndexSearcher; // Searcher is deprecated
50	import org.apache.lucene.search.ScoreDoc;
51	import org.apache.lucene.search.Sort;
52	import org.apache.lucene.search.SortField;
53	import org.apache.lucene.search.TopFieldDocs;
54
55	import org.apache.lucene.index.DocsEnum;
56	import org.apache.lucene.index.MultiFields;
57
58	import org.apache.lucene.store.Directory;
59	import org.apache.lucene.store.FSDirectory;
60
61	import org.apache.lucene.util.Bits;
62	import org.apache.lucene.util.BytesRef;
63	import org.apache.lucene.util.Version;
64
65	public class GS2LuceneQuery extends SharedSoleneQuery
66	{
67	public static String SORT_RANK = "rank";
68	public static String SORT_NATURAL = "natural";
69
70	protected String full_indexdir="";
71
72	protected SortField.Type sort_type = SortField.Type.SCORE;
73	protected boolean reverse_sort = false;
74	protected Sort sorter=new Sort();
75	protected Filter filter = null;
76
77	protected QueryParser query_parser = null;
78	protected QueryParser query_parser_no_stop_words = null;
79	protected IndexSearcher searcher = null;
80	protected IndexReader reader = null;
81
82	public GS2LuceneQuery() {
83	super();
84
85	// Create one query parser with the standard set of stop words, and one with none
86
87	query_parser = new QueryParser(GSLuceneConstants.MATCH_VERSION, TEXTFIELD, new GS2Analyzer()); // uses built-in stop_words_set
88	query_parser_no_stop_words = new QueryParser(GSLuceneConstants.MATCH_VERSION, TEXTFIELD, new GS2Analyzer(new String[] { }));
89	}
90
91
92	public boolean initialise() {
93
94	if (!super.initialise()) {
95	return false;
96	}
97
98
99	if (full_indexdir==null \|\| full_indexdir.length()==-1){
100	utf8out.println("Index directory is not indicated ");
101	utf8out.flush();
102	return false;
103	}
104
105	try {
106	Directory full_indexdir_dir = FSDirectory.open(new File(full_indexdir));
107
108	reader = DirectoryReader.open(full_indexdir_dir); // Returns a IndexReader reading the index in the given Directory. now readOnly=true by default, and therefore also for searcher
109	searcher = new IndexSearcher(reader); // during searcher.search() will get it to compute ranks when sorting by fields
110
111	this.sorter = new Sort(new SortField(this.sort_field, this.sort_type, this.reverse_sort));
112	}
113	catch (IOException exception) {
114	exception.printStackTrace();
115	return false;
116	}
117	return true;
118
119	}
120
121	public void setIndexDir(String full_indexdir) {
122	this.full_indexdir = full_indexdir;
123	}
124
125	public void setSortField(String sort_field) {
126	if (sort_field.equals(SORT_RANK)) {
127	this.sort_field = null;
128	this.sort_type = SortField.Type.SCORE;
129	} else if (sort_field.equals(SORT_NATURAL)) {
130	this.sort_field = null;
131	this.sort_type = SortField.Type.DOC;
132	} else {
133	this.sort_field = sort_field;
134	this.sort_type = SortField.Type.STRING; // for now. numeric??
135	}
136	}
137	public void setReverseSort(boolean reverse) {
138	this.reverse_sort = reverse;
139	}
140	public boolean getReverseSort() {
141	return this.reverse_sort;
142	}
143
144	public void setFilterString(String filter_string) {
145	super.setFilterString(filter_string);
146	this.filter = parseFilterString(filter_string);
147	}
148
149	public Filter getFilter() {
150	return this.filter;
151	}
152
153
154	public LuceneQueryResult runQuery(String query_string) {
155
156	if (query_string == null \|\| query_string.equals("")) {
157	utf8out.println("The query word is not indicated ");
158	utf8out.flush();
159	return null;
160	}
161
162	LuceneQueryResult lucene_query_result=new LuceneQueryResult();
163	lucene_query_result.clear();
164
165	try {
166	Query query_including_stop_words = query_parser_no_stop_words.parse(query_string);
167	query_including_stop_words = query_including_stop_words.rewrite(reader);
168
169	// System.err.println("******* query_string " + query_string + "**");
170
171	Query query = parseQuery(reader, query_parser, query_string, fuzziness);
172
173	// GS2's LuceneWrapper uses lucene-2.3.2. GS3's LuceneWrapper3 works with lucene-3.3.0.
174	// This change in lucene core library for GS3 (present since after version 2.4.1) had the
175	// side-effect that searching on "econom*" didn't display what terms it was searching for,
176	// whereas it had done so in GS2.
177
178	// The details of this problem and its current solution are explained in the ticket
179	// http://trac.greenstone.org/ticket/845
180
181	// We need to change the settings for the rewriteMethod in order to get searches on wildcards
182	// to produce search terms again when the query gets rewritten.
183
184	// We try, in order:
185	// 1. RewriteMethod set to BooleanQuery, to get it working as in GS2 which uses lucene-2.3.2
186	// it will expand wildcard searches to its terms when searching at both section AND doc level.
187	// If that throws a TooManyClauses exception (like when searching for "a*" over lucene demo collection)
188	// 2. Then try a custom rewriteMethod which sets termCountCutoff=350 and docCountPercent cutoff=0.1%
189	// If that throws a TooManyClauses exception (could perhaps happen if the collection has a huge number of docs
190	// 3. Then try the default apache rewriteMethod with its optimum defaults of
191	// termCountCutoff=350 and docCountPercent cutoff=0.1%
192	// See http://lucene.apache.org/core/3_6_1/api/core/org/apache/lucene/search/MultiTermQuery.html
193
194	if(query instanceof MultiTermQuery) {
195	MultiTermQuery multiTermQuery = (MultiTermQuery)query;
196	multiTermQuery.setRewriteMethod(MultiTermQuery.CONSTANT_SCORE_BOOLEAN_QUERY_REWRITE);
197	// less CPU intensive than MultiTermQuery.SCORING_BOOLEAN_QUERY_REWRITE)
198	}
199
200	try {
201	query = query.rewrite(reader);
202	}
203	catch(BooleanQuery.TooManyClauses clauseException) {
204	// Example test case: try searching the lucene demo collection for "a*"
205	// and you'll hit this exception
206
207	lucene_query_result.setError(LuceneQueryResult.TOO_MANY_CLAUSES_ERROR);
208
209	if(query instanceof MultiTermQuery) {
210
211	// CustomRewriteMethod: setting the docCountPercent cutoff to a custom 100%.
212	// This will at least expand the query to its terms when searching with wildcards at section-level
213	// (though it doesn't seem to work for doc-level searches, no matter what the cutoffs are set to).
214
215	MultiTermQuery.ConstantScoreAutoRewrite customRewriteMethod = new MultiTermQuery.ConstantScoreAutoRewrite();
216	customRewriteMethod.setDocCountPercent(100.0);
217	customRewriteMethod.setTermCountCutoff(350); // same as default
218
219	MultiTermQuery multiTermQuery = (MultiTermQuery)query;
220	multiTermQuery.setRewriteMethod(customRewriteMethod);
221	try {
222	query = query.rewrite(reader);
223	}
224	catch(BooleanQuery.TooManyClauses clauseExceptionAgain) {
225
226	// do what the code originally did: use the default rewriteMethod which
227	// uses a default docCountPercent=0.1 (%) and termCountCutoff=350
228
229	multiTermQuery.setRewriteMethod(MultiTermQuery.CONSTANT_SCORE_AUTO_REWRITE_DEFAULT);
230	query = query.rewrite(reader);
231	}
232	}
233	}
234
235	// http://stackoverflow.com/questions/13537126/term-frequency-in-lucene-4-0
236	// http://stackoverflow.com/questions/20575254/lucene-4-4-how-to-get-term-frequency-over-all-index
237	// http://stackoverflow.com/questions/8938960/how-to-get-document-ids-for-document-term-vector-in-lucene?rq=1
238	// https://github.com/hibernate/hibernate-search/blob/master/orm/src/test/java/org/hibernate/search/test/filter/BestDriversFilter.java
239	// http://lucene.apache.org/core/4_7_2/MIGRATE.html
240
241	// Get the list of expanded query terms and their frequencies
242	// num docs matching, and total frequency
243	HashSet terms = new HashSet();
244	query.extractTerms(terms);
245
246	HashMap doc_term_freq_map = new HashMap();
247
248	Iterator iter = terms.iterator();
249
250	Bits liveDocs = null;
251	if(reader.hasDeletions()) {
252	System.err.println("@@@ GS2LuceneQuery.java: There have been deletions. Merging to get liveDocs.");
253	liveDocs = MultiFields.getLiveDocs(reader); // SLOW! But getLiveDocs returns null if there are no deletions
254	}
255
256	while (iter.hasNext()) {
257
258	// http://stackoverflow.com/questions/13537126/term-frequency-in-lucene-4-0
259
260	Term term = (Term) iter.next();
261	BytesRef term_bytes = term.bytes();
262	DocsEnum term_docs = MultiFields.getTermDocsEnum(reader, liveDocs, term.field(), term_bytes); // flags?
263
264	// Get the term frequency over all the documents
265	//TermDocs term_docs = reader.termDocs(term);
266	int term_freq = 0;
267	int match_docs = 0;
268
269	if(term_docs != null) {
270	int docID = -1;
271	while((docID = term_docs.nextDoc()) != DocsEnum.NO_MORE_DOCS) {//while (term_docs.next())
272	if (term_docs.freq() != 0)
273	{
274	term_freq += term_docs.freq();
275	match_docs++;
276
277	// Calculate the document-level term frequency as well
278	Integer lucene_doc_num_obj = new Integer(term_docs.docID());
279	int doc_term_freq = 0;
280	if (doc_term_freq_map.containsKey(lucene_doc_num_obj))
281	{
282	doc_term_freq = ((Integer) doc_term_freq_map.get(lucene_doc_num_obj)).intValue();
283	}
284	doc_term_freq += term_docs.freq();
285
286	doc_term_freq_map.put(lucene_doc_num_obj, new Integer(doc_term_freq));
287	}
288	}
289	} else {
290	System.err.println("@@@ GS2LuceneQuery.java: term_docs is null for term " + term.text());
291	}
292
293	// Create a term
294	lucene_query_result.addTerm(term.text(), term.field(), match_docs, term_freq);
295	}
296
297	// Get the list of stop words removed from the query
298	HashSet terms_including_stop_words = new HashSet();
299	query_including_stop_words.extractTerms(terms_including_stop_words);
300	Iterator terms_including_stop_words_iter = terms_including_stop_words.iterator();
301	while (terms_including_stop_words_iter.hasNext()) {
302	Term term = (Term) terms_including_stop_words_iter.next();
303	if (!terms.contains(term)) {
304	lucene_query_result.addStopWord(term.text());
305	}
306	}
307
308	// Extracting all documents for a given search - http://www.gossamer-threads.com/lists/lucene/java-user/134873
309	// http://lucene.apache.org/core/3_4_0/api/core/org/apache/lucene/search/TotalHitCountCollector.html
310	// http://lucene.apache.org/core/4_7_2/core/index.html?org/apache/lucene/search/TopFieldDocs.html
311
312	// 1. Figure out how many results there will be.
313	//TotalHitCountCollecter countCollector = new TotalHitCountCollector();
314	//searcher.search(query, filter, collector);
315	//int hitCount = collector.count;
316
317	// Actually do the query
318	// Simple case for getting all the matching documents
319	if (end_results == Integer.MAX_VALUE) {
320	// Perform the query (filter and sorter may be null)
321	TopFieldDocs hits = searcher.search(query, filter, end_results, sorter, true, true); // doDocScores=true, doMaxScore=true
322	// Is there a slight difference in the definition between
323	// https://lucene.apache.org/core/3_6_0/api/all/org/apache/lucene/search/IndexSearcher.html#setDefaultFieldSortScoring%28boolean,%20boolean%29
324	// and http://lucene.apache.org/core/4_7_2/core/org/apache/lucene/search/IndexSearcher.html#search%28org.apache.lucene.search.Query,%20org.apache.lucene.search.Filter,%20int,%20org.apache.lucene.search.Sort,%20boolean,%20boolean%29
325	// Seems to be okay.
326	// See also http://stackoverflow.com/questions/7910241/in-lucene-what-is-the-purpose-of-setdefaultfieldsortscoring
327
328	lucene_query_result.setTotalDocs(hits.totalHits);
329
330	// Output the matching documents
331	lucene_query_result.setStartResults(start_results);
332	lucene_query_result.setEndResults(hits.totalHits); // ??
333
334	for (int i = start_results; i < hits.totalHits; i++) {
335	int lucene_doc_num = hits.scoreDocs[i ].doc; // i-1
336	Document doc = reader.document(lucene_doc_num);
337	int doc_term_freq = 0;
338	Integer doc_term_freq_object = (Integer) doc_term_freq_map.get(new Integer(lucene_doc_num));
339	if (doc_term_freq_object != null)
340	{
341	doc_term_freq = doc_term_freq_object.intValue();
342	}
343	lucene_query_result.addDoc(doc.get("docOID").trim(), hits.scoreDocs[i].score, doc_term_freq);
344	}
345	}
346
347	// Slightly more complicated case for returning a subset of the matching documents
348	else {
349	// Perform the query (filter may be null)
350	TopFieldDocs hits = searcher.search(query, filter, end_results, sorter, true, true); // doDocScores=true, doMaxScore=true
351	// See also http://stackoverflow.com/questions/7910241/in-lucene-what-is-the-purpose-of-setdefaultfieldsortscoring
352	lucene_query_result.setTotalDocs(hits.totalHits);
353
354	lucene_query_result.setStartResults(start_results);
355	lucene_query_result.setEndResults(end_results < hits.scoreDocs.length ? end_results: hits.scoreDocs.length);
356
357	// Output the matching documents
358	for (int i = start_results; (i < hits.scoreDocs.length && i < end_results); i++) {
359	int lucene_doc_num = hits.scoreDocs[i].doc;
360	Document doc = reader.document(lucene_doc_num);
361	int doc_term_freq = 0;
362	Integer doc_term_freq_object = (Integer) doc_term_freq_map.get(new Integer(lucene_doc_num));
363	if (doc_term_freq_object != null)
364	{
365	doc_term_freq = doc_term_freq_object.intValue();
366	}
367	lucene_query_result.addDoc(doc.get("docOID").trim(), hits.scoreDocs[i].score, doc_term_freq);
368	}
369	}
370	}
371
372	catch (ParseException parse_exception) {
373	lucene_query_result.setError(LuceneQueryResult.PARSE_ERROR);
374	}
375	catch (BooleanQuery.TooManyClauses too_many_clauses_exception) {
376	lucene_query_result.setError(LuceneQueryResult.TOO_MANY_CLAUSES_ERROR);
377	}
378	catch (IOException exception) {
379	lucene_query_result.setError(LuceneQueryResult.IO_ERROR);
380	exception.printStackTrace();
381	}
382	catch (Exception exception) {
383	lucene_query_result.setError(LuceneQueryResult.OTHER_ERROR);
384	exception.printStackTrace();
385	}
386	return lucene_query_result;
387	}
388
389	public void setDefaultConjunctionOperator(String default_conjunction_operator) {
390	super.setDefaultConjunctionOperator(default_conjunction_operator);
391
392	if (default_conjunction_operator.equals("AND")) {
393	query_parser.setDefaultOperator(query_parser.AND_OPERATOR);
394	query_parser_no_stop_words.setDefaultOperator(query_parser.AND_OPERATOR);
395	} else { // default is OR
396	query_parser.setDefaultOperator(query_parser.OR_OPERATOR);
397	query_parser_no_stop_words.setDefaultOperator(query_parser.OR_OPERATOR);
398	}
399	}
400
401
402	public void cleanUp() {
403	super.cleanUp();
404	try {
405	if(reader != null) {
406	reader.close();
407	// Closes files associated with this index. Also saves any new deletions to disk.
408	// No other methods should be called after this has been called.
409	}
410	} catch (IOException exception) {
411	exception.printStackTrace();
412	}
413	}
414
415
416	protected Query parseQuery(IndexReader reader, QueryParser query_parser, String query_string, String fuzziness)
417	throws java.io.IOException, org.apache.lucene.queryparser.classic.ParseException
418	{
419	// Split query string into the search terms and the filter terms
420	// * The first +(...) term contains the search terms so count
421	// up '(' and stop when we finish matching ')'
422	int offset = 0;
423	int paren_count = 0;
424	boolean seen_paren = false;
425	while (offset < query_string.length() && (!seen_paren \|\| paren_count > 0)) {
426	if (query_string.charAt(offset) == '(') {
427	paren_count++;
428	seen_paren = true;
429	}
430	if (query_string.charAt(offset) == ')') {
431	paren_count--;
432	}
433	offset++;
434	}
435	String query_prefix = query_string.substring(0, offset);
436	String query_suffix = query_string.substring(offset);
437
438	///ystem.err.println("Prefix: " + query_prefix);
439	///ystem.err.println("Suffix: " + query_suffix);
440
441	Query query = query_parser.parse(query_prefix);
442	query = query.rewrite(reader);
443
444	// If this is a fuzzy search, then we need to add the fuzzy
445	// flag to each of the query terms
446	if (fuzziness != null && query.toString().length() > 0) {
447
448	// Revert the query to a string
449	System.err.println("Rewritten query: " + query.toString());
450	// Search through the string for TX:<term> query terms
451	// and append the ~ operator. Note that this search will
452	// not change phrase searches (TX:"<term> <term>") as
453	// fuzzy searching is not possible for these entries.
454	// Yahoo! Time for a state machine!
455	StringBuffer mutable_query_string = new StringBuffer(query.toString());
456	int o = 0; // Offset
457	// 0 = BASE, 1 = SEEN_T, 2 = SEEN_TX, 3 = SEEN_TX:
458	int s = 0; // State
459	while(o < mutable_query_string.length()) {
460	char c = mutable_query_string.charAt(o);
461	if (s == 0 && c == TEXTFIELD.charAt(0)) {
462	///ystem.err.println("Found T!");
463	s = 1;
464	}
465	else if (s == 1) {
466	if (c == TEXTFIELD.charAt(1)) {
467	///ystem.err.println("Found X!");
468	s = 2;
469	}
470	else {
471	s = 0; // Reset
472	}
473	}
474	else if (s == 2) {
475	if (c == ':') {
476	///ystem.err.println("Found TX:!");
477	s = 3;
478	}
479	else {
480	s = 0; // Reset
481	}
482	}
483	else if (s == 3) {
484	// Don't process phrases
485	if (c == '"') {
486	///ystem.err.println("Stupid phrase...");
487	s = 0; // Reset
488	}
489	// Found the end of the term... add the
490	// fuzzy search indicator
491	// Nor outside the scope of parentheses
492	else if (Character.isWhitespace(c) \|\| c == ')') {
493	///ystem.err.println("Yahoo! Found fuzzy term.");
494	mutable_query_string.insert(o, '~' + fuzziness);
495	o++;
496	s = 0; // Reset
497	}
498	}
499	o++;
500	}
501	// If we were in the state of looking for the end of a
502	// term - then we just found it!
503	if (s == 3) {
504
505	mutable_query_string.append('~' + fuzziness);
506	}
507	// Reparse the query
508	///ystem.err.println("Fuzzy query: " + mutable_query_string.toString() + query_suffix);
509	query = query_parser.parse(mutable_query_string.toString() + query_suffix);
510	}
511	else {
512	query = query_parser.parse(query_prefix + query_suffix);
513	}
514
515	return query;
516	}
517
518	protected Filter parseFilterString(String filter_string)
519	{
520	Filter result = null;
521	Pattern pattern = Pattern.compile("\\s\\+(\\w+)\\:([\\{\\[])(\\d+)\\s+TO\\s+(\\d+)([\\}\\]])\\s");
522	Matcher matcher = pattern.matcher(filter_string);
523	if (matcher.matches()) {
524	String field_name = matcher.group(1);
525	boolean include_lower = matcher.group(2).equals("[");
526	BytesRef lower_term = new BytesRef(matcher.group(3));
527	BytesRef upper_term = new BytesRef(matcher.group(4));
528	boolean include_upper = matcher.group(5).equals("]");
529	result = new TermRangeFilter(field_name, lower_term, upper_term, include_lower, include_upper);
530	}
531	else {
532	System.err.println("Error: Could not understand filter string \"" + filter_string + "\"");
533	}
534	return result;
535	}
536
537
538	/** command line program and auxiliary methods */
539
540	// Fairly self-explanatory I should hope
541	static protected boolean query_result_caching_enabled = false;
542
543
544	static public void main (String args[])
545	{
546	if (args.length == 0) {
547	System.out.println("Usage: GS2LuceneQuery <index directory> [-fuzziness value] [-filter filter_string] [-sort sort_field] [-reverse_sort][-dco AND\|OR] [-startresults number -endresults number] [query]");
548	return;
549	}
550
551	try {
552	String index_directory = args[0];
553
554	GS2LuceneQuery queryer = new GS2LuceneQuery();
555	queryer.setIndexDir(index_directory);
556
557	// Prepare the index cache directory, if query result caching is enabled
558	if (query_result_caching_enabled) {
559	// Make the index cache directory if it doesn't already exist
560	File index_cache_directory = new File(index_directory, "cache");
561	if (!index_cache_directory.exists()) {
562	index_cache_directory.mkdir();
563	}
564
565	// Disable caching if the index cache directory isn't available
566	if (!index_cache_directory.exists() \|\| !index_cache_directory.isDirectory()) {
567	query_result_caching_enabled = false;
568	}
569	}
570
571	String query_string = null;
572
573	// Parse the command-line arguments
574	for (int i = 1; i < args.length; i++) {
575	if (args[i].equals("-sort")) {
576	i++;
577	queryer.setSortField(args[i]);
578	}
579	else if (args[i].equals("-reverse_sort")) {
580	queryer.setReverseSort(true);
581	}
582	else if (args[i].equals("-filter")) {
583	i++;
584	queryer.setFilterString(args[i]);
585	}
586	else if (args[i].equals("-dco")) {
587	i++;
588	queryer.setDefaultConjunctionOperator(args[i]);
589	}
590	else if (args[i].equals("-fuzziness")) {
591	i++;
592	queryer.setFuzziness(args[i]);
593	}
594	else if (args[i].equals("-startresults")) {
595	i++;
596	if (args[i].matches("\\d+")) {
597	queryer.setStartResults(Integer.parseInt(args[i]));
598	}
599	}
600	else if (args[i].equals("-endresults")) {
601	i++;
602	if (args[i].matches("\\d+")) {
603	queryer.setEndResults(Integer.parseInt(args[i]));
604	}
605	}
606	else {
607	query_string = args[i];
608	}
609	}
610
611	if (!queryer.initialise()) {
612	return;
613	}
614
615	// The query string has been specified as a command-line argument
616	if (query_string != null) {
617	runQueryCaching(index_directory, queryer, query_string);
618	}
619
620	// Read queries from STDIN
621	else {
622	BufferedReader in = new BufferedReader(new InputStreamReader(System.in, "UTF-8"));
623	while (true) {
624	// Read the query from STDIN
625	query_string = in.readLine();
626	if (query_string == null \|\| query_string.length() == -1) {
627	break;
628	}
629
630	runQueryCaching(index_directory, queryer, query_string);
631
632	}
633	}
634	queryer.cleanUp();
635	}
636	catch (IOException exception) {
637	exception.printStackTrace();
638	}
639	}
640
641	protected static void runQueryCaching(String index_directory, GS2LuceneQuery queryer, String query_string)
642	throws IOException
643	{
644	StringBuffer query_results_xml = new StringBuffer();
645
646	// Check if this query result has been cached from a previous search (if it's enabled)
647	File query_result_cache_file = null;
648	if (query_result_caching_enabled) {
649	// Generate the cache file name from the query options
650	String query_result_cache_file_name = query_string + "-";
651	String fuzziness = queryer.getFuzziness();
652	query_result_cache_file_name += ((fuzziness != null) ? fuzziness : "") + "-";
653	String filter_string = queryer.getFilterString();
654	query_result_cache_file_name += ((filter_string != null) ? filter_string : "") + "-";
655	String sort_string = queryer.getSortField();
656	query_result_cache_file_name += ((sort_string != null) ? sort_string : "") + "-";
657	String reverse_sort_string = (queryer.getReverseSort() ? "1" : "0");
658	query_result_cache_file_name += reverse_sort_string + "-";
659	String default_conjunction_operator = queryer.getDefaultConjunctionOperator();
660	query_result_cache_file_name += default_conjunction_operator + "-";
661	int start_results = queryer.getStartResults();
662	int end_results = queryer.getEndResults();
663	query_result_cache_file_name += start_results + "-" + end_results;
664	query_result_cache_file_name = fileSafe(query_result_cache_file_name);
665
666	// If the query result cache file exists, just return its contents and we're done
667	File index_cache_directory = new File(index_directory, "cache");
668	query_result_cache_file = new File(index_cache_directory, query_result_cache_file_name);
669	if (query_result_cache_file.exists() && query_result_cache_file.isFile()) {
670	FileInputStream fis = new FileInputStream(query_result_cache_file);
671	InputStreamReader isr = new InputStreamReader(fis, "UTF-8");
672	BufferedReader buffered_reader = new BufferedReader(isr);
673	String line = "";
674	while ((line = buffered_reader.readLine()) != null) {
675	query_results_xml.append(line + "\n");
676	}
677	String query_results_xml_string = query_results_xml.toString();
678	query_results_xml_string = query_results_xml_string.replaceFirst("cached=\"false\"", "cached=\"true\"");
679
680	utf8out.print(query_results_xml_string);
681	utf8out.flush();
682
683	return;
684	}
685	}
686
687	// not cached
688	query_results_xml.append("<ResultSet cached=\"false\">\n");
689	query_results_xml.append("<QueryString>" + LuceneQueryResult.xmlSafe(query_string) + "</QueryString>\n");
690	Filter filter = queryer.getFilter();
691	if (filter != null) {
692	query_results_xml.append("<FilterString>" + filter.toString() + "</FilterString>\n");
693	}
694
695	LuceneQueryResult query_result = queryer.runQuery(query_string);
696	if (query_result == null) {
697	System.err.println("Couldn't run the query");
698	return;
699	}
700
701	if (query_result.getError() != LuceneQueryResult.NO_ERROR) {
702	query_results_xml.append("<Error type=\""+query_result.getErrorString()+"\" />\n");
703	} else {
704	query_results_xml.append(query_result.getXMLString());
705	}
706	query_results_xml.append("</ResultSet>\n");
707
708	utf8out.print(query_results_xml);
709	utf8out.flush();
710
711	// Cache this query result, if desired
712	if (query_result_caching_enabled) {
713	// Catch any exceptions thrown trying to write the query result cache file and warn about them, but don't
714	// bother with the full stack trace. It won't affect the functionality if we can't write some cache
715	// files, it will just affect the speed of subsequent requests.
716	// Example exceptions are "permission denied" errors, or "filename too long" errors (the filter string
717	// can get very long in some collections)
718	try
719	{
720	FileWriter query_result_cache_file_writer = new FileWriter(query_result_cache_file);
721	query_result_cache_file_writer.write(query_results_xml.toString());
722	query_result_cache_file_writer.close();
723	}
724	catch (Exception exception)
725	{
726	System.err.println("Warning: Exception occurred trying to write query result cache file (" + exception + ")");
727	}
728	}
729	}
730
731	protected static String fileSafe(String text)
732	{
733	StringBuffer file_safe_text = new StringBuffer();
734	for (int i = 0; i < text.length(); i++) {
735	char character = text.charAt(i);
736	if ((character >= 'A' && character <= 'Z') \|\| (character >= 'a' && character <= 'z') \|\| (character >= '0' && character <= '9') \|\| character == '-') {
737	file_safe_text.append(character);
738	}
739	else {
740	file_safe_text.append('%');
741	file_safe_text.append((int) character);
742	}
743	}
744	return file_safe_text.toString();
745	}
746
747
748	}
749
750

Note: See TracBrowser for help on using the repository browser.

Download in other formats: