Context Navigation

← Previous Revision
Latest Revision
Next Revision →
Blame
Revision Log

source: main/trunk/greenstone2/common-src/indexers/lucene-gs/src/org/greenstone/LuceneWrapper4/GS2LuceneQuery.java@ 29148

Last change on this file since 29148 was 29148, checked in by ak19, 10 years ago
Part of port from lucene3.3.0 to lucene4.7.2. Related to LuceneWrapper. 1. Updating the lucene-gs makefiles to allow compiling up Lucene4Wrapper.jar or Lucene3Wrapper.jar. Only the Linux Makefile.in has been tested so far. 2. Adding in the jar files necessary for Lucene4Wrapper into the lib folder's new lucene4 subfolder. 3. Updating the Lucene src code to use lucene4.7.2 instead of lucene3.3.0.
File size: 26.9 KB

Line
1	/**********************************************************************
2	*
3	* GS2LuceneQuery.java
4	*
5	* Copyright 2004 The New Zealand Digital Library Project
6	*
7	* A component of the Greenstone digital library software
8	* from the New Zealand Digital Library Project at the
9	* University of Waikato, New Zealand.
10	*
11	* This program is free software; you can redistribute it and/or modify
12	* it under the terms of the GNU General Public License as published by
13	* the Free Software Foundation; either version 2 of the License, or
14	* (at your option) any later version.
15	*
16	* This program is distributed in the hope that it will be useful,
17	* but WITHOUT ANY WARRANTY; without even the implied warranty of
18	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19	* GNU General Public License for more details.
20	*
21	* You should have received a copy of the GNU General Public License
22	* along with this program; if not, write to the Free Software
23	* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24	*
25	*********************************************************************/
26	package org.greenstone.LuceneWrapper4;
27
28
29	import java.io.*;
30	import java.util.*;
31	import java.util.regex.*;
32
33	import org.apache.lucene.analysis.Analyzer;
34	import org.apache.lucene.analysis.standard.StandardAnalyzer;
35	import org.apache.lucene.document.Document;
36	import org.apache.lucene.index.DirectoryReader;
37	import org.apache.lucene.index.IndexReader;
38	import org.apache.lucene.index.Term;
39	//import org.apache.lucene.index.TermDocs;
40	import org.apache.lucene.queryparser.classic.ParseException;
41	import org.apache.lucene.queryparser.classic.QueryParser;
42	import org.apache.lucene.search.BooleanQuery; // for the TooManyClauses exception
43	import org.apache.lucene.search.Filter;
44	import org.apache.lucene.search.IndexSearcher;
45	import org.apache.lucene.search.MultiTermQuery;
46	import org.apache.lucene.search.MultiTermQuery.ConstantScoreAutoRewrite;
47	import org.apache.lucene.search.Query;
48	import org.apache.lucene.search.TermRangeFilter;
49	import org.apache.lucene.search.IndexSearcher; // Searcher is deprecated
50	import org.apache.lucene.search.ScoreDoc;
51	import org.apache.lucene.search.Sort;
52	import org.apache.lucene.search.SortField;
53	import org.apache.lucene.search.TopFieldDocs;
54
55	import org.apache.lucene.index.DocsEnum;
56	import org.apache.lucene.index.MultiFields;
57
58	import org.apache.lucene.store.Directory;
59	import org.apache.lucene.store.FSDirectory;
60
61	import org.apache.lucene.util.Bits;
62	import org.apache.lucene.util.BytesRef;
63	import org.apache.lucene.util.Version;
64
65	public class GS2LuceneQuery extends SharedSoleneQuery
66	{
67	public static String SORT_RANK = "rank";
68	public static String SORT_NATURAL = "natural";
69
70	protected String full_indexdir="";
71
72	protected SortField.Type sort_type = SortField.Type.SCORE;
73	protected boolean reverse_sort = false;
74	protected Sort sorter=new Sort();
75	protected Filter filter = null;
76
77	protected QueryParser query_parser = null;
78	protected QueryParser query_parser_no_stop_words = null;
79	protected IndexSearcher searcher = null;
80	protected IndexReader reader = null;
81
82	public GS2LuceneQuery() {
83	super();
84
85	// Create one query parser with the standard set of stop words, and one with none
86
87	query_parser = new QueryParser(GSLuceneConstants.MATCH_VERSION, TEXTFIELD, new GS2Analyzer()); // uses built-in stop_words_set
88	query_parser_no_stop_words = new QueryParser(GSLuceneConstants.MATCH_VERSION, TEXTFIELD, new GS2Analyzer(new String[] { }));
89	}
90
91
92	public boolean initialise() {
93
94	if (!super.initialise()) {
95	return false;
96	}
97
98
99	if (full_indexdir==null \|\| full_indexdir.length()==-1){
100	utf8out.println("Index directory is not indicated ");
101	utf8out.flush();
102	return false;
103	}
104
105	try {
106	Directory full_indexdir_dir = FSDirectory.open(new File(full_indexdir));
107
108	reader = DirectoryReader.open(full_indexdir_dir); // Returns a IndexReader reading the index in the given Directory. now readOnly=true by default, and therefore also for searcher
109	searcher = new IndexSearcher(reader); // during searcher.search() will get it to compute ranks when sorting by fields
110
111	this.sorter = new Sort(new SortField(this.sort_field, this.sort_type, this.reverse_sort));
112	}
113	catch (IOException exception) {
114	exception.printStackTrace();
115	return false;
116	}
117	return true;
118
119	}
120
121	public void setIndexDir(String full_indexdir) {
122	this.full_indexdir = full_indexdir;
123	}
124
125	public void setSortField(String sort_field) {
126	if (sort_field.equals(SORT_RANK)) {
127	this.sort_field = null;
128	this.sort_type = SortField.Type.SCORE;
129	} else if (sort_field.equals(SORT_NATURAL)) {
130	this.sort_field = null;
131	this.sort_type = SortField.Type.DOC;
132	} else {
133	this.sort_field = sort_field;
134	this.sort_type = SortField.Type.STRING; // for now. numeric??
135	}
136	}
137	public void setReverseSort(boolean reverse) {
138	this.reverse_sort = reverse;
139	}
140	public boolean getReverseSort() {
141	return this.reverse_sort;
142	}
143
144	public void setFilterString(String filter_string) {
145	super.setFilterString(filter_string);
146	this.filter = parseFilterString(filter_string);
147	}
148
149	public Filter getFilter() {
150	return this.filter;
151	}
152
153
154	public LuceneQueryResult runQuery(String query_string) {
155
156	if (query_string == null \|\| query_string.equals("")) {
157	utf8out.println("The query word is not indicated ");
158	utf8out.flush();
159	return null;
160	}
161
162	LuceneQueryResult lucene_query_result=new LuceneQueryResult();
163	lucene_query_result.clear();
164
165	try {
166	Query query_including_stop_words = query_parser_no_stop_words.parse(query_string);
167	query_including_stop_words = query_including_stop_words.rewrite(reader);
168
169	// System.err.println("******* query_string " + query_string + "**");
170
171	Query query = parseQuery(reader, query_parser, query_string, fuzziness);
172
173	// GS2's LuceneWrapper uses lucene-2.3.2. GS3's LuceneWrapper3 works with lucene-3.3.0.
174	// This change in lucene core library for GS3 (present since after version 2.4.1) had the
175	// side-effect that searching on "econom*" didn't display what terms it was searching for,
176	// whereas it had done so in GS2.
177
178	// The details of this problem and its current solution are explained in the ticket
179	// http://trac.greenstone.org/ticket/845
180
181	// We need to change the settings for the rewriteMethod in order to get searches on wildcards
182	// to produce search terms again when the query gets rewritten.
183
184	// We try, in order:
185	// 1. RewriteMethod set to BooleanQuery, to get it working as in GS2 which uses lucene-2.3.2
186	// it will expand wildcard searches to its terms when searching at both section AND doc level.
187	// If that throws a TooManyClauses exception (like when searching for "a*" over lucene demo collection)
188	// 2. Then try a custom rewriteMethod which sets termCountCutoff=350 and docCountPercent cutoff=0.1%
189	// If that throws a TooManyClauses exception (could perhaps happen if the collection has a huge number of docs
190	// 3. Then try the default apache rewriteMethod with its optimum defaults of
191	// termCountCutoff=350 and docCountPercent cutoff=0.1%
192	// See http://lucene.apache.org/core/3_6_1/api/core/org/apache/lucene/search/MultiTermQuery.html
193
194	if(query instanceof MultiTermQuery) {
195	MultiTermQuery multiTermQuery = (MultiTermQuery)query;
196	multiTermQuery.setRewriteMethod(MultiTermQuery.CONSTANT_SCORE_BOOLEAN_QUERY_REWRITE);
197	// less CPU intensive than MultiTermQuery.SCORING_BOOLEAN_QUERY_REWRITE)
198	}
199
200	try {
201	query = query.rewrite(reader);
202	}
203	catch(BooleanQuery.TooManyClauses clauseException) {
204	// Example test case: try searching the lucene demo collection for "a*"
205	// and you'll hit this exception
206
207	lucene_query_result.setError(LuceneQueryResult.TOO_MANY_CLAUSES_ERROR);
208
209	if(query instanceof MultiTermQuery) {
210
211	// CustomRewriteMethod: setting the docCountPercent cutoff to a custom 100%.
212	// This will at least expand the query to its terms when searching with wildcards at section-level
213	// (though it doesn't seem to work for doc-level searches, no matter what the cutoffs are set to).
214
215	MultiTermQuery.ConstantScoreAutoRewrite customRewriteMethod = new MultiTermQuery.ConstantScoreAutoRewrite();
216	customRewriteMethod.setDocCountPercent(100.0);
217	customRewriteMethod.setTermCountCutoff(350); // same as default
218
219	MultiTermQuery multiTermQuery = (MultiTermQuery)query;
220	multiTermQuery.setRewriteMethod(customRewriteMethod);
221	try {
222	query = query.rewrite(reader);
223	}
224	catch(BooleanQuery.TooManyClauses clauseExceptionAgain) {
225
226	// do what the code originally did: use the default rewriteMethod which
227	// uses a default docCountPercent=0.1 (%) and termCountCutoff=350
228
229	multiTermQuery.setRewriteMethod(MultiTermQuery.CONSTANT_SCORE_AUTO_REWRITE_DEFAULT);
230	query = query.rewrite(reader);
231	}
232	}
233	}
234
235	// http://stackoverflow.com/questions/13537126/term-frequency-in-lucene-4-0
236	// http://stackoverflow.com/questions/20575254/lucene-4-4-how-to-get-term-frequency-over-all-index
237	// http://stackoverflow.com/questions/8938960/how-to-get-document-ids-for-document-term-vector-in-lucene?rq=1
238	// https://github.com/hibernate/hibernate-search/blob/master/orm/src/test/java/org/hibernate/search/test/filter/BestDriversFilter.java
239	// http://lucene.apache.org/core/4_7_2/MIGRATE.html
240
241	// Get the list of expanded query terms and their frequencies
242	// num docs matching, and total frequency
243	HashSet terms = new HashSet();
244	query.extractTerms(terms);
245
246	HashMap doc_term_freq_map = new HashMap();
247
248	Iterator iter = terms.iterator();
249
250	Bits liveDocs = null;
251	if(reader.hasDeletions()) {
252	System.err.println("@@@ GS2LuceneQuery.java: There have been deletions. Merging to get liveDocs.");
253	liveDocs = MultiFields.getLiveDocs(reader); // SLOW! But getLiveDocs returns null if there are no deletions
254	}
255
256	while (iter.hasNext()) {
257
258	// http://stackoverflow.com/questions/13537126/term-frequency-in-lucene-4-0
259
260	Term term = (Term) iter.next();
261	BytesRef term_bytes = term.bytes();
262	DocsEnum term_docs = MultiFields.getTermDocsEnum(reader, liveDocs, term.field(), term_bytes); // flags?
263
264	// Get the term frequency over all the documents
265	//TermDocs term_docs = reader.termDocs(term);
266	int term_freq = 0;
267	int match_docs = 0;
268
269	int docID = -1;
270	while((docID = term_docs.nextDoc()) != DocsEnum.NO_MORE_DOCS) {//while (term_docs.next())
271	if (term_docs.freq() != 0)
272	{
273	term_freq += term_docs.freq();
274	match_docs++;
275
276	// Calculate the document-level term frequency as well
277	Integer lucene_doc_num_obj = new Integer(term_docs.docID());
278	int doc_term_freq = 0;
279	if (doc_term_freq_map.containsKey(lucene_doc_num_obj))
280	{
281	doc_term_freq = ((Integer) doc_term_freq_map.get(lucene_doc_num_obj)).intValue();
282	}
283	doc_term_freq += term_docs.freq();
284
285	doc_term_freq_map.put(lucene_doc_num_obj, new Integer(doc_term_freq));
286	}
287	}
288
289	// Create a term
290	lucene_query_result.addTerm(term.text(), term.field(), match_docs, term_freq);
291	}
292
293	// Get the list of stop words removed from the query
294	HashSet terms_including_stop_words = new HashSet();
295	query_including_stop_words.extractTerms(terms_including_stop_words);
296	Iterator terms_including_stop_words_iter = terms_including_stop_words.iterator();
297	while (terms_including_stop_words_iter.hasNext()) {
298	Term term = (Term) terms_including_stop_words_iter.next();
299	if (!terms.contains(term)) {
300	lucene_query_result.addStopWord(term.text());
301	}
302	}
303
304	// do the query
305	// Simple case for getting all the matching documents
306	if (end_results == Integer.MAX_VALUE) {
307	// Perform the query (filter and sorter may be null)
308	TopFieldDocs hits = searcher.search(query, filter, end_results, sorter, true, true); // doDocScores=true, doMaxScore=true
309	// Is there a slight difference in the definition between
310	// https://lucene.apache.org/core/3_6_0/api/all/org/apache/lucene/search/IndexSearcher.html#setDefaultFieldSortScoring%28boolean,%20boolean%29
311	// and http://lucene.apache.org/core/4_7_2/core/org/apache/lucene/search/IndexSearcher.html#search%28org.apache.lucene.search.Query,%20org.apache.lucene.search.Filter,%20int,%20org.apache.lucene.search.Sort,%20boolean,%20boolean%29
312	// Seems to be okay.
313	// See also http://stackoverflow.com/questions/7910241/in-lucene-what-is-the-purpose-of-setdefaultfieldsortscoring
314
315	lucene_query_result.setTotalDocs(hits.totalHits);
316
317	// Output the matching documents
318	lucene_query_result.setStartResults(start_results);
319	lucene_query_result.setEndResults(hits.totalHits);
320
321	for (int i = start_results; i <= hits.totalHits; i++) {
322	int lucene_doc_num = hits.scoreDocs[i - 1].doc;
323	Document doc = reader.document(lucene_doc_num);
324	int doc_term_freq = 0;
325	Integer doc_term_freq_object = (Integer) doc_term_freq_map.get(new Integer(lucene_doc_num));
326	if (doc_term_freq_object != null)
327	{
328	doc_term_freq = doc_term_freq_object.intValue();
329	}
330	lucene_query_result.addDoc(doc.get("docOID").trim(), hits.scoreDocs[i-1].score, doc_term_freq);
331	}
332	}
333
334	// Slightly more complicated case for returning a subset of the matching documents
335	else {
336	// Perform the query (filter may be null)
337	TopFieldDocs hits = searcher.search(query, filter, end_results, sorter, true, true); // doDocScores=true, doMaxScore=true
338	// See also http://stackoverflow.com/questions/7910241/in-lucene-what-is-the-purpose-of-setdefaultfieldsortscoring
339	lucene_query_result.setTotalDocs(hits.totalHits);
340
341	lucene_query_result.setStartResults(start_results);
342	lucene_query_result.setEndResults(end_results < hits.scoreDocs.length ? end_results: hits.scoreDocs.length);
343
344	// Output the matching documents
345	for (int i = start_results; (i <= hits.scoreDocs.length && i <= end_results); i++) {
346	int lucene_doc_num = hits.scoreDocs[i - 1].doc;
347	Document doc = reader.document(lucene_doc_num);
348	int doc_term_freq = 0;
349	Integer doc_term_freq_object = (Integer) doc_term_freq_map.get(new Integer(lucene_doc_num));
350	if (doc_term_freq_object != null)
351	{
352	doc_term_freq = doc_term_freq_object.intValue();
353	}
354	lucene_query_result.addDoc(doc.get("docOID").trim(), hits.scoreDocs[i-1].score, doc_term_freq);
355	}
356	}
357	}
358
359	catch (ParseException parse_exception) {
360	lucene_query_result.setError(LuceneQueryResult.PARSE_ERROR);
361	}
362	catch (BooleanQuery.TooManyClauses too_many_clauses_exception) {
363	lucene_query_result.setError(LuceneQueryResult.TOO_MANY_CLAUSES_ERROR);
364	}
365	catch (IOException exception) {
366	lucene_query_result.setError(LuceneQueryResult.IO_ERROR);
367	exception.printStackTrace();
368	}
369	catch (Exception exception) {
370	lucene_query_result.setError(LuceneQueryResult.OTHER_ERROR);
371	exception.printStackTrace();
372	}
373	return lucene_query_result;
374	}
375
376	public void setDefaultConjunctionOperator(String default_conjunction_operator) {
377	super.setDefaultConjunctionOperator(default_conjunction_operator);
378
379	if (default_conjunction_operator.equals("AND")) {
380	query_parser.setDefaultOperator(query_parser.AND_OPERATOR);
381	query_parser_no_stop_words.setDefaultOperator(query_parser.AND_OPERATOR);
382	} else { // default is OR
383	query_parser.setDefaultOperator(query_parser.OR_OPERATOR);
384	query_parser_no_stop_words.setDefaultOperator(query_parser.OR_OPERATOR);
385	}
386	}
387
388
389	public void cleanUp() {
390	super.cleanUp();
391	try {
392	if(reader != null) {
393	reader.close();
394	// Closes files associated with this index. Also saves any new deletions to disk.
395	// No other methods should be called after this has been called.
396	}
397	} catch (IOException exception) {
398	exception.printStackTrace();
399	}
400	}
401
402
403	protected Query parseQuery(IndexReader reader, QueryParser query_parser, String query_string, String fuzziness)
404	throws java.io.IOException, org.apache.lucene.queryparser.classic.ParseException
405	{
406	// Split query string into the search terms and the filter terms
407	// * The first +(...) term contains the search terms so count
408	// up '(' and stop when we finish matching ')'
409	int offset = 0;
410	int paren_count = 0;
411	boolean seen_paren = false;
412	while (offset < query_string.length() && (!seen_paren \|\| paren_count > 0)) {
413	if (query_string.charAt(offset) == '(') {
414	paren_count++;
415	seen_paren = true;
416	}
417	if (query_string.charAt(offset) == ')') {
418	paren_count--;
419	}
420	offset++;
421	}
422	String query_prefix = query_string.substring(0, offset);
423	String query_suffix = query_string.substring(offset);
424
425	///ystem.err.println("Prefix: " + query_prefix);
426	///ystem.err.println("Suffix: " + query_suffix);
427
428	Query query = query_parser.parse(query_prefix);
429	query = query.rewrite(reader);
430
431	// If this is a fuzzy search, then we need to add the fuzzy
432	// flag to each of the query terms
433	if (fuzziness != null && query.toString().length() > 0) {
434
435	// Revert the query to a string
436	System.err.println("Rewritten query: " + query.toString());
437	// Search through the string for TX:<term> query terms
438	// and append the ~ operator. Note that this search will
439	// not change phrase searches (TX:"<term> <term>") as
440	// fuzzy searching is not possible for these entries.
441	// Yahoo! Time for a state machine!
442	StringBuffer mutable_query_string = new StringBuffer(query.toString());
443	int o = 0; // Offset
444	// 0 = BASE, 1 = SEEN_T, 2 = SEEN_TX, 3 = SEEN_TX:
445	int s = 0; // State
446	while(o < mutable_query_string.length()) {
447	char c = mutable_query_string.charAt(o);
448	if (s == 0 && c == TEXTFIELD.charAt(0)) {
449	///ystem.err.println("Found T!");
450	s = 1;
451	}
452	else if (s == 1) {
453	if (c == TEXTFIELD.charAt(1)) {
454	///ystem.err.println("Found X!");
455	s = 2;
456	}
457	else {
458	s = 0; // Reset
459	}
460	}
461	else if (s == 2) {
462	if (c == ':') {
463	///ystem.err.println("Found TX:!");
464	s = 3;
465	}
466	else {
467	s = 0; // Reset
468	}
469	}
470	else if (s == 3) {
471	// Don't process phrases
472	if (c == '"') {
473	///ystem.err.println("Stupid phrase...");
474	s = 0; // Reset
475	}
476	// Found the end of the term... add the
477	// fuzzy search indicator
478	// Nor outside the scope of parentheses
479	else if (Character.isWhitespace(c) \|\| c == ')') {
480	///ystem.err.println("Yahoo! Found fuzzy term.");
481	mutable_query_string.insert(o, '~' + fuzziness);
482	o++;
483	s = 0; // Reset
484	}
485	}
486	o++;
487	}
488	// If we were in the state of looking for the end of a
489	// term - then we just found it!
490	if (s == 3) {
491
492	mutable_query_string.append('~' + fuzziness);
493	}
494	// Reparse the query
495	///ystem.err.println("Fuzzy query: " + mutable_query_string.toString() + query_suffix);
496	query = query_parser.parse(mutable_query_string.toString() + query_suffix);
497	}
498	else {
499	query = query_parser.parse(query_prefix + query_suffix);
500	}
501
502	return query;
503	}
504
505	protected Filter parseFilterString(String filter_string)
506	{
507	Filter result = null;
508	Pattern pattern = Pattern.compile("\\s\\+(\\w+)\\:([\\{\\[])(\\d+)\\s+TO\\s+(\\d+)([\\}\\]])\\s");
509	Matcher matcher = pattern.matcher(filter_string);
510	if (matcher.matches()) {
511	String field_name = matcher.group(1);
512	boolean include_lower = matcher.group(2).equals("[");
513	BytesRef lower_term = new BytesRef(matcher.group(3));
514	BytesRef upper_term = new BytesRef(matcher.group(4));
515	boolean include_upper = matcher.group(5).equals("]");
516	result = new TermRangeFilter(field_name, lower_term, upper_term, include_lower, include_upper);
517	}
518	else {
519	System.err.println("Error: Could not understand filter string \"" + filter_string + "\"");
520	}
521	return result;
522	}
523
524
525	/** command line program and auxiliary methods */
526
527	// Fairly self-explanatory I should hope
528	static protected boolean query_result_caching_enabled = false;
529
530
531	static public void main (String args[])
532	{
533	if (args.length == 0) {
534	System.out.println("Usage: GS2LuceneQuery <index directory> [-fuzziness value] [-filter filter_string] [-sort sort_field] [-reverse_sort][-dco AND\|OR] [-startresults number -endresults number] [query]");
535	return;
536	}
537
538	try {
539	String index_directory = args[0];
540
541	GS2LuceneQuery queryer = new GS2LuceneQuery();
542	queryer.setIndexDir(index_directory);
543
544	// Prepare the index cache directory, if query result caching is enabled
545	if (query_result_caching_enabled) {
546	// Make the index cache directory if it doesn't already exist
547	File index_cache_directory = new File(index_directory, "cache");
548	if (!index_cache_directory.exists()) {
549	index_cache_directory.mkdir();
550	}
551
552	// Disable caching if the index cache directory isn't available
553	if (!index_cache_directory.exists() \|\| !index_cache_directory.isDirectory()) {
554	query_result_caching_enabled = false;
555	}
556	}
557
558	String query_string = null;
559
560	// Parse the command-line arguments
561	for (int i = 1; i < args.length; i++) {
562	if (args[i].equals("-sort")) {
563	i++;
564	queryer.setSortField(args[i]);
565	}
566	else if (args[i].equals("-reverse_sort")) {
567	queryer.setReverseSort(true);
568	}
569	else if (args[i].equals("-filter")) {
570	i++;
571	queryer.setFilterString(args[i]);
572	}
573	else if (args[i].equals("-dco")) {
574	i++;
575	queryer.setDefaultConjunctionOperator(args[i]);
576	}
577	else if (args[i].equals("-fuzziness")) {
578	i++;
579	queryer.setFuzziness(args[i]);
580	}
581	else if (args[i].equals("-startresults")) {
582	i++;
583	if (args[i].matches("\\d+")) {
584	queryer.setStartResults(Integer.parseInt(args[i]));
585	}
586	}
587	else if (args[i].equals("-endresults")) {
588	i++;
589	if (args[i].matches("\\d+")) {
590	queryer.setEndResults(Integer.parseInt(args[i]));
591	}
592	}
593	else {
594	query_string = args[i];
595	}
596	}
597
598	if (!queryer.initialise()) {
599	return;
600	}
601
602	// The query string has been specified as a command-line argument
603	if (query_string != null) {
604	runQueryCaching(index_directory, queryer, query_string);
605	}
606
607	// Read queries from STDIN
608	else {
609	BufferedReader in = new BufferedReader(new InputStreamReader(System.in, "UTF-8"));
610	while (true) {
611	// Read the query from STDIN
612	query_string = in.readLine();
613	if (query_string == null \|\| query_string.length() == -1) {
614	break;
615	}
616
617	runQueryCaching(index_directory, queryer, query_string);
618
619	}
620	}
621	queryer.cleanUp();
622	}
623	catch (IOException exception) {
624	exception.printStackTrace();
625	}
626	}
627
628	protected static void runQueryCaching(String index_directory, GS2LuceneQuery queryer, String query_string)
629	throws IOException
630	{
631	StringBuffer query_results_xml = new StringBuffer();
632
633	// Check if this query result has been cached from a previous search (if it's enabled)
634	File query_result_cache_file = null;
635	if (query_result_caching_enabled) {
636	// Generate the cache file name from the query options
637	String query_result_cache_file_name = query_string + "-";
638	String fuzziness = queryer.getFuzziness();
639	query_result_cache_file_name += ((fuzziness != null) ? fuzziness : "") + "-";
640	String filter_string = queryer.getFilterString();
641	query_result_cache_file_name += ((filter_string != null) ? filter_string : "") + "-";
642	String sort_string = queryer.getSortField();
643	query_result_cache_file_name += ((sort_string != null) ? sort_string : "") + "-";
644	String reverse_sort_string = (queryer.getReverseSort() ? "1" : "0");
645	query_result_cache_file_name += reverse_sort_string + "-";
646	String default_conjunction_operator = queryer.getDefaultConjunctionOperator();
647	query_result_cache_file_name += default_conjunction_operator + "-";
648	int start_results = queryer.getStartResults();
649	int end_results = queryer.getEndResults();
650	query_result_cache_file_name += start_results + "-" + end_results;
651	query_result_cache_file_name = fileSafe(query_result_cache_file_name);
652
653	// If the query result cache file exists, just return its contents and we're done
654	File index_cache_directory = new File(index_directory, "cache");
655	query_result_cache_file = new File(index_cache_directory, query_result_cache_file_name);
656	if (query_result_cache_file.exists() && query_result_cache_file.isFile()) {
657	FileInputStream fis = new FileInputStream(query_result_cache_file);
658	InputStreamReader isr = new InputStreamReader(fis, "UTF-8");
659	BufferedReader buffered_reader = new BufferedReader(isr);
660	String line = "";
661	while ((line = buffered_reader.readLine()) != null) {
662	query_results_xml.append(line + "\n");
663	}
664	String query_results_xml_string = query_results_xml.toString();
665	query_results_xml_string = query_results_xml_string.replaceFirst("cached=\"false\"", "cached=\"true\"");
666
667	utf8out.print(query_results_xml_string);
668	utf8out.flush();
669
670	return;
671	}
672	}
673
674	// not cached
675	query_results_xml.append("<ResultSet cached=\"false\">\n");
676	query_results_xml.append("<QueryString>" + LuceneQueryResult.xmlSafe(query_string) + "</QueryString>\n");
677	Filter filter = queryer.getFilter();
678	if (filter != null) {
679	query_results_xml.append("<FilterString>" + filter.toString() + "</FilterString>\n");
680	}
681
682	LuceneQueryResult query_result = queryer.runQuery(query_string);
683	if (query_result == null) {
684	System.err.println("Couldn't run the query");
685	return;
686	}
687
688	if (query_result.getError() != LuceneQueryResult.NO_ERROR) {
689	query_results_xml.append("<Error type=\""+query_result.getErrorString()+"\" />\n");
690	} else {
691	query_results_xml.append(query_result.getXMLString());
692	}
693	query_results_xml.append("</ResultSet>\n");
694
695	utf8out.print(query_results_xml);
696	utf8out.flush();
697
698	// Cache this query result, if desired
699	if (query_result_caching_enabled) {
700	// Catch any exceptions thrown trying to write the query result cache file and warn about them, but don't
701	// bother with the full stack trace. It won't affect the functionality if we can't write some cache
702	// files, it will just affect the speed of subsequent requests.
703	// Example exceptions are "permission denied" errors, or "filename too long" errors (the filter string
704	// can get very long in some collections)
705	try
706	{
707	FileWriter query_result_cache_file_writer = new FileWriter(query_result_cache_file);
708	query_result_cache_file_writer.write(query_results_xml.toString());
709	query_result_cache_file_writer.close();
710	}
711	catch (Exception exception)
712	{
713	System.err.println("Warning: Exception occurred trying to write query result cache file (" + exception + ")");
714	}
715	}
716	}
717
718	protected static String fileSafe(String text)
719	{
720	StringBuffer file_safe_text = new StringBuffer();
721	for (int i = 0; i < text.length(); i++) {
722	char character = text.charAt(i);
723	if ((character >= 'A' && character <= 'Z') \|\| (character >= 'a' && character <= 'z') \|\| (character >= '0' && character <= '9') \|\| character == '-') {
724	file_safe_text.append(character);
725	}
726	else {
727	file_safe_text.append('%');
728	file_safe_text.append((int) character);
729	}
730	}
731	return file_safe_text.toString();
732	}
733
734
735	}
736
737

Note: See TracBrowser for help on using the repository browser.

Download in other formats: