Context Navigation

source: indexers/trunk/lucene-gs/src/org/greenstone/LuceneWrapper/GS2LuceneQuery.java@ 16015

Last change on this file since 16015 was 16015, checked in by davidb, 16 years ago
Printing to standard out (used as the communication mechanism back to Perl script is now wrapped up in a UTF-8 PrintWriter. Testing showed that it was important to 'flush' output each time a message is printed
Property svn:keywords set to `Author Date Id Revision`
File size: 20.4 KB

Line
1	/**********************************************************************
2	*
3	* GS2LuceneQuery.java
4	*
5	* Copyright 2004 The New Zealand Digital Library Project
6	*
7	* A component of the Greenstone digital library software
8	* from the New Zealand Digital Library Project at the
9	* University of Waikato, New Zealand.
10	*
11	* This program is free software; you can redistribute it and/or modify
12	* it under the terms of the GNU General Public License as published by
13	* the Free Software Foundation; either version 2 of the License, or
14	* (at your option) any later version.
15	*
16	* This program is distributed in the hope that it will be useful,
17	* but WITHOUT ANY WARRANTY; without even the implied warranty of
18	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19	* GNU General Public License for more details.
20	*
21	* You should have received a copy of the GNU General Public License
22	* along with this program; if not, write to the Free Software
23	* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24	*
25	*********************************************************************/
26	package org.greenstone.LuceneWrapper;
27
28
29	import java.io.*;
30	import java.util.*;
31	import java.util.regex.*;
32
33	import org.apache.lucene.analysis.Analyzer;
34	import org.apache.lucene.analysis.standard.StandardAnalyzer;
35	import org.apache.lucene.document.Document;
36	import org.apache.lucene.index.IndexReader;
37	import org.apache.lucene.index.Term;
38	import org.apache.lucene.index.TermDocs;
39	import org.apache.lucene.queryParser.ParseException;
40	import org.apache.lucene.queryParser.QueryParser;
41	import org.apache.lucene.search.BooleanQuery.TooManyClauses;
42	import org.apache.lucene.search.Filter;
43	import org.apache.lucene.search.Hit;
44	import org.apache.lucene.search.Hits;
45	import org.apache.lucene.search.IndexSearcher;
46	import org.apache.lucene.search.Query;
47	import org.apache.lucene.search.RangeFilter;
48	import org.apache.lucene.search.Searcher;
49	import org.apache.lucene.search.ScoreDoc;
50	import org.apache.lucene.search.Sort;
51	import org.apache.lucene.search.TopFieldDocs;
52
53
54	public class GS2LuceneQuery
55	{
56
57
58	static private String TEXTFIELD = "TX";
59
60	// Use the standard set of English stop words by default
61	static private String[] stop_words = StandardAnalyzer.STOP_WORDS;
62
63	private String full_indexdir="";
64	private String default_conjunction_operator = "OR";
65	private String fuzziness = null;
66	private String sort_field = null;
67	private Sort sorter=new Sort();
68	private String filter_string = null;
69	private Filter filter = null;
70	private int start_results=1;
71	private int end_results=Integer.MAX_VALUE;
72
73	private QueryParser query_parser = null;
74	private QueryParser query_parser_no_stop_words = null;
75	private Searcher searcher = null;
76	private IndexReader reader = null;
77
78	static private PrintWriter utf8out = null;
79
80	static
81	{
82	try {
83	OutputStreamWriter osw = new OutputStreamWriter(System.out, "UTF-8");
84	utf8out = new PrintWriter(osw, true);
85	}
86	catch (UnsupportedEncodingException e) {
87	System.out.println(e);
88	}
89	}
90
91
92	public GS2LuceneQuery() {
93
94	// Create one query parser with the standard set of stop words, and one with none
95
96	query_parser = new QueryParser(TEXTFIELD, new StandardAnalyzer(stop_words));
97	query_parser_no_stop_words = new QueryParser(TEXTFIELD, new StandardAnalyzer(new String[] { }));
98	}
99
100
101	public boolean initialise() {
102
103	if (full_indexdir==null \|\| full_indexdir.length()==-1){
104	utf8out.println("Index directory is not indicated ");
105	utf8out.flush();
106	return false;
107	}
108	try {
109	searcher = new IndexSearcher(full_indexdir);
110	reader = ((IndexSearcher) searcher).getIndexReader();
111
112	}
113	catch (IOException exception) {
114	exception.printStackTrace();
115	return false;
116	}
117	return true;
118
119	}
120
121	public LuceneQueryResult runQuery(String query_string) {
122
123	if (query_string == null \|\| query_string.equals("")) {
124	utf8out.println("The query word is not indicated ");
125	utf8out.flush();
126	return null;
127	}
128
129	LuceneQueryResult lucene_query_result=new LuceneQueryResult();
130	lucene_query_result.clear();
131
132	try {
133	Query query_including_stop_words = query_parser_no_stop_words.parse(query_string);
134	query_including_stop_words = query_including_stop_words.rewrite(reader);
135
136	Query query = parseQuery(reader, query_parser, query_string, fuzziness);
137	query = query.rewrite(reader);
138
139	// Get the list of expanded query terms and their frequencies
140	// num docs matching, and total frequency
141	HashSet terms = new HashSet();
142	query.extractTerms(terms);
143
144	Iterator iter = terms.iterator();
145	while (iter.hasNext()) {
146
147	Term term = (Term) iter.next();
148
149	// Get the term frequency over all the documents
150	TermDocs term_docs = reader.termDocs(term);
151	int term_freq = term_docs.freq();
152	int match_docs = 0;
153	if (term_freq != 0) match_docs++;
154	while (term_docs.next()) {
155	term_freq += term_docs.freq();
156	if (term_docs.freq()!= 0) {
157	match_docs++;
158	}
159	}
160
161	// Create a term
162	lucene_query_result.addTerm(term.text(), term.field(), match_docs, term_freq);
163	}
164
165	// Get the list of stop words removed from the query
166	HashSet terms_including_stop_words = new HashSet();
167	query_including_stop_words.extractTerms(terms_including_stop_words);
168	Iterator terms_including_stop_words_iter = terms_including_stop_words.iterator();
169	while (terms_including_stop_words_iter.hasNext()) {
170	Term term = (Term) terms_including_stop_words_iter.next();
171	if (!terms.contains(term)) {
172	lucene_query_result.addStopWord(term.text());
173	}
174	}
175
176	// do the query
177	// Simple case for getting all the matching documents
178	if (end_results == Integer.MAX_VALUE) {
179	// Perform the query (filter and sorter may be null)
180	Hits hits = searcher.search(query, filter, sorter);
181	lucene_query_result.setTotalDocs(hits.length());
182
183	// Output the matching documents
184	lucene_query_result.setStartResults(start_results);
185	lucene_query_result.setEndResults(hits.length());
186
187	for (int i = start_results; i <= hits.length(); i++) {
188	Document doc = hits.doc(i - 1);
189	lucene_query_result.addDoc(Long.parseLong(doc.get("nodeID").trim()), hits.score(i-1));
190	}
191	}
192
193	// Slightly more complicated case for returning a subset of the matching documents
194	else {
195	// Perform the query (filter may be null)
196	TopFieldDocs hits = searcher.search(query, filter, end_results, sorter);
197	lucene_query_result.setTotalDocs(hits.totalHits);
198
199	lucene_query_result.setStartResults(start_results);
200	lucene_query_result.setEndResults(end_results < hits.scoreDocs.length ? end_results: hits.scoreDocs.length);
201
202	// Output the matching documents
203	for (int i = start_results; (i <= hits.scoreDocs.length && i <= end_results); i++) {
204	Document doc = reader.document(hits.scoreDocs[i - 1].doc);
205	lucene_query_result.addDoc(Long.parseLong(doc.get("nodeID").trim()), hits.scoreDocs[i-1].score);
206	}
207	}
208	}
209
210	catch (ParseException parse_exception) {
211	lucene_query_result.setError(LuceneQueryResult.PARSE_ERROR);
212	}
213	catch (TooManyClauses too_many_clauses_exception) {
214	lucene_query_result.setError(LuceneQueryResult.TOO_MANY_CLAUSES_ERROR);
215	}
216	catch (IOException exception) {
217	lucene_query_result.setError(LuceneQueryResult.IO_ERROR);
218	exception.printStackTrace();
219	}
220	catch (Exception exception) {
221	lucene_query_result.setError(LuceneQueryResult.OTHER_ERROR);
222	exception.printStackTrace();
223	}
224	return lucene_query_result;
225	}
226
227	public void setDefaultConjunctionOperator(String default_conjunction_operator) {
228	this.default_conjunction_operator = default_conjunction_operator.toUpperCase();
229	if (default_conjunction_operator.equals("AND")) {
230	query_parser.setDefaultOperator(query_parser.AND_OPERATOR);
231	query_parser_no_stop_words.setDefaultOperator(query_parser.AND_OPERATOR);
232	} else { // default is OR
233	query_parser.setDefaultOperator(query_parser.OR_OPERATOR);
234	query_parser_no_stop_words.setDefaultOperator(query_parser.OR_OPERATOR);
235	}
236	}
237
238	public String getDefaultConjunctionOperator() {
239	return this.default_conjunction_operator;
240	}
241
242	public void setEndResults(int end_results) {
243	this.end_results = end_results;
244	}
245	public int getEndResults() {
246	return this.end_results;
247	}
248
249	public void setFilterString(String filter_string) {
250	this.filter_string = filter_string;
251	this.filter = parseFilterString(filter_string);
252	}
253	public String getFilterString() {
254	return this.filter_string ;
255	}
256
257	public Filter getFilter() {
258	return this.filter;
259	}
260
261	public void setIndexDir(String full_indexdir) {
262	this.full_indexdir = full_indexdir;
263	}
264
265	public void setFuzziness(String fuzziness) {
266	this.fuzziness = fuzziness;
267	}
268	public String getFuzziness() {
269	return this.fuzziness;
270	}
271
272	public void setSortField(String sort_field) {
273	this.sort_field = sort_field;
274	if (sort_field == null) {
275	this.sorter = new Sort();
276	} else {
277	this.sorter = new Sort(sort_field);
278	}
279	}
280	public String getSortField() {
281	return this.sort_field;
282	}
283
284	public void setStartResults(int start_results) {
285	if (start_results < 1) {
286	start_results = 1;
287	}
288	this.start_results = start_results;
289	}
290	public int getStartResults() {
291	return this.start_results;
292	}
293
294	public void cleanUp() {
295	try {
296	if (searcher != null) {
297	searcher.close();
298	}
299	} catch (IOException exception) {
300	exception.printStackTrace();
301	}
302	}
303
304	private Query parseQuery(IndexReader reader, QueryParser query_parser, String query_string, String fuzziness)
305	throws java.io.IOException, org.apache.lucene.queryParser.ParseException
306	{
307	// Split query string into the search terms and the filter terms
308	// * The first +(...) term contains the search terms so count
309	// up '(' and stop when we finish matching ')'
310	int offset = 0;
311	int paren_count = 0;
312	boolean seen_paren = false;
313	while (offset < query_string.length() && (!seen_paren \|\| paren_count > 0)) {
314	if (query_string.charAt(offset) == '(') {
315	paren_count++;
316	seen_paren = true;
317	}
318	if (query_string.charAt(offset) == ')') {
319	paren_count--;
320	}
321	offset++;
322	}
323	String query_prefix = query_string.substring(0, offset);
324	String query_suffix = query_string.substring(offset);
325
326	///ystem.err.println("Prefix: " + query_prefix);
327	///ystem.err.println("Suffix: " + query_suffix);
328
329	Query query = query_parser.parse(query_prefix);
330	query = query.rewrite(reader);
331
332	// If this is a fuzzy search, then we need to add the fuzzy
333	// flag to each of the query terms
334	if (fuzziness != null && query.toString().length() > 0) {
335
336	// Revert the query to a string
337	System.err.println("Rewritten query: " + query.toString());
338	// Search through the string for TX:<term> query terms
339	// and append the ~ operator. Note that this search will
340	// not change phrase searches (TX:"<term> <term>") as
341	// fuzzy searching is not possible for these entries.
342	// Yahoo! Time for a state machine!
343	StringBuffer mutable_query_string = new StringBuffer(query.toString());
344	int o = 0; // Offset
345	// 0 = BASE, 1 = SEEN_T, 2 = SEEN_TX, 3 = SEEN_TX:
346	int s = 0; // State
347	while(o < mutable_query_string.length()) {
348	char c = mutable_query_string.charAt(o);
349	if (s == 0 && c == TEXTFIELD.charAt(0)) {
350	///ystem.err.println("Found T!");
351	s = 1;
352	}
353	else if (s == 1) {
354	if (c == TEXTFIELD.charAt(1)) {
355	///ystem.err.println("Found X!");
356	s = 2;
357	}
358	else {
359	s = 0; // Reset
360	}
361	}
362	else if (s == 2) {
363	if (c == ':') {
364	///ystem.err.println("Found TX:!");
365	s = 3;
366	}
367	else {
368	s = 0; // Reset
369	}
370	}
371	else if (s == 3) {
372	// Don't process phrases
373	if (c == '"') {
374	///ystem.err.println("Stupid phrase...");
375	s = 0; // Reset
376	}
377	// Found the end of the term... add the
378	// fuzzy search indicator
379	// Nor outside the scope of parentheses
380	else if (Character.isWhitespace(c) \|\| c == ')') {
381	///ystem.err.println("Yahoo! Found fuzzy term.");
382	mutable_query_string.insert(o, '~' + fuzziness);
383	o++;
384	s = 0; // Reset
385	}
386	}
387	o++;
388	}
389	// If we were in the state of looking for the end of a
390	// term - then we just found it!
391	if (s == 3) {
392
393	mutable_query_string.append('~' + fuzziness);
394	}
395	// Reparse the query
396	///ystem.err.println("Fuzzy query: " + mutable_query_string.toString() + query_suffix);
397	query = query_parser.parse(mutable_query_string.toString() + query_suffix);
398	}
399	else {
400	query = query_parser.parse(query_prefix + query_suffix);
401	}
402
403	return query;
404	}
405
406	private Filter parseFilterString(String filter_string)
407	{
408	Filter result = null;
409	Pattern pattern = Pattern.compile("\\s\\+(\\w+)\\:([\\{\\[])(\\d+)\\s+TO\\s+(\\d+)([\\}\\]])\\s");
410	Matcher matcher = pattern.matcher(filter_string);
411	if (matcher.matches()) {
412	String field_name = matcher.group(1);
413	boolean include_lower = matcher.group(2).equals("[");
414	String lower_term = matcher.group(3);
415	String upper_term = matcher.group(4);
416	boolean include_upper = matcher.group(5).equals("]");
417	result = new RangeFilter(field_name, lower_term, upper_term, include_lower, include_upper);
418	}
419	else {
420	System.err.println("Error: Could not understand filter string \"" + filter_string + "\"");
421	}
422	return result;
423	}
424
425
426	protected void finalize() throws Throwable
427	{
428	try {
429	utf8out.flush();
430	} finally {
431	super.finalize();
432	}
433	}
434
435
436	/** command line program and auxiliary methods */
437
438	// Fairly self-explanatory I should hope
439	static private boolean query_result_caching_enabled = false;
440
441
442	static public void main (String args[])
443	{
444
445
446	if (args.length == 0) {
447	System.out.println("Usage: GS2LuceneQuery <index directory> [-fuzziness value] [-filter filter_string] [-sort sort_field] [-dco AND\|OR] [-startresults number -endresults number] [query]");
448	return;
449	}
450
451	try {
452	String index_directory = args[0];
453
454	GS2LuceneQuery queryer = new GS2LuceneQuery();
455	queryer.setIndexDir(index_directory);
456
457	// Prepare the index cache directory, if query result caching is enabled
458	if (query_result_caching_enabled) {
459	// Make the index cache directory if it doesn't already exist
460	File index_cache_directory = new File(index_directory, "cache");
461	if (!index_cache_directory.exists()) {
462	index_cache_directory.mkdir();
463	}
464
465	// Disable caching if the index cache directory isn't available
466	if (!index_cache_directory.exists() \|\| !index_cache_directory.isDirectory()) {
467	query_result_caching_enabled = false;
468	}
469	}
470
471	String query_string = null;
472
473	// Parse the command-line arguments
474	for (int i = 1; i < args.length; i++) {
475	if (args[i].equals("-sort")) {
476	i++;
477	queryer.setSortField(args[i]);
478	}
479	else if (args[i].equals("-filter")) {
480	i++;
481	queryer.setFilterString(args[i]);
482	}
483	else if (args[i].equals("-dco")) {
484	i++;
485	queryer.setDefaultConjunctionOperator(args[i]);
486	}
487	else if (args[i].equals("-fuzziness")) {
488	i++;
489	queryer.setFuzziness(args[i]);
490	}
491	else if (args[i].equals("-startresults")) {
492	i++;
493	if (args[i].matches("\\d+")) {
494	queryer.setStartResults(Integer.parseInt(args[i]));
495	}
496	}
497	else if (args[i].equals("-endresults")) {
498	i++;
499	if (args[i].matches("\\d+")) {
500	queryer.setEndResults(Integer.parseInt(args[i]));
501	}
502	}
503	else {
504	query_string = args[i];
505	}
506	}
507
508	if (!queryer.initialise()) {
509	return;
510	}
511
512	// The query string has been specified as a command-line argument
513	if (query_string != null) {
514	runQueryCaching(index_directory, queryer, query_string);
515	}
516
517	// Read queries from STDIN
518	else {
519	BufferedReader in = new BufferedReader(new InputStreamReader(System.in, "UTF-8"));
520	while (true) {
521	// Read the query from STDIN
522	query_string = in.readLine();
523	if (query_string == null \|\| query_string.length() == -1) {
524	break;
525	}
526
527	runQueryCaching(index_directory, queryer, query_string);
528
529	}
530	}
531	queryer.cleanUp();
532	}
533	catch (IOException exception) {
534	exception.printStackTrace();
535	}
536	}
537
538	private static void runQueryCaching(String index_directory, GS2LuceneQuery queryer, String query_string)
539	throws IOException
540	{
541	StringBuffer query_results_xml = new StringBuffer();
542
543	// Check if this query result has been cached from a previous search (if it's enabled)
544	File query_result_cache_file = null;
545	if (query_result_caching_enabled) {
546	// Generate the cache file name from the query options
547	String query_result_cache_file_name = query_string + "-";
548	String fuzziness = queryer.getFuzziness();
549	query_result_cache_file_name += ((fuzziness != null) ? fuzziness : "") + "-";
550	String filter_string = queryer.getFilterString();
551	query_result_cache_file_name += ((filter_string != null) ? filter_string : "") + "-";
552	String sort_string = queryer.getSortField();
553	query_result_cache_file_name += ((sort_string != null) ? sort_string : "") + "-";
554	String default_conjunction_operator = queryer.getDefaultConjunctionOperator();
555	query_result_cache_file_name += default_conjunction_operator + "-";
556	int start_results = queryer.getStartResults();
557	int end_results = queryer.getEndResults();
558	query_result_cache_file_name += start_results + "-" + end_results;
559	query_result_cache_file_name = fileSafe(query_result_cache_file_name);
560
561	// If the query result cache file exists, just return its contents and we're done
562	File index_cache_directory = new File(index_directory, "cache");
563	query_result_cache_file = new File(index_cache_directory, query_result_cache_file_name);
564	if (query_result_cache_file.exists() && query_result_cache_file.isFile()) {
565	FileInputStream fis = new FileInputStream(query_result_cache_file);
566	InputStreamReader isr = new InputStreamReader(fis, "UTF-8");
567	BufferedReader buffered_reader = new BufferedReader(isr);
568	String line = "";
569	while ((line = buffered_reader.readLine()) != null) {
570	query_results_xml.append(line + "\n");
571	}
572	String query_results_xml_string = query_results_xml.toString();
573	query_results_xml_string = query_results_xml_string.replaceFirst("cached=\"false\"", "cached=\"true\"");
574
575	utf8out.print(query_results_xml_string);
576	utf8out.flush();
577
578	return;
579	}
580	}
581
582	// not cached
583	query_results_xml.append("<ResultSet cached=\"false\">\n");
584	query_results_xml.append("<QueryString>" + LuceneQueryResult.xmlSafe(query_string) + "</QueryString>\n");
585	Filter filter = queryer.getFilter();
586	if (filter != null) {
587	query_results_xml.append("<FilterString>" + filter.toString() + "</FilterString>\n");
588	}
589
590	LuceneQueryResult query_result = queryer.runQuery(query_string);
591	if (query_result == null) {
592	System.err.println("Couldn't run the query");
593	return;
594	}
595
596	if (query_result.getError() != LuceneQueryResult.NO_ERROR) {
597	query_results_xml.append("<Error type=\""+query_result.getErrorString()+"\" />\n");
598	} else {
599	query_results_xml.append(query_result.getXMLString());
600	}
601	query_results_xml.append("</ResultSet>\n");
602
603	utf8out.print(query_results_xml);
604	utf8out.flush();
605
606	try {
607	/*
608	Writer output = null;
609	File file = new File("/tmp/lucenequery.txt");
610	output = new BufferedWriter(new FileWriter(file,"UTF-8"));
611	output.write(query_results_xml.toString());
612	output.close();
613	*/
614
615	FileOutputStream fos = new FileOutputStream("/tmp/lucenequery.txt");
616
617	OutputStreamWriter osw2 = new OutputStreamWriter(fos, "UTF-8");
618
619	osw2.write("Query string = " + query_string + "\n");
620	osw2.write(query_results_xml.toString());
621	osw2.close();
622	}
623	catch (Exception e) {
624	e.printStackTrace();
625	}
626
627
628
629	// Cache this query result, if desired
630	if (query_result_caching_enabled) {
631	FileWriter query_result_cache_file_writer = new FileWriter(query_result_cache_file);
632	query_result_cache_file_writer.write(query_results_xml.toString());
633	query_result_cache_file_writer.close();
634	}
635	}
636
637	private static String fileSafe(String text)
638	{
639	StringBuffer file_safe_text = new StringBuffer();
640	for (int i = 0; i < text.length(); i++) {
641	char character = text.charAt(i);
642	if ((character >= 'A' && character <= 'Z') \|\| (character >= 'a' && character <= 'z') \|\| (character >= '0' && character <= '9') \|\| character == '-') {
643	file_safe_text.append(character);
644	}
645	else {
646	file_safe_text.append('%');
647	file_safe_text.append((int) character);
648	}
649	}
650	return file_safe_text.toString();
651	}
652
653
654	}
655
656

Note: See TracBrowser for help on using the repository browser.

Download in other formats: