Context Navigation

source: indexers/trunk/lucene-gs/src/org/greenstone/LuceneWrapper/GS2LuceneQuery.java@ 17804

Last change on this file since 17804 was 17804, checked in by davidb, 15 years ago
Introduction of GS2Analyzer, which overrides default behaviour of StandardAnalyzer to make accent folding of Latin-1 on
Property svn:keywords set to `Author Date Id Revision`
File size: 21.2 KB

Line
1	/**********************************************************************
2	*
3	* GS2LuceneQuery.java
4	*
5	* Copyright 2004 The New Zealand Digital Library Project
6	*
7	* A component of the Greenstone digital library software
8	* from the New Zealand Digital Library Project at the
9	* University of Waikato, New Zealand.
10	*
11	* This program is free software; you can redistribute it and/or modify
12	* it under the terms of the GNU General Public License as published by
13	* the Free Software Foundation; either version 2 of the License, or
14	* (at your option) any later version.
15	*
16	* This program is distributed in the hope that it will be useful,
17	* but WITHOUT ANY WARRANTY; without even the implied warranty of
18	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19	* GNU General Public License for more details.
20	*
21	* You should have received a copy of the GNU General Public License
22	* along with this program; if not, write to the Free Software
23	* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24	*
25	*********************************************************************/
26	package org.greenstone.LuceneWrapper;
27
28
29	import java.io.*;
30	import java.util.*;
31	import java.util.regex.*;
32
33	import org.apache.lucene.analysis.Analyzer;
34	import org.apache.lucene.analysis.standard.StandardAnalyzer;
35	import org.apache.lucene.document.Document;
36	import org.apache.lucene.index.IndexReader;
37	import org.apache.lucene.index.Term;
38	import org.apache.lucene.index.TermDocs;
39	import org.apache.lucene.queryParser.ParseException;
40	import org.apache.lucene.queryParser.QueryParser;
41	import org.apache.lucene.search.BooleanQuery.TooManyClauses;
42	import org.apache.lucene.search.Filter;
43	import org.apache.lucene.search.Hit;
44	import org.apache.lucene.search.Hits;
45	import org.apache.lucene.search.IndexSearcher;
46	import org.apache.lucene.search.Query;
47	import org.apache.lucene.search.RangeFilter;
48	import org.apache.lucene.search.Searcher;
49	import org.apache.lucene.search.ScoreDoc;
50	import org.apache.lucene.search.Sort;
51	import org.apache.lucene.search.TopFieldDocs;
52
53
54	public class GS2LuceneQuery
55	{
56
57
58	static private String TEXTFIELD = "TX";
59
60	// Use the standard set of English stop words by default
61	static private String[] stop_words = GS2Analyzer.STOP_WORDS;
62
63	private String full_indexdir="";
64	private String default_conjunction_operator = "OR";
65	private String fuzziness = null;
66	private String sort_field = null;
67	private Sort sorter=new Sort();
68	private String filter_string = null;
69	private Filter filter = null;
70	private int start_results=1;
71	private int end_results=Integer.MAX_VALUE;
72
73	private QueryParser query_parser = null;
74	private QueryParser query_parser_no_stop_words = null;
75	private Searcher searcher = null;
76	private IndexReader reader = null;
77
78	static private PrintWriter utf8out = null;
79
80	static
81	{
82	try {
83	OutputStreamWriter osw = new OutputStreamWriter(System.out, "UTF-8");
84	utf8out = new PrintWriter(osw, true);
85	}
86	catch (UnsupportedEncodingException e) {
87	System.out.println(e);
88	}
89	}
90
91
92	public GS2LuceneQuery() {
93
94	// Create one query parser with the standard set of stop words, and one with none
95
96	query_parser = new QueryParser(TEXTFIELD, new GS2Analyzer(stop_words));
97	query_parser_no_stop_words = new QueryParser(TEXTFIELD, new GS2Analyzer(new String[] { }));
98	}
99
100
101	public boolean initialise() {
102
103	if (full_indexdir==null \|\| full_indexdir.length()==-1){
104	utf8out.println("Index directory is not indicated ");
105	utf8out.flush();
106	return false;
107	}
108	try {
109	searcher = new IndexSearcher(full_indexdir);
110	reader = ((IndexSearcher) searcher).getIndexReader();
111
112	}
113	catch (IOException exception) {
114	exception.printStackTrace();
115	return false;
116	}
117	return true;
118
119	}
120
121	public LuceneQueryResult runQuery(String query_string) {
122
123	if (query_string == null \|\| query_string.equals("")) {
124	utf8out.println("The query word is not indicated ");
125	utf8out.flush();
126	return null;
127	}
128
129	LuceneQueryResult lucene_query_result=new LuceneQueryResult();
130	lucene_query_result.clear();
131
132	try {
133	Query query_including_stop_words = query_parser_no_stop_words.parse(query_string);
134	query_including_stop_words = query_including_stop_words.rewrite(reader);
135
136	// System.err.println("******* query_string " + query_string + "**");
137
138	Query query = parseQuery(reader, query_parser, query_string, fuzziness);
139	query = query.rewrite(reader);
140
141	// Get the list of expanded query terms and their frequencies
142	// num docs matching, and total frequency
143	HashSet terms = new HashSet();
144	query.extractTerms(terms);
145
146	HashMap doc_term_freq_map = new HashMap();
147
148	Iterator iter = terms.iterator();
149	while (iter.hasNext()) {
150
151	Term term = (Term) iter.next();
152
153	// Get the term frequency over all the documents
154	TermDocs term_docs = reader.termDocs(term);
155	int term_freq = 0;
156	int match_docs = 0;
157	while (term_docs.next())
158	{
159	if (term_docs.freq() != 0)
160	{
161	term_freq += term_docs.freq();
162	match_docs++;
163
164	// Calculate the document-level term frequency as well
165	Integer lucene_doc_num_obj = new Integer(term_docs.doc());
166	int doc_term_freq = 0;
167	if (doc_term_freq_map.containsKey(lucene_doc_num_obj))
168	{
169	doc_term_freq = ((Integer) doc_term_freq_map.get(lucene_doc_num_obj)).intValue();
170	}
171	doc_term_freq += term_docs.freq();
172
173	doc_term_freq_map.put(lucene_doc_num_obj, new Integer(doc_term_freq));
174	}
175	}
176
177	// Create a term
178	lucene_query_result.addTerm(term.text(), term.field(), match_docs, term_freq);
179	}
180
181	// Get the list of stop words removed from the query
182	HashSet terms_including_stop_words = new HashSet();
183	query_including_stop_words.extractTerms(terms_including_stop_words);
184	Iterator terms_including_stop_words_iter = terms_including_stop_words.iterator();
185	while (terms_including_stop_words_iter.hasNext()) {
186	Term term = (Term) terms_including_stop_words_iter.next();
187	if (!terms.contains(term)) {
188	lucene_query_result.addStopWord(term.text());
189	}
190	}
191
192	// do the query
193	// Simple case for getting all the matching documents
194	if (end_results == Integer.MAX_VALUE) {
195	// Perform the query (filter and sorter may be null)
196	Hits hits = searcher.search(query, filter, sorter);
197	lucene_query_result.setTotalDocs(hits.length());
198
199	// Output the matching documents
200	lucene_query_result.setStartResults(start_results);
201	lucene_query_result.setEndResults(hits.length());
202
203	for (int i = start_results; i <= hits.length(); i++) {
204	int lucene_doc_num = hits.id(i - 1);
205	Document doc = hits.doc(i - 1);
206	int doc_term_freq = ((Integer) doc_term_freq_map.get(new Integer(lucene_doc_num))).intValue();
207	lucene_query_result.addDoc(doc.get("docOID").trim(), hits.score(i-1), doc_term_freq);
208	}
209	}
210
211	// Slightly more complicated case for returning a subset of the matching documents
212	else {
213	// Perform the query (filter may be null)
214	TopFieldDocs hits = searcher.search(query, filter, end_results, sorter);
215	lucene_query_result.setTotalDocs(hits.totalHits);
216
217	lucene_query_result.setStartResults(start_results);
218	lucene_query_result.setEndResults(end_results < hits.scoreDocs.length ? end_results: hits.scoreDocs.length);
219
220	// Output the matching documents
221	for (int i = start_results; (i <= hits.scoreDocs.length && i <= end_results); i++) {
222	int lucene_doc_num = hits.scoreDocs[i - 1].doc;
223	Document doc = reader.document(lucene_doc_num);
224	int doc_term_freq = ((Integer) doc_term_freq_map.get(new Integer(lucene_doc_num))).intValue();
225	lucene_query_result.addDoc(doc.get("docOID").trim(), hits.scoreDocs[i-1].score, doc_term_freq);
226	}
227	}
228	}
229
230	catch (ParseException parse_exception) {
231	lucene_query_result.setError(LuceneQueryResult.PARSE_ERROR);
232	}
233	catch (TooManyClauses too_many_clauses_exception) {
234	lucene_query_result.setError(LuceneQueryResult.TOO_MANY_CLAUSES_ERROR);
235	}
236	catch (IOException exception) {
237	lucene_query_result.setError(LuceneQueryResult.IO_ERROR);
238	exception.printStackTrace();
239	}
240	catch (Exception exception) {
241	lucene_query_result.setError(LuceneQueryResult.OTHER_ERROR);
242	exception.printStackTrace();
243	}
244	return lucene_query_result;
245	}
246
247	public void setDefaultConjunctionOperator(String default_conjunction_operator) {
248	this.default_conjunction_operator = default_conjunction_operator.toUpperCase();
249	if (default_conjunction_operator.equals("AND")) {
250	query_parser.setDefaultOperator(query_parser.AND_OPERATOR);
251	query_parser_no_stop_words.setDefaultOperator(query_parser.AND_OPERATOR);
252	} else { // default is OR
253	query_parser.setDefaultOperator(query_parser.OR_OPERATOR);
254	query_parser_no_stop_words.setDefaultOperator(query_parser.OR_OPERATOR);
255	}
256	}
257
258	public String getDefaultConjunctionOperator() {
259	return this.default_conjunction_operator;
260	}
261
262	public void setEndResults(int end_results) {
263	this.end_results = end_results;
264	}
265	public int getEndResults() {
266	return this.end_results;
267	}
268
269	public void setFilterString(String filter_string) {
270	this.filter_string = filter_string;
271	this.filter = parseFilterString(filter_string);
272	}
273	public String getFilterString() {
274	return this.filter_string ;
275	}
276
277	public Filter getFilter() {
278	return this.filter;
279	}
280
281	public void setIndexDir(String full_indexdir) {
282	this.full_indexdir = full_indexdir;
283	}
284
285	public void setFuzziness(String fuzziness) {
286	this.fuzziness = fuzziness;
287	}
288	public String getFuzziness() {
289	return this.fuzziness;
290	}
291
292	public void setSortField(String sort_field) {
293	this.sort_field = sort_field;
294	if (sort_field == null) {
295	this.sorter = new Sort();
296	} else {
297	this.sorter = new Sort(sort_field);
298	}
299	}
300	public String getSortField() {
301	return this.sort_field;
302	}
303
304	public void setStartResults(int start_results) {
305	if (start_results < 1) {
306	start_results = 1;
307	}
308	this.start_results = start_results;
309	}
310	public int getStartResults() {
311	return this.start_results;
312	}
313
314	public void cleanUp() {
315	try {
316	if (searcher != null) {
317	searcher.close();
318	}
319	} catch (IOException exception) {
320	exception.printStackTrace();
321	}
322	}
323
324	private Query parseQuery(IndexReader reader, QueryParser query_parser, String query_string, String fuzziness)
325	throws java.io.IOException, org.apache.lucene.queryParser.ParseException
326	{
327	// Split query string into the search terms and the filter terms
328	// * The first +(...) term contains the search terms so count
329	// up '(' and stop when we finish matching ')'
330	int offset = 0;
331	int paren_count = 0;
332	boolean seen_paren = false;
333	while (offset < query_string.length() && (!seen_paren \|\| paren_count > 0)) {
334	if (query_string.charAt(offset) == '(') {
335	paren_count++;
336	seen_paren = true;
337	}
338	if (query_string.charAt(offset) == ')') {
339	paren_count--;
340	}
341	offset++;
342	}
343	String query_prefix = query_string.substring(0, offset);
344	String query_suffix = query_string.substring(offset);
345
346	///ystem.err.println("Prefix: " + query_prefix);
347	///ystem.err.println("Suffix: " + query_suffix);
348
349	Query query = query_parser.parse(query_prefix);
350	query = query.rewrite(reader);
351
352	// If this is a fuzzy search, then we need to add the fuzzy
353	// flag to each of the query terms
354	if (fuzziness != null && query.toString().length() > 0) {
355
356	// Revert the query to a string
357	System.err.println("Rewritten query: " + query.toString());
358	// Search through the string for TX:<term> query terms
359	// and append the ~ operator. Note that this search will
360	// not change phrase searches (TX:"<term> <term>") as
361	// fuzzy searching is not possible for these entries.
362	// Yahoo! Time for a state machine!
363	StringBuffer mutable_query_string = new StringBuffer(query.toString());
364	int o = 0; // Offset
365	// 0 = BASE, 1 = SEEN_T, 2 = SEEN_TX, 3 = SEEN_TX:
366	int s = 0; // State
367	while(o < mutable_query_string.length()) {
368	char c = mutable_query_string.charAt(o);
369	if (s == 0 && c == TEXTFIELD.charAt(0)) {
370	///ystem.err.println("Found T!");
371	s = 1;
372	}
373	else if (s == 1) {
374	if (c == TEXTFIELD.charAt(1)) {
375	///ystem.err.println("Found X!");
376	s = 2;
377	}
378	else {
379	s = 0; // Reset
380	}
381	}
382	else if (s == 2) {
383	if (c == ':') {
384	///ystem.err.println("Found TX:!");
385	s = 3;
386	}
387	else {
388	s = 0; // Reset
389	}
390	}
391	else if (s == 3) {
392	// Don't process phrases
393	if (c == '"') {
394	///ystem.err.println("Stupid phrase...");
395	s = 0; // Reset
396	}
397	// Found the end of the term... add the
398	// fuzzy search indicator
399	// Nor outside the scope of parentheses
400	else if (Character.isWhitespace(c) \|\| c == ')') {
401	///ystem.err.println("Yahoo! Found fuzzy term.");
402	mutable_query_string.insert(o, '~' + fuzziness);
403	o++;
404	s = 0; // Reset
405	}
406	}
407	o++;
408	}
409	// If we were in the state of looking for the end of a
410	// term - then we just found it!
411	if (s == 3) {
412
413	mutable_query_string.append('~' + fuzziness);
414	}
415	// Reparse the query
416	///ystem.err.println("Fuzzy query: " + mutable_query_string.toString() + query_suffix);
417	query = query_parser.parse(mutable_query_string.toString() + query_suffix);
418	}
419	else {
420	query = query_parser.parse(query_prefix + query_suffix);
421	}
422
423	return query;
424	}
425
426	private Filter parseFilterString(String filter_string)
427	{
428	Filter result = null;
429	Pattern pattern = Pattern.compile("\\s\\+(\\w+)\\:([\\{\\[])(\\d+)\\s+TO\\s+(\\d+)([\\}\\]])\\s");
430	Matcher matcher = pattern.matcher(filter_string);
431	if (matcher.matches()) {
432	String field_name = matcher.group(1);
433	boolean include_lower = matcher.group(2).equals("[");
434	String lower_term = matcher.group(3);
435	String upper_term = matcher.group(4);
436	boolean include_upper = matcher.group(5).equals("]");
437	result = new RangeFilter(field_name, lower_term, upper_term, include_lower, include_upper);
438	}
439	else {
440	System.err.println("Error: Could not understand filter string \"" + filter_string + "\"");
441	}
442	return result;
443	}
444
445
446	protected void finalize() throws Throwable
447	{
448	try {
449	utf8out.flush();
450	} finally {
451	super.finalize();
452	}
453	}
454
455
456	/** command line program and auxiliary methods */
457
458	// Fairly self-explanatory I should hope
459	static private boolean query_result_caching_enabled = false;
460
461
462	static public void main (String args[])
463	{
464
465
466	if (args.length == 0) {
467	System.out.println("Usage: GS2LuceneQuery <index directory> [-fuzziness value] [-filter filter_string] [-sort sort_field] [-dco AND\|OR] [-startresults number -endresults number] [query]");
468	return;
469	}
470
471	try {
472	String index_directory = args[0];
473
474	GS2LuceneQuery queryer = new GS2LuceneQuery();
475	queryer.setIndexDir(index_directory);
476
477	// Prepare the index cache directory, if query result caching is enabled
478	if (query_result_caching_enabled) {
479	// Make the index cache directory if it doesn't already exist
480	File index_cache_directory = new File(index_directory, "cache");
481	if (!index_cache_directory.exists()) {
482	index_cache_directory.mkdir();
483	}
484
485	// Disable caching if the index cache directory isn't available
486	if (!index_cache_directory.exists() \|\| !index_cache_directory.isDirectory()) {
487	query_result_caching_enabled = false;
488	}
489	}
490
491	String query_string = null;
492
493	// Parse the command-line arguments
494	for (int i = 1; i < args.length; i++) {
495	if (args[i].equals("-sort")) {
496	i++;
497	queryer.setSortField(args[i]);
498	}
499	else if (args[i].equals("-filter")) {
500	i++;
501	queryer.setFilterString(args[i]);
502	}
503	else if (args[i].equals("-dco")) {
504	i++;
505	queryer.setDefaultConjunctionOperator(args[i]);
506	}
507	else if (args[i].equals("-fuzziness")) {
508	i++;
509	queryer.setFuzziness(args[i]);
510	}
511	else if (args[i].equals("-startresults")) {
512	i++;
513	if (args[i].matches("\\d+")) {
514	queryer.setStartResults(Integer.parseInt(args[i]));
515	}
516	}
517	else if (args[i].equals("-endresults")) {
518	i++;
519	if (args[i].matches("\\d+")) {
520	queryer.setEndResults(Integer.parseInt(args[i]));
521	}
522	}
523	else {
524	query_string = args[i];
525	}
526	}
527
528	if (!queryer.initialise()) {
529	return;
530	}
531
532	// The query string has been specified as a command-line argument
533	if (query_string != null) {
534	runQueryCaching(index_directory, queryer, query_string);
535	}
536
537	// Read queries from STDIN
538	else {
539	BufferedReader in = new BufferedReader(new InputStreamReader(System.in, "UTF-8"));
540	while (true) {
541	// Read the query from STDIN
542	query_string = in.readLine();
543	if (query_string == null \|\| query_string.length() == -1) {
544	break;
545	}
546
547	runQueryCaching(index_directory, queryer, query_string);
548
549	}
550	}
551	queryer.cleanUp();
552	}
553	catch (IOException exception) {
554	exception.printStackTrace();
555	}
556	}
557
558	private static void runQueryCaching(String index_directory, GS2LuceneQuery queryer, String query_string)
559	throws IOException
560	{
561	StringBuffer query_results_xml = new StringBuffer();
562
563	// Check if this query result has been cached from a previous search (if it's enabled)
564	File query_result_cache_file = null;
565	if (query_result_caching_enabled) {
566	// Generate the cache file name from the query options
567	String query_result_cache_file_name = query_string + "-";
568	String fuzziness = queryer.getFuzziness();
569	query_result_cache_file_name += ((fuzziness != null) ? fuzziness : "") + "-";
570	String filter_string = queryer.getFilterString();
571	query_result_cache_file_name += ((filter_string != null) ? filter_string : "") + "-";
572	String sort_string = queryer.getSortField();
573	query_result_cache_file_name += ((sort_string != null) ? sort_string : "") + "-";
574	String default_conjunction_operator = queryer.getDefaultConjunctionOperator();
575	query_result_cache_file_name += default_conjunction_operator + "-";
576	int start_results = queryer.getStartResults();
577	int end_results = queryer.getEndResults();
578	query_result_cache_file_name += start_results + "-" + end_results;
579	query_result_cache_file_name = fileSafe(query_result_cache_file_name);
580
581	// If the query result cache file exists, just return its contents and we're done
582	File index_cache_directory = new File(index_directory, "cache");
583	query_result_cache_file = new File(index_cache_directory, query_result_cache_file_name);
584	if (query_result_cache_file.exists() && query_result_cache_file.isFile()) {
585	FileInputStream fis = new FileInputStream(query_result_cache_file);
586	InputStreamReader isr = new InputStreamReader(fis, "UTF-8");
587	BufferedReader buffered_reader = new BufferedReader(isr);
588	String line = "";
589	while ((line = buffered_reader.readLine()) != null) {
590	query_results_xml.append(line + "\n");
591	}
592	String query_results_xml_string = query_results_xml.toString();
593	query_results_xml_string = query_results_xml_string.replaceFirst("cached=\"false\"", "cached=\"true\"");
594
595	utf8out.print(query_results_xml_string);
596	utf8out.flush();
597
598	return;
599	}
600	}
601
602	// not cached
603	query_results_xml.append("<ResultSet cached=\"false\">\n");
604	query_results_xml.append("<QueryString>" + LuceneQueryResult.xmlSafe(query_string) + "</QueryString>\n");
605	Filter filter = queryer.getFilter();
606	if (filter != null) {
607	query_results_xml.append("<FilterString>" + filter.toString() + "</FilterString>\n");
608	}
609
610	LuceneQueryResult query_result = queryer.runQuery(query_string);
611	if (query_result == null) {
612	System.err.println("Couldn't run the query");
613	return;
614	}
615
616	if (query_result.getError() != LuceneQueryResult.NO_ERROR) {
617	query_results_xml.append("<Error type=\""+query_result.getErrorString()+"\" />\n");
618	} else {
619	query_results_xml.append(query_result.getXMLString());
620	}
621	query_results_xml.append("</ResultSet>\n");
622
623	utf8out.print(query_results_xml);
624	utf8out.flush();
625
626	try {
627	/*
628	Writer output = null;
629	File file = new File("/tmp/lucenequery.txt");
630	output = new BufferedWriter(new FileWriter(file,"UTF-8"));
631	output.write(query_results_xml.toString());
632	output.close();
633	*/
634
635	FileOutputStream fos = new FileOutputStream("/tmp/lucenequery.txt");
636
637	OutputStreamWriter osw2 = new OutputStreamWriter(fos, "UTF-8");
638
639	osw2.write("Query string = " + query_string + "\n");
640	osw2.write(query_results_xml.toString());
641	osw2.close();
642	}
643	catch (Exception e) {
644	e.printStackTrace();
645	}
646
647
648
649	// Cache this query result, if desired
650	if (query_result_caching_enabled) {
651	FileWriter query_result_cache_file_writer = new FileWriter(query_result_cache_file);
652	query_result_cache_file_writer.write(query_results_xml.toString());
653	query_result_cache_file_writer.close();
654	}
655	}
656
657	private static String fileSafe(String text)
658	{
659	StringBuffer file_safe_text = new StringBuffer();
660	for (int i = 0; i < text.length(); i++) {
661	char character = text.charAt(i);
662	if ((character >= 'A' && character <= 'Z') \|\| (character >= 'a' && character <= 'z') \|\| (character >= '0' && character <= '9') \|\| character == '-') {
663	file_safe_text.append(character);
664	}
665	else {
666	file_safe_text.append('%');
667	file_safe_text.append((int) character);
668	}
669	}
670	return file_safe_text.toString();
671	}
672
673
674	}
675
676

Note: See TracBrowser for help on using the repository browser.

Download in other formats: