Context Navigation

source: indexers/trunk/lucene-gs/src/org/greenstone/LuceneWrapper/GS2LuceneQuery.java@ 16947

Last change on this file since 16947 was 16947, checked in by mdewsnip, 16 years ago
Changed the Lucene code to use the Greenstone document OIDs directly, instead of creating its own numeric IDs and then mapping them to the Greenstone OIDs in the GDBM file. As well as being simpler and more space and speed efficient (the mapping no longer needs to be stored in the GDBM file, and no lookup needs to be done for each search result), this is another important step along the road to true incremental building.
Property svn:keywords set to `Author Date Id Revision`
File size: 21.1 KB

Line
1	/**********************************************************************
2	*
3	* GS2LuceneQuery.java
4	*
5	* Copyright 2004 The New Zealand Digital Library Project
6	*
7	* A component of the Greenstone digital library software
8	* from the New Zealand Digital Library Project at the
9	* University of Waikato, New Zealand.
10	*
11	* This program is free software; you can redistribute it and/or modify
12	* it under the terms of the GNU General Public License as published by
13	* the Free Software Foundation; either version 2 of the License, or
14	* (at your option) any later version.
15	*
16	* This program is distributed in the hope that it will be useful,
17	* but WITHOUT ANY WARRANTY; without even the implied warranty of
18	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19	* GNU General Public License for more details.
20	*
21	* You should have received a copy of the GNU General Public License
22	* along with this program; if not, write to the Free Software
23	* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24	*
25	*********************************************************************/
26	package org.greenstone.LuceneWrapper;
27
28
29	import java.io.*;
30	import java.util.*;
31	import java.util.regex.*;
32
33	import org.apache.lucene.analysis.Analyzer;
34	import org.apache.lucene.analysis.standard.StandardAnalyzer;
35	import org.apache.lucene.document.Document;
36	import org.apache.lucene.index.IndexReader;
37	import org.apache.lucene.index.Term;
38	import org.apache.lucene.index.TermDocs;
39	import org.apache.lucene.queryParser.ParseException;
40	import org.apache.lucene.queryParser.QueryParser;
41	import org.apache.lucene.search.BooleanQuery.TooManyClauses;
42	import org.apache.lucene.search.Filter;
43	import org.apache.lucene.search.Hit;
44	import org.apache.lucene.search.Hits;
45	import org.apache.lucene.search.IndexSearcher;
46	import org.apache.lucene.search.Query;
47	import org.apache.lucene.search.RangeFilter;
48	import org.apache.lucene.search.Searcher;
49	import org.apache.lucene.search.ScoreDoc;
50	import org.apache.lucene.search.Sort;
51	import org.apache.lucene.search.TopFieldDocs;
52
53
54	public class GS2LuceneQuery
55	{
56
57
58	static private String TEXTFIELD = "TX";
59
60	// Use the standard set of English stop words by default
61	static private String[] stop_words = StandardAnalyzer.STOP_WORDS;
62
63	private String full_indexdir="";
64	private String default_conjunction_operator = "OR";
65	private String fuzziness = null;
66	private String sort_field = null;
67	private Sort sorter=new Sort();
68	private String filter_string = null;
69	private Filter filter = null;
70	private int start_results=1;
71	private int end_results=Integer.MAX_VALUE;
72
73	private QueryParser query_parser = null;
74	private QueryParser query_parser_no_stop_words = null;
75	private Searcher searcher = null;
76	private IndexReader reader = null;
77
78	static private PrintWriter utf8out = null;
79
80	static
81	{
82	try {
83	OutputStreamWriter osw = new OutputStreamWriter(System.out, "UTF-8");
84	utf8out = new PrintWriter(osw, true);
85	}
86	catch (UnsupportedEncodingException e) {
87	System.out.println(e);
88	}
89	}
90
91
92	public GS2LuceneQuery() {
93
94	// Create one query parser with the standard set of stop words, and one with none
95
96	query_parser = new QueryParser(TEXTFIELD, new StandardAnalyzer(stop_words));
97	query_parser_no_stop_words = new QueryParser(TEXTFIELD, new StandardAnalyzer(new String[] { }));
98	}
99
100
101	public boolean initialise() {
102
103	if (full_indexdir==null \|\| full_indexdir.length()==-1){
104	utf8out.println("Index directory is not indicated ");
105	utf8out.flush();
106	return false;
107	}
108	try {
109	searcher = new IndexSearcher(full_indexdir);
110	reader = ((IndexSearcher) searcher).getIndexReader();
111
112	}
113	catch (IOException exception) {
114	exception.printStackTrace();
115	return false;
116	}
117	return true;
118
119	}
120
121	public LuceneQueryResult runQuery(String query_string) {
122
123	if (query_string == null \|\| query_string.equals("")) {
124	utf8out.println("The query word is not indicated ");
125	utf8out.flush();
126	return null;
127	}
128
129	LuceneQueryResult lucene_query_result=new LuceneQueryResult();
130	lucene_query_result.clear();
131
132	try {
133	Query query_including_stop_words = query_parser_no_stop_words.parse(query_string);
134	query_including_stop_words = query_including_stop_words.rewrite(reader);
135
136	Query query = parseQuery(reader, query_parser, query_string, fuzziness);
137	query = query.rewrite(reader);
138
139	// Get the list of expanded query terms and their frequencies
140	// num docs matching, and total frequency
141	HashSet terms = new HashSet();
142	query.extractTerms(terms);
143
144	HashMap doc_term_freq_map = new HashMap();
145
146	Iterator iter = terms.iterator();
147	while (iter.hasNext()) {
148
149	Term term = (Term) iter.next();
150
151	// Get the term frequency over all the documents
152	TermDocs term_docs = reader.termDocs(term);
153	int term_freq = 0;
154	int match_docs = 0;
155	while (term_docs.next())
156	{
157	if (term_docs.freq() != 0)
158	{
159	term_freq += term_docs.freq();
160	match_docs++;
161
162	// Calculate the document-level term frequency as well
163	Integer lucene_doc_num_obj = new Integer(term_docs.doc());
164	int doc_term_freq = 0;
165	if (doc_term_freq_map.containsKey(lucene_doc_num_obj))
166	{
167	doc_term_freq = ((Integer) doc_term_freq_map.get(lucene_doc_num_obj)).intValue();
168	}
169	doc_term_freq += term_docs.freq();
170
171	doc_term_freq_map.put(lucene_doc_num_obj, new Integer(doc_term_freq));
172	}
173	}
174
175	// Create a term
176	lucene_query_result.addTerm(term.text(), term.field(), match_docs, term_freq);
177	}
178
179	// Get the list of stop words removed from the query
180	HashSet terms_including_stop_words = new HashSet();
181	query_including_stop_words.extractTerms(terms_including_stop_words);
182	Iterator terms_including_stop_words_iter = terms_including_stop_words.iterator();
183	while (terms_including_stop_words_iter.hasNext()) {
184	Term term = (Term) terms_including_stop_words_iter.next();
185	if (!terms.contains(term)) {
186	lucene_query_result.addStopWord(term.text());
187	}
188	}
189
190	// do the query
191	// Simple case for getting all the matching documents
192	if (end_results == Integer.MAX_VALUE) {
193	// Perform the query (filter and sorter may be null)
194	Hits hits = searcher.search(query, filter, sorter);
195	lucene_query_result.setTotalDocs(hits.length());
196
197	// Output the matching documents
198	lucene_query_result.setStartResults(start_results);
199	lucene_query_result.setEndResults(hits.length());
200
201	for (int i = start_results; i <= hits.length(); i++) {
202	int lucene_doc_num = hits.id(i - 1);
203	Document doc = hits.doc(i - 1);
204	int doc_term_freq = ((Integer) doc_term_freq_map.get(new Integer(lucene_doc_num))).intValue();
205	lucene_query_result.addDoc(doc.get("docOID").trim(), hits.score(i-1), doc_term_freq);
206	}
207	}
208
209	// Slightly more complicated case for returning a subset of the matching documents
210	else {
211	// Perform the query (filter may be null)
212	TopFieldDocs hits = searcher.search(query, filter, end_results, sorter);
213	lucene_query_result.setTotalDocs(hits.totalHits);
214
215	lucene_query_result.setStartResults(start_results);
216	lucene_query_result.setEndResults(end_results < hits.scoreDocs.length ? end_results: hits.scoreDocs.length);
217
218	// Output the matching documents
219	for (int i = start_results; (i <= hits.scoreDocs.length && i <= end_results); i++) {
220	int lucene_doc_num = hits.scoreDocs[i - 1].doc;
221	Document doc = reader.document(lucene_doc_num);
222	int doc_term_freq = ((Integer) doc_term_freq_map.get(new Integer(lucene_doc_num))).intValue();
223	lucene_query_result.addDoc(doc.get("docOID").trim(), hits.scoreDocs[i-1].score, doc_term_freq);
224	}
225	}
226	}
227
228	catch (ParseException parse_exception) {
229	lucene_query_result.setError(LuceneQueryResult.PARSE_ERROR);
230	}
231	catch (TooManyClauses too_many_clauses_exception) {
232	lucene_query_result.setError(LuceneQueryResult.TOO_MANY_CLAUSES_ERROR);
233	}
234	catch (IOException exception) {
235	lucene_query_result.setError(LuceneQueryResult.IO_ERROR);
236	exception.printStackTrace();
237	}
238	catch (Exception exception) {
239	lucene_query_result.setError(LuceneQueryResult.OTHER_ERROR);
240	exception.printStackTrace();
241	}
242	return lucene_query_result;
243	}
244
245	public void setDefaultConjunctionOperator(String default_conjunction_operator) {
246	this.default_conjunction_operator = default_conjunction_operator.toUpperCase();
247	if (default_conjunction_operator.equals("AND")) {
248	query_parser.setDefaultOperator(query_parser.AND_OPERATOR);
249	query_parser_no_stop_words.setDefaultOperator(query_parser.AND_OPERATOR);
250	} else { // default is OR
251	query_parser.setDefaultOperator(query_parser.OR_OPERATOR);
252	query_parser_no_stop_words.setDefaultOperator(query_parser.OR_OPERATOR);
253	}
254	}
255
256	public String getDefaultConjunctionOperator() {
257	return this.default_conjunction_operator;
258	}
259
260	public void setEndResults(int end_results) {
261	this.end_results = end_results;
262	}
263	public int getEndResults() {
264	return this.end_results;
265	}
266
267	public void setFilterString(String filter_string) {
268	this.filter_string = filter_string;
269	this.filter = parseFilterString(filter_string);
270	}
271	public String getFilterString() {
272	return this.filter_string ;
273	}
274
275	public Filter getFilter() {
276	return this.filter;
277	}
278
279	public void setIndexDir(String full_indexdir) {
280	this.full_indexdir = full_indexdir;
281	}
282
283	public void setFuzziness(String fuzziness) {
284	this.fuzziness = fuzziness;
285	}
286	public String getFuzziness() {
287	return this.fuzziness;
288	}
289
290	public void setSortField(String sort_field) {
291	this.sort_field = sort_field;
292	if (sort_field == null) {
293	this.sorter = new Sort();
294	} else {
295	this.sorter = new Sort(sort_field);
296	}
297	}
298	public String getSortField() {
299	return this.sort_field;
300	}
301
302	public void setStartResults(int start_results) {
303	if (start_results < 1) {
304	start_results = 1;
305	}
306	this.start_results = start_results;
307	}
308	public int getStartResults() {
309	return this.start_results;
310	}
311
312	public void cleanUp() {
313	try {
314	if (searcher != null) {
315	searcher.close();
316	}
317	} catch (IOException exception) {
318	exception.printStackTrace();
319	}
320	}
321
322	private Query parseQuery(IndexReader reader, QueryParser query_parser, String query_string, String fuzziness)
323	throws java.io.IOException, org.apache.lucene.queryParser.ParseException
324	{
325	// Split query string into the search terms and the filter terms
326	// * The first +(...) term contains the search terms so count
327	// up '(' and stop when we finish matching ')'
328	int offset = 0;
329	int paren_count = 0;
330	boolean seen_paren = false;
331	while (offset < query_string.length() && (!seen_paren \|\| paren_count > 0)) {
332	if (query_string.charAt(offset) == '(') {
333	paren_count++;
334	seen_paren = true;
335	}
336	if (query_string.charAt(offset) == ')') {
337	paren_count--;
338	}
339	offset++;
340	}
341	String query_prefix = query_string.substring(0, offset);
342	String query_suffix = query_string.substring(offset);
343
344	///ystem.err.println("Prefix: " + query_prefix);
345	///ystem.err.println("Suffix: " + query_suffix);
346
347	Query query = query_parser.parse(query_prefix);
348	query = query.rewrite(reader);
349
350	// If this is a fuzzy search, then we need to add the fuzzy
351	// flag to each of the query terms
352	if (fuzziness != null && query.toString().length() > 0) {
353
354	// Revert the query to a string
355	System.err.println("Rewritten query: " + query.toString());
356	// Search through the string for TX:<term> query terms
357	// and append the ~ operator. Note that this search will
358	// not change phrase searches (TX:"<term> <term>") as
359	// fuzzy searching is not possible for these entries.
360	// Yahoo! Time for a state machine!
361	StringBuffer mutable_query_string = new StringBuffer(query.toString());
362	int o = 0; // Offset
363	// 0 = BASE, 1 = SEEN_T, 2 = SEEN_TX, 3 = SEEN_TX:
364	int s = 0; // State
365	while(o < mutable_query_string.length()) {
366	char c = mutable_query_string.charAt(o);
367	if (s == 0 && c == TEXTFIELD.charAt(0)) {
368	///ystem.err.println("Found T!");
369	s = 1;
370	}
371	else if (s == 1) {
372	if (c == TEXTFIELD.charAt(1)) {
373	///ystem.err.println("Found X!");
374	s = 2;
375	}
376	else {
377	s = 0; // Reset
378	}
379	}
380	else if (s == 2) {
381	if (c == ':') {
382	///ystem.err.println("Found TX:!");
383	s = 3;
384	}
385	else {
386	s = 0; // Reset
387	}
388	}
389	else if (s == 3) {
390	// Don't process phrases
391	if (c == '"') {
392	///ystem.err.println("Stupid phrase...");
393	s = 0; // Reset
394	}
395	// Found the end of the term... add the
396	// fuzzy search indicator
397	// Nor outside the scope of parentheses
398	else if (Character.isWhitespace(c) \|\| c == ')') {
399	///ystem.err.println("Yahoo! Found fuzzy term.");
400	mutable_query_string.insert(o, '~' + fuzziness);
401	o++;
402	s = 0; // Reset
403	}
404	}
405	o++;
406	}
407	// If we were in the state of looking for the end of a
408	// term - then we just found it!
409	if (s == 3) {
410
411	mutable_query_string.append('~' + fuzziness);
412	}
413	// Reparse the query
414	///ystem.err.println("Fuzzy query: " + mutable_query_string.toString() + query_suffix);
415	query = query_parser.parse(mutable_query_string.toString() + query_suffix);
416	}
417	else {
418	query = query_parser.parse(query_prefix + query_suffix);
419	}
420
421	return query;
422	}
423
424	private Filter parseFilterString(String filter_string)
425	{
426	Filter result = null;
427	Pattern pattern = Pattern.compile("\\s\\+(\\w+)\\:([\\{\\[])(\\d+)\\s+TO\\s+(\\d+)([\\}\\]])\\s");
428	Matcher matcher = pattern.matcher(filter_string);
429	if (matcher.matches()) {
430	String field_name = matcher.group(1);
431	boolean include_lower = matcher.group(2).equals("[");
432	String lower_term = matcher.group(3);
433	String upper_term = matcher.group(4);
434	boolean include_upper = matcher.group(5).equals("]");
435	result = new RangeFilter(field_name, lower_term, upper_term, include_lower, include_upper);
436	}
437	else {
438	System.err.println("Error: Could not understand filter string \"" + filter_string + "\"");
439	}
440	return result;
441	}
442
443
444	protected void finalize() throws Throwable
445	{
446	try {
447	utf8out.flush();
448	} finally {
449	super.finalize();
450	}
451	}
452
453
454	/** command line program and auxiliary methods */
455
456	// Fairly self-explanatory I should hope
457	static private boolean query_result_caching_enabled = false;
458
459
460	static public void main (String args[])
461	{
462
463
464	if (args.length == 0) {
465	System.out.println("Usage: GS2LuceneQuery <index directory> [-fuzziness value] [-filter filter_string] [-sort sort_field] [-dco AND\|OR] [-startresults number -endresults number] [query]");
466	return;
467	}
468
469	try {
470	String index_directory = args[0];
471
472	GS2LuceneQuery queryer = new GS2LuceneQuery();
473	queryer.setIndexDir(index_directory);
474
475	// Prepare the index cache directory, if query result caching is enabled
476	if (query_result_caching_enabled) {
477	// Make the index cache directory if it doesn't already exist
478	File index_cache_directory = new File(index_directory, "cache");
479	if (!index_cache_directory.exists()) {
480	index_cache_directory.mkdir();
481	}
482
483	// Disable caching if the index cache directory isn't available
484	if (!index_cache_directory.exists() \|\| !index_cache_directory.isDirectory()) {
485	query_result_caching_enabled = false;
486	}
487	}
488
489	String query_string = null;
490
491	// Parse the command-line arguments
492	for (int i = 1; i < args.length; i++) {
493	if (args[i].equals("-sort")) {
494	i++;
495	queryer.setSortField(args[i]);
496	}
497	else if (args[i].equals("-filter")) {
498	i++;
499	queryer.setFilterString(args[i]);
500	}
501	else if (args[i].equals("-dco")) {
502	i++;
503	queryer.setDefaultConjunctionOperator(args[i]);
504	}
505	else if (args[i].equals("-fuzziness")) {
506	i++;
507	queryer.setFuzziness(args[i]);
508	}
509	else if (args[i].equals("-startresults")) {
510	i++;
511	if (args[i].matches("\\d+")) {
512	queryer.setStartResults(Integer.parseInt(args[i]));
513	}
514	}
515	else if (args[i].equals("-endresults")) {
516	i++;
517	if (args[i].matches("\\d+")) {
518	queryer.setEndResults(Integer.parseInt(args[i]));
519	}
520	}
521	else {
522	query_string = args[i];
523	}
524	}
525
526	if (!queryer.initialise()) {
527	return;
528	}
529
530	// The query string has been specified as a command-line argument
531	if (query_string != null) {
532	runQueryCaching(index_directory, queryer, query_string);
533	}
534
535	// Read queries from STDIN
536	else {
537	BufferedReader in = new BufferedReader(new InputStreamReader(System.in, "UTF-8"));
538	while (true) {
539	// Read the query from STDIN
540	query_string = in.readLine();
541	if (query_string == null \|\| query_string.length() == -1) {
542	break;
543	}
544
545	runQueryCaching(index_directory, queryer, query_string);
546
547	}
548	}
549	queryer.cleanUp();
550	}
551	catch (IOException exception) {
552	exception.printStackTrace();
553	}
554	}
555
556	private static void runQueryCaching(String index_directory, GS2LuceneQuery queryer, String query_string)
557	throws IOException
558	{
559	StringBuffer query_results_xml = new StringBuffer();
560
561	// Check if this query result has been cached from a previous search (if it's enabled)
562	File query_result_cache_file = null;
563	if (query_result_caching_enabled) {
564	// Generate the cache file name from the query options
565	String query_result_cache_file_name = query_string + "-";
566	String fuzziness = queryer.getFuzziness();
567	query_result_cache_file_name += ((fuzziness != null) ? fuzziness : "") + "-";
568	String filter_string = queryer.getFilterString();
569	query_result_cache_file_name += ((filter_string != null) ? filter_string : "") + "-";
570	String sort_string = queryer.getSortField();
571	query_result_cache_file_name += ((sort_string != null) ? sort_string : "") + "-";
572	String default_conjunction_operator = queryer.getDefaultConjunctionOperator();
573	query_result_cache_file_name += default_conjunction_operator + "-";
574	int start_results = queryer.getStartResults();
575	int end_results = queryer.getEndResults();
576	query_result_cache_file_name += start_results + "-" + end_results;
577	query_result_cache_file_name = fileSafe(query_result_cache_file_name);
578
579	// If the query result cache file exists, just return its contents and we're done
580	File index_cache_directory = new File(index_directory, "cache");
581	query_result_cache_file = new File(index_cache_directory, query_result_cache_file_name);
582	if (query_result_cache_file.exists() && query_result_cache_file.isFile()) {
583	FileInputStream fis = new FileInputStream(query_result_cache_file);
584	InputStreamReader isr = new InputStreamReader(fis, "UTF-8");
585	BufferedReader buffered_reader = new BufferedReader(isr);
586	String line = "";
587	while ((line = buffered_reader.readLine()) != null) {
588	query_results_xml.append(line + "\n");
589	}
590	String query_results_xml_string = query_results_xml.toString();
591	query_results_xml_string = query_results_xml_string.replaceFirst("cached=\"false\"", "cached=\"true\"");
592
593	utf8out.print(query_results_xml_string);
594	utf8out.flush();
595
596	return;
597	}
598	}
599
600	// not cached
601	query_results_xml.append("<ResultSet cached=\"false\">\n");
602	query_results_xml.append("<QueryString>" + LuceneQueryResult.xmlSafe(query_string) + "</QueryString>\n");
603	Filter filter = queryer.getFilter();
604	if (filter != null) {
605	query_results_xml.append("<FilterString>" + filter.toString() + "</FilterString>\n");
606	}
607
608	LuceneQueryResult query_result = queryer.runQuery(query_string);
609	if (query_result == null) {
610	System.err.println("Couldn't run the query");
611	return;
612	}
613
614	if (query_result.getError() != LuceneQueryResult.NO_ERROR) {
615	query_results_xml.append("<Error type=\""+query_result.getErrorString()+"\" />\n");
616	} else {
617	query_results_xml.append(query_result.getXMLString());
618	}
619	query_results_xml.append("</ResultSet>\n");
620
621	utf8out.print(query_results_xml);
622	utf8out.flush();
623
624	try {
625	/*
626	Writer output = null;
627	File file = new File("/tmp/lucenequery.txt");
628	output = new BufferedWriter(new FileWriter(file,"UTF-8"));
629	output.write(query_results_xml.toString());
630	output.close();
631	*/
632
633	FileOutputStream fos = new FileOutputStream("/tmp/lucenequery.txt");
634
635	OutputStreamWriter osw2 = new OutputStreamWriter(fos, "UTF-8");
636
637	osw2.write("Query string = " + query_string + "\n");
638	osw2.write(query_results_xml.toString());
639	osw2.close();
640	}
641	catch (Exception e) {
642	e.printStackTrace();
643	}
644
645
646
647	// Cache this query result, if desired
648	if (query_result_caching_enabled) {
649	FileWriter query_result_cache_file_writer = new FileWriter(query_result_cache_file);
650	query_result_cache_file_writer.write(query_results_xml.toString());
651	query_result_cache_file_writer.close();
652	}
653	}
654
655	private static String fileSafe(String text)
656	{
657	StringBuffer file_safe_text = new StringBuffer();
658	for (int i = 0; i < text.length(); i++) {
659	char character = text.charAt(i);
660	if ((character >= 'A' && character <= 'Z') \|\| (character >= 'a' && character <= 'z') \|\| (character >= '0' && character <= '9') \|\| character == '-') {
661	file_safe_text.append(character);
662	}
663	else {
664	file_safe_text.append('%');
665	file_safe_text.append((int) character);
666	}
667	}
668	return file_safe_text.toString();
669	}
670
671
672	}
673
674

Note: See TracBrowser for help on using the repository browser.

Download in other formats: