Context Navigation

source: trunk/gsdl/src/java/org/nzdl/gsdl/LuceneWrap/GS2LuceneQuery.java@ 13557

Last change on this file since 13557 was 13557, checked in by kjdon, 17 years ago
in GS3 we will use GS2LuceneQuery directly, so moved most of the functionality out of the static methods and into instance methods. main() is now pretty much just a wrapper around the class. Caching has been left in the command line version for now - maybe should be in the class version too??
Property svn:keywords set to `Author Date Id Revision`
File size: 19.1 KB

Line
1	/**********************************************************************
2	*
3	* GS2LuceneQuery.java
4	*
5	* Copyright 2004 The New Zealand Digital Library Project
6	*
7	* A component of the Greenstone digital library software
8	* from the New Zealand Digital Library Project at the
9	* University of Waikato, New Zealand.
10	*
11	* This program is free software; you can redistribute it and/or modify
12	* it under the terms of the GNU General Public License as published by
13	* the Free Software Foundation; either version 2 of the License, or
14	* (at your option) any later version.
15	*
16	* This program is distributed in the hope that it will be useful,
17	* but WITHOUT ANY WARRANTY; without even the implied warranty of
18	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19	* GNU General Public License for more details.
20	*
21	* You should have received a copy of the GNU General Public License
22	* along with this program; if not, write to the Free Software
23	* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24	*
25	*********************************************************************/
26	package org.nzdl.gsdl.LuceneWrap;
27
28
29	import java.io.*;
30	import java.util.*;
31	import java.util.regex.*;
32
33	import org.apache.lucene.analysis.Analyzer;
34	import org.apache.lucene.analysis.standard.StandardAnalyzer;
35	import org.apache.lucene.document.Document;
36	import org.apache.lucene.index.IndexReader;
37	import org.apache.lucene.index.Term;
38	import org.apache.lucene.index.TermDocs;
39	import org.apache.lucene.queryParser.ParseException;
40	import org.apache.lucene.queryParser.QueryParser;
41	import org.apache.lucene.search.BooleanQuery.TooManyClauses;
42	import org.apache.lucene.search.Filter;
43	import org.apache.lucene.search.Hit;
44	import org.apache.lucene.search.Hits;
45	import org.apache.lucene.search.IndexSearcher;
46	import org.apache.lucene.search.Query;
47	import org.apache.lucene.search.RangeFilter;
48	import org.apache.lucene.search.Searcher;
49	import org.apache.lucene.search.ScoreDoc;
50	import org.apache.lucene.search.Sort;
51	import org.apache.lucene.search.TopFieldDocs;
52
53
54	public class GS2LuceneQuery
55	{
56
57
58	static private String TEXTFIELD = "TX";
59
60	// Use the standard set of English stop words by default
61	static private String[] stop_words = StandardAnalyzer.STOP_WORDS;
62
63	private String full_indexdir="";
64	private String default_conjunction_operator = "OR";
65	private String fuzziness = null;
66	private String sort_field = null;
67	private Sort sorter=new Sort();
68	private String filter_string = null;
69	private Filter filter = null;
70	private int start_results=1;
71	private int end_results=Integer.MAX_VALUE;
72
73	private QueryParser query_parser = null;
74	private QueryParser query_parser_no_stop_words = null;
75	private Searcher searcher = null;
76	private IndexReader reader = null;
77
78	public GS2LuceneQuery() {
79
80	// Create one query parser with the standard set of stop words, and one with none
81
82	query_parser = new QueryParser(TEXTFIELD, new StandardAnalyzer(stop_words));
83	query_parser_no_stop_words = new QueryParser(TEXTFIELD, new StandardAnalyzer(new String[] { }));
84	}
85
86
87	public boolean initialise() {
88
89	if (full_indexdir==null \|\| full_indexdir.length()==-1){
90	System.out.println("Index directory is not indicated ");
91	return false;
92	}
93	try {
94	searcher = new IndexSearcher(full_indexdir);
95	reader = ((IndexSearcher) searcher).getIndexReader();
96
97	}
98	catch (IOException exception) {
99	exception.printStackTrace();
100	return false;
101	}
102	return true;
103
104	}
105
106	public LuceneQueryResult runQuery(String query_string) {
107
108	if (query_string == null \|\| query_string.equals("")) {
109	System.out.println("The query word is not indicated ");
110	return null;
111	}
112
113	LuceneQueryResult lucene_query_result=new LuceneQueryResult();
114	lucene_query_result.clear();
115
116	try {
117	Query query_including_stop_words = query_parser_no_stop_words.parse(query_string);
118	query_including_stop_words = query_including_stop_words.rewrite(reader);
119
120	Query query = parseQuery(reader, query_parser, query_string, fuzziness);
121	query = query.rewrite(reader);
122
123	// Get the list of expanded query terms and their frequencies
124	// num docs matching, and total frequency
125	HashSet terms = new HashSet();
126	query.extractTerms(terms);
127
128	Iterator iter = terms.iterator();
129	while (iter.hasNext()) {
130
131	Term term = (Term) iter.next();
132
133	// Get the term frequency over all the documents
134	TermDocs term_docs = reader.termDocs(term);
135	int term_freq = term_docs.freq();
136	int match_docs = 0;
137	if (term_freq != 0) match_docs++;
138	while (term_docs.next()) {
139	term_freq += term_docs.freq();
140	if (term_docs.freq()!= 0) {
141	match_docs++;
142	}
143	}
144
145	// Create a term
146	lucene_query_result.addTerm(term.text(), term.field(), match_docs, term_freq);
147	}
148
149	// Get the list of stop words removed from the query
150	HashSet terms_including_stop_words = new HashSet();
151	query_including_stop_words.extractTerms(terms_including_stop_words);
152	Iterator terms_including_stop_words_iter = terms_including_stop_words.iterator();
153	while (terms_including_stop_words_iter.hasNext()) {
154	Term term = (Term) terms_including_stop_words_iter.next();
155	if (!terms.contains(term)) {
156	lucene_query_result.addStopWord(term.text());
157	}
158	}
159
160	// do the query
161	// Simple case for getting all the matching documents
162	if (end_results == Integer.MAX_VALUE) {
163	// Perform the query (filter and sorter may be null)
164	Hits hits = searcher.search(query, filter, sorter);
165	lucene_query_result.setTotalDocs(hits.length());
166
167	// Output the matching documents
168	lucene_query_result.setStartResults(start_results);
169	lucene_query_result.setEndResults(hits.length());
170
171	for (int i = start_results; i <= hits.length(); i++) {
172	Document doc = hits.doc(i - 1);
173	lucene_query_result.addDoc(Long.parseLong(doc.get("nodeID").trim()), hits.score(i-1));
174	}
175	}
176
177	// Slightly more complicated case for returning a subset of the matching documents
178	else {
179	// Perform the query (filter may be null)
180	TopFieldDocs hits = searcher.search(query, filter, end_results, sorter);
181	lucene_query_result.setTotalDocs(hits.totalHits);
182
183	lucene_query_result.setStartResults(start_results);
184	lucene_query_result.setEndResults(end_results < hits.scoreDocs.length ? end_results: hits.scoreDocs.length);
185
186	// Output the matching documents
187	for (int i = start_results; (i <= hits.scoreDocs.length && i <= end_results); i++) {
188	Document doc = reader.document(hits.scoreDocs[i - 1].doc);
189	lucene_query_result.addDoc(Long.parseLong(doc.get("nodeID").trim()), hits.scoreDocs[i-1].score);
190	}
191	}
192	}
193
194	catch (ParseException parse_exception) {
195	lucene_query_result.setError(LuceneQueryResult.PARSE_ERROR);
196	}
197	catch (TooManyClauses too_many_clauses_exception) {
198	lucene_query_result.setError(LuceneQueryResult.TOO_MANY_CLAUSES_ERROR);
199	}
200	catch (IOException exception) {
201	lucene_query_result.setError(LuceneQueryResult.IO_ERROR);
202	exception.printStackTrace();
203	}
204
205	return lucene_query_result;
206	}
207
208	public void setDefaultConjunctionOperator(String default_conjunction_operator) {
209	this.default_conjunction_operator = default_conjunction_operator.toUpperCase();
210	if (default_conjunction_operator == "AND") {
211	query_parser.setDefaultOperator(query_parser.AND_OPERATOR);
212	query_parser_no_stop_words.setDefaultOperator(query_parser.AND_OPERATOR);
213	} else { // default is OR
214	query_parser.setDefaultOperator(query_parser.OR_OPERATOR);
215	query_parser_no_stop_words.setDefaultOperator(query_parser.OR_OPERATOR);
216	}
217	}
218
219	public String getDefaultConjunctionOperator() {
220	return this.default_conjunction_operator;
221	}
222
223	public void setEndResults(int end_results) {
224	this.end_results = end_results;
225	}
226	public int getEndResults() {
227	return this.end_results;
228	}
229
230	public void setFilterString(String filter_string) {
231	this.filter_string = filter_string;
232	this.filter = parseFilterString(filter_string);
233	}
234	public String getFilterString() {
235	return this.filter_string ;
236	}
237
238	public Filter getFilter() {
239	return this.filter;
240	}
241
242	public void setIndexDir(String full_indexdir) {
243	this.full_indexdir = full_indexdir;
244	}
245
246	public void setFuzziness(String fuzziness) {
247	this.fuzziness = fuzziness;
248	}
249	public String getFuzziness() {
250	return this.fuzziness;
251	}
252
253	public void setSortField(String sort_field) {
254	this.sort_field = sort_field;
255	this.sorter = new Sort(sort_field);
256	}
257	public String getSortField() {
258	return this.sort_field;
259	}
260
261	public void setStartResults(int start_results) {
262	this.start_results = start_results;
263	}
264	public int getStartResults() {
265	return this.start_results;
266	}
267
268	public void cleanUp() {
269	try {
270	searcher.close();
271	} catch (IOException exception) {
272	exception.printStackTrace();
273	}
274	}
275
276	private Query parseQuery(IndexReader reader, QueryParser query_parser, String query_string, String fuzziness)
277	throws java.io.IOException, org.apache.lucene.queryParser.ParseException
278	{
279	// Split query string into the search terms and the filter terms
280	// * The first +(...) term contains the search terms so count
281	// up '(' and stop when we finish matching ')'
282	int offset = 0;
283	int paren_count = 0;
284	boolean seen_paren = false;
285	while (offset < query_string.length() && (!seen_paren \|\| paren_count > 0)) {
286	if (query_string.charAt(offset) == '(') {
287	paren_count++;
288	seen_paren = true;
289	}
290	if (query_string.charAt(offset) == ')') {
291	paren_count--;
292	}
293	offset++;
294	}
295	String query_prefix = query_string.substring(0, offset);
296	String query_suffix = query_string.substring(offset);
297
298	///ystem.err.println("Prefix: " + query_prefix);
299	///ystem.err.println("Suffix: " + query_suffix);
300
301	Query query = query_parser.parse(query_prefix);
302	query = query.rewrite(reader);
303
304	// If this is a fuzzy search, then we need to add the fuzzy
305	// flag to each of the query terms
306	if (fuzziness != null && query.toString().length() > 0) {
307
308	// Revert the query to a string
309	System.err.println("Rewritten query: " + query.toString());
310	// Search through the string for TX:<term> query terms
311	// and append the ~ operator. Note that this search will
312	// not change phrase searches (TX:"<term> <term>") as
313	// fuzzy searching is not possible for these entries.
314	// Yahoo! Time for a state machine!
315	StringBuffer mutable_query_string = new StringBuffer(query.toString());
316	int o = 0; // Offset
317	// 0 = BASE, 1 = SEEN_T, 2 = SEEN_TX, 3 = SEEN_TX:
318	int s = 0; // State
319	while(o < mutable_query_string.length()) {
320	char c = mutable_query_string.charAt(o);
321	if (s == 0 && c == TEXTFIELD.charAt(0)) {
322	///ystem.err.println("Found T!");
323	s = 1;
324	}
325	else if (s == 1) {
326	if (c == TEXTFIELD.charAt(1)) {
327	///ystem.err.println("Found X!");
328	s = 2;
329	}
330	else {
331	s = 0; // Reset
332	}
333	}
334	else if (s == 2) {
335	if (c == ':') {
336	///ystem.err.println("Found TX:!");
337	s = 3;
338	}
339	else {
340	s = 0; // Reset
341	}
342	}
343	else if (s == 3) {
344	// Don't process phrases
345	if (c == '"') {
346	///ystem.err.println("Stupid phrase...");
347	s = 0; // Reset
348	}
349	// Found the end of the term... add the
350	// fuzzy search indicator
351	// Nor outside the scope of parentheses
352	else if (Character.isWhitespace(c) \|\| c == ')') {
353	///ystem.err.println("Yahoo! Found fuzzy term.");
354	mutable_query_string.insert(o, '~' + fuzziness);
355	o++;
356	s = 0; // Reset
357	}
358	}
359	o++;
360	}
361	// If we were in the state of looking for the end of a
362	// term - then we just found it!
363	if (s == 3) {
364
365	mutable_query_string.append('~' + fuzziness);
366	}
367	// Reparse the query
368	///ystem.err.println("Fuzzy query: " + mutable_query_string.toString() + query_suffix);
369	query = query_parser.parse(mutable_query_string.toString() + query_suffix);
370	}
371	else {
372	query = query_parser.parse(query_prefix + query_suffix);
373	}
374
375	return query;
376	}
377
378	private Filter parseFilterString(String filter_string)
379	{
380	Filter result = null;
381	Pattern pattern = Pattern.compile("\\s\\+(\\w+)\\:([\\{\\[])(\\d+)\\s+TO\\s+(\\d+)([\\}\\]])\\s");
382	Matcher matcher = pattern.matcher(filter_string);
383	if (matcher.matches()) {
384	String field_name = matcher.group(1);
385	boolean include_lower = matcher.group(2).equals("[");
386	String lower_term = matcher.group(3);
387	String upper_term = matcher.group(4);
388	boolean include_upper = matcher.group(5).equals("]");
389	result = new RangeFilter(field_name, lower_term, upper_term, include_lower, include_upper);
390	}
391	else {
392	System.err.println("Error: Could not understand filter string \"" + filter_string + "\"");
393	}
394	return result;
395	}
396
397
398	/** command line program and auxiliary methods */
399
400	// Fairly self-explanatory I should hope
401	static private boolean query_result_caching_enabled = false;
402
403	static public void main (String args[])
404	{
405	if (args.length == 0) {
406	System.out.println("Usage: GS2LuceneQuery <index directory> [-fuzziness value] [-filter filter_string] [-sort sort_field] [-dco AND\|OR] [-startresults number -endresults number] [query]");
407	return;
408	}
409
410	try {
411	String index_directory = args[0];
412
413	GS2LuceneQuery queryer = new GS2LuceneQuery();
414	queryer.setIndexDir(index_directory);
415
416	// Prepare the index cache directory, if query result caching is enabled
417	if (query_result_caching_enabled) {
418	// Make the index cache directory if it doesn't already exist
419	File index_cache_directory = new File(index_directory, "cache");
420	if (!index_cache_directory.exists()) {
421	index_cache_directory.mkdir();
422	}
423
424	// Disable caching if the index cache directory isn't available
425	if (!index_cache_directory.exists() \|\| !index_cache_directory.isDirectory()) {
426	query_result_caching_enabled = false;
427	}
428	}
429
430	String query_string = null;
431
432	// Parse the command-line arguments
433	for (int i = 1; i < args.length; i++) {
434	if (args[i].equals("-sort")) {
435	i++;
436	queryer.setSortField(args[i]);
437	}
438	else if (args[i].equals("-filter")) {
439	i++;
440	queryer.setFilterString(args[i]);
441	}
442	else if (args[i].equals("-dco")) {
443	i++;
444	queryer.setDefaultConjunctionOperator(args[i]);
445	}
446	else if (args[i].equals("-fuzziness")) {
447	i++;
448	queryer.setFuzziness(args[i]);
449	}
450	else if (args[i].equals("-startresults")) {
451	i++;
452	if (args[i].matches("\\d+")) {
453	queryer.setStartResults(Integer.parseInt(args[i]));
454	}
455	}
456	else if (args[i].equals("-endresults")) {
457	i++;
458	if (args[i].matches("\\d+")) {
459	queryer.setEndResults(Integer.parseInt(args[i]));
460	}
461	}
462	else {
463	query_string = args[i];
464	}
465	}
466
467	if (!queryer.initialise()) {
468	return;
469	}
470
471	// The query string has been specified as a command-line argument
472	if (query_string != null) {
473	runQueryCaching(index_directory, queryer, query_string);
474	}
475
476	// Read queries from STDIN
477	else {
478	BufferedReader in = new BufferedReader(new InputStreamReader(System.in, "UTF-8"));
479	while (true) {
480	// Read the query from STDIN
481	query_string = in.readLine();
482	if (query_string == null \|\| query_string.length() == -1) {
483	break;
484	}
485	runQueryCaching(index_directory, queryer, query_string);
486
487	}
488	}
489	queryer.cleanUp();
490	}
491	catch (IOException exception) {
492	exception.printStackTrace();
493	}
494	}
495
496	private static void runQueryCaching(String index_directory, GS2LuceneQuery queryer, String query_string)
497	throws IOException
498	{
499	StringBuffer query_results_xml = new StringBuffer();
500
501	// Check if this query result has been cached from a previous search (if it's enabled)
502	File query_result_cache_file = null;
503	if (query_result_caching_enabled) {
504	// Generate the cache file name from the query options
505	String query_result_cache_file_name = query_string + "-";
506	String fuzziness = queryer.getFuzziness();
507	query_result_cache_file_name += ((fuzziness != null) ? fuzziness : "") + "-";
508	String filter_string = queryer.getFilterString();
509	query_result_cache_file_name += ((filter_string != null) ? filter_string : "") + "-";
510	String sort_string = queryer.getSortField();
511	query_result_cache_file_name += ((sort_string != null) ? sort_string : "") + "-";
512	String default_conjunction_operator = queryer.getDefaultConjunctionOperator();
513	query_result_cache_file_name += default_conjunction_operator + "-";
514	int start_results = queryer.getStartResults();
515	int end_results = queryer.getEndResults();
516	query_result_cache_file_name += start_results + "-" + end_results;
517	query_result_cache_file_name = fileSafe(query_result_cache_file_name);
518
519	// If the query result cache file exists, just return its contents and we're done
520	File index_cache_directory = new File(index_directory, "cache");
521	query_result_cache_file = new File(index_cache_directory, query_result_cache_file_name);
522	if (query_result_cache_file.exists() && query_result_cache_file.isFile()) {
523	FileInputStream fis = new FileInputStream(query_result_cache_file);
524	InputStreamReader isr = new InputStreamReader(fis, "UTF-8");
525	BufferedReader buffered_reader = new BufferedReader(isr);
526	String line = "";
527	while ((line = buffered_reader.readLine()) != null) {
528	query_results_xml.append(line + "\n");
529	}
530	String query_results_xml_string = query_results_xml.toString();
531	query_results_xml_string = query_results_xml_string.replaceFirst("cached=\"false\"", "cached=\"true\"");
532	System.out.print(query_results_xml_string);
533	return;
534	}
535	}
536
537	// not cached
538	query_results_xml.append("<ResultSet cached=\"false\">\n");
539	query_results_xml.append("<QueryString>" + LuceneQueryResult.xmlSafe(query_string) + "</QueryString>\n");
540	Filter filter = queryer.getFilter();
541	if (filter != null) {
542	query_results_xml.append("<FilterString>" + filter.toString() + "</FilterString>\n");
543	}
544
545	LuceneQueryResult query_result = queryer.runQuery(query_string);
546	if (query_result == null) {
547	System.err.println("Couldn't run the query");
548	return;
549	}
550
551	if (query_result.getError() != LuceneQueryResult.NO_ERROR) {
552	query_results_xml.append("<Error type=\""+query_result.getErrorString()+"\" />\n");
553	} else {
554	query_results_xml.append(query_result.getXMLString());
555	}
556	query_results_xml.append("</ResultSet>\n");
557
558	System.out.print(query_results_xml);
559
560	// Cache this query result, if desired
561	if (query_result_caching_enabled) {
562	FileWriter query_result_cache_file_writer = new FileWriter(query_result_cache_file);
563	query_result_cache_file_writer.write(query_results_xml.toString());
564	query_result_cache_file_writer.close();
565	}
566	}
567
568	private static String fileSafe(String text)
569	{
570	StringBuffer file_safe_text = new StringBuffer();
571	for (int i = 0; i < text.length(); i++) {
572	char character = text.charAt(i);
573	if ((character >= 'A' && character <= 'Z') \|\| (character >= 'a' && character <= 'z') \|\| (character >= '0' && character <= '9') \|\| character == '-') {
574	file_safe_text.append(character);
575	}
576	else {
577	file_safe_text.append('%');
578	file_safe_text.append((int) character);
579	}
580	}
581	return file_safe_text.toString();
582	}
583
584
585	}
586
587

Note: See TracBrowser for help on using the repository browser.

Download in other formats: