Context Navigation

source: main/trunk/greenstone2/common-src/indexers/lucene-gs/src/org/greenstone/LuceneWrapper3/GS2LuceneQuery.java@ 24732

Last change on this file since 24732 was 24732, checked in by davidb, 13 years ago
Some additional changes that allow both Lucene 2.x and 3.x be compiled up side-by-site
File size: 20.4 KB

Line
1	/**********************************************************************
2	*
3	* GS2LuceneQuery.java
4	*
5	* Copyright 2004 The New Zealand Digital Library Project
6	*
7	* A component of the Greenstone digital library software
8	* from the New Zealand Digital Library Project at the
9	* University of Waikato, New Zealand.
10	*
11	* This program is free software; you can redistribute it and/or modify
12	* it under the terms of the GNU General Public License as published by
13	* the Free Software Foundation; either version 2 of the License, or
14	* (at your option) any later version.
15	*
16	* This program is distributed in the hope that it will be useful,
17	* but WITHOUT ANY WARRANTY; without even the implied warranty of
18	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19	* GNU General Public License for more details.
20	*
21	* You should have received a copy of the GNU General Public License
22	* along with this program; if not, write to the Free Software
23	* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24	*
25	*********************************************************************/
26	package org.greenstone.LuceneWrapper3;
27
28
29	import java.io.*;
30	import java.util.*;
31	import java.util.regex.*;
32
33	import org.apache.lucene.analysis.Analyzer;
34	import org.apache.lucene.analysis.standard.StandardAnalyzer;
35	import org.apache.lucene.document.Document;
36	import org.apache.lucene.index.IndexReader;
37	import org.apache.lucene.index.Term;
38	import org.apache.lucene.index.TermDocs;
39	import org.apache.lucene.queryParser.ParseException;
40	import org.apache.lucene.queryParser.QueryParser;
41	import org.apache.lucene.search.BooleanQuery.TooManyClauses;
42	import org.apache.lucene.search.Filter;
43	import org.apache.lucene.search.IndexSearcher;
44	import org.apache.lucene.search.Query;
45	import org.apache.lucene.search.TermRangeFilter;
46	import org.apache.lucene.search.Searcher;
47	import org.apache.lucene.search.ScoreDoc;
48	import org.apache.lucene.search.Sort;
49	import org.apache.lucene.search.SortField;
50	import org.apache.lucene.search.TopFieldDocs;
51
52	import org.apache.lucene.store.Directory;
53	import org.apache.lucene.store.FSDirectory;
54	import org.apache.lucene.util.Version;
55
56	public class GS2LuceneQuery extends SharedSoleneQuery
57	{
58	protected String full_indexdir="";
59
60	protected Sort sorter=new Sort();
61	protected Filter filter = null;
62
63	protected static Version matchVersion = Version.LUCENE_24;
64
65	protected QueryParser query_parser = null;
66	protected QueryParser query_parser_no_stop_words = null;
67	protected Searcher searcher = null;
68	protected IndexReader reader = null;
69
70	public GS2LuceneQuery() {
71	super();
72
73	// Create one query parser with the standard set of stop words, and one with none
74
75	query_parser = new QueryParser(matchVersion, TEXTFIELD, new GS2Analyzer()); // uses built-in stop_words_set
76	query_parser_no_stop_words = new QueryParser(matchVersion, TEXTFIELD, new GS2Analyzer(new String[] { }));
77	}
78
79
80	public boolean initialise() {
81
82	if (!super.initialise()) {
83	return false;
84	}
85
86
87	if (full_indexdir==null \|\| full_indexdir.length()==-1){
88	utf8out.println("Index directory is not indicated ");
89	utf8out.flush();
90	return false;
91	}
92
93	try {
94	Directory full_indexdir_dir = FSDirectory.open(new File(full_indexdir));
95	searcher = new IndexSearcher(full_indexdir_dir,true);
96	reader = ((IndexSearcher) searcher).getIndexReader();
97
98	}
99	catch (IOException exception) {
100	exception.printStackTrace();
101	return false;
102	}
103	return true;
104
105	}
106
107	public void setIndexDir(String full_indexdir) {
108	this.full_indexdir = full_indexdir;
109	}
110
111	public void setSortField(String sort_field) {
112	super.setSortField(sort_field);
113
114	if (sort_field == null) {
115	this.sorter = new Sort();
116	} else {
117	this.sorter = new Sort(new SortField(sort_field,SortField.STRING)); // **** can do better than this?!?
118	}
119	}
120
121	public void setFilterString(String filter_string) {
122	super.setFilterString(filter_string);
123	this.filter = parseFilterString(filter_string);
124	}
125
126	public Filter getFilter() {
127	return this.filter;
128	}
129
130
131	public LuceneQueryResult runQuery(String query_string) {
132
133	if (query_string == null \|\| query_string.equals("")) {
134	utf8out.println("The query word is not indicated ");
135	utf8out.flush();
136	return null;
137	}
138
139	LuceneQueryResult lucene_query_result=new LuceneQueryResult();
140	lucene_query_result.clear();
141
142	try {
143	Query query_including_stop_words = query_parser_no_stop_words.parse(query_string);
144	query_including_stop_words = query_including_stop_words.rewrite(reader);
145
146	// System.err.println("******* query_string " + query_string + "**");
147
148	Query query = parseQuery(reader, query_parser, query_string, fuzziness);
149	query = query.rewrite(reader);
150
151	// Get the list of expanded query terms and their frequencies
152	// num docs matching, and total frequency
153	HashSet terms = new HashSet();
154	query.extractTerms(terms);
155
156	HashMap doc_term_freq_map = new HashMap();
157
158	Iterator iter = terms.iterator();
159	while (iter.hasNext()) {
160
161	Term term = (Term) iter.next();
162
163	// Get the term frequency over all the documents
164	TermDocs term_docs = reader.termDocs(term);
165	int term_freq = 0;
166	int match_docs = 0;
167	while (term_docs.next())
168	{
169	if (term_docs.freq() != 0)
170	{
171	term_freq += term_docs.freq();
172	match_docs++;
173
174	// Calculate the document-level term frequency as well
175	Integer lucene_doc_num_obj = new Integer(term_docs.doc());
176	int doc_term_freq = 0;
177	if (doc_term_freq_map.containsKey(lucene_doc_num_obj))
178	{
179	doc_term_freq = ((Integer) doc_term_freq_map.get(lucene_doc_num_obj)).intValue();
180	}
181	doc_term_freq += term_docs.freq();
182
183	doc_term_freq_map.put(lucene_doc_num_obj, new Integer(doc_term_freq));
184	}
185	}
186
187	// Create a term
188	lucene_query_result.addTerm(term.text(), term.field(), match_docs, term_freq);
189	}
190
191	// Get the list of stop words removed from the query
192	HashSet terms_including_stop_words = new HashSet();
193	query_including_stop_words.extractTerms(terms_including_stop_words);
194	Iterator terms_including_stop_words_iter = terms_including_stop_words.iterator();
195	while (terms_including_stop_words_iter.hasNext()) {
196	Term term = (Term) terms_including_stop_words_iter.next();
197	if (!terms.contains(term)) {
198	lucene_query_result.addStopWord(term.text());
199	}
200	}
201
202	// do the query
203	// Simple case for getting all the matching documents
204	if (end_results == Integer.MAX_VALUE) {
205	// Perform the query (filter and sorter may be null)
206	TopFieldDocs hits = searcher.search(query, filter, end_results, sorter);
207	lucene_query_result.setTotalDocs(hits.totalHits);
208
209	// Output the matching documents
210	lucene_query_result.setStartResults(start_results);
211	lucene_query_result.setEndResults(hits.totalHits);
212
213	for (int i = start_results; i <= hits.totalHits; i++) {
214	int lucene_doc_num = hits.scoreDocs[i - 1].doc;
215	Document doc = reader.document(lucene_doc_num);
216	int doc_term_freq = 0;
217	Integer doc_term_freq_object = (Integer) doc_term_freq_map.get(new Integer(lucene_doc_num));
218	if (doc_term_freq_object != null)
219	{
220	doc_term_freq = doc_term_freq_object.intValue();
221	}
222	lucene_query_result.addDoc(doc.get("docOID").trim(), hits.scoreDocs[i-1].score, doc_term_freq);
223	}
224	}
225
226	// Slightly more complicated case for returning a subset of the matching documents
227	else {
228	// Perform the query (filter may be null)
229	TopFieldDocs hits = searcher.search(query, filter, end_results, sorter);
230	lucene_query_result.setTotalDocs(hits.totalHits);
231
232	lucene_query_result.setStartResults(start_results);
233	lucene_query_result.setEndResults(end_results < hits.scoreDocs.length ? end_results: hits.scoreDocs.length);
234
235	// Output the matching documents
236	for (int i = start_results; (i <= hits.scoreDocs.length && i <= end_results); i++) {
237	int lucene_doc_num = hits.scoreDocs[i - 1].doc;
238	Document doc = reader.document(lucene_doc_num);
239	int doc_term_freq = 0;
240	Integer doc_term_freq_object = (Integer) doc_term_freq_map.get(new Integer(lucene_doc_num));
241	if (doc_term_freq_object != null)
242	{
243	doc_term_freq = doc_term_freq_object.intValue();
244	}
245	lucene_query_result.addDoc(doc.get("docOID").trim(), hits.scoreDocs[i-1].score, doc_term_freq);
246	}
247	}
248	}
249
250	catch (ParseException parse_exception) {
251	lucene_query_result.setError(LuceneQueryResult.PARSE_ERROR);
252	}
253	catch (TooManyClauses too_many_clauses_exception) {
254	lucene_query_result.setError(LuceneQueryResult.TOO_MANY_CLAUSES_ERROR);
255	}
256	catch (IOException exception) {
257	lucene_query_result.setError(LuceneQueryResult.IO_ERROR);
258	exception.printStackTrace();
259	}
260	catch (Exception exception) {
261	lucene_query_result.setError(LuceneQueryResult.OTHER_ERROR);
262	exception.printStackTrace();
263	}
264	return lucene_query_result;
265	}
266
267	public void setDefaultConjunctionOperator(String default_conjunction_operator) {
268	super.setDefaultConjunctionOperator(default_conjunction_operator);
269
270	if (default_conjunction_operator.equals("AND")) {
271	query_parser.setDefaultOperator(query_parser.AND_OPERATOR);
272	query_parser_no_stop_words.setDefaultOperator(query_parser.AND_OPERATOR);
273	} else { // default is OR
274	query_parser.setDefaultOperator(query_parser.OR_OPERATOR);
275	query_parser_no_stop_words.setDefaultOperator(query_parser.OR_OPERATOR);
276	}
277	}
278
279
280	public void cleanUp() {
281	super.cleanUp();
282	try {
283	if (searcher != null) {
284	searcher.close();
285	}
286	} catch (IOException exception) {
287	exception.printStackTrace();
288	}
289	}
290
291
292	protected Query parseQuery(IndexReader reader, QueryParser query_parser, String query_string, String fuzziness)
293	throws java.io.IOException, org.apache.lucene.queryParser.ParseException
294	{
295	// Split query string into the search terms and the filter terms
296	// * The first +(...) term contains the search terms so count
297	// up '(' and stop when we finish matching ')'
298	int offset = 0;
299	int paren_count = 0;
300	boolean seen_paren = false;
301	while (offset < query_string.length() && (!seen_paren \|\| paren_count > 0)) {
302	if (query_string.charAt(offset) == '(') {
303	paren_count++;
304	seen_paren = true;
305	}
306	if (query_string.charAt(offset) == ')') {
307	paren_count--;
308	}
309	offset++;
310	}
311	String query_prefix = query_string.substring(0, offset);
312	String query_suffix = query_string.substring(offset);
313
314	///ystem.err.println("Prefix: " + query_prefix);
315	///ystem.err.println("Suffix: " + query_suffix);
316
317	Query query = query_parser.parse(query_prefix);
318	query = query.rewrite(reader);
319
320	// If this is a fuzzy search, then we need to add the fuzzy
321	// flag to each of the query terms
322	if (fuzziness != null && query.toString().length() > 0) {
323
324	// Revert the query to a string
325	System.err.println("Rewritten query: " + query.toString());
326	// Search through the string for TX:<term> query terms
327	// and append the ~ operator. Note that this search will
328	// not change phrase searches (TX:"<term> <term>") as
329	// fuzzy searching is not possible for these entries.
330	// Yahoo! Time for a state machine!
331	StringBuffer mutable_query_string = new StringBuffer(query.toString());
332	int o = 0; // Offset
333	// 0 = BASE, 1 = SEEN_T, 2 = SEEN_TX, 3 = SEEN_TX:
334	int s = 0; // State
335	while(o < mutable_query_string.length()) {
336	char c = mutable_query_string.charAt(o);
337	if (s == 0 && c == TEXTFIELD.charAt(0)) {
338	///ystem.err.println("Found T!");
339	s = 1;
340	}
341	else if (s == 1) {
342	if (c == TEXTFIELD.charAt(1)) {
343	///ystem.err.println("Found X!");
344	s = 2;
345	}
346	else {
347	s = 0; // Reset
348	}
349	}
350	else if (s == 2) {
351	if (c == ':') {
352	///ystem.err.println("Found TX:!");
353	s = 3;
354	}
355	else {
356	s = 0; // Reset
357	}
358	}
359	else if (s == 3) {
360	// Don't process phrases
361	if (c == '"') {
362	///ystem.err.println("Stupid phrase...");
363	s = 0; // Reset
364	}
365	// Found the end of the term... add the
366	// fuzzy search indicator
367	// Nor outside the scope of parentheses
368	else if (Character.isWhitespace(c) \|\| c == ')') {
369	///ystem.err.println("Yahoo! Found fuzzy term.");
370	mutable_query_string.insert(o, '~' + fuzziness);
371	o++;
372	s = 0; // Reset
373	}
374	}
375	o++;
376	}
377	// If we were in the state of looking for the end of a
378	// term - then we just found it!
379	if (s == 3) {
380
381	mutable_query_string.append('~' + fuzziness);
382	}
383	// Reparse the query
384	///ystem.err.println("Fuzzy query: " + mutable_query_string.toString() + query_suffix);
385	query = query_parser.parse(mutable_query_string.toString() + query_suffix);
386	}
387	else {
388	query = query_parser.parse(query_prefix + query_suffix);
389	}
390
391	return query;
392	}
393
394	protected Filter parseFilterString(String filter_string)
395	{
396	Filter result = null;
397	Pattern pattern = Pattern.compile("\\s\\+(\\w+)\\:([\\{\\[])(\\d+)\\s+TO\\s+(\\d+)([\\}\\]])\\s");
398	Matcher matcher = pattern.matcher(filter_string);
399	if (matcher.matches()) {
400	String field_name = matcher.group(1);
401	boolean include_lower = matcher.group(2).equals("[");
402	String lower_term = matcher.group(3);
403	String upper_term = matcher.group(4);
404	boolean include_upper = matcher.group(5).equals("]");
405	result = new TermRangeFilter(field_name, lower_term, upper_term, include_lower, include_upper);
406	}
407	else {
408	System.err.println("Error: Could not understand filter string \"" + filter_string + "\"");
409	}
410	return result;
411	}
412
413
414	/** command line program and auxiliary methods */
415
416	// Fairly self-explanatory I should hope
417	static protected boolean query_result_caching_enabled = false;
418
419
420	static public void main (String args[])
421	{
422	if (args.length == 0) {
423	System.out.println("Usage: GS2LuceneQuery <index directory> [-fuzziness value] [-filter filter_string] [-sort sort_field] [-dco AND\|OR] [-startresults number -endresults number] [query]");
424	return;
425	}
426
427	try {
428	String index_directory = args[0];
429
430	GS2LuceneQuery queryer = new GS2LuceneQuery();
431	queryer.setIndexDir(index_directory);
432
433	// Prepare the index cache directory, if query result caching is enabled
434	if (query_result_caching_enabled) {
435	// Make the index cache directory if it doesn't already exist
436	File index_cache_directory = new File(index_directory, "cache");
437	if (!index_cache_directory.exists()) {
438	index_cache_directory.mkdir();
439	}
440
441	// Disable caching if the index cache directory isn't available
442	if (!index_cache_directory.exists() \|\| !index_cache_directory.isDirectory()) {
443	query_result_caching_enabled = false;
444	}
445	}
446
447	String query_string = null;
448
449	// Parse the command-line arguments
450	for (int i = 1; i < args.length; i++) {
451	if (args[i].equals("-sort")) {
452	i++;
453	queryer.setSortField(args[i]);
454	}
455	else if (args[i].equals("-filter")) {
456	i++;
457	queryer.setFilterString(args[i]);
458	}
459	else if (args[i].equals("-dco")) {
460	i++;
461	queryer.setDefaultConjunctionOperator(args[i]);
462	}
463	else if (args[i].equals("-fuzziness")) {
464	i++;
465	queryer.setFuzziness(args[i]);
466	}
467	else if (args[i].equals("-startresults")) {
468	i++;
469	if (args[i].matches("\\d+")) {
470	queryer.setStartResults(Integer.parseInt(args[i]));
471	}
472	}
473	else if (args[i].equals("-endresults")) {
474	i++;
475	if (args[i].matches("\\d+")) {
476	queryer.setEndResults(Integer.parseInt(args[i]));
477	}
478	}
479	else {
480	query_string = args[i];
481	}
482	}
483
484	if (!queryer.initialise()) {
485	return;
486	}
487
488	// The query string has been specified as a command-line argument
489	if (query_string != null) {
490	runQueryCaching(index_directory, queryer, query_string);
491	}
492
493	// Read queries from STDIN
494	else {
495	BufferedReader in = new BufferedReader(new InputStreamReader(System.in, "UTF-8"));
496	while (true) {
497	// Read the query from STDIN
498	query_string = in.readLine();
499	if (query_string == null \|\| query_string.length() == -1) {
500	break;
501	}
502
503	runQueryCaching(index_directory, queryer, query_string);
504
505	}
506	}
507	queryer.cleanUp();
508	}
509	catch (IOException exception) {
510	exception.printStackTrace();
511	}
512	}
513
514	protected static void runQueryCaching(String index_directory, GS2LuceneQuery queryer, String query_string)
515	throws IOException
516	{
517	StringBuffer query_results_xml = new StringBuffer();
518
519	// Check if this query result has been cached from a previous search (if it's enabled)
520	File query_result_cache_file = null;
521	if (query_result_caching_enabled) {
522	// Generate the cache file name from the query options
523	String query_result_cache_file_name = query_string + "-";
524	String fuzziness = queryer.getFuzziness();
525	query_result_cache_file_name += ((fuzziness != null) ? fuzziness : "") + "-";
526	String filter_string = queryer.getFilterString();
527	query_result_cache_file_name += ((filter_string != null) ? filter_string : "") + "-";
528	String sort_string = queryer.getSortField();
529	query_result_cache_file_name += ((sort_string != null) ? sort_string : "") + "-";
530	String default_conjunction_operator = queryer.getDefaultConjunctionOperator();
531	query_result_cache_file_name += default_conjunction_operator + "-";
532	int start_results = queryer.getStartResults();
533	int end_results = queryer.getEndResults();
534	query_result_cache_file_name += start_results + "-" + end_results;
535	query_result_cache_file_name = fileSafe(query_result_cache_file_name);
536
537	// If the query result cache file exists, just return its contents and we're done
538	File index_cache_directory = new File(index_directory, "cache");
539	query_result_cache_file = new File(index_cache_directory, query_result_cache_file_name);
540	if (query_result_cache_file.exists() && query_result_cache_file.isFile()) {
541	FileInputStream fis = new FileInputStream(query_result_cache_file);
542	InputStreamReader isr = new InputStreamReader(fis, "UTF-8");
543	BufferedReader buffered_reader = new BufferedReader(isr);
544	String line = "";
545	while ((line = buffered_reader.readLine()) != null) {
546	query_results_xml.append(line + "\n");
547	}
548	String query_results_xml_string = query_results_xml.toString();
549	query_results_xml_string = query_results_xml_string.replaceFirst("cached=\"false\"", "cached=\"true\"");
550
551	utf8out.print(query_results_xml_string);
552	utf8out.flush();
553
554	return;
555	}
556	}
557
558	// not cached
559	query_results_xml.append("<ResultSet cached=\"false\">\n");
560	query_results_xml.append("<QueryString>" + LuceneQueryResult.xmlSafe(query_string) + "</QueryString>\n");
561	Filter filter = queryer.getFilter();
562	if (filter != null) {
563	query_results_xml.append("<FilterString>" + filter.toString() + "</FilterString>\n");
564	}
565
566	LuceneQueryResult query_result = queryer.runQuery(query_string);
567	if (query_result == null) {
568	System.err.println("Couldn't run the query");
569	return;
570	}
571
572	if (query_result.getError() != LuceneQueryResult.NO_ERROR) {
573	query_results_xml.append("<Error type=\""+query_result.getErrorString()+"\" />\n");
574	} else {
575	query_results_xml.append(query_result.getXMLString());
576	}
577	query_results_xml.append("</ResultSet>\n");
578
579	utf8out.print(query_results_xml);
580	utf8out.flush();
581
582	// Cache this query result, if desired
583	if (query_result_caching_enabled) {
584	// Catch any exceptions thrown trying to write the query result cache file and warn about them, but don't
585	// bother with the full stack trace. It won't affect the functionality if we can't write some cache
586	// files, it will just affect the speed of subsequent requests.
587	// Example exceptions are "permission denied" errors, or "filename too long" errors (the filter string
588	// can get very long in some collections)
589	try
590	{
591	FileWriter query_result_cache_file_writer = new FileWriter(query_result_cache_file);
592	query_result_cache_file_writer.write(query_results_xml.toString());
593	query_result_cache_file_writer.close();
594	}
595	catch (Exception exception)
596	{
597	System.err.println("Warning: Exception occurred trying to write query result cache file (" + exception + ")");
598	}
599	}
600	}
601
602	protected static String fileSafe(String text)
603	{
604	StringBuffer file_safe_text = new StringBuffer();
605	for (int i = 0; i < text.length(); i++) {
606	char character = text.charAt(i);
607	if ((character >= 'A' && character <= 'Z') \|\| (character >= 'a' && character <= 'z') \|\| (character >= '0' && character <= '9') \|\| character == '-') {
608	file_safe_text.append(character);
609	}
610	else {
611	file_safe_text.append('%');
612	file_safe_text.append((int) character);
613	}
614	}
615	return file_safe_text.toString();
616	}
617
618
619	}
620
621

Note: See TracBrowser for help on using the repository browser.

Download in other formats: