Context Navigation

source: main/trunk/greenstone2/common-src/indexers/lucene-gs/src/org/greenstone/LuceneWrapper/GS2LuceneQuery.java@ 24731

Last change on this file since 24731 was 24731, checked in by sjm84, 13 years ago
Lucene 3.x version of code accidentally commited rolling back to 2.x compatible version
Property svn:keywords set to `Author Date Id Revision`
File size: 21.5 KB

Line
1	/**********************************************************************
2	*
3	* GS2LuceneQuery.java
4	*
5	* Copyright 2004 The New Zealand Digital Library Project
6	*
7	* A component of the Greenstone digital library software
8	* from the New Zealand Digital Library Project at the
9	* University of Waikato, New Zealand.
10	*
11	* This program is free software; you can redistribute it and/or modify
12	* it under the terms of the GNU General Public License as published by
13	* the Free Software Foundation; either version 2 of the License, or
14	* (at your option) any later version.
15	*
16	* This program is distributed in the hope that it will be useful,
17	* but WITHOUT ANY WARRANTY; without even the implied warranty of
18	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19	* GNU General Public License for more details.
20	*
21	* You should have received a copy of the GNU General Public License
22	* along with this program; if not, write to the Free Software
23	* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24	*
25	*********************************************************************/
26	package org.greenstone.LuceneWrapper;
27
28
29	import java.io.*;
30	import java.util.*;
31	import java.util.regex.*;
32
33	import org.apache.lucene.analysis.Analyzer;
34	import org.apache.lucene.analysis.standard.StandardAnalyzer;
35	import org.apache.lucene.document.Document;
36	import org.apache.lucene.index.IndexReader;
37	import org.apache.lucene.index.Term;
38	import org.apache.lucene.index.TermDocs;
39	import org.apache.lucene.queryParser.ParseException;
40	import org.apache.lucene.queryParser.QueryParser;
41	import org.apache.lucene.search.BooleanQuery.TooManyClauses;
42	import org.apache.lucene.search.Filter;
43	import org.apache.lucene.search.Hit;
44	import org.apache.lucene.search.Hits;
45	import org.apache.lucene.search.IndexSearcher;
46	import org.apache.lucene.search.Query;
47	import org.apache.lucene.search.RangeFilter;
48	import org.apache.lucene.search.Searcher;
49	import org.apache.lucene.search.ScoreDoc;
50	import org.apache.lucene.search.Sort;
51	import org.apache.lucene.search.TopFieldDocs;
52
53
54	public class GS2LuceneQuery
55	{
56
57
58	static private String TEXTFIELD = "TX";
59
60	// Use the standard set of English stop words by default
61	static private String[] stop_words = GS2Analyzer.STOP_WORDS;
62
63	private String full_indexdir="";
64	private String default_conjunction_operator = "OR";
65	private String fuzziness = null;
66	private String sort_field = null;
67	private Sort sorter=new Sort();
68	private String filter_string = null;
69	private Filter filter = null;
70	private int start_results=1;
71	private int end_results=Integer.MAX_VALUE;
72
73	private QueryParser query_parser = null;
74	private QueryParser query_parser_no_stop_words = null;
75	private Searcher searcher = null;
76	private IndexReader reader = null;
77
78	static private PrintWriter utf8out = null;
79
80	static
81	{
82	try {
83	OutputStreamWriter osw = new OutputStreamWriter(System.out, "UTF-8");
84	utf8out = new PrintWriter(osw, true);
85	}
86	catch (UnsupportedEncodingException e) {
87	System.out.println(e);
88	}
89	}
90
91
92	public GS2LuceneQuery() {
93
94	// Create one query parser with the standard set of stop words, and one with none
95
96	query_parser = new QueryParser(TEXTFIELD, new GS2Analyzer(stop_words));
97	query_parser_no_stop_words = new QueryParser(TEXTFIELD, new GS2Analyzer(new String[] { }));
98	}
99
100
101	public boolean initialise() {
102
103	if (full_indexdir==null \|\| full_indexdir.length()==-1){
104	utf8out.println("Index directory is not indicated ");
105	utf8out.flush();
106	return false;
107	}
108	try {
109	searcher = new IndexSearcher(full_indexdir);
110	reader = ((IndexSearcher) searcher).getIndexReader();
111
112	}
113	catch (IOException exception) {
114	exception.printStackTrace();
115	return false;
116	}
117	return true;
118
119	}
120
121	public LuceneQueryResult runQuery(String query_string) {
122
123	if (query_string == null \|\| query_string.equals("")) {
124	utf8out.println("The query word is not indicated ");
125	utf8out.flush();
126	return null;
127	}
128
129	LuceneQueryResult lucene_query_result=new LuceneQueryResult();
130	lucene_query_result.clear();
131
132	try {
133	Query query_including_stop_words = query_parser_no_stop_words.parse(query_string);
134	query_including_stop_words = query_including_stop_words.rewrite(reader);
135
136	// System.err.println("******* query_string " + query_string + "**");
137
138	Query query = parseQuery(reader, query_parser, query_string, fuzziness);
139	query = query.rewrite(reader);
140
141	// Get the list of expanded query terms and their frequencies
142	// num docs matching, and total frequency
143	HashSet terms = new HashSet();
144	query.extractTerms(terms);
145
146	HashMap doc_term_freq_map = new HashMap();
147
148	Iterator iter = terms.iterator();
149	while (iter.hasNext()) {
150
151	Term term = (Term) iter.next();
152
153	// Get the term frequency over all the documents
154	TermDocs term_docs = reader.termDocs(term);
155	int term_freq = 0;
156	int match_docs = 0;
157	while (term_docs.next())
158	{
159	if (term_docs.freq() != 0)
160	{
161	term_freq += term_docs.freq();
162	match_docs++;
163
164	// Calculate the document-level term frequency as well
165	Integer lucene_doc_num_obj = new Integer(term_docs.doc());
166	int doc_term_freq = 0;
167	if (doc_term_freq_map.containsKey(lucene_doc_num_obj))
168	{
169	doc_term_freq = ((Integer) doc_term_freq_map.get(lucene_doc_num_obj)).intValue();
170	}
171	doc_term_freq += term_docs.freq();
172
173	doc_term_freq_map.put(lucene_doc_num_obj, new Integer(doc_term_freq));
174	}
175	}
176
177	// Create a term
178	lucene_query_result.addTerm(term.text(), term.field(), match_docs, term_freq);
179	}
180
181	// Get the list of stop words removed from the query
182	HashSet terms_including_stop_words = new HashSet();
183	query_including_stop_words.extractTerms(terms_including_stop_words);
184	Iterator terms_including_stop_words_iter = terms_including_stop_words.iterator();
185	while (terms_including_stop_words_iter.hasNext()) {
186	Term term = (Term) terms_including_stop_words_iter.next();
187	if (!terms.contains(term)) {
188	lucene_query_result.addStopWord(term.text());
189	}
190	}
191
192	// do the query
193	// Simple case for getting all the matching documents
194	if (end_results == Integer.MAX_VALUE) {
195	// Perform the query (filter and sorter may be null)
196	Hits hits = searcher.search(query, filter, sorter);
197	lucene_query_result.setTotalDocs(hits.length());
198
199	// Output the matching documents
200	lucene_query_result.setStartResults(start_results);
201	lucene_query_result.setEndResults(hits.length());
202
203	for (int i = start_results; i <= hits.length(); i++) {
204	int lucene_doc_num = hits.id(i - 1);
205	Document doc = hits.doc(i - 1);
206	int doc_term_freq = 0;
207	Integer doc_term_freq_object = (Integer) doc_term_freq_map.get(new Integer(lucene_doc_num));
208	if (doc_term_freq_object != null)
209	{
210	doc_term_freq = doc_term_freq_object.intValue();
211	}
212	lucene_query_result.addDoc(doc.get("docOID").trim(), hits.score(i-1), doc_term_freq);
213	}
214	}
215
216	// Slightly more complicated case for returning a subset of the matching documents
217	else {
218	// Perform the query (filter may be null)
219	TopFieldDocs hits = searcher.search(query, filter, end_results, sorter);
220	lucene_query_result.setTotalDocs(hits.totalHits);
221
222	lucene_query_result.setStartResults(start_results);
223	lucene_query_result.setEndResults(end_results < hits.scoreDocs.length ? end_results: hits.scoreDocs.length);
224
225	// Output the matching documents
226	for (int i = start_results; (i <= hits.scoreDocs.length && i <= end_results); i++) {
227	int lucene_doc_num = hits.scoreDocs[i - 1].doc;
228	Document doc = reader.document(lucene_doc_num);
229	int doc_term_freq = 0;
230	Integer doc_term_freq_object = (Integer) doc_term_freq_map.get(new Integer(lucene_doc_num));
231	if (doc_term_freq_object != null)
232	{
233	doc_term_freq = doc_term_freq_object.intValue();
234	}
235	lucene_query_result.addDoc(doc.get("docOID").trim(), hits.scoreDocs[i-1].score, doc_term_freq);
236	}
237	}
238	}
239
240	catch (ParseException parse_exception) {
241	lucene_query_result.setError(LuceneQueryResult.PARSE_ERROR);
242	}
243	catch (TooManyClauses too_many_clauses_exception) {
244	lucene_query_result.setError(LuceneQueryResult.TOO_MANY_CLAUSES_ERROR);
245	}
246	catch (IOException exception) {
247	lucene_query_result.setError(LuceneQueryResult.IO_ERROR);
248	exception.printStackTrace();
249	}
250	catch (Exception exception) {
251	lucene_query_result.setError(LuceneQueryResult.OTHER_ERROR);
252	exception.printStackTrace();
253	}
254	return lucene_query_result;
255	}
256
257	public void setDefaultConjunctionOperator(String default_conjunction_operator) {
258	this.default_conjunction_operator = default_conjunction_operator.toUpperCase();
259	if (default_conjunction_operator.equals("AND")) {
260	query_parser.setDefaultOperator(query_parser.AND_OPERATOR);
261	query_parser_no_stop_words.setDefaultOperator(query_parser.AND_OPERATOR);
262	} else { // default is OR
263	query_parser.setDefaultOperator(query_parser.OR_OPERATOR);
264	query_parser_no_stop_words.setDefaultOperator(query_parser.OR_OPERATOR);
265	}
266	}
267
268	public String getDefaultConjunctionOperator() {
269	return this.default_conjunction_operator;
270	}
271
272	public void setEndResults(int end_results) {
273	this.end_results = end_results;
274	}
275	public int getEndResults() {
276	return this.end_results;
277	}
278
279	public void setFilterString(String filter_string) {
280	this.filter_string = filter_string;
281	this.filter = parseFilterString(filter_string);
282	}
283	public String getFilterString() {
284	return this.filter_string ;
285	}
286
287	public Filter getFilter() {
288	return this.filter;
289	}
290
291	public void setIndexDir(String full_indexdir) {
292	this.full_indexdir = full_indexdir;
293	}
294
295	public void setFuzziness(String fuzziness) {
296	this.fuzziness = fuzziness;
297	}
298	public String getFuzziness() {
299	return this.fuzziness;
300	}
301
302	public void setSortField(String sort_field) {
303	this.sort_field = sort_field;
304	if (sort_field == null) {
305	this.sorter = new Sort();
306	} else {
307	this.sorter = new Sort(sort_field);
308	}
309	}
310	public String getSortField() {
311	return this.sort_field;
312	}
313
314	public void setStartResults(int start_results) {
315	if (start_results < 1) {
316	start_results = 1;
317	}
318	this.start_results = start_results;
319	}
320	public int getStartResults() {
321	return this.start_results;
322	}
323
324	public void cleanUp() {
325	try {
326	if (searcher != null) {
327	searcher.close();
328	}
329	} catch (IOException exception) {
330	exception.printStackTrace();
331	}
332	}
333
334	private Query parseQuery(IndexReader reader, QueryParser query_parser, String query_string, String fuzziness)
335	throws java.io.IOException, org.apache.lucene.queryParser.ParseException
336	{
337	// Split query string into the search terms and the filter terms
338	// * The first +(...) term contains the search terms so count
339	// up '(' and stop when we finish matching ')'
340	int offset = 0;
341	int paren_count = 0;
342	boolean seen_paren = false;
343	while (offset < query_string.length() && (!seen_paren \|\| paren_count > 0)) {
344	if (query_string.charAt(offset) == '(') {
345	paren_count++;
346	seen_paren = true;
347	}
348	if (query_string.charAt(offset) == ')') {
349	paren_count--;
350	}
351	offset++;
352	}
353	String query_prefix = query_string.substring(0, offset);
354	String query_suffix = query_string.substring(offset);
355
356	///ystem.err.println("Prefix: " + query_prefix);
357	///ystem.err.println("Suffix: " + query_suffix);
358
359	Query query = query_parser.parse(query_prefix);
360	query = query.rewrite(reader);
361
362	// If this is a fuzzy search, then we need to add the fuzzy
363	// flag to each of the query terms
364	if (fuzziness != null && query.toString().length() > 0) {
365
366	// Revert the query to a string
367	System.err.println("Rewritten query: " + query.toString());
368	// Search through the string for TX:<term> query terms
369	// and append the ~ operator. Note that this search will
370	// not change phrase searches (TX:"<term> <term>") as
371	// fuzzy searching is not possible for these entries.
372	// Yahoo! Time for a state machine!
373	StringBuffer mutable_query_string = new StringBuffer(query.toString());
374	int o = 0; // Offset
375	// 0 = BASE, 1 = SEEN_T, 2 = SEEN_TX, 3 = SEEN_TX:
376	int s = 0; // State
377	while(o < mutable_query_string.length()) {
378	char c = mutable_query_string.charAt(o);
379	if (s == 0 && c == TEXTFIELD.charAt(0)) {
380	///ystem.err.println("Found T!");
381	s = 1;
382	}
383	else if (s == 1) {
384	if (c == TEXTFIELD.charAt(1)) {
385	///ystem.err.println("Found X!");
386	s = 2;
387	}
388	else {
389	s = 0; // Reset
390	}
391	}
392	else if (s == 2) {
393	if (c == ':') {
394	///ystem.err.println("Found TX:!");
395	s = 3;
396	}
397	else {
398	s = 0; // Reset
399	}
400	}
401	else if (s == 3) {
402	// Don't process phrases
403	if (c == '"') {
404	///ystem.err.println("Stupid phrase...");
405	s = 0; // Reset
406	}
407	// Found the end of the term... add the
408	// fuzzy search indicator
409	// Nor outside the scope of parentheses
410	else if (Character.isWhitespace(c) \|\| c == ')') {
411	///ystem.err.println("Yahoo! Found fuzzy term.");
412	mutable_query_string.insert(o, '~' + fuzziness);
413	o++;
414	s = 0; // Reset
415	}
416	}
417	o++;
418	}
419	// If we were in the state of looking for the end of a
420	// term - then we just found it!
421	if (s == 3) {
422
423	mutable_query_string.append('~' + fuzziness);
424	}
425	// Reparse the query
426	///ystem.err.println("Fuzzy query: " + mutable_query_string.toString() + query_suffix);
427	query = query_parser.parse(mutable_query_string.toString() + query_suffix);
428	}
429	else {
430	query = query_parser.parse(query_prefix + query_suffix);
431	}
432
433	return query;
434	}
435
436	private Filter parseFilterString(String filter_string)
437	{
438	Filter result = null;
439	Pattern pattern = Pattern.compile("\\s\\+(\\w+)\\:([\\{\\[])(\\d+)\\s+TO\\s+(\\d+)([\\}\\]])\\s");
440	Matcher matcher = pattern.matcher(filter_string);
441	if (matcher.matches()) {
442	String field_name = matcher.group(1);
443	boolean include_lower = matcher.group(2).equals("[");
444	String lower_term = matcher.group(3);
445	String upper_term = matcher.group(4);
446	boolean include_upper = matcher.group(5).equals("]");
447	result = new RangeFilter(field_name, lower_term, upper_term, include_lower, include_upper);
448	}
449	else {
450	System.err.println("Error: Could not understand filter string \"" + filter_string + "\"");
451	}
452	return result;
453	}
454
455
456	protected void finalize() throws Throwable
457	{
458	try {
459	utf8out.flush();
460	} finally {
461	super.finalize();
462	}
463	}
464
465
466	/** command line program and auxiliary methods */
467
468	// Fairly self-explanatory I should hope
469	static private boolean query_result_caching_enabled = false;
470
471
472	static public void main (String args[])
473	{
474
475
476	if (args.length == 0) {
477	System.out.println("Usage: GS2LuceneQuery <index directory> [-fuzziness value] [-filter filter_string] [-sort sort_field] [-dco AND\|OR] [-startresults number -endresults number] [query]");
478	return;
479	}
480
481	try {
482	String index_directory = args[0];
483
484	GS2LuceneQuery queryer = new GS2LuceneQuery();
485	queryer.setIndexDir(index_directory);
486
487	// Prepare the index cache directory, if query result caching is enabled
488	if (query_result_caching_enabled) {
489	// Make the index cache directory if it doesn't already exist
490	File index_cache_directory = new File(index_directory, "cache");
491	if (!index_cache_directory.exists()) {
492	index_cache_directory.mkdir();
493	}
494
495	// Disable caching if the index cache directory isn't available
496	if (!index_cache_directory.exists() \|\| !index_cache_directory.isDirectory()) {
497	query_result_caching_enabled = false;
498	}
499	}
500
501	String query_string = null;
502
503	// Parse the command-line arguments
504	for (int i = 1; i < args.length; i++) {
505	if (args[i].equals("-sort")) {
506	i++;
507	queryer.setSortField(args[i]);
508	}
509	else if (args[i].equals("-filter")) {
510	i++;
511	queryer.setFilterString(args[i]);
512	}
513	else if (args[i].equals("-dco")) {
514	i++;
515	queryer.setDefaultConjunctionOperator(args[i]);
516	}
517	else if (args[i].equals("-fuzziness")) {
518	i++;
519	queryer.setFuzziness(args[i]);
520	}
521	else if (args[i].equals("-startresults")) {
522	i++;
523	if (args[i].matches("\\d+")) {
524	queryer.setStartResults(Integer.parseInt(args[i]));
525	}
526	}
527	else if (args[i].equals("-endresults")) {
528	i++;
529	if (args[i].matches("\\d+")) {
530	queryer.setEndResults(Integer.parseInt(args[i]));
531	}
532	}
533	else {
534	query_string = args[i];
535	}
536	}
537
538	if (!queryer.initialise()) {
539	return;
540	}
541
542	// The query string has been specified as a command-line argument
543	if (query_string != null) {
544	runQueryCaching(index_directory, queryer, query_string);
545	}
546
547	// Read queries from STDIN
548	else {
549	BufferedReader in = new BufferedReader(new InputStreamReader(System.in, "UTF-8"));
550	while (true) {
551	// Read the query from STDIN
552	query_string = in.readLine();
553	if (query_string == null \|\| query_string.length() == -1) {
554	break;
555	}
556
557	runQueryCaching(index_directory, queryer, query_string);
558
559	}
560	}
561	queryer.cleanUp();
562	}
563	catch (IOException exception) {
564	exception.printStackTrace();
565	}
566	}
567
568	private static void runQueryCaching(String index_directory, GS2LuceneQuery queryer, String query_string)
569	throws IOException
570	{
571	StringBuffer query_results_xml = new StringBuffer();
572
573	// Check if this query result has been cached from a previous search (if it's enabled)
574	File query_result_cache_file = null;
575	if (query_result_caching_enabled) {
576	// Generate the cache file name from the query options
577	String query_result_cache_file_name = query_string + "-";
578	String fuzziness = queryer.getFuzziness();
579	query_result_cache_file_name += ((fuzziness != null) ? fuzziness : "") + "-";
580	String filter_string = queryer.getFilterString();
581	query_result_cache_file_name += ((filter_string != null) ? filter_string : "") + "-";
582	String sort_string = queryer.getSortField();
583	query_result_cache_file_name += ((sort_string != null) ? sort_string : "") + "-";
584	String default_conjunction_operator = queryer.getDefaultConjunctionOperator();
585	query_result_cache_file_name += default_conjunction_operator + "-";
586	int start_results = queryer.getStartResults();
587	int end_results = queryer.getEndResults();
588	query_result_cache_file_name += start_results + "-" + end_results;
589	query_result_cache_file_name = fileSafe(query_result_cache_file_name);
590
591	// If the query result cache file exists, just return its contents and we're done
592	File index_cache_directory = new File(index_directory, "cache");
593	query_result_cache_file = new File(index_cache_directory, query_result_cache_file_name);
594	if (query_result_cache_file.exists() && query_result_cache_file.isFile()) {
595	FileInputStream fis = new FileInputStream(query_result_cache_file);
596	InputStreamReader isr = new InputStreamReader(fis, "UTF-8");
597	BufferedReader buffered_reader = new BufferedReader(isr);
598	String line = "";
599	while ((line = buffered_reader.readLine()) != null) {
600	query_results_xml.append(line + "\n");
601	}
602	String query_results_xml_string = query_results_xml.toString();
603	query_results_xml_string = query_results_xml_string.replaceFirst("cached=\"false\"", "cached=\"true\"");
604
605	utf8out.print(query_results_xml_string);
606	utf8out.flush();
607
608	return;
609	}
610	}
611
612	// not cached
613	query_results_xml.append("<ResultSet cached=\"false\">\n");
614	query_results_xml.append("<QueryString>" + LuceneQueryResult.xmlSafe(query_string) + "</QueryString>\n");
615	Filter filter = queryer.getFilter();
616	if (filter != null) {
617	query_results_xml.append("<FilterString>" + filter.toString() + "</FilterString>\n");
618	}
619
620	LuceneQueryResult query_result = queryer.runQuery(query_string);
621	if (query_result == null) {
622	System.err.println("Couldn't run the query");
623	return;
624	}
625
626	if (query_result.getError() != LuceneQueryResult.NO_ERROR) {
627	query_results_xml.append("<Error type=\""+query_result.getErrorString()+"\" />\n");
628	} else {
629	query_results_xml.append(query_result.getXMLString());
630	}
631	query_results_xml.append("</ResultSet>\n");
632
633	utf8out.print(query_results_xml);
634	utf8out.flush();
635
636	// Cache this query result, if desired
637	if (query_result_caching_enabled) {
638	// Catch any exceptions thrown trying to write the query result cache file and warn about them, but don't
639	// bother with the full stack trace. It won't affect the functionality if we can't write some cache
640	// files, it will just affect the speed of subsequent requests.
641	// Example exceptions are "permission denied" errors, or "filename too long" errors (the filter string
642	// can get very long in some collections)
643	try
644	{
645	FileWriter query_result_cache_file_writer = new FileWriter(query_result_cache_file);
646	query_result_cache_file_writer.write(query_results_xml.toString());
647	query_result_cache_file_writer.close();
648	}
649	catch (Exception exception)
650	{
651	System.err.println("Warning: Exception occurred trying to write query result cache file (" + exception + ")");
652	}
653	}
654	}
655
656	private static String fileSafe(String text)
657	{
658	StringBuffer file_safe_text = new StringBuffer();
659	for (int i = 0; i < text.length(); i++) {
660	char character = text.charAt(i);
661	if ((character >= 'A' && character <= 'Z') \|\| (character >= 'a' && character <= 'z') \|\| (character >= '0' && character <= '9') \|\| character == '-') {
662	file_safe_text.append(character);
663	}
664	else {
665	file_safe_text.append('%');
666	file_safe_text.append((int) character);
667	}
668	}
669	return file_safe_text.toString();
670	}
671
672
673	}
674
675

Note: See TracBrowser for help on using the repository browser.

Download in other formats: