Context Navigation

source: trunk/indexers/lucene-gs/src/org/greenstone/LuceneWrapper/GS2LuceneQuery.java@ 12387

Last change on this file since 12387 was 12387, checked in by mdewsnip, 18 years ago
Fixes for fuzzy searching, many thanks to John Thompson and DL Consulting Ltd.
Property svn:keywords set to `Author Date Id Revision`
File size: 14.1 KB

Line
1	/**
2	*
3	* @author [email protected]
4	* @author [email protected]
5	* @version
6	*/
7
8	package org.nzdl.gsdl.LuceneWrap;
9
10
11	import java.io.BufferedReader;
12	import java.io.InputStreamReader;
13	import java.util.Collections;
14	import java.util.HashMap;
15	import java.util.HashSet;
16	import java.util.Iterator;
17	import java.util.Set;
18
19	import org.apache.lucene.analysis.Analyzer;
20	import org.apache.lucene.analysis.standard.StandardAnalyzer;
21	import org.apache.lucene.document.Document;
22	import org.apache.lucene.index.IndexReader;
23	import org.apache.lucene.index.Term;
24	import org.apache.lucene.index.TermFreqVector;
25	import org.apache.lucene.queryParser.QueryParser;
26	import org.apache.lucene.search.Hit;
27	import org.apache.lucene.search.Hits;
28	import org.apache.lucene.search.IndexSearcher;
29	import org.apache.lucene.search.Query;
30	import org.apache.lucene.search.Searcher;
31	import org.apache.lucene.search.Sort;
32
33
34	public class GS2LuceneQuery
35	{
36	public static void main (String args[])
37	{
38	if (args.length == 0) {
39	System.out.println("Usage: GS2LuceneQuery <index directory> (<sort field>)");
40	return;
41	}
42
43	try {
44	Searcher searcher = new IndexSearcher(args[0]);
45	IndexReader reader = ((IndexSearcher) searcher).getIndexReader();
46
47	Sort sorter = new Sort();
48	boolean fuzzy = false;
49
50	// New code to allow the default conjunction operator to be
51	// definable
52	String default_conjuction_operator = "OR";
53	for (int i = 1; i < args.length; i++)
54	{
55	if (args[i].equals("-sort"))
56	{
57	i++;
58	///ystem.err.println("**** sort by = " + args[i]);
59	sorter = new Sort(args[i]);
60	}
61	if (args[i].equals("-dco"))
62	{
63	i++;
64	default_conjuction_operator = args[i];
65	}
66	if (args[i].equals("-fuzzy"))
67	{
68	fuzzy = true;
69	}
70	}
71
72	// Create one query parser with the standard set of stop words, and one with none
73	QueryParser query_parser = new QueryParser("TX", new StandardAnalyzer());
74	QueryParser query_parser_no_stop_words = new QueryParser("TX", new StandardAnalyzer(new String[] { }));
75
76	// Lucene does "OR" queries by default; do an "AND" query if specified
77	if (default_conjuction_operator.equals("AND")) {
78	query_parser.setDefaultOperator(query_parser.AND_OPERATOR);
79	query_parser_no_stop_words.setDefaultOperator(query_parser.AND_OPERATOR);
80	}
81
82	BufferedReader in = new BufferedReader(new InputStreamReader(System.in, "UTF-8"));
83	while (true) {
84	// Read the query from STDIN
85	String query_string = in.readLine();
86	if (query_string == null \|\| query_string.length() == -1) {
87	break;
88	}
89	///ystem.err.println("**** query = " + query_string);
90
91	Query query_including_stop_words = query_parser_no_stop_words.parse(query_string);
92	query_including_stop_words = query_including_stop_words.rewrite(reader);
93
94	// Split query string into the search terms and the filter terms
95	// * The first +(...) term contains the search terms so count
96	// up '(' and stop when we finish matching ')'
97	int offset = 0;
98	int paren_count = 0;
99	boolean seen_paren = false;
100	while (offset < query_string.length() && (!seen_paren \|\| paren_count > 0))
101	{
102	if (query_string.charAt(offset) == '(')
103	{
104	paren_count++;
105	seen_paren = true;
106	}
107	if (query_string.charAt(offset) == ')')
108	{
109	paren_count--;
110	}
111	offset++;
112	}
113	String query_prefix = query_string.substring(0, offset);
114	String query_suffix = query_string.substring(offset);
115
116	///ystem.err.println("Prefix: " + query_prefix);
117	///ystem.err.println("Suffix: " + query_suffix);
118
119	Query query = query_parser.parse(query_prefix);
120	query = query.rewrite(reader);
121
122	// If this is a fuzzy search, then we need to add the fuzzy
123	// flag to each of the query terms
124	if (fuzzy && query.toString().length() > 0)
125	{
126	// Revert the query to a string
127	///ystem.err.println("Rewritten query: " + query.toString());
128	// Search through the string for TX:<term> query terms
129	// and append the ~ operator. Not that this search will
130	// not change phrase searches (TX:"<term> <term>") as
131	// fuzzy searching is not possible for these entries.
132	// Yahoo! Time for a state machine!
133	StringBuffer mutable_query_string = new StringBuffer(query.toString());
134	int o = 0; // Offset
135	// 0 = BASE, 1 = SEEN_T, 2 = SEEN_TX, 3 = SEEN_TX:
136	int s = 0; // State
137	while(o < mutable_query_string.length())
138	{
139	char c = mutable_query_string.charAt(o);
140	if (s == 0 && c == 'T')
141	{
142	///ystem.err.println("Found T!");
143	s = 1;
144	}
145	else if (s == 1)
146	{
147	if (c == 'X')
148	{
149	///ystem.err.println("Found X!");
150	s = 2;
151	}
152	else
153	{
154	s = 0; // Reset
155	}
156	}
157	else if (s == 2)
158	{
159	if (c == ':')
160	{
161	///ystem.err.println("Found TX:!");
162	s = 3;
163	}
164	else
165	{
166	s = 0; // Reset
167	}
168	}
169	else if (s == 3)
170	{
171	// Don't process phrases
172	if (c == '"')
173	{
174	///ystem.err.println("Stupid phrase...");
175	s = 0; // Reset
176	}
177	// Found the end of the term... add the
178	// fuzzy search indicator
179	// Nor outside the scope of parentheses
180	else if (Character.isWhitespace(c) \|\| c == ')')
181	{
182	///ystem.err.println("Yahoo! Found fuzzy term.");
183	mutable_query_string.insert(o, '~');
184	o++;
185	s = 0; // Reset
186	}
187	}
188	o++;
189	}
190	// If we were in the state of looking for the end of a
191	// term - then we just found it!
192	if (s == 3)
193	{
194	mutable_query_string.append('~');
195	}
196	// Reparse the query
197	///ystem.err.println("Fuzzy query: " + mutable_query_string.toString() + query_suffix);
198	query = query_parser.parse(mutable_query_string.toString() + query_suffix);
199	// And rewrite again
200	query = query.rewrite(reader);
201	///ystem.err.println("Rewritten Fuzzy query: " + query.toString());
202	}
203
204
205	// Perform the query
206	Hits hits = searcher.search(query, sorter);
207	System.out.println("<ResultSet>");
208	System.out.println(" <QueryString>" + query_string + "</QueryString>");
209	// Return the list of expanded query terms and their frequencies
210	HashMap term_counts = new HashMap();
211	HashMap term_fields = new HashMap();
212	HashSet terms = new HashSet();
213	query.extractTerms(terms);
214	//System.out.println(" <QueryTermsInfo num=\"" + terms.size() + "\"/>");
215	Iterator iter = terms.iterator();
216	while (iter.hasNext())
217	{
218	Term term = (Term) iter.next();
219	// If you wanted to limit this to just TX terms add
220	// something like this:
221	//if (term.field().equals("TX"))
222	term_counts.put(term.text(), new Integer(0));
223	term_fields.put(term.text(), term.field());
224	}
225
226	// Do we need to use a hit iterator to get sorted results?
227	System.out.println(" <MatchingDocsInfo num=\"" + hits.length() + "\"/>");
228	Iterator hit_iter = hits.iterator();
229	while (hit_iter.hasNext())
230	{
231	Hit hit = (Hit) hit_iter.next();
232	Document doc = hit.getDocument();
233	String node_id = doc.get("nodeID");
234	System.out.println(" <Match id=\"" + node_id + "\" />");
235
236	// From the document, extract the Term Vector for the
237	// TX field
238	TermFreqVector term_freq_vector = reader.getTermFreqVector(hit.getId(), "TX");
239	if (term_freq_vector.size() > 0)
240	{
241	int[] term_frequencies = term_freq_vector.getTermFrequencies();
242	// Now for each query term, determine the
243	// frequency - which may of course be 0.
244	Set term_counts_set = term_counts.keySet();
245	Iterator terms_iter = term_counts_set.iterator();
246	while (terms_iter.hasNext())
247	{
248	String term = (String) terms_iter.next();
249	Integer count_integer = (Integer) term_counts.get(term);
250	int count = count_integer.intValue();
251	int index = term_freq_vector.indexOf(term);
252	// If the term has a count, then add to
253	// the total count for this term
254	if (index != -1)
255	{
256	count += term_frequencies[index];
257
258	}
259	// Store the result
260	term_counts.put(term, new Integer(count));
261	count_integer = null;
262	term = null;
263	}
264	terms_iter = null;
265	term_counts_set = null;
266	}
267	else
268	{
269	///ystem.err.println("Error! Missing term vector for document " + hit.getId());
270	}
271	}
272
273	// Retrieve all the useful terms
274	Set term_counts_set = term_counts.keySet();
275	System.out.println(" <QueryTermsInfo num=\"" + term_counts_set.size() + "\"/>");
276	// Iterate over them
277	Iterator terms_iter = term_counts_set.iterator();
278	while (terms_iter.hasNext())
279	{
280	String term = (String) terms_iter.next();
281	Integer count = (Integer) term_counts.get(term);
282	String field = (String) term_fields.get(term);
283	System.out.println(" <Term value=\"" + term + "\" field=\"" + field + "\" freq=\"" + count.intValue() + "\" />");
284	count = null;
285	term = null;
286	}
287	// Cleanup
288	terms_iter = null;
289	term_counts_set = null;
290
291	// Return the list of stop words removed from the query
292	HashSet terms_including_stop_words = new HashSet();
293	query_including_stop_words.extractTerms(terms_including_stop_words);
294	Iterator terms_including_stop_words_iter = terms_including_stop_words.iterator();
295	while (terms_including_stop_words_iter.hasNext()) {
296	Term term = (Term) terms_including_stop_words_iter.next();
297	if (!terms.contains(term)) {
298	System.out.println(" <StopWord value=\"" + term.text() + "\"/>");
299	}
300	}
301
302	System.out.println("</ResultSet>");
303	}
304
305	searcher.close();
306	}
307	catch (Exception exception) {
308	exception.printStackTrace();
309	}
310	}
311	}

Note: See TracBrowser for help on using the repository browser.

Download in other formats: