Context Navigation

source: trunk/gsdl/src/java/org/nzdl/gsdl/LuceneWrap/GS2LuceneQuery.java@ 12770

Last change on this file since 12770 was 12770, checked in by mdewsnip, 18 years ago
Changed the Lucene "-fuzzy" argument to "-fuzziness <value>", for more accurate control.
Property svn:keywords set to `Author Date Id Revision`
File size: 13.0 KB

Line
1	/**
2	*
3	* @author [email protected]
4	* @author [email protected]
5	* @author [email protected]
6	* @author [email protected]
7	* @version
8	*/
9
10	package org.nzdl.gsdl.LuceneWrap;
11
12
13	import java.io.*;
14	import java.util.*;
15	import java.util.regex.*;
16
17	import org.apache.lucene.analysis.Analyzer;
18	import org.apache.lucene.analysis.standard.StandardAnalyzer;
19	import org.apache.lucene.document.Document;
20	import org.apache.lucene.index.IndexReader;
21	import org.apache.lucene.index.Term;
22	import org.apache.lucene.index.TermFreqVector;
23	import org.apache.lucene.queryParser.ParseException;
24	import org.apache.lucene.queryParser.QueryParser;
25	import org.apache.lucene.search.BooleanQuery.TooManyClauses;
26	import org.apache.lucene.search.Filter;
27	import org.apache.lucene.search.Hit;
28	import org.apache.lucene.search.Hits;
29	import org.apache.lucene.search.IndexSearcher;
30	import org.apache.lucene.search.Query;
31	import org.apache.lucene.search.QueryFilter;
32	import org.apache.lucene.search.RangeFilter;
33	import org.apache.lucene.search.Searcher;
34	import org.apache.lucene.search.Sort;
35
36
37	public class GS2LuceneQuery
38	{
39	// Use the standard set of English stop words by default
40	static private String[] stop_words = StandardAnalyzer.STOP_WORDS;
41
42
43	static public void main (String args[])
44	{
45	if (args.length == 0) {
46	System.out.println("Usage: GS2LuceneQuery <index directory> [-fuzziness value] [-filter filter_string] [-sort sort_field] [-dco AND\|OR] [-startresults number -endresults number]");
47	return;
48	}
49
50	try {
51	Searcher searcher = new IndexSearcher(args[0]);
52	IndexReader reader = ((IndexSearcher) searcher).getIndexReader();
53
54	// Create one query parser with the standard set of stop words, and one with none
55	QueryParser query_parser = new QueryParser("TX", new StandardAnalyzer(stop_words));
56	QueryParser query_parser_no_stop_words = new QueryParser("TX", new StandardAnalyzer(new String[] { }));
57
58	Sort sorter = new Sort();
59	Filter filter = null;
60	String fuzziness = null;
61
62	// Paging
63	int start_results = 1;
64	int end_results = -1;
65
66	// New code to allow the default conjunction operator to be
67	// definable
68	String default_conjuction_operator = "OR";
69	for (int i = 1; i < args.length; i++)
70	{
71	if (args[i].equals("-sort"))
72	{
73	i++;
74	sorter = new Sort(args[i]);
75	}
76	if (args[i].equals("-filter"))
77	{
78	i++;
79
80	// Parse up filter
81	filter = parseFilterString(args[i]);
82	}
83	if (args[i].equals("-dco"))
84	{
85	i++;
86	default_conjuction_operator = args[i];
87	}
88	if (args[i].equals("-fuzziness"))
89	{
90	i++;
91	fuzziness = args[i];
92	}
93	if (args[i].equals("-startresults"))
94	{
95	i++;
96	if (args[i].matches("\\d+"))
97	{
98	start_results = Integer.parseInt(args[i]);
99	}
100	}
101	if (args[i].equals("-endresults"))
102	{
103	i++;
104	if (args[i].matches("\\d+"))
105	{
106	end_results = Integer.parseInt(args[i]);
107	}
108	}
109	}
110
111	// Lucene does "OR" queries by default; do an "AND" query if specified
112	if (default_conjuction_operator.equals("AND"))
113	{
114	query_parser.setDefaultOperator(query_parser.AND_OPERATOR);
115	query_parser_no_stop_words.setDefaultOperator(query_parser.AND_OPERATOR);
116	}
117
118	BufferedReader in = new BufferedReader(new InputStreamReader(System.in, "UTF-8"));
119	while (true) {
120	// Read the query from STDIN
121	String query_string = in.readLine();
122	if (query_string == null \|\| query_string.length() == -1) {
123	break;
124	}
125	System.out.println("<ResultSet>");
126	System.out.println(" <QueryString>" + query_string + "</QueryString>");
127	if (filter != null)
128	{
129	System.out.println(" <FilterString>" + filter.toString() + "</FilterString>");
130	}
131
132	try {
133	Query query_including_stop_words = query_parser_no_stop_words.parse(query_string);
134	query_including_stop_words = query_including_stop_words.rewrite(reader);
135
136	Query query = parseQuery(reader, query_parser, query_string, fuzziness);
137	query = query.rewrite(reader);
138
139	// Perform the query
140	Hits hits;
141	if (filter != null) {
142	hits = searcher.search(query, filter, sorter);
143	}
144	else {
145	hits = searcher.search(query, sorter);
146	}
147
148	// Return the list of expanded query terms and their frequencies
149	HashMap term_counts = new HashMap();
150	HashMap term_fields = new HashMap();
151	HashSet terms = new HashSet();
152	query.extractTerms(terms);
153	Iterator iter = terms.iterator();
154	while (iter.hasNext())
155	{
156	Term term = (Term) iter.next();
157	// If you wanted to limit this to just TX terms add
158	// something like this:
159	//if (term.field().equals("TX"))
160	term_counts.put(term.text(), new Integer(0));
161	term_fields.put(term.text(), term.field());
162	}
163
164	// Do we need to use a hit iterator to get sorted results?
165	System.out.println(" <MatchingDocsInfo num=\"" + hits.length() + "\"/>");
166	System.out.println(" <StartResults num=\"" + start_results + "\" />");
167	System.out.println(" <EndsResults num=\"" + end_results + "\" />");
168
169	int counter = 1;
170	Iterator hit_iter = hits.iterator();
171	while (hit_iter.hasNext())
172	{
173	Hit hit = (Hit) hit_iter.next();
174	Document doc = hit.getDocument();
175	String node_id = doc.get("nodeID");
176
177	// May not be paging results
178	if (start_results == 1 && end_results == -1)
179	{
180	System.out.println(" <Match id=\"" + node_id + "\" />");
181	}
182	// Otherwise skip up until page offset
183	else if (start_results <= counter && counter <= end_results)
184	{
185	System.out.println(" <Match id=\"" + node_id + "\" />");
186	}
187	// And skip all the rest
188
189	// From the document, extract the Term Vector for the
190	// TX field
191	TermFreqVector term_freq_vector = reader.getTermFreqVector(hit.getId(), "TX");
192	if (term_freq_vector != null && term_freq_vector.size() > 0)
193	{
194	int[] term_frequencies = term_freq_vector.getTermFrequencies();
195	// Now for each query term, determine the
196	// frequency - which may of course be 0.
197	Set term_counts_set = term_counts.keySet();
198	Iterator terms_iter = term_counts_set.iterator();
199	while (terms_iter.hasNext())
200	{
201	String term = (String) terms_iter.next();
202	Integer count_integer = (Integer) term_counts.get(term);
203	int count = count_integer.intValue();
204	int index = term_freq_vector.indexOf(term);
205	// If the term has a count, then add to
206	// the total count for this term
207	if (index != -1)
208	{
209	count += term_frequencies[index];
210
211	}
212	// Store the result
213	term_counts.put(term, new Integer(count));
214	count_integer = null;
215	term = null;
216	}
217	terms_iter = null;
218	term_counts_set = null;
219	}
220	else
221	{
222	///ystem.err.println("Error! Missing term vector for document " + hit.getId());
223	}
224	++counter;
225	}
226
227	// Retrieve all the useful terms
228	Set term_counts_set = term_counts.keySet();
229	System.out.println(" <QueryTermsInfo num=\"" + term_counts_set.size() + "\"/>");
230	// Iterate over them
231	Iterator terms_iter = term_counts_set.iterator();
232	while (terms_iter.hasNext())
233	{
234	String term = (String) terms_iter.next();
235	Integer count = (Integer) term_counts.get(term);
236	String field = (String) term_fields.get(term);
237	System.out.println(" <Term value=\"" + term + "\" field=\"" + field + "\" freq=\"" + count.intValue() + "\" />");
238	count = null;
239	term = null;
240	}
241	// Cleanup
242	terms_iter = null;
243	term_counts_set = null;
244
245	// Return the list of stop words removed from the query
246	HashSet terms_including_stop_words = new HashSet();
247	query_including_stop_words.extractTerms(terms_including_stop_words);
248	Iterator terms_including_stop_words_iter = terms_including_stop_words.iterator();
249	while (terms_including_stop_words_iter.hasNext()) {
250	Term term = (Term) terms_including_stop_words_iter.next();
251	if (!terms.contains(term)) {
252	System.out.println(" <StopWord value=\"" + term.text() + "\"/>");
253	}
254	}
255	}
256	catch (ParseException parse_exception) {
257	System.out.println(" <Error type=\"PARSE_EXCEPTION\"/>");
258	}
259	catch (TooManyClauses too_many_clauses_exception) {
260	System.out.println(" <Error type=\"TOO_MANY_CLAUSES\"/>");
261	}
262
263	System.out.println("</ResultSet>");
264	}
265
266	searcher.close();
267	}
268	catch (IOException exception) {
269	exception.printStackTrace();
270	}
271	}
272
273
274	private static Query parseQuery(IndexReader reader, QueryParser query_parser, String query_string, String fuzziness)
275	throws java.io.IOException, org.apache.lucene.queryParser.ParseException
276	{
277	// Split query string into the search terms and the filter terms
278	// * The first +(...) term contains the search terms so count
279	// up '(' and stop when we finish matching ')'
280	int offset = 0;
281	int paren_count = 0;
282	boolean seen_paren = false;
283	while (offset < query_string.length() && (!seen_paren \|\| paren_count > 0))
284	{
285	if (query_string.charAt(offset) == '(')
286	{
287	paren_count++;
288	seen_paren = true;
289	}
290	if (query_string.charAt(offset) == ')')
291	{
292	paren_count--;
293	}
294	offset++;
295	}
296	String query_prefix = query_string.substring(0, offset);
297	String query_suffix = query_string.substring(offset);
298
299	///ystem.err.println("Prefix: " + query_prefix);
300	///ystem.err.println("Suffix: " + query_suffix);
301
302	Query query = query_parser.parse(query_prefix);
303	query = query.rewrite(reader);
304
305	// If this is a fuzzy search, then we need to add the fuzzy
306	// flag to each of the query terms
307	if (fuzziness != null && query.toString().length() > 0)
308	{
309	// Revert the query to a string
310	System.err.println("Rewritten query: " + query.toString());
311	// Search through the string for TX:<term> query terms
312	// and append the ~ operator. Not that this search will
313	// not change phrase searches (TX:"<term> <term>") as
314	// fuzzy searching is not possible for these entries.
315	// Yahoo! Time for a state machine!
316	StringBuffer mutable_query_string = new StringBuffer(query.toString());
317	int o = 0; // Offset
318	// 0 = BASE, 1 = SEEN_T, 2 = SEEN_TX, 3 = SEEN_TX:
319	int s = 0; // State
320	while(o < mutable_query_string.length())
321	{
322	char c = mutable_query_string.charAt(o);
323	if (s == 0 && c == 'T')
324	{
325	///ystem.err.println("Found T!");
326	s = 1;
327	}
328	else if (s == 1)
329	{
330	if (c == 'X')
331	{
332	///ystem.err.println("Found X!");
333	s = 2;
334	}
335	else
336	{
337	s = 0; // Reset
338	}
339	}
340	else if (s == 2)
341	{
342	if (c == ':')
343	{
344	///ystem.err.println("Found TX:!");
345	s = 3;
346	}
347	else
348	{
349	s = 0; // Reset
350	}
351	}
352	else if (s == 3)
353	{
354	// Don't process phrases
355	if (c == '"')
356	{
357	///ystem.err.println("Stupid phrase...");
358	s = 0; // Reset
359	}
360	// Found the end of the term... add the
361	// fuzzy search indicator
362	// Nor outside the scope of parentheses
363	else if (Character.isWhitespace(c) \|\| c == ')')
364	{
365	///ystem.err.println("Yahoo! Found fuzzy term.");
366	mutable_query_string.insert(o, '~' + fuzziness);
367	o++;
368	s = 0; // Reset
369	}
370	}
371	o++;
372	}
373	// If we were in the state of looking for the end of a
374	// term - then we just found it!
375	if (s == 3)
376	{
377	mutable_query_string.append('~' + fuzziness);
378	}
379	// Reparse the query
380	///ystem.err.println("Fuzzy query: " + mutable_query_string.toString() + query_suffix);
381	query = query_parser.parse(mutable_query_string.toString() + query_suffix);
382	}
383	else
384	{
385	query = query_parser.parse(query_prefix + query_suffix);
386	}
387
388	return query;
389	}
390
391
392	/**
393	* @todo Michael to comment
394	*/
395	private static Filter parseFilterString(String filter_string)
396	{
397	Filter result = null;
398	Pattern pattern = Pattern.compile("\\s\\+(\\w+)\\:([\\{\\[])(\\d+)\\s+TO\\s+(\\d+)([\\}\\]])\\s");
399	Matcher matcher = pattern.matcher(filter_string);
400	if (matcher.matches())
401	{
402	String field_name = matcher.group(1);
403	boolean include_lower = matcher.group(2).equals("[");
404	String lower_term = matcher.group(3);
405	String upper_term = matcher.group(4);
406	boolean include_upper = matcher.group(5).equals("]");
407	result = new RangeFilter(field_name, lower_term, upper_term, include_lower, include_upper);
408	}
409	else
410	{
411	System.err.println("Error: Could not understand filter string \"" + filter_string + "\"");
412	}
413	return result;
414	}
415	/ parseFilterString() /
416	}

Note: See TracBrowser for help on using the repository browser.

Download in other formats: