source: trunk/gsdl/src/java/org/nzdl/gsdl/LuceneWrap/GS2LuceneQuery.java@ 12770

Last change on this file since 12770 was 12770, checked in by mdewsnip, 18 years ago

Changed the Lucene "-fuzzy" argument to "-fuzziness <value>", for more accurate control.

  • Property svn:keywords set to Author Date Id Revision
File size: 13.0 KB
Line 
1/**
2 *
3 * @author [email protected]
4 * @author [email protected]
5 * @author [email protected]
6 * @author [email protected]
7 * @version
8 */
9
10package org.nzdl.gsdl.LuceneWrap;
11
12
13import java.io.*;
14import java.util.*;
15import java.util.regex.*;
16
17import org.apache.lucene.analysis.Analyzer;
18import org.apache.lucene.analysis.standard.StandardAnalyzer;
19import org.apache.lucene.document.Document;
20import org.apache.lucene.index.IndexReader;
21import org.apache.lucene.index.Term;
22import org.apache.lucene.index.TermFreqVector;
23import org.apache.lucene.queryParser.ParseException;
24import org.apache.lucene.queryParser.QueryParser;
25import org.apache.lucene.search.BooleanQuery.TooManyClauses;
26import org.apache.lucene.search.Filter;
27import org.apache.lucene.search.Hit;
28import org.apache.lucene.search.Hits;
29import org.apache.lucene.search.IndexSearcher;
30import org.apache.lucene.search.Query;
31import org.apache.lucene.search.QueryFilter;
32import org.apache.lucene.search.RangeFilter;
33import org.apache.lucene.search.Searcher;
34import org.apache.lucene.search.Sort;
35
36
37public class GS2LuceneQuery
38{
39 // Use the standard set of English stop words by default
40 static private String[] stop_words = StandardAnalyzer.STOP_WORDS;
41
42
43 static public void main (String args[])
44 {
45 if (args.length == 0) {
46 System.out.println("Usage: GS2LuceneQuery <index directory> [-fuzziness value] [-filter filter_string] [-sort sort_field] [-dco AND|OR] [-startresults number -endresults number]");
47 return;
48 }
49
50 try {
51 Searcher searcher = new IndexSearcher(args[0]);
52 IndexReader reader = ((IndexSearcher) searcher).getIndexReader();
53
54 // Create one query parser with the standard set of stop words, and one with none
55 QueryParser query_parser = new QueryParser("TX", new StandardAnalyzer(stop_words));
56 QueryParser query_parser_no_stop_words = new QueryParser("TX", new StandardAnalyzer(new String[] { }));
57
58 Sort sorter = new Sort();
59 Filter filter = null;
60 String fuzziness = null;
61
62 // Paging
63 int start_results = 1;
64 int end_results = -1;
65
66 // New code to allow the default conjunction operator to be
67 // definable
68 String default_conjuction_operator = "OR";
69 for (int i = 1; i < args.length; i++)
70 {
71 if (args[i].equals("-sort"))
72 {
73 i++;
74 sorter = new Sort(args[i]);
75 }
76 if (args[i].equals("-filter"))
77 {
78 i++;
79
80 // Parse up filter
81 filter = parseFilterString(args[i]);
82 }
83 if (args[i].equals("-dco"))
84 {
85 i++;
86 default_conjuction_operator = args[i];
87 }
88 if (args[i].equals("-fuzziness"))
89 {
90 i++;
91 fuzziness = args[i];
92 }
93 if (args[i].equals("-startresults"))
94 {
95 i++;
96 if (args[i].matches("\\d+"))
97 {
98 start_results = Integer.parseInt(args[i]);
99 }
100 }
101 if (args[i].equals("-endresults"))
102 {
103 i++;
104 if (args[i].matches("\\d+"))
105 {
106 end_results = Integer.parseInt(args[i]);
107 }
108 }
109 }
110
111 // Lucene does "OR" queries by default; do an "AND" query if specified
112 if (default_conjuction_operator.equals("AND"))
113 {
114 query_parser.setDefaultOperator(query_parser.AND_OPERATOR);
115 query_parser_no_stop_words.setDefaultOperator(query_parser.AND_OPERATOR);
116 }
117
118 BufferedReader in = new BufferedReader(new InputStreamReader(System.in, "UTF-8"));
119 while (true) {
120 // Read the query from STDIN
121 String query_string = in.readLine();
122 if (query_string == null || query_string.length() == -1) {
123 break;
124 }
125 System.out.println("<ResultSet>");
126 System.out.println(" <QueryString>" + query_string + "</QueryString>");
127 if (filter != null)
128 {
129 System.out.println(" <FilterString>" + filter.toString() + "</FilterString>");
130 }
131
132 try {
133 Query query_including_stop_words = query_parser_no_stop_words.parse(query_string);
134 query_including_stop_words = query_including_stop_words.rewrite(reader);
135
136 Query query = parseQuery(reader, query_parser, query_string, fuzziness);
137 query = query.rewrite(reader);
138
139 // Perform the query
140 Hits hits;
141 if (filter != null) {
142 hits = searcher.search(query, filter, sorter);
143 }
144 else {
145 hits = searcher.search(query, sorter);
146 }
147
148 // Return the list of expanded query terms and their frequencies
149 HashMap term_counts = new HashMap();
150 HashMap term_fields = new HashMap();
151 HashSet terms = new HashSet();
152 query.extractTerms(terms);
153 Iterator iter = terms.iterator();
154 while (iter.hasNext())
155 {
156 Term term = (Term) iter.next();
157 // If you wanted to limit this to just TX terms add
158 // something like this:
159 //if (term.field().equals("TX"))
160 term_counts.put(term.text(), new Integer(0));
161 term_fields.put(term.text(), term.field());
162 }
163
164 // Do we need to use a hit iterator to get sorted results?
165 System.out.println(" <MatchingDocsInfo num=\"" + hits.length() + "\"/>");
166 System.out.println(" <StartResults num=\"" + start_results + "\" />");
167 System.out.println(" <EndsResults num=\"" + end_results + "\" />");
168
169 int counter = 1;
170 Iterator hit_iter = hits.iterator();
171 while (hit_iter.hasNext())
172 {
173 Hit hit = (Hit) hit_iter.next();
174 Document doc = hit.getDocument();
175 String node_id = doc.get("nodeID");
176
177 // May not be paging results
178 if (start_results == 1 && end_results == -1)
179 {
180 System.out.println(" <Match id=\"" + node_id + "\" />");
181 }
182 // Otherwise skip up until page offset
183 else if (start_results <= counter && counter <= end_results)
184 {
185 System.out.println(" <Match id=\"" + node_id + "\" />");
186 }
187 // And skip all the rest
188
189 // From the document, extract the Term Vector for the
190 // TX field
191 TermFreqVector term_freq_vector = reader.getTermFreqVector(hit.getId(), "TX");
192 if (term_freq_vector != null && term_freq_vector.size() > 0)
193 {
194 int[] term_frequencies = term_freq_vector.getTermFrequencies();
195 // Now for each query term, determine the
196 // frequency - which may of course be 0.
197 Set term_counts_set = term_counts.keySet();
198 Iterator terms_iter = term_counts_set.iterator();
199 while (terms_iter.hasNext())
200 {
201 String term = (String) terms_iter.next();
202 Integer count_integer = (Integer) term_counts.get(term);
203 int count = count_integer.intValue();
204 int index = term_freq_vector.indexOf(term);
205 // If the term has a count, then add to
206 // the total count for this term
207 if (index != -1)
208 {
209 count += term_frequencies[index];
210
211 }
212 // Store the result
213 term_counts.put(term, new Integer(count));
214 count_integer = null;
215 term = null;
216 }
217 terms_iter = null;
218 term_counts_set = null;
219 }
220 else
221 {
222 ///ystem.err.println("Error! Missing term vector for document " + hit.getId());
223 }
224 ++counter;
225 }
226
227 // Retrieve all the useful terms
228 Set term_counts_set = term_counts.keySet();
229 System.out.println(" <QueryTermsInfo num=\"" + term_counts_set.size() + "\"/>");
230 // Iterate over them
231 Iterator terms_iter = term_counts_set.iterator();
232 while (terms_iter.hasNext())
233 {
234 String term = (String) terms_iter.next();
235 Integer count = (Integer) term_counts.get(term);
236 String field = (String) term_fields.get(term);
237 System.out.println(" <Term value=\"" + term + "\" field=\"" + field + "\" freq=\"" + count.intValue() + "\" />");
238 count = null;
239 term = null;
240 }
241 // Cleanup
242 terms_iter = null;
243 term_counts_set = null;
244
245 // Return the list of stop words removed from the query
246 HashSet terms_including_stop_words = new HashSet();
247 query_including_stop_words.extractTerms(terms_including_stop_words);
248 Iterator terms_including_stop_words_iter = terms_including_stop_words.iterator();
249 while (terms_including_stop_words_iter.hasNext()) {
250 Term term = (Term) terms_including_stop_words_iter.next();
251 if (!terms.contains(term)) {
252 System.out.println(" <StopWord value=\"" + term.text() + "\"/>");
253 }
254 }
255 }
256 catch (ParseException parse_exception) {
257 System.out.println(" <Error type=\"PARSE_EXCEPTION\"/>");
258 }
259 catch (TooManyClauses too_many_clauses_exception) {
260 System.out.println(" <Error type=\"TOO_MANY_CLAUSES\"/>");
261 }
262
263 System.out.println("</ResultSet>");
264 }
265
266 searcher.close();
267 }
268 catch (IOException exception) {
269 exception.printStackTrace();
270 }
271 }
272
273
274 private static Query parseQuery(IndexReader reader, QueryParser query_parser, String query_string, String fuzziness)
275 throws java.io.IOException, org.apache.lucene.queryParser.ParseException
276 {
277 // Split query string into the search terms and the filter terms
278 // * The first +(...) term contains the search terms so count
279 // up '(' and stop when we finish matching ')'
280 int offset = 0;
281 int paren_count = 0;
282 boolean seen_paren = false;
283 while (offset < query_string.length() && (!seen_paren || paren_count > 0))
284 {
285 if (query_string.charAt(offset) == '(')
286 {
287 paren_count++;
288 seen_paren = true;
289 }
290 if (query_string.charAt(offset) == ')')
291 {
292 paren_count--;
293 }
294 offset++;
295 }
296 String query_prefix = query_string.substring(0, offset);
297 String query_suffix = query_string.substring(offset);
298
299 ///ystem.err.println("Prefix: " + query_prefix);
300 ///ystem.err.println("Suffix: " + query_suffix);
301
302 Query query = query_parser.parse(query_prefix);
303 query = query.rewrite(reader);
304
305 // If this is a fuzzy search, then we need to add the fuzzy
306 // flag to each of the query terms
307 if (fuzziness != null && query.toString().length() > 0)
308 {
309 // Revert the query to a string
310 System.err.println("Rewritten query: " + query.toString());
311 // Search through the string for TX:<term> query terms
312 // and append the ~ operator. Not that this search will
313 // not change phrase searches (TX:"<term> <term>") as
314 // fuzzy searching is not possible for these entries.
315 // Yahoo! Time for a state machine!
316 StringBuffer mutable_query_string = new StringBuffer(query.toString());
317 int o = 0; // Offset
318 // 0 = BASE, 1 = SEEN_T, 2 = SEEN_TX, 3 = SEEN_TX:
319 int s = 0; // State
320 while(o < mutable_query_string.length())
321 {
322 char c = mutable_query_string.charAt(o);
323 if (s == 0 && c == 'T')
324 {
325 ///ystem.err.println("Found T!");
326 s = 1;
327 }
328 else if (s == 1)
329 {
330 if (c == 'X')
331 {
332 ///ystem.err.println("Found X!");
333 s = 2;
334 }
335 else
336 {
337 s = 0; // Reset
338 }
339 }
340 else if (s == 2)
341 {
342 if (c == ':')
343 {
344 ///ystem.err.println("Found TX:!");
345 s = 3;
346 }
347 else
348 {
349 s = 0; // Reset
350 }
351 }
352 else if (s == 3)
353 {
354 // Don't process phrases
355 if (c == '"')
356 {
357 ///ystem.err.println("Stupid phrase...");
358 s = 0; // Reset
359 }
360 // Found the end of the term... add the
361 // fuzzy search indicator
362 // Nor outside the scope of parentheses
363 else if (Character.isWhitespace(c) || c == ')')
364 {
365 ///ystem.err.println("Yahoo! Found fuzzy term.");
366 mutable_query_string.insert(o, '~' + fuzziness);
367 o++;
368 s = 0; // Reset
369 }
370 }
371 o++;
372 }
373 // If we were in the state of looking for the end of a
374 // term - then we just found it!
375 if (s == 3)
376 {
377 mutable_query_string.append('~' + fuzziness);
378 }
379 // Reparse the query
380 ///ystem.err.println("Fuzzy query: " + mutable_query_string.toString() + query_suffix);
381 query = query_parser.parse(mutable_query_string.toString() + query_suffix);
382 }
383 else
384 {
385 query = query_parser.parse(query_prefix + query_suffix);
386 }
387
388 return query;
389 }
390
391
392 /**
393 * @todo Michael to comment
394 */
395 private static Filter parseFilterString(String filter_string)
396 {
397 Filter result = null;
398 Pattern pattern = Pattern.compile("\\s*\\+(\\w+)\\:([\\{\\[])(\\d+)\\s+TO\\s+(\\d+)([\\}\\]])\\s*");
399 Matcher matcher = pattern.matcher(filter_string);
400 if (matcher.matches())
401 {
402 String field_name = matcher.group(1);
403 boolean include_lower = matcher.group(2).equals("[");
404 String lower_term = matcher.group(3);
405 String upper_term = matcher.group(4);
406 boolean include_upper = matcher.group(5).equals("]");
407 result = new RangeFilter(field_name, lower_term, upper_term, include_lower, include_upper);
408 }
409 else
410 {
411 System.err.println("Error: Could not understand filter string \"" + filter_string + "\"");
412 }
413 return result;
414 }
415 /** parseFilterString() **/
416}
Note: See TracBrowser for help on using the repository browser.