source: trunk/indexers/lucene-gs/src/org/greenstone/LuceneWrapper/GS2LuceneQuery.java@ 12776

Last change on this file since 12776 was 12776, checked in by mdewsnip, 18 years ago

Fixed a bug where misspelled words could be marked as stop words with fuzzy searching on.

  • Property svn:keywords set to Author Date Id Revision
File size: 13.0 KB
Line 
1/**
2 *
3 * @author [email protected]
4 * @author [email protected]
5 * @author [email protected]
6 * @author [email protected]
7 * @version
8 */
9
10package org.nzdl.gsdl.LuceneWrap;
11
12
13import java.io.*;
14import java.util.*;
15import java.util.regex.*;
16
17import org.apache.lucene.analysis.Analyzer;
18import org.apache.lucene.analysis.standard.StandardAnalyzer;
19import org.apache.lucene.document.Document;
20import org.apache.lucene.index.IndexReader;
21import org.apache.lucene.index.Term;
22import org.apache.lucene.index.TermFreqVector;
23import org.apache.lucene.queryParser.ParseException;
24import org.apache.lucene.queryParser.QueryParser;
25import org.apache.lucene.search.BooleanQuery.TooManyClauses;
26import org.apache.lucene.search.Filter;
27import org.apache.lucene.search.Hit;
28import org.apache.lucene.search.Hits;
29import org.apache.lucene.search.IndexSearcher;
30import org.apache.lucene.search.Query;
31import org.apache.lucene.search.QueryFilter;
32import org.apache.lucene.search.RangeFilter;
33import org.apache.lucene.search.Searcher;
34import org.apache.lucene.search.Sort;
35
36
37public class GS2LuceneQuery
38{
39 // Use the standard set of English stop words by default
40 static private String[] stop_words = StandardAnalyzer.STOP_WORDS;
41
42
43 static public void main (String args[])
44 {
45 if (args.length == 0) {
46 System.out.println("Usage: GS2LuceneQuery <index directory> [-fuzziness value] [-filter filter_string] [-sort sort_field] [-dco AND|OR] [-startresults number -endresults number]");
47 return;
48 }
49
50 try {
51 Searcher searcher = new IndexSearcher(args[0]);
52 IndexReader reader = ((IndexSearcher) searcher).getIndexReader();
53
54 // Create one query parser with the standard set of stop words, and one with none
55 QueryParser query_parser = new QueryParser("TX", new StandardAnalyzer(stop_words));
56 QueryParser query_parser_no_stop_words = new QueryParser("TX", new StandardAnalyzer(new String[] { }));
57
58 Sort sorter = new Sort();
59 Filter filter = null;
60 String fuzziness = null;
61
62 // Paging
63 int start_results = 1;
64 int end_results = -1;
65
66 // New code to allow the default conjunction operator to be
67 // definable
68 String default_conjuction_operator = "OR";
69 for (int i = 1; i < args.length; i++) {
70 if (args[i].equals("-sort")) {
71 i++;
72 sorter = new Sort(args[i]);
73 }
74 if (args[i].equals("-filter")) {
75 i++;
76
77 // Parse up filter
78 filter = parseFilterString(args[i]);
79 }
80 if (args[i].equals("-dco")) {
81 i++;
82 default_conjuction_operator = args[i];
83 }
84 if (args[i].equals("-fuzziness")) {
85 i++;
86 fuzziness = args[i];
87 }
88 if (args[i].equals("-startresults")) {
89 i++;
90 if (args[i].matches("\\d+")) {
91 start_results = Integer.parseInt(args[i]);
92 }
93 }
94 if (args[i].equals("-endresults")) {
95 i++;
96 if (args[i].matches("\\d+")) {
97 end_results = Integer.parseInt(args[i]);
98 }
99 }
100 }
101
102 // Lucene does "OR" queries by default; do an "AND" query if specified
103 if (default_conjuction_operator.equals("AND")) {
104 query_parser.setDefaultOperator(query_parser.AND_OPERATOR);
105 query_parser_no_stop_words.setDefaultOperator(query_parser.AND_OPERATOR);
106 }
107
108 BufferedReader in = new BufferedReader(new InputStreamReader(System.in, "UTF-8"));
109 while (true) {
110 // Read the query from STDIN
111 String query_string = in.readLine();
112 if (query_string == null || query_string.length() == -1) {
113 break;
114 }
115 System.out.println("<ResultSet>");
116 System.out.println(" <QueryString>" + xmlSafe(query_string) + "</QueryString>");
117 if (filter != null) {
118 System.out.println(" <FilterString>" + filter.toString() + "</FilterString>");
119 }
120
121 try {
122 Query query_including_stop_words = parseQuery(reader, query_parser_no_stop_words, query_string, fuzziness);
123 query_including_stop_words = query_including_stop_words.rewrite(reader);
124
125 Query query = parseQuery(reader, query_parser, query_string, fuzziness);
126 query = query.rewrite(reader);
127
128 // Perform the query
129 Hits hits;
130 if (filter != null) {
131 hits = searcher.search(query, filter, sorter);
132 }
133 else {
134 hits = searcher.search(query, sorter);
135 }
136
137 // Return the list of expanded query terms and their frequencies
138 HashMap term_counts = new HashMap();
139 HashMap term_fields = new HashMap();
140 HashSet terms = new HashSet();
141 query.extractTerms(terms);
142 Iterator iter = terms.iterator();
143 while (iter.hasNext()) {
144 Term term = (Term) iter.next();
145 // If you wanted to limit this to just TX terms add
146 // something like this:
147 //if (term.field().equals("TX"))
148 term_counts.put(term.text(), new Integer(0));
149 term_fields.put(term.text(), term.field());
150 }
151
152 // Do we need to use a hit iterator to get sorted results?
153 System.out.println(" <MatchingDocsInfo num=\"" + hits.length() + "\"/>");
154 System.out.println(" <StartResults num=\"" + start_results + "\" />");
155 System.out.println(" <EndsResults num=\"" + end_results + "\" />");
156
157 int counter = 1;
158 Iterator hit_iter = hits.iterator();
159 while (hit_iter.hasNext()) {
160 Hit hit = (Hit) hit_iter.next();
161 Document doc = hit.getDocument();
162 String node_id = doc.get("nodeID");
163
164 // May not be paging results
165 if (start_results == 1 && end_results == -1) {
166 System.out.println(" <Match id=\"" + node_id + "\" />");
167 }
168 // Otherwise skip up until page offset
169 else if (start_results <= counter && counter <= end_results) {
170 System.out.println(" <Match id=\"" + node_id + "\" />");
171 }
172 // And skip all the rest
173
174 // From the document, extract the Term Vector for the
175 // TX field
176 TermFreqVector term_freq_vector = reader.getTermFreqVector(hit.getId(), "TX");
177 if (term_freq_vector != null && term_freq_vector.size() > 0) {
178 int[] term_frequencies = term_freq_vector.getTermFrequencies();
179 // Now for each query term, determine the
180 // frequency - which may of course be 0.
181 Set term_counts_set = term_counts.keySet();
182 Iterator terms_iter = term_counts_set.iterator();
183 while (terms_iter.hasNext()) {
184
185 String term = (String) terms_iter.next();
186 Integer count_integer = (Integer) term_counts.get(term);
187 int count = count_integer.intValue();
188 int index = term_freq_vector.indexOf(term);
189 // If the term has a count, then add to
190 // the total count for this term
191 if (index != -1) {
192 count += term_frequencies[index];
193 }
194 // Store the result
195 term_counts.put(term, new Integer(count));
196 count_integer = null;
197 term = null;
198 }
199 terms_iter = null;
200 term_counts_set = null;
201 }
202 else {
203 ///ystem.err.println("Error! Missing term vector for document " + hit.getId());
204 }
205 ++counter;
206 }
207
208 // Retrieve all the useful terms
209 Set term_counts_set = term_counts.keySet();
210 System.out.println(" <QueryTermsInfo num=\"" + term_counts_set.size() + "\"/>");
211 // Iterate over them
212 Iterator terms_iter = term_counts_set.iterator();
213 while (terms_iter.hasNext()) {
214 String term = (String) terms_iter.next();
215 Integer count = (Integer) term_counts.get(term);
216 String field = (String) term_fields.get(term);
217
218 // Ignore any terms with zero frequency, because they don't exist in the matching
219 // documents. It seems that this should never happen, but it's a consequence of
220 // how the terms are identified. The terms are found by rewriting the query (above).
221 // At this point, the query hasn't been run, so each query term is expanded without
222 // knowing whether the expanded term will actually appear in one of the resulting
223 // documents. For example, "otago" may be expanded to "otaio" and "otaqo", but if
224 // the search is for "otago AND auckland", no matching documents may include "otaio".
225 // Hopefully that made some sense...
226 if (count.intValue() > 0) {
227 System.out.println(" <Term value=\"" + term + "\" field=\"" + field + "\" freq=\"" + count.intValue() + "\" />");
228 }
229 count = null;
230 term = null;
231 }
232
233 // Cleanup
234 terms_iter = null;
235 term_counts_set = null;
236
237 // Return the list of stop words removed from the query
238 HashSet terms_including_stop_words = new HashSet();
239 query_including_stop_words.extractTerms(terms_including_stop_words);
240 Iterator terms_including_stop_words_iter = terms_including_stop_words.iterator();
241 while (terms_including_stop_words_iter.hasNext()) {
242 Term term = (Term) terms_including_stop_words_iter.next();
243 if (!terms.contains(term)) {
244 System.out.println(" <StopWord value=\"" + term.text() + "\"/>");
245 }
246 }
247 }
248 catch (ParseException parse_exception) {
249 System.out.println(" <Error type=\"PARSE_EXCEPTION\"/>");
250 }
251 catch (TooManyClauses too_many_clauses_exception) {
252 System.out.println(" <Error type=\"TOO_MANY_CLAUSES\"/>");
253 }
254
255 System.out.println("</ResultSet>");
256 }
257
258 searcher.close();
259 }
260 catch (IOException exception) {
261 exception.printStackTrace();
262 }
263 }
264
265 private static String xmlSafe(String text) {
266 return text.replaceAll("\\&", "\\&amp;");
267 }
268
269 private static Query parseQuery(IndexReader reader, QueryParser query_parser, String query_string, String fuzziness)
270 throws java.io.IOException, org.apache.lucene.queryParser.ParseException
271 {
272 // Split query string into the search terms and the filter terms
273 // * The first +(...) term contains the search terms so count
274 // up '(' and stop when we finish matching ')'
275 int offset = 0;
276 int paren_count = 0;
277 boolean seen_paren = false;
278 while (offset < query_string.length() && (!seen_paren || paren_count > 0)) {
279 if (query_string.charAt(offset) == '(') {
280 paren_count++;
281 seen_paren = true;
282 }
283 if (query_string.charAt(offset) == ')') {
284 paren_count--;
285 }
286 offset++;
287 }
288 String query_prefix = query_string.substring(0, offset);
289 String query_suffix = query_string.substring(offset);
290
291 ///ystem.err.println("Prefix: " + query_prefix);
292 ///ystem.err.println("Suffix: " + query_suffix);
293
294 Query query = query_parser.parse(query_prefix);
295 query = query.rewrite(reader);
296
297 // If this is a fuzzy search, then we need to add the fuzzy
298 // flag to each of the query terms
299 if (fuzziness != null && query.toString().length() > 0) {
300 // Revert the query to a string
301 System.err.println("Rewritten query: " + query.toString());
302 // Search through the string for TX:<term> query terms
303 // and append the ~ operator. Not that this search will
304 // not change phrase searches (TX:"<term> <term>") as
305 // fuzzy searching is not possible for these entries.
306 // Yahoo! Time for a state machine!
307 StringBuffer mutable_query_string = new StringBuffer(query.toString());
308 int o = 0; // Offset
309 // 0 = BASE, 1 = SEEN_T, 2 = SEEN_TX, 3 = SEEN_TX:
310 int s = 0; // State
311 while (o < mutable_query_string.length()) {
312 char c = mutable_query_string.charAt(o);
313 if (s == 0 && c == 'T') {
314 ///ystem.err.println("Found T!");
315 s = 1;
316 }
317 else if (s == 1) {
318 if (c == 'X') {
319 ///ystem.err.println("Found X!");
320 s = 2;
321 }
322 else {
323 s = 0; // Reset
324 }
325 }
326 else if (s == 2) {
327 if (c == ':') {
328 ///ystem.err.println("Found TX:!");
329 s = 3;
330 }
331 else {
332 s = 0; // Reset
333 }
334 }
335 else if (s == 3) {
336 // Don't process phrases
337 if (c == '"') {
338 ///ystem.err.println("Stupid phrase...");
339 s = 0; // Reset
340 }
341 // Found the end of the term... add the
342 // fuzzy search indicator
343 // Nor outside the scope of parentheses
344 else if (Character.isWhitespace(c) || c == ')') {
345 ///ystem.err.println("Yahoo! Found fuzzy term.");
346 mutable_query_string.insert(o, '~' + fuzziness);
347 o++;
348 s = 0; // Reset
349 }
350 }
351 o++;
352 }
353 // If we were in the state of looking for the end of a
354 // term - then we just found it!
355 if (s == 3) {
356 mutable_query_string.append('~' + fuzziness);
357 }
358 // Reparse the query
359 ///ystem.err.println("Fuzzy query: " + mutable_query_string.toString() + query_suffix);
360 query = query_parser.parse(mutable_query_string.toString() + query_suffix);
361 }
362 else {
363 query = query_parser.parse(query_prefix + query_suffix);
364 }
365
366 return query;
367 }
368
369
370 /**
371 * @todo Michael to comment
372 */
373 private static Filter parseFilterString(String filter_string)
374 {
375 Filter result = null;
376 Pattern pattern = Pattern.compile("\\s*\\+(\\w+)\\:([\\{\\[])(\\d+)\\s+TO\\s+(\\d+)([\\}\\]])\\s*");
377 Matcher matcher = pattern.matcher(filter_string);
378 if (matcher.matches()) {
379 String field_name = matcher.group(1);
380 boolean include_lower = matcher.group(2).equals("[");
381 String lower_term = matcher.group(3);
382 String upper_term = matcher.group(4);
383 boolean include_upper = matcher.group(5).equals("]");
384 result = new RangeFilter(field_name, lower_term, upper_term, include_lower, include_upper);
385 }
386 else {
387 System.err.println("Error: Could not understand filter string \"" + filter_string + "\"");
388 }
389 return result;
390 }
391 /** parseFilterString() **/
392}
Note: See TracBrowser for help on using the repository browser.