source: trunk/indexers/lucene-gs/src/org/greenstone/LuceneWrapper/GS2LuceneQuery.java@ 12775

Last change on this file since 12775 was 12775, checked in by mdewsnip, 18 years ago

Fixed bug where some terms have zero frequency (because they don't actually appear in the matching documents).

  • Property svn:keywords set to Author Date Id Revision
File size: 13.0 KB
Line 
1/**
2 *
3 * @author [email protected]
4 * @author [email protected]
5 * @author [email protected]
6 * @author [email protected]
7 * @version
8 */
9
10package org.nzdl.gsdl.LuceneWrap;
11
12
13import java.io.*;
14import java.util.*;
15import java.util.regex.*;
16
17import org.apache.lucene.analysis.Analyzer;
18import org.apache.lucene.analysis.standard.StandardAnalyzer;
19import org.apache.lucene.document.Document;
20import org.apache.lucene.index.IndexReader;
21import org.apache.lucene.index.Term;
22import org.apache.lucene.index.TermFreqVector;
23import org.apache.lucene.queryParser.ParseException;
24import org.apache.lucene.queryParser.QueryParser;
25import org.apache.lucene.search.BooleanQuery.TooManyClauses;
26import org.apache.lucene.search.Filter;
27import org.apache.lucene.search.Hit;
28import org.apache.lucene.search.Hits;
29import org.apache.lucene.search.IndexSearcher;
30import org.apache.lucene.search.Query;
31import org.apache.lucene.search.QueryFilter;
32import org.apache.lucene.search.RangeFilter;
33import org.apache.lucene.search.Searcher;
34import org.apache.lucene.search.Sort;
35
36
37public class GS2LuceneQuery
38{
39 // Use the standard set of English stop words by default
40 static private String[] stop_words = StandardAnalyzer.STOP_WORDS;
41
42
43 static public void main (String args[])
44 {
45 if (args.length == 0) {
46 System.out.println("Usage: GS2LuceneQuery <index directory> [-fuzziness value] [-filter filter_string] [-sort sort_field] [-dco AND|OR] [-startresults number -endresults number]");
47 return;
48 }
49
50 try {
51 Searcher searcher = new IndexSearcher(args[0]);
52 IndexReader reader = ((IndexSearcher) searcher).getIndexReader();
53
54 // Create one query parser with the standard set of stop words, and one with none
55 QueryParser query_parser = new QueryParser("TX", new StandardAnalyzer(stop_words));
56 QueryParser query_parser_no_stop_words = new QueryParser("TX", new StandardAnalyzer(new String[] { }));
57
58 Sort sorter = new Sort();
59 Filter filter = null;
60 String fuzziness = null;
61
62 // Paging
63 int start_results = 1;
64 int end_results = -1;
65
66 // New code to allow the default conjunction operator to be
67 // definable
68 String default_conjuction_operator = "OR";
69 for (int i = 1; i < args.length; i++) {
70 if (args[i].equals("-sort")) {
71 i++;
72 sorter = new Sort(args[i]);
73 }
74 if (args[i].equals("-filter")) {
75 i++;
76
77 // Parse up filter
78 filter = parseFilterString(args[i]);
79 }
80 if (args[i].equals("-dco")) {
81 i++;
82 default_conjuction_operator = args[i];
83 }
84 if (args[i].equals("-fuzziness")) {
85 i++;
86 fuzziness = args[i];
87 }
88 if (args[i].equals("-startresults")) {
89 i++;
90 if (args[i].matches("\\d+")) {
91 start_results = Integer.parseInt(args[i]);
92 }
93 }
94 if (args[i].equals("-endresults")) {
95 i++;
96 if (args[i].matches("\\d+")) {
97 end_results = Integer.parseInt(args[i]);
98 }
99 }
100 }
101
102 // Lucene does "OR" queries by default; do an "AND" query if specified
103 if (default_conjuction_operator.equals("AND")) {
104 query_parser.setDefaultOperator(query_parser.AND_OPERATOR);
105 query_parser_no_stop_words.setDefaultOperator(query_parser.AND_OPERATOR);
106 }
107
108 BufferedReader in = new BufferedReader(new InputStreamReader(System.in, "UTF-8"));
109 while (true) {
110 // Read the query from STDIN
111 String query_string = in.readLine();
112 if (query_string == null || query_string.length() == -1) {
113 break;
114 }
115 System.out.println("<ResultSet>");
116 System.out.println(" <QueryString>" + xmlSafe(query_string) + "</QueryString>");
117 if (filter != null) {
118 System.out.println(" <FilterString>" + filter.toString() + "</FilterString>");
119 }
120
121 try {
122 Query query_including_stop_words = query_parser_no_stop_words.parse(query_string);
123 query_including_stop_words = query_including_stop_words.rewrite(reader);
124
125 Query query = parseQuery(reader, query_parser, query_string, fuzziness);
126 query = query.rewrite(reader);
127
128 // Perform the query
129 Hits hits;
130 if (filter != null) {
131 hits = searcher.search(query, filter, sorter);
132 }
133 else {
134 hits = searcher.search(query, sorter);
135 }
136
137 // Return the list of expanded query terms and their frequencies
138 HashMap term_counts = new HashMap();
139 HashMap term_fields = new HashMap();
140 HashSet terms = new HashSet();
141 query.extractTerms(terms);
142 Iterator iter = terms.iterator();
143 while (iter.hasNext()) {
144 Term term = (Term) iter.next();
145 // If you wanted to limit this to just TX terms add
146 // something like this:
147 //if (term.field().equals("TX"))
148 term_counts.put(term.text(), new Integer(0));
149 term_fields.put(term.text(), term.field());
150 }
151
152 // Do we need to use a hit iterator to get sorted results?
153 System.out.println(" <MatchingDocsInfo num=\"" + hits.length() + "\"/>");
154 System.out.println(" <StartResults num=\"" + start_results + "\" />");
155 System.out.println(" <EndsResults num=\"" + end_results + "\" />");
156
157 int counter = 1;
158 Iterator hit_iter = hits.iterator();
159 while (hit_iter.hasNext()) {
160 Hit hit = (Hit) hit_iter.next();
161 Document doc = hit.getDocument();
162 String node_id = doc.get("nodeID");
163
164 // May not be paging results
165 if (start_results == 1 && end_results == -1) {
166 System.out.println(" <Match id=\"" + node_id + "\" />");
167 }
168 // Otherwise skip up until page offset
169 else if (start_results <= counter && counter <= end_results) {
170 System.out.println(" <Match id=\"" + node_id + "\" />");
171 }
172 // And skip all the rest
173
174 // From the document, extract the Term Vector for the
175 // TX field
176 TermFreqVector term_freq_vector = reader.getTermFreqVector(hit.getId(), "TX");
177 if (term_freq_vector != null && term_freq_vector.size() > 0) {
178 int[] term_frequencies = term_freq_vector.getTermFrequencies();
179 // Now for each query term, determine the
180 // frequency - which may of course be 0.
181 Set term_counts_set = term_counts.keySet();
182 Iterator terms_iter = term_counts_set.iterator();
183 while (terms_iter.hasNext()) {
184
185 String term = (String) terms_iter.next();
186 Integer count_integer = (Integer) term_counts.get(term);
187 int count = count_integer.intValue();
188 int index = term_freq_vector.indexOf(term);
189 // If the term has a count, then add to
190 // the total count for this term
191 if (index != -1) {
192 count += term_frequencies[index];
193 }
194 // Store the result
195 term_counts.put(term, new Integer(count));
196 count_integer = null;
197 term = null;
198 }
199 terms_iter = null;
200 term_counts_set = null;
201 }
202 else {
203 ///ystem.err.println("Error! Missing term vector for document " + hit.getId());
204 }
205 ++counter;
206 }
207
208 // Retrieve all the useful terms
209 Set term_counts_set = term_counts.keySet();
210 System.out.println(" <QueryTermsInfo num=\"" + term_counts_set.size() + "\"/>");
211 // Iterate over them
212 Iterator terms_iter = term_counts_set.iterator();
213 while (terms_iter.hasNext()) {
214 String term = (String) terms_iter.next();
215 Integer count = (Integer) term_counts.get(term);
216 String field = (String) term_fields.get(term);
217
218 // Ignore any terms with zero frequency, because they don't exist in the matching
219 // documents. It seems that this should never happen, but it's a consequence of
220 // how the terms are identified. The terms are found by rewriting the query (above).
221 // At this point, the query hasn't been run, so each query term is expanded without
222 // knowing whether the expanded term will actually appear in one of the resulting
223 // documents. For example, "otago" may be expanded to "otaio" and "otaqo", but if
224 // the search is for "otago AND auckland", no matching documents may include "otaio".
225 // Hopefully that made some sense...
226 if (count.intValue() > 0) {
227 System.out.println(" <Term value=\"" + term + "\" field=\"" + field + "\" freq=\"" + count.intValue() + "\" />");
228 }
229 count = null;
230 term = null;
231 }
232
233 // Cleanup
234 terms_iter = null;
235 term_counts_set = null;
236
237 // Return the list of stop words removed from the query
238 HashSet terms_including_stop_words = new HashSet();
239 query_including_stop_words.extractTerms(terms_including_stop_words);
240 Iterator terms_including_stop_words_iter = terms_including_stop_words.iterator();
241 while (terms_including_stop_words_iter.hasNext()) {
242 Term term = (Term) terms_including_stop_words_iter.next();
243 if (!terms.contains(term)) {
244 System.out.println(" <StopWord value=\"" + term.text() + "\"/>");
245 }
246 }
247 }
248 catch (ParseException parse_exception) {
249 System.out.println(" <Error type=\"PARSE_EXCEPTION\"/>");
250 }
251 catch (TooManyClauses too_many_clauses_exception) {
252 System.out.println(" <Error type=\"TOO_MANY_CLAUSES\"/>");
253 }
254
255 System.out.println("</ResultSet>");
256 }
257
258 searcher.close();
259 }
260 catch (IOException exception) {
261 exception.printStackTrace();
262 }
263 }
264
265 private static String xmlSafe(String text) {
266 return text.replaceAll("\\&", "\\&amp;");
267 }
268
269 private static Query parseQuery(IndexReader reader, QueryParser query_parser, String query_string, String fuzziness)
270 throws java.io.IOException, org.apache.lucene.queryParser.ParseException
271 {
272 // Split query string into the search terms and the filter terms
273 // * The first +(...) term contains the search terms so count
274 // up '(' and stop when we finish matching ')'
275 int offset = 0;
276 int paren_count = 0;
277 boolean seen_paren = false;
278 while (offset < query_string.length() && (!seen_paren || paren_count > 0)) {
279 if (query_string.charAt(offset) == '(') {
280 paren_count++;
281 seen_paren = true;
282 }
283 if (query_string.charAt(offset) == ')') {
284 paren_count--;
285 }
286 offset++;
287 }
288 String query_prefix = query_string.substring(0, offset);
289 String query_suffix = query_string.substring(offset);
290
291 ///ystem.err.println("Prefix: " + query_prefix);
292 ///ystem.err.println("Suffix: " + query_suffix);
293
294 Query query = query_parser.parse(query_prefix);
295 query = query.rewrite(reader);
296
297 // If this is a fuzzy search, then we need to add the fuzzy
298 // flag to each of the query terms
299 if (fuzziness != null && query.toString().length() > 0) {
300 // Revert the query to a string
301 System.err.println("Rewritten query: " + query.toString());
302 // Search through the string for TX:<term> query terms
303 // and append the ~ operator. Not that this search will
304 // not change phrase searches (TX:"<term> <term>") as
305 // fuzzy searching is not possible for these entries.
306 // Yahoo! Time for a state machine!
307 StringBuffer mutable_query_string = new StringBuffer(query.toString());
308 int o = 0; // Offset
309 // 0 = BASE, 1 = SEEN_T, 2 = SEEN_TX, 3 = SEEN_TX:
310 int s = 0; // State
311 while (o < mutable_query_string.length()) {
312 char c = mutable_query_string.charAt(o);
313 if (s == 0 && c == 'T') {
314 ///ystem.err.println("Found T!");
315 s = 1;
316 }
317 else if (s == 1) {
318 if (c == 'X') {
319 ///ystem.err.println("Found X!");
320 s = 2;
321 }
322 else {
323 s = 0; // Reset
324 }
325 }
326 else if (s == 2) {
327 if (c == ':') {
328 ///ystem.err.println("Found TX:!");
329 s = 3;
330 }
331 else {
332 s = 0; // Reset
333 }
334 }
335 else if (s == 3) {
336 // Don't process phrases
337 if (c == '"') {
338 ///ystem.err.println("Stupid phrase...");
339 s = 0; // Reset
340 }
341 // Found the end of the term... add the
342 // fuzzy search indicator
343 // Nor outside the scope of parentheses
344 else if (Character.isWhitespace(c) || c == ')') {
345 ///ystem.err.println("Yahoo! Found fuzzy term.");
346 mutable_query_string.insert(o, '~' + fuzziness);
347 o++;
348 s = 0; // Reset
349 }
350 }
351 o++;
352 }
353 // If we were in the state of looking for the end of a
354 // term - then we just found it!
355 if (s == 3) {
356 mutable_query_string.append('~' + fuzziness);
357 }
358 // Reparse the query
359 ///ystem.err.println("Fuzzy query: " + mutable_query_string.toString() + query_suffix);
360 query = query_parser.parse(mutable_query_string.toString() + query_suffix);
361 }
362 else {
363 query = query_parser.parse(query_prefix + query_suffix);
364 }
365
366 return query;
367 }
368
369
370 /**
371 * @todo Michael to comment
372 */
373 private static Filter parseFilterString(String filter_string)
374 {
375 Filter result = null;
376 Pattern pattern = Pattern.compile("\\s*\\+(\\w+)\\:([\\{\\[])(\\d+)\\s+TO\\s+(\\d+)([\\}\\]])\\s*");
377 Matcher matcher = pattern.matcher(filter_string);
378 if (matcher.matches()) {
379 String field_name = matcher.group(1);
380 boolean include_lower = matcher.group(2).equals("[");
381 String lower_term = matcher.group(3);
382 String upper_term = matcher.group(4);
383 boolean include_upper = matcher.group(5).equals("]");
384 result = new RangeFilter(field_name, lower_term, upper_term, include_lower, include_upper);
385 }
386 else {
387 System.err.println("Error: Could not understand filter string \"" + filter_string + "\"");
388 }
389 return result;
390 }
391 /** parseFilterString() **/
392}
Note: See TracBrowser for help on using the repository browser.