source: trunk/indexers/lucene-gs/src/org/greenstone/LuceneWrapper/GS2LuceneQuery.java@ 12408

Last change on this file since 12408 was 12408, checked in by mdewsnip, 18 years ago

Added a "-filter" option which can currently be used for specifying range filters (eg. we're going to use it for dates). Many thanks to Me and DL Consulting Ltd.

  • Property svn:keywords set to Author Date Id Revision
File size: 15.9 KB
Line 
1/**
2 *
3 * @author [email protected]
4 * @author [email protected]
5 * @version
6 */
7
8package org.nzdl.gsdl.LuceneWrap;
9
10
11import java.io.BufferedReader;
12import java.io.InputStreamReader;
13import java.util.Collections;
14import java.util.HashMap;
15import java.util.HashSet;
16import java.util.Iterator;
17import java.util.Set;
18
19import org.apache.lucene.analysis.Analyzer;
20import org.apache.lucene.analysis.standard.StandardAnalyzer;
21import org.apache.lucene.document.Document;
22import org.apache.lucene.index.IndexReader;
23import org.apache.lucene.index.Term;
24import org.apache.lucene.index.TermFreqVector;
25import org.apache.lucene.queryParser.QueryParser;
26import org.apache.lucene.search.Filter;
27import org.apache.lucene.search.Hit;
28import org.apache.lucene.search.Hits;
29import org.apache.lucene.search.IndexSearcher;
30import org.apache.lucene.search.Query;
31import org.apache.lucene.search.RangeFilter;
32import org.apache.lucene.search.Searcher;
33import org.apache.lucene.search.Sort;
34
35
36public class GS2LuceneQuery
37{
38 public static void main (String args[])
39 {
40 if (args.length == 0) {
41 System.out.println("Usage: GS2LuceneQuery <index directory> (<sort field>)");
42 return;
43 }
44
45 try {
46 Searcher searcher = new IndexSearcher(args[0]);
47 IndexReader reader = ((IndexSearcher) searcher).getIndexReader();
48
49 Sort sorter = new Sort();
50 Filter filter = null;
51 boolean fuzzy = false;
52
53 // New code to allow the default conjunction operator to be
54 // definable
55 String default_conjuction_operator = "OR";
56 for (int i = 1; i < args.length; i++)
57 {
58 if (args[i].equals("-sort"))
59 {
60 i++;
61 ///ystem.err.println("**** sort by = " + args[i]);
62 sorter = new Sort(args[i]);
63 }
64 if (args[i].equals("-filter"))
65 {
66 i++;
67 filter = parseFilterString(args[i]);
68 }
69 if (args[i].equals("-dco"))
70 {
71 i++;
72 default_conjuction_operator = args[i];
73 }
74 if (args[i].equals("-fuzzy"))
75 {
76 fuzzy = true;
77 }
78 }
79
80 // Create one query parser with the standard set of stop words, and one with none
81 QueryParser query_parser = new QueryParser("TX", new StandardAnalyzer());
82 QueryParser query_parser_no_stop_words = new QueryParser("TX", new StandardAnalyzer(new String[] { }));
83
84 // Lucene does "OR" queries by default; do an "AND" query if specified
85 if (default_conjuction_operator.equals("AND"))
86 {
87 query_parser.setDefaultOperator(query_parser.AND_OPERATOR);
88 query_parser_no_stop_words.setDefaultOperator(query_parser.AND_OPERATOR);
89 }
90
91 BufferedReader in = new BufferedReader(new InputStreamReader(System.in, "UTF-8"));
92 while (true)
93 {
94 // Read the query from STDIN
95 String query_string = in.readLine();
96 if (query_string == null || query_string.length() == -1)
97 {
98 break;
99 }
100 ///ystem.err.println("**** query = " + query_string);
101
102 Query query_including_stop_words = query_parser_no_stop_words.parse(query_string);
103 query_including_stop_words = query_including_stop_words.rewrite(reader);
104
105 // Split query string into the search terms and the filter terms
106 // * The first +(...) term contains the search terms so count
107 // up '(' and stop when we finish matching ')'
108 int offset = 0;
109 int paren_count = 0;
110 boolean seen_paren = false;
111 while (offset < query_string.length() && (!seen_paren || paren_count > 0))
112 {
113 if (query_string.charAt(offset) == '(')
114 {
115 paren_count++;
116 seen_paren = true;
117 }
118 if (query_string.charAt(offset) == ')')
119 {
120 paren_count--;
121 }
122 offset++;
123 }
124 String query_prefix = query_string.substring(0, offset);
125 String query_suffix = query_string.substring(offset);
126
127 ///ystem.err.println("Prefix: " + query_prefix);
128 ///ystem.err.println("Suffix: " + query_suffix);
129
130 Query query = query_parser.parse(query_prefix);
131 query = query.rewrite(reader);
132
133 // If this is a fuzzy search, then we need to add the fuzzy
134 // flag to each of the query terms
135 if (fuzzy && query.toString().length() > 0)
136 {
137 // Revert the query to a string
138 ///ystem.err.println("Rewritten query: " + query.toString());
139 // Search through the string for TX:<term> query terms
140 // and append the ~ operator. Not that this search will
141 // not change phrase searches (TX:"<term> <term>") as
142 // fuzzy searching is not possible for these entries.
143 // Yahoo! Time for a state machine!
144 StringBuffer mutable_query_string = new StringBuffer(query.toString());
145 int o = 0; // Offset
146 // 0 = BASE, 1 = SEEN_T, 2 = SEEN_TX, 3 = SEEN_TX:
147 int s = 0; // State
148 while(o < mutable_query_string.length())
149 {
150 char c = mutable_query_string.charAt(o);
151 if (s == 0 && c == 'T')
152 {
153 ///ystem.err.println("Found T!");
154 s = 1;
155 }
156 else if (s == 1)
157 {
158 if (c == 'X')
159 {
160 ///ystem.err.println("Found X!");
161 s = 2;
162 }
163 else
164 {
165 s = 0; // Reset
166 }
167 }
168 else if (s == 2)
169 {
170 if (c == ':')
171 {
172 ///ystem.err.println("Found TX:!");
173 s = 3;
174 }
175 else
176 {
177 s = 0; // Reset
178 }
179 }
180 else if (s == 3)
181 {
182 // Don't process phrases
183 if (c == '"')
184 {
185 ///ystem.err.println("Stupid phrase...");
186 s = 0; // Reset
187 }
188 // Found the end of the term... add the
189 // fuzzy search indicator
190 // Nor outside the scope of parentheses
191 else if (Character.isWhitespace(c) || c == ')')
192 {
193 ///ystem.err.println("Yahoo! Found fuzzy term.");
194 mutable_query_string.insert(o, '~');
195 o++;
196 s = 0; // Reset
197 }
198 }
199 o++;
200 }
201 // If we were in the state of looking for the end of a
202 // term - then we just found it!
203 if (s == 3)
204 {
205 mutable_query_string.append('~');
206 }
207 // Reparse the query
208 ///ystem.err.println("Fuzzy query: " + mutable_query_string.toString() + query_suffix);
209 query = query_parser.parse(mutable_query_string.toString() + query_suffix);
210 // And rewrite again
211 query = query.rewrite(reader);
212 ///ystem.err.println("Rewritten Fuzzy query: " + query.toString());
213 }
214 else
215 {
216 query = query_parser.parse(query_prefix + query_suffix);
217 query = query.rewrite(reader);
218 }
219
220 // Perform the query
221 Hits hits;
222 if (filter != null) {
223 hits = searcher.search(query, filter, sorter);
224 }
225 else {
226 hits = searcher.search(query, sorter);
227 }
228 System.out.println("<ResultSet>");
229 System.out.println(" <QueryString>" + query_string + "</QueryString>");
230 // Return the list of expanded query terms and their frequencies
231 HashMap term_counts = new HashMap();
232 HashMap term_fields = new HashMap();
233 HashSet terms = new HashSet();
234 query.extractTerms(terms);
235 //System.out.println(" <QueryTermsInfo num=\"" + terms.size() + "\"/>");
236 Iterator iter = terms.iterator();
237 while (iter.hasNext())
238 {
239 Term term = (Term) iter.next();
240 // If you wanted to limit this to just TX terms add
241 // something like this:
242 //if (term.field().equals("TX"))
243 term_counts.put(term.text(), new Integer(0));
244 term_fields.put(term.text(), term.field());
245 }
246
247 // Do we need to use a hit iterator to get sorted results?
248 System.out.println(" <MatchingDocsInfo num=\"" + hits.length() + "\"/>");
249 Iterator hit_iter = hits.iterator();
250 while (hit_iter.hasNext())
251 {
252 Hit hit = (Hit) hit_iter.next();
253 Document doc = hit.getDocument();
254 String node_id = doc.get("nodeID");
255 System.out.println(" <Match id=\"" + node_id + "\" />");
256
257 // From the document, extract the Term Vector for the
258 // TX field
259 TermFreqVector term_freq_vector = reader.getTermFreqVector(hit.getId(), "TX");
260 if (term_freq_vector != null && term_freq_vector.size() > 0)
261 {
262 int[] term_frequencies = term_freq_vector.getTermFrequencies();
263 // Now for each query term, determine the
264 // frequency - which may of course be 0.
265 Set term_counts_set = term_counts.keySet();
266 Iterator terms_iter = term_counts_set.iterator();
267 while (terms_iter.hasNext())
268 {
269 String term = (String) terms_iter.next();
270 Integer count_integer = (Integer) term_counts.get(term);
271 int count = count_integer.intValue();
272 int index = term_freq_vector.indexOf(term);
273 // If the term has a count, then add to
274 // the total count for this term
275 if (index != -1)
276 {
277 count += term_frequencies[index];
278
279 }
280 // Store the result
281 term_counts.put(term, new Integer(count));
282 count_integer = null;
283 term = null;
284 }
285 terms_iter = null;
286 term_counts_set = null;
287 }
288 else
289 {
290 ///ystem.err.println("Error! Missing term vector for document " + hit.getId());
291 }
292 }
293
294 // Retrieve all the useful terms
295 Set term_counts_set = term_counts.keySet();
296 System.out.println(" <QueryTermsInfo num=\"" + term_counts_set.size() + "\"/>");
297 // Iterate over them
298 Iterator terms_iter = term_counts_set.iterator();
299 while (terms_iter.hasNext())
300 {
301 String term = (String) terms_iter.next();
302 Integer count = (Integer) term_counts.get(term);
303 String field = (String) term_fields.get(term);
304 System.out.println(" <Term value=\"" + term + "\" field=\"" + field + "\" freq=\"" + count.intValue() + "\" />");
305 count = null;
306 term = null;
307 }
308 // Cleanup
309 terms_iter = null;
310 term_counts_set = null;
311
312 // Return the list of stop words removed from the query
313 HashSet terms_including_stop_words = new HashSet();
314 query_including_stop_words.extractTerms(terms_including_stop_words);
315 Iterator terms_including_stop_words_iter = terms_including_stop_words.iterator();
316 while (terms_including_stop_words_iter.hasNext()) {
317 Term term = (Term) terms_including_stop_words_iter.next();
318 if (!terms.contains(term)) {
319 System.out.println(" <StopWord value=\"" + term.text() + "\"/>");
320 }
321 }
322
323 System.out.println("</ResultSet>");
324 }
325
326 searcher.close();
327 }
328 catch (Exception exception) {
329 exception.printStackTrace();
330 }
331 }
332
333
334 private static Filter parseFilterString(String filter_string)
335 {
336 // Range filters
337 if (filter_string.matches("(.*):[\\{\\[](.+) TO (.+)[\\}\\]]")) {
338 String field_name = filter_string.substring(0, filter_string.indexOf(":"));
339 boolean include_lower = (filter_string.charAt(filter_string.indexOf(":") + 1) == '[');
340 String lower_term = filter_string.substring(filter_string.indexOf(":") + 2, filter_string.indexOf(" TO "));
341 String upper_term = filter_string.substring(filter_string.indexOf(" TO ") + " TO ".length(), filter_string.length() - 1);
342 boolean include_upper = (filter_string.charAt(filter_string.length() - 1) == ']');
343 return new RangeFilter(field_name, lower_term, upper_term, include_lower, include_upper);
344 }
345
346 System.err.println("Error: Could not understand filter string \"" + filter_string + "\"");
347 return null;
348 }
349}
Note: See TracBrowser for help on using the repository browser.