source: trunk/indexers/lucene-gs/src/org/greenstone/LuceneWrapper/GS2LuceneQuery.java@ 12653

Last change on this file since 12653 was 12653, checked in by mdewsnip, 18 years ago

Made it a little bit easier to use a custom set of stop words with Lucene.

  • Property svn:keywords set to Author Date Id Revision
File size: 11.1 KB
Line 
1/**
2 *
3 * @author [email protected]
4 * @author [email protected]
5 * @version
6 */
7
8package org.nzdl.gsdl.LuceneWrap;
9
10
11import java.io.*;
12import java.util.*;
13
14import org.apache.lucene.analysis.Analyzer;
15import org.apache.lucene.analysis.standard.StandardAnalyzer;
16import org.apache.lucene.document.Document;
17import org.apache.lucene.index.IndexReader;
18import org.apache.lucene.index.Term;
19import org.apache.lucene.index.TermFreqVector;
20import org.apache.lucene.queryParser.ParseException;
21import org.apache.lucene.queryParser.QueryParser;
22import org.apache.lucene.search.BooleanQuery.TooManyClauses;
23import org.apache.lucene.search.Filter;
24import org.apache.lucene.search.Hit;
25import org.apache.lucene.search.Hits;
26import org.apache.lucene.search.IndexSearcher;
27import org.apache.lucene.search.Query;
28import org.apache.lucene.search.QueryFilter;
29import org.apache.lucene.search.RangeFilter;
30import org.apache.lucene.search.Searcher;
31import org.apache.lucene.search.Sort;
32
33
34public class GS2LuceneQuery
35{
36 // Use the standard set of English stop words by default
37 static private String[] stop_words = StandardAnalyzer.STOP_WORDS;
38
39
40 static public void main (String args[])
41 {
42 if (args.length == 0) {
43 System.out.println("Usage: GS2LuceneQuery <index directory> [-fuzzy] [-filter filter_string] [-sort sort_field] [-dco AND|OR]");
44 return;
45 }
46
47 try {
48 Searcher searcher = new IndexSearcher(args[0]);
49 IndexReader reader = ((IndexSearcher) searcher).getIndexReader();
50
51 // Create one query parser with stop words, and one with none
52 QueryParser query_parser = new QueryParser("TX", new StandardAnalyzer(stop_words));
53 QueryParser query_parser_no_stop_words = new QueryParser("TX", new StandardAnalyzer(new String[] { }));
54
55 Sort sorter = new Sort();
56 QueryFilter filter = null;
57 boolean fuzzy = false;
58
59 // New code to allow the default conjunction operator to be
60 // definable
61 String default_conjuction_operator = "OR";
62 for (int i = 1; i < args.length; i++)
63 {
64 if (args[i].equals("-sort"))
65 {
66 i++;
67 sorter = new Sort(args[i]);
68 }
69 if (args[i].equals("-filter"))
70 {
71 i++;
72 try {
73 filter = new QueryFilter(query_parser.parse(args[i]));
74 }
75 catch (ParseException exception) {
76 exception.printStackTrace();
77 }
78 }
79 if (args[i].equals("-dco"))
80 {
81 i++;
82 default_conjuction_operator = args[i];
83 }
84 if (args[i].equals("-fuzzy"))
85 {
86 fuzzy = true;
87 }
88 }
89
90 // Lucene does "OR" queries by default; do an "AND" query if specified
91 if (default_conjuction_operator.equals("AND"))
92 {
93 query_parser.setDefaultOperator(query_parser.AND_OPERATOR);
94 query_parser_no_stop_words.setDefaultOperator(query_parser.AND_OPERATOR);
95 }
96
97 BufferedReader in = new BufferedReader(new InputStreamReader(System.in, "UTF-8"));
98 while (true) {
99 // Read the query from STDIN
100 String query_string = in.readLine();
101 if (query_string == null || query_string.length() == -1) {
102 break;
103 }
104 System.out.println("<ResultSet>");
105 System.out.println(" <QueryString>" + query_string + "</QueryString>");
106
107 try {
108 Query query_including_stop_words = query_parser_no_stop_words.parse(query_string);
109 query_including_stop_words = query_including_stop_words.rewrite(reader);
110
111 Query query = parseQuery(reader, query_parser, query_string, fuzzy);
112 query = query.rewrite(reader);
113
114 // Perform the query
115 Hits hits;
116 if (filter != null) {
117 hits = searcher.search(query, filter, sorter);
118 }
119 else {
120 hits = searcher.search(query, sorter);
121 }
122
123 // Return the list of expanded query terms and their frequencies
124 HashMap term_counts = new HashMap();
125 HashMap term_fields = new HashMap();
126 HashSet terms = new HashSet();
127 query.extractTerms(terms);
128 Iterator iter = terms.iterator();
129 while (iter.hasNext())
130 {
131 Term term = (Term) iter.next();
132 // If you wanted to limit this to just TX terms add
133 // something like this:
134 //if (term.field().equals("TX"))
135 term_counts.put(term.text(), new Integer(0));
136 term_fields.put(term.text(), term.field());
137 }
138
139 // Do we need to use a hit iterator to get sorted results?
140 System.out.println(" <MatchingDocsInfo num=\"" + hits.length() + "\"/>");
141 Iterator hit_iter = hits.iterator();
142 while (hit_iter.hasNext())
143 {
144 Hit hit = (Hit) hit_iter.next();
145 Document doc = hit.getDocument();
146 String node_id = doc.get("nodeID");
147 System.out.println(" <Match id=\"" + node_id + "\" />");
148
149 // From the document, extract the Term Vector for the
150 // TX field
151 TermFreqVector term_freq_vector = reader.getTermFreqVector(hit.getId(), "TX");
152 if (term_freq_vector != null && term_freq_vector.size() > 0)
153 {
154 int[] term_frequencies = term_freq_vector.getTermFrequencies();
155 // Now for each query term, determine the
156 // frequency - which may of course be 0.
157 Set term_counts_set = term_counts.keySet();
158 Iterator terms_iter = term_counts_set.iterator();
159 while (terms_iter.hasNext())
160 {
161 String term = (String) terms_iter.next();
162 Integer count_integer = (Integer) term_counts.get(term);
163 int count = count_integer.intValue();
164 int index = term_freq_vector.indexOf(term);
165 // If the term has a count, then add to
166 // the total count for this term
167 if (index != -1)
168 {
169 count += term_frequencies[index];
170
171 }
172 // Store the result
173 term_counts.put(term, new Integer(count));
174 count_integer = null;
175 term = null;
176 }
177 terms_iter = null;
178 term_counts_set = null;
179 }
180 else
181 {
182 ///ystem.err.println("Error! Missing term vector for document " + hit.getId());
183 }
184 }
185
186 // Retrieve all the useful terms
187 Set term_counts_set = term_counts.keySet();
188 System.out.println(" <QueryTermsInfo num=\"" + term_counts_set.size() + "\"/>");
189 // Iterate over them
190 Iterator terms_iter = term_counts_set.iterator();
191 while (terms_iter.hasNext())
192 {
193 String term = (String) terms_iter.next();
194 Integer count = (Integer) term_counts.get(term);
195 String field = (String) term_fields.get(term);
196 System.out.println(" <Term value=\"" + term + "\" field=\"" + field + "\" freq=\"" + count.intValue() + "\" />");
197 count = null;
198 term = null;
199 }
200 // Cleanup
201 terms_iter = null;
202 term_counts_set = null;
203
204 // Return the list of stop words removed from the query
205 HashSet terms_including_stop_words = new HashSet();
206 query_including_stop_words.extractTerms(terms_including_stop_words);
207 Iterator terms_including_stop_words_iter = terms_including_stop_words.iterator();
208 while (terms_including_stop_words_iter.hasNext()) {
209 Term term = (Term) terms_including_stop_words_iter.next();
210 if (!terms.contains(term)) {
211 System.out.println(" <StopWord value=\"" + term.text() + "\"/>");
212 }
213 }
214 }
215 catch (ParseException parse_exception) {
216 System.out.println(" <Error type=\"PARSE_EXCEPTION\"/>");
217 }
218 catch (TooManyClauses too_many_clauses_exception) {
219 System.out.println(" <Error type=\"TOO_MANY_CLAUSES\"/>");
220 }
221
222 System.out.println("</ResultSet>");
223 }
224
225 searcher.close();
226 }
227 catch (IOException exception) {
228 exception.printStackTrace();
229 }
230 }
231
232
233 private static Query parseQuery(IndexReader reader, QueryParser query_parser, String query_string, boolean fuzzy)
234 throws java.io.IOException, org.apache.lucene.queryParser.ParseException
235 {
236 // Split query string into the search terms and the filter terms
237 // * The first +(...) term contains the search terms so count
238 // up '(' and stop when we finish matching ')'
239 int offset = 0;
240 int paren_count = 0;
241 boolean seen_paren = false;
242 while (offset < query_string.length() && (!seen_paren || paren_count > 0))
243 {
244 if (query_string.charAt(offset) == '(')
245 {
246 paren_count++;
247 seen_paren = true;
248 }
249 if (query_string.charAt(offset) == ')')
250 {
251 paren_count--;
252 }
253 offset++;
254 }
255 String query_prefix = query_string.substring(0, offset);
256 String query_suffix = query_string.substring(offset);
257
258 ///ystem.err.println("Prefix: " + query_prefix);
259 ///ystem.err.println("Suffix: " + query_suffix);
260
261 Query query = query_parser.parse(query_prefix);
262 query = query.rewrite(reader);
263
264 // If this is a fuzzy search, then we need to add the fuzzy
265 // flag to each of the query terms
266 if (fuzzy && query.toString().length() > 0)
267 {
268 // Revert the query to a string
269 System.err.println("Rewritten query: " + query.toString());
270 // Search through the string for TX:<term> query terms
271 // and append the ~ operator. Not that this search will
272 // not change phrase searches (TX:"<term> <term>") as
273 // fuzzy searching is not possible for these entries.
274 // Yahoo! Time for a state machine!
275 StringBuffer mutable_query_string = new StringBuffer(query.toString());
276 int o = 0; // Offset
277 // 0 = BASE, 1 = SEEN_T, 2 = SEEN_TX, 3 = SEEN_TX:
278 int s = 0; // State
279 while(o < mutable_query_string.length())
280 {
281 char c = mutable_query_string.charAt(o);
282 if (s == 0 && c == 'T')
283 {
284 ///ystem.err.println("Found T!");
285 s = 1;
286 }
287 else if (s == 1)
288 {
289 if (c == 'X')
290 {
291 ///ystem.err.println("Found X!");
292 s = 2;
293 }
294 else
295 {
296 s = 0; // Reset
297 }
298 }
299 else if (s == 2)
300 {
301 if (c == ':')
302 {
303 ///ystem.err.println("Found TX:!");
304 s = 3;
305 }
306 else
307 {
308 s = 0; // Reset
309 }
310 }
311 else if (s == 3)
312 {
313 // Don't process phrases
314 if (c == '"')
315 {
316 ///ystem.err.println("Stupid phrase...");
317 s = 0; // Reset
318 }
319 // Found the end of the term... add the
320 // fuzzy search indicator
321 // Nor outside the scope of parentheses
322 else if (Character.isWhitespace(c) || c == ')')
323 {
324 ///ystem.err.println("Yahoo! Found fuzzy term.");
325 mutable_query_string.insert(o, '~');
326 o++;
327 s = 0; // Reset
328 }
329 }
330 o++;
331 }
332 // If we were in the state of looking for the end of a
333 // term - then we just found it!
334 if (s == 3)
335 {
336 mutable_query_string.append('~');
337 }
338 // Reparse the query
339 ///ystem.err.println("Fuzzy query: " + mutable_query_string.toString() + query_suffix);
340 query = query_parser.parse(mutable_query_string.toString() + query_suffix);
341 }
342 else
343 {
344 query = query_parser.parse(query_prefix + query_suffix);
345 }
346
347 return query;
348 }
349}
Note: See TracBrowser for help on using the repository browser.