source: trunk/gsdl/src/java/org/nzdl/gsdl/LuceneWrap/GS2LuceneQuery.java@ 13054

Last change on this file since 13054 was 13054, checked in by mdewsnip, 18 years ago

Now puts the terms through xmlSafe() as well, to prevent invalid XML with weird terms containing punctuation.

  • Property svn:keywords set to Author Date Id Revision
File size: 15.5 KB
Line 
1/**
2 *
3 * @author [email protected]
4 * @author [email protected]
5 * @author [email protected]
6 * @author [email protected]
7 * @version
8 */
9
10package org.nzdl.gsdl.LuceneWrap;
11
12
13import java.io.*;
14import java.util.*;
15import java.util.regex.*;
16
17import org.apache.lucene.analysis.Analyzer;
18import org.apache.lucene.analysis.standard.StandardAnalyzer;
19import org.apache.lucene.document.Document;
20import org.apache.lucene.index.IndexReader;
21import org.apache.lucene.index.Term;
22import org.apache.lucene.index.TermDocs;
23import org.apache.lucene.queryParser.ParseException;
24import org.apache.lucene.queryParser.QueryParser;
25import org.apache.lucene.search.BooleanQuery.TooManyClauses;
26import org.apache.lucene.search.Filter;
27import org.apache.lucene.search.Hit;
28import org.apache.lucene.search.Hits;
29import org.apache.lucene.search.IndexSearcher;
30import org.apache.lucene.search.Query;
31import org.apache.lucene.search.RangeFilter;
32import org.apache.lucene.search.Searcher;
33import org.apache.lucene.search.ScoreDoc;
34import org.apache.lucene.search.Sort;
35import org.apache.lucene.search.TopFieldDocs;
36
37
38public class GS2LuceneQuery
39{
40 static private String TEXTFIELD = "TX";
41
42 // Fairly self-explanatory I should hope
43 static private boolean query_result_caching_enabled = false;
44
45 // Use the standard set of English stop words by default
46 static private String[] stop_words = StandardAnalyzer.STOP_WORDS;
47
48 // Command-line options
49 static private String fuzziness = null;
50 static private String filter_string = null;
51 static private Filter filter = null;
52 static private String sort_string = null;
53 static private Sort sorter = new Sort();
54 static private String default_conjuction_operator = "OR";
55 static private int start_results = 1;
56 static private int end_results = Integer.MAX_VALUE;
57
58
59 static public void main (String args[])
60 {
61 if (args.length == 0) {
62 System.out.println("Usage: GS2LuceneQuery <index directory> [-fuzziness value] [-filter filter_string] [-sort sort_field] [-dco AND|OR] [-startresults number -endresults number] [query]");
63 return;
64 }
65
66 try {
67 String index_directory = args[0];
68 Searcher searcher = new IndexSearcher(index_directory);
69 IndexReader reader = ((IndexSearcher) searcher).getIndexReader();
70
71 // Prepare the index cache directory, if query result caching is enabled
72 if (query_result_caching_enabled) {
73 // Make the index cache directory if it doesn't already exist
74 File index_cache_directory = new File(index_directory, "cache");
75 if (!index_cache_directory.exists()) {
76 index_cache_directory.mkdir();
77 }
78
79 // Disable caching if the index cache directory isn't available
80 if (!index_cache_directory.exists() || !index_cache_directory.isDirectory()) {
81 query_result_caching_enabled = false;
82 }
83 }
84
85 // Create one query parser with the standard set of stop words, and one with none
86 QueryParser query_parser = new QueryParser(TEXTFIELD, new StandardAnalyzer(stop_words));
87 QueryParser query_parser_no_stop_words = new QueryParser(TEXTFIELD, new StandardAnalyzer(new String[] { }));
88
89 String query_string = null;
90
91 // Parse the command-line arguments
92 for (int i = 1; i < args.length; i++) {
93 if (args[i].equals("-sort")) {
94 i++;
95 sort_string = args[i];
96 sorter = new Sort(sort_string);
97 }
98 else if (args[i].equals("-filter")) {
99 i++;
100 filter_string = args[i];
101 filter = parseFilterString(filter_string);
102 }
103 else if (args[i].equals("-dco")) {
104 i++;
105 default_conjuction_operator = args[i];
106 }
107 else if (args[i].equals("-fuzziness")) {
108 i++;
109 fuzziness = args[i];
110 }
111 else if (args[i].equals("-startresults")) {
112 i++;
113 if (args[i].matches("\\d+")) {
114 start_results = Integer.parseInt(args[i]);
115 }
116 }
117 else if (args[i].equals("-endresults")) {
118 i++;
119 if (args[i].matches("\\d+")) {
120 end_results = Integer.parseInt(args[i]);
121 }
122 }
123 else {
124 query_string = args[i];
125 }
126 }
127
128 // Lucene does "OR" queries by default; do an "AND" query if specified
129 if (default_conjuction_operator.equals("AND")) {
130 query_parser.setDefaultOperator(query_parser.AND_OPERATOR);
131 query_parser_no_stop_words.setDefaultOperator(query_parser.AND_OPERATOR);
132 }
133
134 // The query string has been specified as a command-line argument
135 if (query_string != null) {
136 runQuery(index_directory, searcher, reader, query_parser, query_parser_no_stop_words, query_string);
137 }
138
139 // Read queries from STDIN
140 else {
141 BufferedReader in = new BufferedReader(new InputStreamReader(System.in, "UTF-8"));
142 while (true) {
143 // Read the query from STDIN
144 query_string = in.readLine();
145 if (query_string == null || query_string.length() == -1) {
146 break;
147 }
148
149 runQuery(index_directory, searcher, reader, query_parser, query_parser_no_stop_words, query_string);
150 }
151 }
152
153 searcher.close();
154 }
155 catch (IOException exception) {
156 exception.printStackTrace();
157 }
158 }
159
160
161 private static void runQuery(String index_directory, Searcher searcher, IndexReader reader, QueryParser query_parser, QueryParser query_parser_no_stop_words, String query_string)
162 throws IOException
163 {
164 StringBuffer query_results_xml = new StringBuffer();
165
166 // Check if this query result has been cached from a previous search (if it's enabled)
167 File query_result_cache_file = null;
168 if (query_result_caching_enabled) {
169 // Generate the cache file name from the query options
170 String query_result_cache_file_name = query_string + "-";
171 query_result_cache_file_name += ((fuzziness != null) ? fuzziness : "") + "-";
172 query_result_cache_file_name += ((filter_string != null) ? filter_string : "") + "-";
173 query_result_cache_file_name += ((sort_string != null) ? sort_string : "") + "-";
174 query_result_cache_file_name += default_conjuction_operator + "-";
175 query_result_cache_file_name += start_results + "-" + end_results;
176 query_result_cache_file_name = fileSafe(query_result_cache_file_name);
177
178 // If the query result cache file exists, just return its contents and we're done
179 File index_cache_directory = new File(index_directory, "cache");
180 query_result_cache_file = new File(index_cache_directory, query_result_cache_file_name);
181 if (query_result_cache_file.exists() && query_result_cache_file.isFile()) {
182 FileInputStream fis = new FileInputStream(query_result_cache_file);
183 InputStreamReader isr = new InputStreamReader(fis, "UTF-8");
184 BufferedReader buffered_reader = new BufferedReader(isr);
185 String line = "";
186 while ((line = buffered_reader.readLine()) != null) {
187 query_results_xml.append(line + "\n");
188 }
189 String query_results_xml_string = query_results_xml.toString();
190 query_results_xml_string = query_results_xml_string.replaceFirst("cached=\"false\"", "cached=\"true\"");
191 System.out.print(query_results_xml_string);
192 return;
193 }
194 }
195
196 query_results_xml.append("<ResultSet cached=\"false\">\n");
197 query_results_xml.append(" <QueryString>" + xmlSafe(query_string) + "</QueryString>\n");
198 if (filter != null) {
199 query_results_xml.append(" <FilterString>" + filter.toString() + "</FilterString>\n");
200 }
201
202 try {
203 Query query_including_stop_words = parseQuery(reader, query_parser_no_stop_words, query_string, fuzziness);
204 query_including_stop_words = query_including_stop_words.rewrite(reader);
205
206 Query query = parseQuery(reader, query_parser, query_string, fuzziness);
207 query = query.rewrite(reader);
208
209 // Return the list of expanded query terms and their frequencies
210 HashSet terms = new HashSet();
211 query.extractTerms(terms);
212 Iterator term_iterator = terms.iterator();
213 query_results_xml.append(" <QueryTermsInfo num=\"" + terms.size() + "\"/>\n");
214 while (term_iterator.hasNext()) {
215 Term term = (Term) term_iterator.next();
216
217 // Get the term frequency over all the documents
218 TermDocs term_docs = reader.termDocs(term);
219 int term_freq = term_docs.freq();
220 while (term_docs.next()) {
221 term_freq += term_docs.freq();
222 }
223
224 // If you wanted to limit this to just text terms add
225 // something like this:
226 // if (term.field().equals(TEXTFIELD))
227 query_results_xml.append(" <Term value=\"" + xmlSafe(term.text()) + "\" field=\"" + term.field() + "\" freq=\"" + term_freq + "\" />\n");
228 }
229
230 // Return the list of stop words removed from the query
231 HashSet terms_including_stop_words = new HashSet();
232 query_including_stop_words.extractTerms(terms_including_stop_words);
233 Iterator terms_including_stop_words_iter = terms_including_stop_words.iterator();
234 while (terms_including_stop_words_iter.hasNext()) {
235 Term term = (Term) terms_including_stop_words_iter.next();
236 if (!terms.contains(term)) {
237 query_results_xml.append(" <StopWord value=\"" + term.text() + "\"/>\n");
238 }
239 }
240
241 // Simple case for getting all the matching documents
242 if (end_results == Integer.MAX_VALUE) {
243 // Perform the query (filter and sorter may be null)
244 Hits hits = searcher.search(query, filter, sorter);
245 query_results_xml.append(" <MatchingDocsInfo num=\"" + hits.length() + "\"/>\n");
246
247 // Output the matching documents
248 query_results_xml.append(" <StartResults num=\"" + start_results + "\" />\n");
249 query_results_xml.append(" <EndsResults num=\"" + hits.length() + "\" />\n");
250 for (int i = start_results; i <= hits.length(); i++) {
251 Document doc = hits.doc(i - 1);
252 query_results_xml.append(" <Match id=\"" + doc.get("nodeID") + "\" />\n");
253 }
254 }
255
256 // Slightly more complicated case for returning a subset of the matching documents
257 else {
258 // Perform the query (filter may be null)
259 TopFieldDocs hits = searcher.search(query, filter, end_results, sorter);
260 query_results_xml.append(" <MatchingDocsInfo num=\"" + hits.totalHits + "\"/>\n");
261
262 // Output the matching documents
263 query_results_xml.append(" <StartResults num=\"" + start_results + "\" />\n");
264 query_results_xml.append(" <EndsResults num=\"" + end_results + "\" />\n");
265 for (int i = start_results; (i <= hits.scoreDocs.length && i <= end_results); i++) {
266 Document doc = reader.document(hits.scoreDocs[i - 1].doc);
267 query_results_xml.append(" <Match id=\"" + doc.get("nodeID") + "\" />\n");
268 }
269 }
270 }
271 catch (ParseException parse_exception) {
272 query_results_xml.append(" <Error type=\"PARSE_EXCEPTION\"/>\n");
273 }
274 catch (TooManyClauses too_many_clauses_exception) {
275 query_results_xml.append(" <Error type=\"TOO_MANY_CLAUSES\"/>\n");
276 }
277
278 query_results_xml.append("</ResultSet>\n");
279
280 System.out.print(query_results_xml);
281
282 // Cache this query result, if desired
283 if (query_result_caching_enabled) {
284 FileWriter query_result_cache_file_writer = new FileWriter(query_result_cache_file);
285 query_result_cache_file_writer.write(query_results_xml.toString());
286 query_result_cache_file_writer.close();
287 }
288 }
289
290
291 private static String fileSafe(String text)
292 {
293 StringBuffer file_safe_text = new StringBuffer();
294 for (int i = 0; i < text.length(); i++) {
295 char character = text.charAt(i);
296 if ((character >= 'A' && character <= 'Z') || (character >= 'a' && character <= 'z') || (character >= '0' && character <= '9') || character == '-') {
297 file_safe_text.append(character);
298 }
299 else {
300 file_safe_text.append('%');
301 file_safe_text.append((int) character);
302 }
303 }
304 return file_safe_text.toString();
305 }
306
307
308 private static String xmlSafe(String text) {
309 text = text.replaceAll("&","&amp;amp;");
310 text = text.replaceAll("<","&amp;lt;");
311 text = text.replaceAll(">","&amp;gt;");
312 text = text.replaceAll("'","&amp;#039;");
313 text = text.replaceAll("\\\"","&amp;quot;");
314 return text;
315 }
316
317
318 private static Query parseQuery(IndexReader reader, QueryParser query_parser, String query_string, String fuzziness)
319 throws java.io.IOException, org.apache.lucene.queryParser.ParseException
320 {
321 // Split query string into the search terms and the filter terms
322 // * The first +(...) term contains the search terms so count
323 // up '(' and stop when we finish matching ')'
324 int offset = 0;
325 int paren_count = 0;
326 boolean seen_paren = false;
327 while (offset < query_string.length() && (!seen_paren || paren_count > 0)) {
328 if (query_string.charAt(offset) == '(') {
329 paren_count++;
330 seen_paren = true;
331 }
332 if (query_string.charAt(offset) == ')') {
333 paren_count--;
334 }
335 offset++;
336 }
337 String query_prefix = query_string.substring(0, offset);
338 String query_suffix = query_string.substring(offset);
339
340 ///ystem.err.println("Prefix: " + query_prefix);
341 ///ystem.err.println("Suffix: " + query_suffix);
342
343 Query query = query_parser.parse(query_prefix);
344 query = query.rewrite(reader);
345
346 // If this is a fuzzy search, then we need to add the fuzzy
347 // flag to each of the query terms
348 if (fuzziness != null && query.toString().length() > 0) {
349 // Revert the query to a string
350 System.err.println("Rewritten query: " + query.toString());
351 // Search through the string for TX:<term> query terms
352 // and append the ~ operator. Not that this search will
353 // not change phrase searches (TX:"<term> <term>") as
354 // fuzzy searching is not possible for these entries.
355 // Yahoo! Time for a state machine!
356 StringBuffer mutable_query_string = new StringBuffer(query.toString());
357 int o = 0; // Offset
358 // 0 = BASE, 1 = SEEN_T, 2 = SEEN_TX, 3 = SEEN_TX:
359 int s = 0; // State
360 while (o < mutable_query_string.length()) {
361 char c = mutable_query_string.charAt(o);
362 if (s == 0 && c == TEXTFIELD.charAt(0)) {
363 ///ystem.err.println("Found T!");
364 s = 1;
365 }
366 else if (s == 1) {
367 if (c == TEXTFIELD.charAt(1)) {
368 ///ystem.err.println("Found X!");
369 s = 2;
370 }
371 else {
372 s = 0; // Reset
373 }
374 }
375 else if (s == 2) {
376 if (c == ':') {
377 ///ystem.err.println("Found TX:!");
378 s = 3;
379 }
380 else {
381 s = 0; // Reset
382 }
383 }
384 else if (s == 3) {
385 // Don't process phrases
386 if (c == '"') {
387 ///ystem.err.println("Stupid phrase...");
388 s = 0; // Reset
389 }
390 // Found the end of the term... add the
391 // fuzzy search indicator
392 // Nor outside the scope of parentheses
393 else if (Character.isWhitespace(c) || c == ')') {
394 ///ystem.err.println("Yahoo! Found fuzzy term.");
395 mutable_query_string.insert(o, '~' + fuzziness);
396 o++;
397 s = 0; // Reset
398 }
399 }
400 o++;
401 }
402 // If we were in the state of looking for the end of a
403 // term - then we just found it!
404 if (s == 3) {
405 mutable_query_string.append('~' + fuzziness);
406 }
407 // Reparse the query
408 ///ystem.err.println("Fuzzy query: " + mutable_query_string.toString() + query_suffix);
409 query = query_parser.parse(mutable_query_string.toString() + query_suffix);
410 }
411 else {
412 query = query_parser.parse(query_prefix + query_suffix);
413 }
414
415 return query;
416 }
417
418
419 /**
420 * @todo Michael to comment
421 */
422 private static Filter parseFilterString(String filter_string)
423 {
424 Filter result = null;
425 Pattern pattern = Pattern.compile("\\s*\\+(\\w+)\\:([\\{\\[])(\\d+)\\s+TO\\s+(\\d+)([\\}\\]])\\s*");
426 Matcher matcher = pattern.matcher(filter_string);
427 if (matcher.matches()) {
428 String field_name = matcher.group(1);
429 boolean include_lower = matcher.group(2).equals("[");
430 String lower_term = matcher.group(3);
431 String upper_term = matcher.group(4);
432 boolean include_upper = matcher.group(5).equals("]");
433 result = new RangeFilter(field_name, lower_term, upper_term, include_lower, include_upper);
434 }
435 else {
436 System.err.println("Error: Could not understand filter string \"" + filter_string + "\"");
437 }
438 return result;
439 }
440 /** parseFilterString() **/
441}
Note: See TracBrowser for help on using the repository browser.