source: trunk/indexers/lucene-gs/src/org/greenstone/LuceneWrapper/GS2LuceneQuery.java@ 12989

Last change on this file since 12989 was 12989, checked in by mdewsnip, 18 years ago

Follow to close the searcher object.

  • Property svn:keywords set to Author Date Id Revision
File size: 11.5 KB
Line 
1/**
2 *
3 * @author [email protected]
4 * @author [email protected]
5 * @author [email protected]
6 * @author [email protected]
7 * @version
8 */
9
10package org.nzdl.gsdl.LuceneWrap;
11
12
13import java.io.*;
14import java.util.*;
15import java.util.regex.*;
16
17import org.apache.lucene.analysis.Analyzer;
18import org.apache.lucene.analysis.standard.StandardAnalyzer;
19import org.apache.lucene.document.Document;
20import org.apache.lucene.index.IndexReader;
21import org.apache.lucene.index.Term;
22import org.apache.lucene.index.TermDocs;
23import org.apache.lucene.queryParser.ParseException;
24import org.apache.lucene.queryParser.QueryParser;
25import org.apache.lucene.search.BooleanQuery.TooManyClauses;
26import org.apache.lucene.search.Filter;
27import org.apache.lucene.search.Hit;
28import org.apache.lucene.search.Hits;
29import org.apache.lucene.search.IndexSearcher;
30import org.apache.lucene.search.Query;
31import org.apache.lucene.search.RangeFilter;
32import org.apache.lucene.search.Searcher;
33import org.apache.lucene.search.ScoreDoc;
34import org.apache.lucene.search.Sort;
35import org.apache.lucene.search.TopFieldDocs;
36
37
38public class GS2LuceneQuery
39{
40 static private String TEXTFIELD = "TX";
41
42 // Use the standard set of English stop words by default
43 static private String[] stop_words = StandardAnalyzer.STOP_WORDS;
44
45 // Command-line options
46 static private String fuzziness = null;
47 static private Filter filter = null;
48 static private Sort sorter = new Sort();
49 static private String default_conjuction_operator = "OR";
50 static private int start_results = 1;
51 static private int end_results = Integer.MAX_VALUE;
52
53
54 static public void main (String args[])
55 {
56 if (args.length == 0) {
57 System.out.println("Usage: GS2LuceneQuery <index directory> [-fuzziness value] [-filter filter_string] [-sort sort_field] [-dco AND|OR] [-startresults number -endresults number] [query]");
58 return;
59 }
60
61 try {
62 Searcher searcher = new IndexSearcher(args[0]);
63 IndexReader reader = ((IndexSearcher) searcher).getIndexReader();
64
65 // Create one query parser with the standard set of stop words, and one with none
66 QueryParser query_parser = new QueryParser(TEXTFIELD, new StandardAnalyzer(stop_words));
67 QueryParser query_parser_no_stop_words = new QueryParser(TEXTFIELD, new StandardAnalyzer(new String[] { }));
68
69 String query_string = null;
70
71 // Parse the command-line arguments
72 for (int i = 1; i < args.length; i++) {
73 if (args[i].equals("-sort")) {
74 i++;
75 sorter = new Sort(args[i]);
76 }
77 else if (args[i].equals("-filter")) {
78 i++;
79 filter = parseFilterString(args[i]);
80 }
81 else if (args[i].equals("-dco")) {
82 i++;
83 default_conjuction_operator = args[i];
84 }
85 else if (args[i].equals("-fuzziness")) {
86 i++;
87 fuzziness = args[i];
88 }
89 else if (args[i].equals("-startresults")) {
90 i++;
91 if (args[i].matches("\\d+")) {
92 start_results = Integer.parseInt(args[i]);
93 }
94 }
95 else if (args[i].equals("-endresults")) {
96 i++;
97 if (args[i].matches("\\d+")) {
98 end_results = Integer.parseInt(args[i]);
99 }
100 }
101 else {
102 query_string = args[i];
103 }
104 }
105
106 // Lucene does "OR" queries by default; do an "AND" query if specified
107 if (default_conjuction_operator.equals("AND")) {
108 query_parser.setDefaultOperator(query_parser.AND_OPERATOR);
109 query_parser_no_stop_words.setDefaultOperator(query_parser.AND_OPERATOR);
110 }
111
112 // The query string has been specified as a command-line argument
113 if (query_string != null) {
114 runQuery(searcher, reader, query_parser, query_parser_no_stop_words, query_string);
115 }
116
117 // Read queries from STDIN
118 else {
119 BufferedReader in = new BufferedReader(new InputStreamReader(System.in, "UTF-8"));
120 while (true) {
121 // Read the query from STDIN
122 query_string = in.readLine();
123 if (query_string == null || query_string.length() == -1) {
124 break;
125 }
126
127 runQuery(searcher, reader, query_parser, query_parser_no_stop_words, query_string);
128 }
129 }
130
131 searcher.close();
132 }
133 catch (IOException exception) {
134 exception.printStackTrace();
135 }
136 }
137
138
139 private static void runQuery(Searcher searcher, IndexReader reader, QueryParser query_parser, QueryParser query_parser_no_stop_words, String query_string)
140 throws IOException
141 {
142 try {
143 Query query_including_stop_words = parseQuery(reader, query_parser_no_stop_words, query_string, fuzziness);
144 query_including_stop_words = query_including_stop_words.rewrite(reader);
145
146 Query query = parseQuery(reader, query_parser, query_string, fuzziness);
147 query = query.rewrite(reader);
148
149 // Return the list of expanded query terms and their frequencies
150 HashSet terms = new HashSet();
151 query.extractTerms(terms);
152 Iterator term_iterator = terms.iterator();
153 System.out.println(" <QueryTermsInfo num=\"" + terms.size() + "\"/>");
154 while (term_iterator.hasNext()) {
155 Term term = (Term) term_iterator.next();
156
157 // Get the term frequency over all the documents
158 TermDocs term_docs = reader.termDocs(term);
159 int term_freq = term_docs.freq();
160 while (term_docs.next()) {
161 term_freq += term_docs.freq();
162 }
163
164 // If you wanted to limit this to just text terms add
165 // something like this:
166 // if (term.field().equals(TEXTFIELD))
167 System.out.println(" <Term value=\"" + term.text() + "\" field=\"" + term.field() + "\" freq=\"" + term_freq + "\" />");
168 }
169
170 // Return the list of stop words removed from the query
171 HashSet terms_including_stop_words = new HashSet();
172 query_including_stop_words.extractTerms(terms_including_stop_words);
173 Iterator terms_including_stop_words_iter = terms_including_stop_words.iterator();
174 while (terms_including_stop_words_iter.hasNext()) {
175 Term term = (Term) terms_including_stop_words_iter.next();
176 if (!terms.contains(term)) {
177 System.out.println(" <StopWord value=\"" + term.text() + "\"/>");
178 }
179 }
180
181 // Simple case for getting all the matching documents
182 if (end_results == Integer.MAX_VALUE) {
183 // Perform the query (filter and sorter may be null)
184 Hits hits = searcher.search(query, filter, sorter);
185 System.out.println(" <MatchingDocsInfo num=\"" + hits.length() + "\"/>");
186
187 // Output the matching documents
188 System.out.println(" <StartResults num=\"" + start_results + "\" />");
189 System.out.println(" <EndsResults num=\"" + hits.length() + "\" />");
190 for (int i = start_results; i <= hits.length(); i++) {
191 Document doc = hits.doc(i - 1);
192 System.out.println(" <Match id=\"" + doc.get("nodeID") + "\" />");
193 }
194 }
195
196 // Slightly more complicated case for returning a subset of the matching documents
197 else {
198 // Perform the query (filter may be null)
199 TopFieldDocs hits = searcher.search(query, filter, end_results, sorter);
200 System.out.println(" <MatchingDocsInfo num=\"" + hits.totalHits + "\"/>");
201
202 // Output the matching documents
203 System.out.println(" <StartResults num=\"" + start_results + "\" />");
204 System.out.println(" <EndsResults num=\"" + end_results + "\" />");
205 for (int i = start_results; (i <= hits.scoreDocs.length && i <= end_results); i++) {
206 Document doc = reader.document(hits.scoreDocs[i - 1].doc);
207 System.out.println(" <Match id=\"" + doc.get("nodeID") + "\" />");
208 }
209 }
210 }
211 catch (ParseException parse_exception) {
212 System.out.println(" <Error type=\"PARSE_EXCEPTION\"/>");
213 }
214 catch (TooManyClauses too_many_clauses_exception) {
215 System.out.println(" <Error type=\"TOO_MANY_CLAUSES\"/>");
216 }
217
218 System.out.println("</ResultSet>");
219 }
220
221
222 private static String xmlSafe(String text) {
223 return text.replaceAll("\\&", "\\&amp;");
224 }
225
226 private static Query parseQuery(IndexReader reader, QueryParser query_parser, String query_string, String fuzziness)
227 throws java.io.IOException, org.apache.lucene.queryParser.ParseException
228 {
229 // Split query string into the search terms and the filter terms
230 // * The first +(...) term contains the search terms so count
231 // up '(' and stop when we finish matching ')'
232 int offset = 0;
233 int paren_count = 0;
234 boolean seen_paren = false;
235 while (offset < query_string.length() && (!seen_paren || paren_count > 0)) {
236 if (query_string.charAt(offset) == '(') {
237 paren_count++;
238 seen_paren = true;
239 }
240 if (query_string.charAt(offset) == ')') {
241 paren_count--;
242 }
243 offset++;
244 }
245 String query_prefix = query_string.substring(0, offset);
246 String query_suffix = query_string.substring(offset);
247
248 ///ystem.err.println("Prefix: " + query_prefix);
249 ///ystem.err.println("Suffix: " + query_suffix);
250
251 Query query = query_parser.parse(query_prefix);
252 query = query.rewrite(reader);
253
254 // If this is a fuzzy search, then we need to add the fuzzy
255 // flag to each of the query terms
256 if (fuzziness != null && query.toString().length() > 0) {
257 // Revert the query to a string
258 System.err.println("Rewritten query: " + query.toString());
259 // Search through the string for TX:<term> query terms
260 // and append the ~ operator. Not that this search will
261 // not change phrase searches (TX:"<term> <term>") as
262 // fuzzy searching is not possible for these entries.
263 // Yahoo! Time for a state machine!
264 StringBuffer mutable_query_string = new StringBuffer(query.toString());
265 int o = 0; // Offset
266 // 0 = BASE, 1 = SEEN_T, 2 = SEEN_TX, 3 = SEEN_TX:
267 int s = 0; // State
268 while (o < mutable_query_string.length()) {
269 char c = mutable_query_string.charAt(o);
270 if (s == 0 && c == TEXTFIELD.charAt(0)) {
271 ///ystem.err.println("Found T!");
272 s = 1;
273 }
274 else if (s == 1) {
275 if (c == TEXTFIELD.charAt(1)) {
276 ///ystem.err.println("Found X!");
277 s = 2;
278 }
279 else {
280 s = 0; // Reset
281 }
282 }
283 else if (s == 2) {
284 if (c == ':') {
285 ///ystem.err.println("Found TX:!");
286 s = 3;
287 }
288 else {
289 s = 0; // Reset
290 }
291 }
292 else if (s == 3) {
293 // Don't process phrases
294 if (c == '"') {
295 ///ystem.err.println("Stupid phrase...");
296 s = 0; // Reset
297 }
298 // Found the end of the term... add the
299 // fuzzy search indicator
300 // Nor outside the scope of parentheses
301 else if (Character.isWhitespace(c) || c == ')') {
302 ///ystem.err.println("Yahoo! Found fuzzy term.");
303 mutable_query_string.insert(o, '~' + fuzziness);
304 o++;
305 s = 0; // Reset
306 }
307 }
308 o++;
309 }
310 // If we were in the state of looking for the end of a
311 // term - then we just found it!
312 if (s == 3) {
313 mutable_query_string.append('~' + fuzziness);
314 }
315 // Reparse the query
316 ///ystem.err.println("Fuzzy query: " + mutable_query_string.toString() + query_suffix);
317 query = query_parser.parse(mutable_query_string.toString() + query_suffix);
318 }
319 else {
320 query = query_parser.parse(query_prefix + query_suffix);
321 }
322
323 return query;
324 }
325
326
327 /**
328 * @todo Michael to comment
329 */
330 private static Filter parseFilterString(String filter_string)
331 {
332 Filter result = null;
333 Pattern pattern = Pattern.compile("\\s*\\+(\\w+)\\:([\\{\\[])(\\d+)\\s+TO\\s+(\\d+)([\\}\\]])\\s*");
334 Matcher matcher = pattern.matcher(filter_string);
335 if (matcher.matches()) {
336 String field_name = matcher.group(1);
337 boolean include_lower = matcher.group(2).equals("[");
338 String lower_term = matcher.group(3);
339 String upper_term = matcher.group(4);
340 boolean include_upper = matcher.group(5).equals("]");
341 result = new RangeFilter(field_name, lower_term, upper_term, include_lower, include_upper);
342 }
343 else {
344 System.err.println("Error: Could not understand filter string \"" + filter_string + "\"");
345 }
346 return result;
347 }
348 /** parseFilterString() **/
349}
Note: See TracBrowser for help on using the repository browser.