source: trunk/indexers/lucene-gs/src/org/greenstone/LuceneWrapper/GS2LuceneQuery.java@ 12987

Last change on this file since 12987 was 12987, checked in by mdewsnip, 18 years ago

You can now specify the query string as a command-line argument to GS2LuceneQuery.

  • Property svn:keywords set to Author Date Id Revision
File size: 11.5 KB
Line 
1/**
2 *
3 * @author [email protected]
4 * @author [email protected]
5 * @author [email protected]
6 * @author [email protected]
7 * @version
8 */
9
10package org.nzdl.gsdl.LuceneWrap;
11
12
13import java.io.*;
14import java.util.*;
15import java.util.regex.*;
16
17import org.apache.lucene.analysis.Analyzer;
18import org.apache.lucene.analysis.standard.StandardAnalyzer;
19import org.apache.lucene.document.Document;
20import org.apache.lucene.index.IndexReader;
21import org.apache.lucene.index.Term;
22import org.apache.lucene.index.TermDocs;
23import org.apache.lucene.queryParser.ParseException;
24import org.apache.lucene.queryParser.QueryParser;
25import org.apache.lucene.search.BooleanQuery.TooManyClauses;
26import org.apache.lucene.search.Filter;
27import org.apache.lucene.search.Hit;
28import org.apache.lucene.search.Hits;
29import org.apache.lucene.search.IndexSearcher;
30import org.apache.lucene.search.Query;
31import org.apache.lucene.search.RangeFilter;
32import org.apache.lucene.search.Searcher;
33import org.apache.lucene.search.ScoreDoc;
34import org.apache.lucene.search.Sort;
35import org.apache.lucene.search.TopFieldDocs;
36
37
38public class GS2LuceneQuery
39{
40 static private String TEXTFIELD = "TX";
41
42 // Use the standard set of English stop words by default
43 static private String[] stop_words = StandardAnalyzer.STOP_WORDS;
44
45 // Command-line options
46 static private String fuzziness = null;
47 static private Filter filter = null;
48 static private Sort sorter = new Sort();
49 static private String default_conjuction_operator = "OR";
50 static private int start_results = 1;
51 static private int end_results = Integer.MAX_VALUE;
52
53
54 static public void main (String args[])
55 {
56 if (args.length == 0) {
57 System.out.println("Usage: GS2LuceneQuery <index directory> [-fuzziness value] [-filter filter_string] [-sort sort_field] [-dco AND|OR] [-startresults number -endresults number] [query]");
58 return;
59 }
60
61 try {
62 Searcher searcher = new IndexSearcher(args[0]);
63 IndexReader reader = ((IndexSearcher) searcher).getIndexReader();
64
65 // Create one query parser with the standard set of stop words, and one with none
66 QueryParser query_parser = new QueryParser(TEXTFIELD, new StandardAnalyzer(stop_words));
67 QueryParser query_parser_no_stop_words = new QueryParser(TEXTFIELD, new StandardAnalyzer(new String[] { }));
68
69 String query_string = null;
70
71 // Parse the command-line arguments
72 for (int i = 1; i < args.length; i++) {
73 if (args[i].equals("-sort")) {
74 i++;
75 sorter = new Sort(args[i]);
76 }
77 else if (args[i].equals("-filter")) {
78 i++;
79 filter = parseFilterString(args[i]);
80 }
81 else if (args[i].equals("-dco")) {
82 i++;
83 default_conjuction_operator = args[i];
84 }
85 else if (args[i].equals("-fuzziness")) {
86 i++;
87 fuzziness = args[i];
88 }
89 else if (args[i].equals("-startresults")) {
90 i++;
91 if (args[i].matches("\\d+")) {
92 start_results = Integer.parseInt(args[i]);
93 }
94 }
95 else if (args[i].equals("-endresults")) {
96 i++;
97 if (args[i].matches("\\d+")) {
98 end_results = Integer.parseInt(args[i]);
99 }
100 }
101 else {
102 query_string = args[i];
103 }
104 }
105
106 // Lucene does "OR" queries by default; do an "AND" query if specified
107 if (default_conjuction_operator.equals("AND")) {
108 query_parser.setDefaultOperator(query_parser.AND_OPERATOR);
109 query_parser_no_stop_words.setDefaultOperator(query_parser.AND_OPERATOR);
110 }
111
112 // The query string has been specified as a command-line argument
113 if (query_string != null) {
114 runQuery(searcher, reader, query_parser, query_parser_no_stop_words, query_string);
115 }
116
117 // Read queries from STDIN
118 else {
119 BufferedReader in = new BufferedReader(new InputStreamReader(System.in, "UTF-8"));
120 while (true) {
121 // Read the query from STDIN
122 query_string = in.readLine();
123 if (query_string == null || query_string.length() == -1) {
124 break;
125 }
126
127 runQuery(searcher, reader, query_parser, query_parser_no_stop_words, query_string);
128 }
129 }
130 }
131 catch (IOException exception) {
132 exception.printStackTrace();
133 }
134 }
135
136
137 private static void runQuery(Searcher searcher, IndexReader reader, QueryParser query_parser, QueryParser query_parser_no_stop_words, String query_string)
138 throws IOException
139 {
140 try {
141 Query query_including_stop_words = parseQuery(reader, query_parser_no_stop_words, query_string, fuzziness);
142 query_including_stop_words = query_including_stop_words.rewrite(reader);
143
144 Query query = parseQuery(reader, query_parser, query_string, fuzziness);
145 query = query.rewrite(reader);
146
147 // Return the list of expanded query terms and their frequencies
148 HashSet terms = new HashSet();
149 query.extractTerms(terms);
150 Iterator term_iterator = terms.iterator();
151 System.out.println(" <QueryTermsInfo num=\"" + terms.size() + "\"/>");
152 while (term_iterator.hasNext()) {
153 Term term = (Term) term_iterator.next();
154
155 // Get the term frequency over all the documents
156 TermDocs term_docs = reader.termDocs(term);
157 int term_freq = term_docs.freq();
158 while (term_docs.next()) {
159 term_freq += term_docs.freq();
160 }
161
162 // If you wanted to limit this to just text terms add
163 // something like this:
164 // if (term.field().equals(TEXTFIELD))
165 System.out.println(" <Term value=\"" + term.text() + "\" field=\"" + term.field() + "\" freq=\"" + term_freq + "\" />");
166 }
167
168 // Return the list of stop words removed from the query
169 HashSet terms_including_stop_words = new HashSet();
170 query_including_stop_words.extractTerms(terms_including_stop_words);
171 Iterator terms_including_stop_words_iter = terms_including_stop_words.iterator();
172 while (terms_including_stop_words_iter.hasNext()) {
173 Term term = (Term) terms_including_stop_words_iter.next();
174 if (!terms.contains(term)) {
175 System.out.println(" <StopWord value=\"" + term.text() + "\"/>");
176 }
177 }
178
179 // Simple case for getting all the matching documents
180 if (end_results == Integer.MAX_VALUE) {
181 // Perform the query (filter and sorter may be null)
182 Hits hits = searcher.search(query, filter, sorter);
183 System.out.println(" <MatchingDocsInfo num=\"" + hits.length() + "\"/>");
184
185 // Output the matching documents
186 System.out.println(" <StartResults num=\"" + start_results + "\" />");
187 System.out.println(" <EndsResults num=\"" + hits.length() + "\" />");
188 for (int i = start_results; i <= hits.length(); i++) {
189 Document doc = hits.doc(i - 1);
190 System.out.println(" <Match id=\"" + doc.get("nodeID") + "\" />");
191 }
192 }
193
194 // Slightly more complicated case for returning a subset of the matching documents
195 else {
196 // Perform the query (filter may be null)
197 TopFieldDocs hits = searcher.search(query, filter, end_results, sorter);
198 System.out.println(" <MatchingDocsInfo num=\"" + hits.totalHits + "\"/>");
199
200 // Output the matching documents
201 System.out.println(" <StartResults num=\"" + start_results + "\" />");
202 System.out.println(" <EndsResults num=\"" + end_results + "\" />");
203 for (int i = start_results; (i <= hits.scoreDocs.length && i <= end_results); i++) {
204 Document doc = reader.document(hits.scoreDocs[i - 1].doc);
205 System.out.println(" <Match id=\"" + doc.get("nodeID") + "\" />");
206 }
207 }
208 }
209 catch (ParseException parse_exception) {
210 System.out.println(" <Error type=\"PARSE_EXCEPTION\"/>");
211 }
212 catch (TooManyClauses too_many_clauses_exception) {
213 System.out.println(" <Error type=\"TOO_MANY_CLAUSES\"/>");
214 }
215
216 System.out.println("</ResultSet>");
217 }
218
219
220 private static String xmlSafe(String text) {
221 return text.replaceAll("\\&", "\\&amp;");
222 }
223
224 private static Query parseQuery(IndexReader reader, QueryParser query_parser, String query_string, String fuzziness)
225 throws java.io.IOException, org.apache.lucene.queryParser.ParseException
226 {
227 // Split query string into the search terms and the filter terms
228 // * The first +(...) term contains the search terms so count
229 // up '(' and stop when we finish matching ')'
230 int offset = 0;
231 int paren_count = 0;
232 boolean seen_paren = false;
233 while (offset < query_string.length() && (!seen_paren || paren_count > 0)) {
234 if (query_string.charAt(offset) == '(') {
235 paren_count++;
236 seen_paren = true;
237 }
238 if (query_string.charAt(offset) == ')') {
239 paren_count--;
240 }
241 offset++;
242 }
243 String query_prefix = query_string.substring(0, offset);
244 String query_suffix = query_string.substring(offset);
245
246 ///ystem.err.println("Prefix: " + query_prefix);
247 ///ystem.err.println("Suffix: " + query_suffix);
248
249 Query query = query_parser.parse(query_prefix);
250 query = query.rewrite(reader);
251
252 // If this is a fuzzy search, then we need to add the fuzzy
253 // flag to each of the query terms
254 if (fuzziness != null && query.toString().length() > 0) {
255 // Revert the query to a string
256 System.err.println("Rewritten query: " + query.toString());
257 // Search through the string for TX:<term> query terms
258 // and append the ~ operator. Not that this search will
259 // not change phrase searches (TX:"<term> <term>") as
260 // fuzzy searching is not possible for these entries.
261 // Yahoo! Time for a state machine!
262 StringBuffer mutable_query_string = new StringBuffer(query.toString());
263 int o = 0; // Offset
264 // 0 = BASE, 1 = SEEN_T, 2 = SEEN_TX, 3 = SEEN_TX:
265 int s = 0; // State
266 while (o < mutable_query_string.length()) {
267 char c = mutable_query_string.charAt(o);
268 if (s == 0 && c == TEXTFIELD.charAt(0)) {
269 ///ystem.err.println("Found T!");
270 s = 1;
271 }
272 else if (s == 1) {
273 if (c == TEXTFIELD.charAt(1)) {
274 ///ystem.err.println("Found X!");
275 s = 2;
276 }
277 else {
278 s = 0; // Reset
279 }
280 }
281 else if (s == 2) {
282 if (c == ':') {
283 ///ystem.err.println("Found TX:!");
284 s = 3;
285 }
286 else {
287 s = 0; // Reset
288 }
289 }
290 else if (s == 3) {
291 // Don't process phrases
292 if (c == '"') {
293 ///ystem.err.println("Stupid phrase...");
294 s = 0; // Reset
295 }
296 // Found the end of the term... add the
297 // fuzzy search indicator
298 // Nor outside the scope of parentheses
299 else if (Character.isWhitespace(c) || c == ')') {
300 ///ystem.err.println("Yahoo! Found fuzzy term.");
301 mutable_query_string.insert(o, '~' + fuzziness);
302 o++;
303 s = 0; // Reset
304 }
305 }
306 o++;
307 }
308 // If we were in the state of looking for the end of a
309 // term - then we just found it!
310 if (s == 3) {
311 mutable_query_string.append('~' + fuzziness);
312 }
313 // Reparse the query
314 ///ystem.err.println("Fuzzy query: " + mutable_query_string.toString() + query_suffix);
315 query = query_parser.parse(mutable_query_string.toString() + query_suffix);
316 }
317 else {
318 query = query_parser.parse(query_prefix + query_suffix);
319 }
320
321 return query;
322 }
323
324
325 /**
326 * @todo Michael to comment
327 */
328 private static Filter parseFilterString(String filter_string)
329 {
330 Filter result = null;
331 Pattern pattern = Pattern.compile("\\s*\\+(\\w+)\\:([\\{\\[])(\\d+)\\s+TO\\s+(\\d+)([\\}\\]])\\s*");
332 Matcher matcher = pattern.matcher(filter_string);
333 if (matcher.matches()) {
334 String field_name = matcher.group(1);
335 boolean include_lower = matcher.group(2).equals("[");
336 String lower_term = matcher.group(3);
337 String upper_term = matcher.group(4);
338 boolean include_upper = matcher.group(5).equals("]");
339 result = new RangeFilter(field_name, lower_term, upper_term, include_lower, include_upper);
340 }
341 else {
342 System.err.println("Error: Could not understand filter string \"" + filter_string + "\"");
343 }
344 return result;
345 }
346 /** parseFilterString() **/
347}
Note: See TracBrowser for help on using the repository browser.