source: trunk/indexers/lucene-gs/src/org/greenstone/LuceneWrapper/GS2LuceneQuery.java@ 12983

Last change on this file since 12983 was 12983, checked in by mdewsnip, 18 years ago

Moved the stuff for running the query into a new runQuery function, in preparation for allowing the query string to be specified as a command-line argument.

  • Property svn:keywords set to Author Date Id Revision
File size: 11.1 KB
Line 
1/**
2 *
3 * @author [email protected]
4 * @author [email protected]
5 * @author [email protected]
6 * @author [email protected]
7 * @version
8 */
9
10package org.nzdl.gsdl.LuceneWrap;
11
12
13import java.io.*;
14import java.util.*;
15import java.util.regex.*;
16
17import org.apache.lucene.analysis.Analyzer;
18import org.apache.lucene.analysis.standard.StandardAnalyzer;
19import org.apache.lucene.document.Document;
20import org.apache.lucene.index.IndexReader;
21import org.apache.lucene.index.Term;
22import org.apache.lucene.index.TermDocs;
23import org.apache.lucene.queryParser.ParseException;
24import org.apache.lucene.queryParser.QueryParser;
25import org.apache.lucene.search.BooleanQuery.TooManyClauses;
26import org.apache.lucene.search.Filter;
27import org.apache.lucene.search.Hit;
28import org.apache.lucene.search.Hits;
29import org.apache.lucene.search.IndexSearcher;
30import org.apache.lucene.search.Query;
31import org.apache.lucene.search.RangeFilter;
32import org.apache.lucene.search.Searcher;
33import org.apache.lucene.search.ScoreDoc;
34import org.apache.lucene.search.Sort;
35import org.apache.lucene.search.TopFieldDocs;
36
37
38public class GS2LuceneQuery
39{
40 static private String TEXTFIELD = "TX";
41
42 // Use the standard set of English stop words by default
43 static private String[] stop_words = StandardAnalyzer.STOP_WORDS;
44
45 // Command-line options
46 static private String fuzziness = null;
47 static private Filter filter = null;
48 static private Sort sorter = new Sort();
49 static private String default_conjuction_operator = "OR";
50 static private int start_results = 1;
51 static private int end_results = Integer.MAX_VALUE;
52
53
54 static public void main (String args[])
55 {
56 if (args.length == 0) {
57 System.out.println("Usage: GS2LuceneQuery <index directory> [-fuzziness value] [-filter filter_string] [-sort sort_field] [-dco AND|OR] [-startresults number -endresults number]");
58 return;
59 }
60
61 try {
62 Searcher searcher = new IndexSearcher(args[0]);
63 IndexReader reader = ((IndexSearcher) searcher).getIndexReader();
64
65 // Create one query parser with the standard set of stop words, and one with none
66 QueryParser query_parser = new QueryParser(TEXTFIELD, new StandardAnalyzer(stop_words));
67 QueryParser query_parser_no_stop_words = new QueryParser(TEXTFIELD, new StandardAnalyzer(new String[] { }));
68
69 for (int i = 1; i < args.length; i++) {
70 if (args[i].equals("-sort")) {
71 i++;
72 sorter = new Sort(args[i]);
73 }
74 else if (args[i].equals("-filter")) {
75 i++;
76 filter = parseFilterString(args[i]);
77 }
78 else if (args[i].equals("-dco")) {
79 i++;
80 default_conjuction_operator = args[i];
81 }
82 else if (args[i].equals("-fuzziness")) {
83 i++;
84 fuzziness = args[i];
85 }
86 else if (args[i].equals("-startresults")) {
87 i++;
88 if (args[i].matches("\\d+")) {
89 start_results = Integer.parseInt(args[i]);
90 }
91 }
92 else if (args[i].equals("-endresults")) {
93 i++;
94 if (args[i].matches("\\d+")) {
95 end_results = Integer.parseInt(args[i]);
96 }
97 }
98 }
99
100 // Lucene does "OR" queries by default; do an "AND" query if specified
101 if (default_conjuction_operator.equals("AND")) {
102 query_parser.setDefaultOperator(query_parser.AND_OPERATOR);
103 query_parser_no_stop_words.setDefaultOperator(query_parser.AND_OPERATOR);
104 }
105
106 BufferedReader in = new BufferedReader(new InputStreamReader(System.in, "UTF-8"));
107 while (true) {
108 // Read the query from STDIN
109 String query_string = in.readLine();
110 if (query_string == null || query_string.length() == -1) {
111 break;
112 }
113
114 runQuery(searcher, reader, query_parser, query_parser_no_stop_words, query_string);
115 }
116 }
117 catch (IOException exception) {
118 exception.printStackTrace();
119 }
120 }
121
122
123 private static void runQuery(Searcher searcher, IndexReader reader, QueryParser query_parser, QueryParser query_parser_no_stop_words, String query_string)
124 throws IOException
125 {
126 try {
127 Query query_including_stop_words = parseQuery(reader, query_parser_no_stop_words, query_string, fuzziness);
128 query_including_stop_words = query_including_stop_words.rewrite(reader);
129
130 Query query = parseQuery(reader, query_parser, query_string, fuzziness);
131 query = query.rewrite(reader);
132
133 // Return the list of expanded query terms and their frequencies
134 HashSet terms = new HashSet();
135 query.extractTerms(terms);
136 Iterator term_iterator = terms.iterator();
137 System.out.println(" <QueryTermsInfo num=\"" + terms.size() + "\"/>");
138 while (term_iterator.hasNext()) {
139 Term term = (Term) term_iterator.next();
140
141 // Get the term frequency over all the documents
142 TermDocs term_docs = reader.termDocs(term);
143 int term_freq = term_docs.freq();
144 while (term_docs.next()) {
145 term_freq += term_docs.freq();
146 }
147
148 // If you wanted to limit this to just text terms add
149 // something like this:
150 // if (term.field().equals(TEXTFIELD))
151 System.out.println(" <Term value=\"" + term.text() + "\" field=\"" + term.field() + "\" freq=\"" + term_freq + "\" />");
152 }
153
154 // Return the list of stop words removed from the query
155 HashSet terms_including_stop_words = new HashSet();
156 query_including_stop_words.extractTerms(terms_including_stop_words);
157 Iterator terms_including_stop_words_iter = terms_including_stop_words.iterator();
158 while (terms_including_stop_words_iter.hasNext()) {
159 Term term = (Term) terms_including_stop_words_iter.next();
160 if (!terms.contains(term)) {
161 System.out.println(" <StopWord value=\"" + term.text() + "\"/>");
162 }
163 }
164
165 // Simple case for getting all the matching documents
166 if (end_results == Integer.MAX_VALUE) {
167 // Perform the query (filter and sorter may be null)
168 Hits hits = searcher.search(query, filter, sorter);
169 System.out.println(" <MatchingDocsInfo num=\"" + hits.length() + "\"/>");
170
171 // Output the matching documents
172 System.out.println(" <StartResults num=\"" + start_results + "\" />");
173 System.out.println(" <EndsResults num=\"" + hits.length() + "\" />");
174 for (int i = start_results; i <= hits.length(); i++) {
175 Document doc = hits.doc(i - 1);
176 System.out.println(" <Match id=\"" + doc.get("nodeID") + "\" />");
177 }
178 }
179
180 // Slightly more complicated case for returning a subset of the matching documents
181 else {
182 // Perform the query (filter may be null)
183 TopFieldDocs hits = searcher.search(query, filter, end_results, sorter);
184 System.out.println(" <MatchingDocsInfo num=\"" + hits.totalHits + "\"/>");
185
186 // Output the matching documents
187 System.out.println(" <StartResults num=\"" + start_results + "\" />");
188 System.out.println(" <EndsResults num=\"" + end_results + "\" />");
189 for (int i = start_results; (i <= hits.scoreDocs.length && i <= end_results); i++) {
190 Document doc = reader.document(hits.scoreDocs[i - 1].doc);
191 System.out.println(" <Match id=\"" + doc.get("nodeID") + "\" />");
192 }
193 }
194 }
195 catch (ParseException parse_exception) {
196 System.out.println(" <Error type=\"PARSE_EXCEPTION\"/>");
197 }
198 catch (TooManyClauses too_many_clauses_exception) {
199 System.out.println(" <Error type=\"TOO_MANY_CLAUSES\"/>");
200 }
201
202 System.out.println("</ResultSet>");
203 }
204
205
206 private static String xmlSafe(String text) {
207 return text.replaceAll("\\&", "\\&amp;");
208 }
209
210 private static Query parseQuery(IndexReader reader, QueryParser query_parser, String query_string, String fuzziness)
211 throws java.io.IOException, org.apache.lucene.queryParser.ParseException
212 {
213 // Split query string into the search terms and the filter terms
214 // * The first +(...) term contains the search terms so count
215 // up '(' and stop when we finish matching ')'
216 int offset = 0;
217 int paren_count = 0;
218 boolean seen_paren = false;
219 while (offset < query_string.length() && (!seen_paren || paren_count > 0)) {
220 if (query_string.charAt(offset) == '(') {
221 paren_count++;
222 seen_paren = true;
223 }
224 if (query_string.charAt(offset) == ')') {
225 paren_count--;
226 }
227 offset++;
228 }
229 String query_prefix = query_string.substring(0, offset);
230 String query_suffix = query_string.substring(offset);
231
232 ///ystem.err.println("Prefix: " + query_prefix);
233 ///ystem.err.println("Suffix: " + query_suffix);
234
235 Query query = query_parser.parse(query_prefix);
236 query = query.rewrite(reader);
237
238 // If this is a fuzzy search, then we need to add the fuzzy
239 // flag to each of the query terms
240 if (fuzziness != null && query.toString().length() > 0) {
241 // Revert the query to a string
242 System.err.println("Rewritten query: " + query.toString());
243 // Search through the string for TX:<term> query terms
244 // and append the ~ operator. Not that this search will
245 // not change phrase searches (TX:"<term> <term>") as
246 // fuzzy searching is not possible for these entries.
247 // Yahoo! Time for a state machine!
248 StringBuffer mutable_query_string = new StringBuffer(query.toString());
249 int o = 0; // Offset
250 // 0 = BASE, 1 = SEEN_T, 2 = SEEN_TX, 3 = SEEN_TX:
251 int s = 0; // State
252 while (o < mutable_query_string.length()) {
253 char c = mutable_query_string.charAt(o);
254 if (s == 0 && c == TEXTFIELD.charAt(0)) {
255 ///ystem.err.println("Found T!");
256 s = 1;
257 }
258 else if (s == 1) {
259 if (c == TEXTFIELD.charAt(1)) {
260 ///ystem.err.println("Found X!");
261 s = 2;
262 }
263 else {
264 s = 0; // Reset
265 }
266 }
267 else if (s == 2) {
268 if (c == ':') {
269 ///ystem.err.println("Found TX:!");
270 s = 3;
271 }
272 else {
273 s = 0; // Reset
274 }
275 }
276 else if (s == 3) {
277 // Don't process phrases
278 if (c == '"') {
279 ///ystem.err.println("Stupid phrase...");
280 s = 0; // Reset
281 }
282 // Found the end of the term... add the
283 // fuzzy search indicator
284 // Nor outside the scope of parentheses
285 else if (Character.isWhitespace(c) || c == ')') {
286 ///ystem.err.println("Yahoo! Found fuzzy term.");
287 mutable_query_string.insert(o, '~' + fuzziness);
288 o++;
289 s = 0; // Reset
290 }
291 }
292 o++;
293 }
294 // If we were in the state of looking for the end of a
295 // term - then we just found it!
296 if (s == 3) {
297 mutable_query_string.append('~' + fuzziness);
298 }
299 // Reparse the query
300 ///ystem.err.println("Fuzzy query: " + mutable_query_string.toString() + query_suffix);
301 query = query_parser.parse(mutable_query_string.toString() + query_suffix);
302 }
303 else {
304 query = query_parser.parse(query_prefix + query_suffix);
305 }
306
307 return query;
308 }
309
310
311 /**
312 * @todo Michael to comment
313 */
314 private static Filter parseFilterString(String filter_string)
315 {
316 Filter result = null;
317 Pattern pattern = Pattern.compile("\\s*\\+(\\w+)\\:([\\{\\[])(\\d+)\\s+TO\\s+(\\d+)([\\}\\]])\\s*");
318 Matcher matcher = pattern.matcher(filter_string);
319 if (matcher.matches()) {
320 String field_name = matcher.group(1);
321 boolean include_lower = matcher.group(2).equals("[");
322 String lower_term = matcher.group(3);
323 String upper_term = matcher.group(4);
324 boolean include_upper = matcher.group(5).equals("]");
325 result = new RangeFilter(field_name, lower_term, upper_term, include_lower, include_upper);
326 }
327 else {
328 System.err.println("Error: Could not understand filter string \"" + filter_string + "\"");
329 }
330 return result;
331 }
332 /** parseFilterString() **/
333}
Note: See TracBrowser for help on using the repository browser.