source: trunk/indexers/lucene-gs/src/org/greenstone/LuceneWrapper/GS2LuceneQuery.java@ 12980

Last change on this file since 12980 was 12980, checked in by mdewsnip, 18 years ago

Now passes the endresults value (if defined) into the Searcher.search() call so only the required number of documents are returned.

  • Property svn:keywords set to Author Date Id Revision
File size: 11.2 KB
Line 
1/**
2 *
3 * @author [email protected]
4 * @author [email protected]
5 * @author [email protected]
6 * @author [email protected]
7 * @version
8 */
9
10package org.nzdl.gsdl.LuceneWrap;
11
12
13import java.io.*;
14import java.util.*;
15import java.util.regex.*;
16
17import org.apache.lucene.analysis.Analyzer;
18import org.apache.lucene.analysis.standard.StandardAnalyzer;
19import org.apache.lucene.document.Document;
20import org.apache.lucene.index.IndexReader;
21import org.apache.lucene.index.Term;
22import org.apache.lucene.index.TermDocs;
23import org.apache.lucene.queryParser.ParseException;
24import org.apache.lucene.queryParser.QueryParser;
25import org.apache.lucene.search.BooleanQuery.TooManyClauses;
26import org.apache.lucene.search.Filter;
27import org.apache.lucene.search.Hit;
28import org.apache.lucene.search.Hits;
29import org.apache.lucene.search.IndexSearcher;
30import org.apache.lucene.search.Query;
31import org.apache.lucene.search.RangeFilter;
32import org.apache.lucene.search.Searcher;
33import org.apache.lucene.search.ScoreDoc;
34import org.apache.lucene.search.Sort;
35import org.apache.lucene.search.TopFieldDocs;
36
37
38public class GS2LuceneQuery
39{
40 static private String TEXTFIELD = "TX";
41
42 // Use the standard set of English stop words by default
43 static private String[] stop_words = StandardAnalyzer.STOP_WORDS;
44
45
46 static public void main (String args[])
47 {
48 if (args.length == 0) {
49 System.out.println("Usage: GS2LuceneQuery <index directory> [-fuzziness value] [-filter filter_string] [-sort sort_field] [-dco AND|OR] [-startresults number -endresults number]");
50 return;
51 }
52
53 try {
54 Searcher searcher = new IndexSearcher(args[0]);
55 IndexReader reader = ((IndexSearcher) searcher).getIndexReader();
56
57 // Create one query parser with the standard set of stop words, and one with none
58 QueryParser query_parser = new QueryParser(TEXTFIELD, new StandardAnalyzer(stop_words));
59 QueryParser query_parser_no_stop_words = new QueryParser(TEXTFIELD, new StandardAnalyzer(new String[] { }));
60
61 Sort sorter = new Sort();
62 Filter filter = null;
63 String fuzziness = null;
64
65 // Paging
66 int start_results = 1;
67 int end_results = Integer.MAX_VALUE;
68
69 // New code to allow the default conjunction operator to be
70 // definable
71 String default_conjuction_operator = "OR";
72 for (int i = 1; i < args.length; i++) {
73 if (args[i].equals("-sort")) {
74 i++;
75 sorter = new Sort(args[i]);
76 }
77 if (args[i].equals("-filter")) {
78 i++;
79 filter = parseFilterString(args[i]);
80 }
81 if (args[i].equals("-dco")) {
82 i++;
83 default_conjuction_operator = args[i];
84 }
85 if (args[i].equals("-fuzziness")) {
86 i++;
87 fuzziness = args[i];
88 }
89 if (args[i].equals("-startresults")) {
90 i++;
91 if (args[i].matches("\\d+")) {
92 start_results = Integer.parseInt(args[i]);
93 }
94 }
95 if (args[i].equals("-endresults")) {
96 i++;
97 if (args[i].matches("\\d+")) {
98 end_results = Integer.parseInt(args[i]);
99 }
100 }
101 }
102
103 // Lucene does "OR" queries by default; do an "AND" query if specified
104 if (default_conjuction_operator.equals("AND")) {
105 query_parser.setDefaultOperator(query_parser.AND_OPERATOR);
106 query_parser_no_stop_words.setDefaultOperator(query_parser.AND_OPERATOR);
107 }
108
109 BufferedReader in = new BufferedReader(new InputStreamReader(System.in, "UTF-8"));
110 while (true) {
111 // Read the query from STDIN
112 String query_string = in.readLine();
113 if (query_string == null || query_string.length() == -1) {
114 break;
115 }
116 System.out.println("<ResultSet>");
117 System.out.println(" <QueryString>" + xmlSafe(query_string) + "</QueryString>");
118 if (filter != null) {
119 System.out.println(" <FilterString>" + filter.toString() + "</FilterString>");
120 }
121
122 try {
123 Query query_including_stop_words = parseQuery(reader, query_parser_no_stop_words, query_string, fuzziness);
124 query_including_stop_words = query_including_stop_words.rewrite(reader);
125
126 Query query = parseQuery(reader, query_parser, query_string, fuzziness);
127 query = query.rewrite(reader);
128
129 // Return the list of expanded query terms and their frequencies
130 HashSet terms = new HashSet();
131 query.extractTerms(terms);
132 Iterator term_iterator = terms.iterator();
133 System.out.println(" <QueryTermsInfo num=\"" + terms.size() + "\"/>");
134 while (term_iterator.hasNext()) {
135 Term term = (Term) term_iterator.next();
136
137 // Get the term frequency over all the documents
138 TermDocs term_docs = reader.termDocs(term);
139 int term_freq = term_docs.freq();
140 while (term_docs.next()) {
141 term_freq += term_docs.freq();
142 }
143
144 // If you wanted to limit this to just text terms add
145 // something like this:
146 // if (term.field().equals(TEXTFIELD))
147 System.out.println(" <Term value=\"" + term.text() + "\" field=\"" + term.field() + "\" freq=\"" + term_freq + "\" />");
148 }
149
150 // Return the list of stop words removed from the query
151 HashSet terms_including_stop_words = new HashSet();
152 query_including_stop_words.extractTerms(terms_including_stop_words);
153 Iterator terms_including_stop_words_iter = terms_including_stop_words.iterator();
154 while (terms_including_stop_words_iter.hasNext()) {
155 Term term = (Term) terms_including_stop_words_iter.next();
156 if (!terms.contains(term)) {
157 System.out.println(" <StopWord value=\"" + term.text() + "\"/>");
158 }
159 }
160
161 // Simple case for getting all the matching documents
162 if (end_results == Integer.MAX_VALUE) {
163 // Perform the query (filter and sorter may be null)
164 Hits hits = searcher.search(query, filter, sorter);
165 System.out.println(" <MatchingDocsInfo num=\"" + hits.length() + "\"/>");
166
167 // Output the matching documents
168 System.out.println(" <StartResults num=\"" + start_results + "\" />");
169 System.out.println(" <EndsResults num=\"" + hits.length() + "\" />");
170 for (int i = start_results; i <= hits.length(); i++) {
171 Document doc = hits.doc(i - 1);
172 System.out.println(" <Match id=\"" + doc.get("nodeID") + "\" />");
173 }
174 }
175
176 // Slightly more complicated case for returning a subset of the matching documents
177 else {
178 // Perform the query (filter may be null)
179 TopFieldDocs hits = searcher.search(query, filter, end_results, sorter);
180 System.out.println(" <MatchingDocsInfo num=\"" + hits.totalHits + "\"/>");
181
182 // Output the matching documents
183 System.out.println(" <StartResults num=\"" + start_results + "\" />");
184 System.out.println(" <EndsResults num=\"" + end_results + "\" />");
185 for (int i = start_results; (i <= hits.scoreDocs.length && i <= end_results); i++) {
186 Document doc = reader.document(hits.scoreDocs[i - 1].doc);
187 System.out.println(" <Match id=\"" + doc.get("nodeID") + "\" />");
188 }
189 }
190 }
191 catch (ParseException parse_exception) {
192 System.out.println(" <Error type=\"PARSE_EXCEPTION\"/>");
193 }
194 catch (TooManyClauses too_many_clauses_exception) {
195 System.out.println(" <Error type=\"TOO_MANY_CLAUSES\"/>");
196 }
197
198 System.out.println("</ResultSet>");
199 }
200
201 searcher.close();
202 }
203 catch (IOException exception) {
204 exception.printStackTrace();
205 }
206 }
207
208 private static String xmlSafe(String text) {
209 return text.replaceAll("\\&", "\\&amp;");
210 }
211
212 private static Query parseQuery(IndexReader reader, QueryParser query_parser, String query_string, String fuzziness)
213 throws java.io.IOException, org.apache.lucene.queryParser.ParseException
214 {
215 // Split query string into the search terms and the filter terms
216 // * The first +(...) term contains the search terms so count
217 // up '(' and stop when we finish matching ')'
218 int offset = 0;
219 int paren_count = 0;
220 boolean seen_paren = false;
221 while (offset < query_string.length() && (!seen_paren || paren_count > 0)) {
222 if (query_string.charAt(offset) == '(') {
223 paren_count++;
224 seen_paren = true;
225 }
226 if (query_string.charAt(offset) == ')') {
227 paren_count--;
228 }
229 offset++;
230 }
231 String query_prefix = query_string.substring(0, offset);
232 String query_suffix = query_string.substring(offset);
233
234 ///ystem.err.println("Prefix: " + query_prefix);
235 ///ystem.err.println("Suffix: " + query_suffix);
236
237 Query query = query_parser.parse(query_prefix);
238 query = query.rewrite(reader);
239
240 // If this is a fuzzy search, then we need to add the fuzzy
241 // flag to each of the query terms
242 if (fuzziness != null && query.toString().length() > 0) {
243 // Revert the query to a string
244 System.err.println("Rewritten query: " + query.toString());
245 // Search through the string for TX:<term> query terms
246 // and append the ~ operator. Not that this search will
247 // not change phrase searches (TX:"<term> <term>") as
248 // fuzzy searching is not possible for these entries.
249 // Yahoo! Time for a state machine!
250 StringBuffer mutable_query_string = new StringBuffer(query.toString());
251 int o = 0; // Offset
252 // 0 = BASE, 1 = SEEN_T, 2 = SEEN_TX, 3 = SEEN_TX:
253 int s = 0; // State
254 while (o < mutable_query_string.length()) {
255 char c = mutable_query_string.charAt(o);
256 if (s == 0 && c == TEXTFIELD.charAt(0)) {
257 ///ystem.err.println("Found T!");
258 s = 1;
259 }
260 else if (s == 1) {
261 if (c == TEXTFIELD.charAt(1)) {
262 ///ystem.err.println("Found X!");
263 s = 2;
264 }
265 else {
266 s = 0; // Reset
267 }
268 }
269 else if (s == 2) {
270 if (c == ':') {
271 ///ystem.err.println("Found TX:!");
272 s = 3;
273 }
274 else {
275 s = 0; // Reset
276 }
277 }
278 else if (s == 3) {
279 // Don't process phrases
280 if (c == '"') {
281 ///ystem.err.println("Stupid phrase...");
282 s = 0; // Reset
283 }
284 // Found the end of the term... add the
285 // fuzzy search indicator
286 // Nor outside the scope of parentheses
287 else if (Character.isWhitespace(c) || c == ')') {
288 ///ystem.err.println("Yahoo! Found fuzzy term.");
289 mutable_query_string.insert(o, '~' + fuzziness);
290 o++;
291 s = 0; // Reset
292 }
293 }
294 o++;
295 }
296 // If we were in the state of looking for the end of a
297 // term - then we just found it!
298 if (s == 3) {
299 mutable_query_string.append('~' + fuzziness);
300 }
301 // Reparse the query
302 ///ystem.err.println("Fuzzy query: " + mutable_query_string.toString() + query_suffix);
303 query = query_parser.parse(mutable_query_string.toString() + query_suffix);
304 }
305 else {
306 query = query_parser.parse(query_prefix + query_suffix);
307 }
308
309 return query;
310 }
311
312
313 /**
314 * @todo Michael to comment
315 */
316 private static Filter parseFilterString(String filter_string)
317 {
318 Filter result = null;
319 Pattern pattern = Pattern.compile("\\s*\\+(\\w+)\\:([\\{\\[])(\\d+)\\s+TO\\s+(\\d+)([\\}\\]])\\s*");
320 Matcher matcher = pattern.matcher(filter_string);
321 if (matcher.matches()) {
322 String field_name = matcher.group(1);
323 boolean include_lower = matcher.group(2).equals("[");
324 String lower_term = matcher.group(3);
325 String upper_term = matcher.group(4);
326 boolean include_upper = matcher.group(5).equals("]");
327 result = new RangeFilter(field_name, lower_term, upper_term, include_lower, include_upper);
328 }
329 else {
330 System.err.println("Error: Could not understand filter string \"" + filter_string + "\"");
331 }
332 return result;
333 }
334 /** parseFilterString() **/
335}
Note: See TracBrowser for help on using the repository browser.