source: trunk/gsdl/src/java/org/nzdl/gsdl/LuceneWrap/GS2LuceneQuery.java@ 12991

Last change on this file since 12991 was 12991, checked in by mdewsnip, 18 years ago

Ooops... managed to lose the header of the XML output in my recent changes.

  • Property svn:keywords set to Author Date Id Revision
File size: 11.8 KB
Line 
1/**
2 *
3 * @author [email protected]
4 * @author [email protected]
5 * @author [email protected]
6 * @author [email protected]
7 * @version
8 */
9
10package org.nzdl.gsdl.LuceneWrap;
11
12
13import java.io.*;
14import java.util.*;
15import java.util.regex.*;
16
17import org.apache.lucene.analysis.Analyzer;
18import org.apache.lucene.analysis.standard.StandardAnalyzer;
19import org.apache.lucene.document.Document;
20import org.apache.lucene.index.IndexReader;
21import org.apache.lucene.index.Term;
22import org.apache.lucene.index.TermDocs;
23import org.apache.lucene.queryParser.ParseException;
24import org.apache.lucene.queryParser.QueryParser;
25import org.apache.lucene.search.BooleanQuery.TooManyClauses;
26import org.apache.lucene.search.Filter;
27import org.apache.lucene.search.Hit;
28import org.apache.lucene.search.Hits;
29import org.apache.lucene.search.IndexSearcher;
30import org.apache.lucene.search.Query;
31import org.apache.lucene.search.RangeFilter;
32import org.apache.lucene.search.Searcher;
33import org.apache.lucene.search.ScoreDoc;
34import org.apache.lucene.search.Sort;
35import org.apache.lucene.search.TopFieldDocs;
36
37
38public class GS2LuceneQuery
39{
40 static private String TEXTFIELD = "TX";
41
42 // Use the standard set of English stop words by default
43 static private String[] stop_words = StandardAnalyzer.STOP_WORDS;
44
45 // Command-line options
46 static private String fuzziness = null;
47 static private Filter filter = null;
48 static private Sort sorter = new Sort();
49 static private String default_conjuction_operator = "OR";
50 static private int start_results = 1;
51 static private int end_results = Integer.MAX_VALUE;
52
53
54 static public void main (String args[])
55 {
56 if (args.length == 0) {
57 System.out.println("Usage: GS2LuceneQuery <index directory> [-fuzziness value] [-filter filter_string] [-sort sort_field] [-dco AND|OR] [-startresults number -endresults number] [query]");
58 return;
59 }
60
61 try {
62 Searcher searcher = new IndexSearcher(args[0]);
63 IndexReader reader = ((IndexSearcher) searcher).getIndexReader();
64
65 // Create one query parser with the standard set of stop words, and one with none
66 QueryParser query_parser = new QueryParser(TEXTFIELD, new StandardAnalyzer(stop_words));
67 QueryParser query_parser_no_stop_words = new QueryParser(TEXTFIELD, new StandardAnalyzer(new String[] { }));
68
69 String query_string = null;
70
71 // Parse the command-line arguments
72 for (int i = 1; i < args.length; i++) {
73 if (args[i].equals("-sort")) {
74 i++;
75 sorter = new Sort(args[i]);
76 }
77 else if (args[i].equals("-filter")) {
78 i++;
79 filter = parseFilterString(args[i]);
80 }
81 else if (args[i].equals("-dco")) {
82 i++;
83 default_conjuction_operator = args[i];
84 }
85 else if (args[i].equals("-fuzziness")) {
86 i++;
87 fuzziness = args[i];
88 }
89 else if (args[i].equals("-startresults")) {
90 i++;
91 if (args[i].matches("\\d+")) {
92 start_results = Integer.parseInt(args[i]);
93 }
94 }
95 else if (args[i].equals("-endresults")) {
96 i++;
97 if (args[i].matches("\\d+")) {
98 end_results = Integer.parseInt(args[i]);
99 }
100 }
101 else {
102 query_string = args[i];
103 }
104 }
105
106 // Lucene does "OR" queries by default; do an "AND" query if specified
107 if (default_conjuction_operator.equals("AND")) {
108 query_parser.setDefaultOperator(query_parser.AND_OPERATOR);
109 query_parser_no_stop_words.setDefaultOperator(query_parser.AND_OPERATOR);
110 }
111
112 // The query string has been specified as a command-line argument
113 if (query_string != null) {
114 runQuery(searcher, reader, query_parser, query_parser_no_stop_words, query_string);
115 }
116
117 // Read queries from STDIN
118 else {
119 BufferedReader in = new BufferedReader(new InputStreamReader(System.in, "UTF-8"));
120 while (true) {
121 // Read the query from STDIN
122 query_string = in.readLine();
123 if (query_string == null || query_string.length() == -1) {
124 break;
125 }
126
127 runQuery(searcher, reader, query_parser, query_parser_no_stop_words, query_string);
128 }
129 }
130
131 searcher.close();
132 }
133 catch (IOException exception) {
134 exception.printStackTrace();
135 }
136 }
137
138
139 private static void runQuery(Searcher searcher, IndexReader reader, QueryParser query_parser, QueryParser query_parser_no_stop_words, String query_string)
140 throws IOException
141 {
142 System.out.println("<ResultSet>");
143 System.out.println(" <QueryString>" + xmlSafe(query_string) + "</QueryString>");
144 if (filter != null) {
145 System.out.println(" <FilterString>" + filter.toString() + "</FilterString>");
146 }
147
148 try {
149 Query query_including_stop_words = parseQuery(reader, query_parser_no_stop_words, query_string, fuzziness);
150 query_including_stop_words = query_including_stop_words.rewrite(reader);
151
152 Query query = parseQuery(reader, query_parser, query_string, fuzziness);
153 query = query.rewrite(reader);
154
155 // Return the list of expanded query terms and their frequencies
156 HashSet terms = new HashSet();
157 query.extractTerms(terms);
158 Iterator term_iterator = terms.iterator();
159 System.out.println(" <QueryTermsInfo num=\"" + terms.size() + "\"/>");
160 while (term_iterator.hasNext()) {
161 Term term = (Term) term_iterator.next();
162
163 // Get the term frequency over all the documents
164 TermDocs term_docs = reader.termDocs(term);
165 int term_freq = term_docs.freq();
166 while (term_docs.next()) {
167 term_freq += term_docs.freq();
168 }
169
170 // If you wanted to limit this to just text terms add
171 // something like this:
172 // if (term.field().equals(TEXTFIELD))
173 System.out.println(" <Term value=\"" + term.text() + "\" field=\"" + term.field() + "\" freq=\"" + term_freq + "\" />");
174 }
175
176 // Return the list of stop words removed from the query
177 HashSet terms_including_stop_words = new HashSet();
178 query_including_stop_words.extractTerms(terms_including_stop_words);
179 Iterator terms_including_stop_words_iter = terms_including_stop_words.iterator();
180 while (terms_including_stop_words_iter.hasNext()) {
181 Term term = (Term) terms_including_stop_words_iter.next();
182 if (!terms.contains(term)) {
183 System.out.println(" <StopWord value=\"" + term.text() + "\"/>");
184 }
185 }
186
187 // Simple case for getting all the matching documents
188 if (end_results == Integer.MAX_VALUE) {
189 // Perform the query (filter and sorter may be null)
190 Hits hits = searcher.search(query, filter, sorter);
191 System.out.println(" <MatchingDocsInfo num=\"" + hits.length() + "\"/>");
192
193 // Output the matching documents
194 System.out.println(" <StartResults num=\"" + start_results + "\" />");
195 System.out.println(" <EndsResults num=\"" + hits.length() + "\" />");
196 for (int i = start_results; i <= hits.length(); i++) {
197 Document doc = hits.doc(i - 1);
198 System.out.println(" <Match id=\"" + doc.get("nodeID") + "\" />");
199 }
200 }
201
202 // Slightly more complicated case for returning a subset of the matching documents
203 else {
204 // Perform the query (filter may be null)
205 TopFieldDocs hits = searcher.search(query, filter, end_results, sorter);
206 System.out.println(" <MatchingDocsInfo num=\"" + hits.totalHits + "\"/>");
207
208 // Output the matching documents
209 System.out.println(" <StartResults num=\"" + start_results + "\" />");
210 System.out.println(" <EndsResults num=\"" + end_results + "\" />");
211 for (int i = start_results; (i <= hits.scoreDocs.length && i <= end_results); i++) {
212 Document doc = reader.document(hits.scoreDocs[i - 1].doc);
213 System.out.println(" <Match id=\"" + doc.get("nodeID") + "\" />");
214 }
215 }
216 }
217 catch (ParseException parse_exception) {
218 System.out.println(" <Error type=\"PARSE_EXCEPTION\"/>");
219 }
220 catch (TooManyClauses too_many_clauses_exception) {
221 System.out.println(" <Error type=\"TOO_MANY_CLAUSES\"/>");
222 }
223
224 System.out.println("</ResultSet>");
225 }
226
227
228 private static String xmlSafe(String text) {
229 return text.replaceAll("\\&", "\\&amp;");
230 }
231
232 private static Query parseQuery(IndexReader reader, QueryParser query_parser, String query_string, String fuzziness)
233 throws java.io.IOException, org.apache.lucene.queryParser.ParseException
234 {
235 // Split query string into the search terms and the filter terms
236 // * The first +(...) term contains the search terms so count
237 // up '(' and stop when we finish matching ')'
238 int offset = 0;
239 int paren_count = 0;
240 boolean seen_paren = false;
241 while (offset < query_string.length() && (!seen_paren || paren_count > 0)) {
242 if (query_string.charAt(offset) == '(') {
243 paren_count++;
244 seen_paren = true;
245 }
246 if (query_string.charAt(offset) == ')') {
247 paren_count--;
248 }
249 offset++;
250 }
251 String query_prefix = query_string.substring(0, offset);
252 String query_suffix = query_string.substring(offset);
253
254 ///ystem.err.println("Prefix: " + query_prefix);
255 ///ystem.err.println("Suffix: " + query_suffix);
256
257 Query query = query_parser.parse(query_prefix);
258 query = query.rewrite(reader);
259
260 // If this is a fuzzy search, then we need to add the fuzzy
261 // flag to each of the query terms
262 if (fuzziness != null && query.toString().length() > 0) {
263 // Revert the query to a string
264 System.err.println("Rewritten query: " + query.toString());
265 // Search through the string for TX:<term> query terms
266 // and append the ~ operator. Not that this search will
267 // not change phrase searches (TX:"<term> <term>") as
268 // fuzzy searching is not possible for these entries.
269 // Yahoo! Time for a state machine!
270 StringBuffer mutable_query_string = new StringBuffer(query.toString());
271 int o = 0; // Offset
272 // 0 = BASE, 1 = SEEN_T, 2 = SEEN_TX, 3 = SEEN_TX:
273 int s = 0; // State
274 while (o < mutable_query_string.length()) {
275 char c = mutable_query_string.charAt(o);
276 if (s == 0 && c == TEXTFIELD.charAt(0)) {
277 ///ystem.err.println("Found T!");
278 s = 1;
279 }
280 else if (s == 1) {
281 if (c == TEXTFIELD.charAt(1)) {
282 ///ystem.err.println("Found X!");
283 s = 2;
284 }
285 else {
286 s = 0; // Reset
287 }
288 }
289 else if (s == 2) {
290 if (c == ':') {
291 ///ystem.err.println("Found TX:!");
292 s = 3;
293 }
294 else {
295 s = 0; // Reset
296 }
297 }
298 else if (s == 3) {
299 // Don't process phrases
300 if (c == '"') {
301 ///ystem.err.println("Stupid phrase...");
302 s = 0; // Reset
303 }
304 // Found the end of the term... add the
305 // fuzzy search indicator
306 // Nor outside the scope of parentheses
307 else if (Character.isWhitespace(c) || c == ')') {
308 ///ystem.err.println("Yahoo! Found fuzzy term.");
309 mutable_query_string.insert(o, '~' + fuzziness);
310 o++;
311 s = 0; // Reset
312 }
313 }
314 o++;
315 }
316 // If we were in the state of looking for the end of a
317 // term - then we just found it!
318 if (s == 3) {
319 mutable_query_string.append('~' + fuzziness);
320 }
321 // Reparse the query
322 ///ystem.err.println("Fuzzy query: " + mutable_query_string.toString() + query_suffix);
323 query = query_parser.parse(mutable_query_string.toString() + query_suffix);
324 }
325 else {
326 query = query_parser.parse(query_prefix + query_suffix);
327 }
328
329 return query;
330 }
331
332
333 /**
334 * @todo Michael to comment
335 */
336 private static Filter parseFilterString(String filter_string)
337 {
338 Filter result = null;
339 Pattern pattern = Pattern.compile("\\s*\\+(\\w+)\\:([\\{\\[])(\\d+)\\s+TO\\s+(\\d+)([\\}\\]])\\s*");
340 Matcher matcher = pattern.matcher(filter_string);
341 if (matcher.matches()) {
342 String field_name = matcher.group(1);
343 boolean include_lower = matcher.group(2).equals("[");
344 String lower_term = matcher.group(3);
345 String upper_term = matcher.group(4);
346 boolean include_upper = matcher.group(5).equals("]");
347 result = new RangeFilter(field_name, lower_term, upper_term, include_lower, include_upper);
348 }
349 else {
350 System.err.println("Error: Could not understand filter string \"" + filter_string + "\"");
351 }
352 return result;
353 }
354 /** parseFilterString() **/
355}
Note: See TracBrowser for help on using the repository browser.