source: trunk/indexers/lucene-gs/src/org/greenstone/LuceneWrapper/GS2LuceneQuery.java@ 12976

Last change on this file since 12976 was 12976, checked in by mdewsnip, 18 years ago

Rearranged some code to make the fact that the term information is now independent of the search results clearer.

  • Property svn:keywords set to Author Date Id Revision
File size: 10.7 KB
Line 
1/**
2 *
3 * @author [email protected]
4 * @author [email protected]
5 * @author [email protected]
6 * @author [email protected]
7 * @version
8 */
9
10package org.nzdl.gsdl.LuceneWrap;
11
12
13import java.io.*;
14import java.util.*;
15import java.util.regex.*;
16
17import org.apache.lucene.analysis.Analyzer;
18import org.apache.lucene.analysis.standard.StandardAnalyzer;
19import org.apache.lucene.document.Document;
20import org.apache.lucene.index.IndexReader;
21import org.apache.lucene.index.Term;
22import org.apache.lucene.index.TermDocs;
23import org.apache.lucene.queryParser.ParseException;
24import org.apache.lucene.queryParser.QueryParser;
25import org.apache.lucene.search.BooleanQuery.TooManyClauses;
26import org.apache.lucene.search.Filter;
27import org.apache.lucene.search.Hit;
28import org.apache.lucene.search.Hits;
29import org.apache.lucene.search.IndexSearcher;
30import org.apache.lucene.search.Query;
31import org.apache.lucene.search.RangeFilter;
32import org.apache.lucene.search.Searcher;
33import org.apache.lucene.search.Sort;
34
35
36public class GS2LuceneQuery
37{
38 static private String TEXTFIELD = "TX";
39
40 // Use the standard set of English stop words by default
41 static private String[] stop_words = StandardAnalyzer.STOP_WORDS;
42
43
44 static public void main (String args[])
45 {
46 if (args.length == 0) {
47 System.out.println("Usage: GS2LuceneQuery <index directory> [-fuzziness value] [-filter filter_string] [-sort sort_field] [-dco AND|OR] [-startresults number -endresults number]");
48 return;
49 }
50
51 try {
52 Searcher searcher = new IndexSearcher(args[0]);
53 IndexReader reader = ((IndexSearcher) searcher).getIndexReader();
54
55 // Create one query parser with the standard set of stop words, and one with none
56 QueryParser query_parser = new QueryParser(TEXTFIELD, new StandardAnalyzer(stop_words));
57 QueryParser query_parser_no_stop_words = new QueryParser(TEXTFIELD, new StandardAnalyzer(new String[] { }));
58
59 Sort sorter = null;
60 Filter filter = null;
61 String fuzziness = null;
62
63 // Paging
64 int start_results = 1;
65 int end_results = -1;
66
67 // New code to allow the default conjunction operator to be
68 // definable
69 String default_conjuction_operator = "OR";
70 for (int i = 1; i < args.length; i++) {
71 if (args[i].equals("-sort")) {
72 i++;
73 sorter = new Sort(args[i]);
74 }
75 if (args[i].equals("-filter")) {
76 i++;
77 filter = parseFilterString(args[i]);
78 }
79 if (args[i].equals("-dco")) {
80 i++;
81 default_conjuction_operator = args[i];
82 }
83 if (args[i].equals("-fuzziness")) {
84 i++;
85 fuzziness = args[i];
86 }
87 if (args[i].equals("-startresults")) {
88 i++;
89 if (args[i].matches("\\d+")) {
90 start_results = Integer.parseInt(args[i]);
91 }
92 }
93 if (args[i].equals("-endresults")) {
94 i++;
95 if (args[i].matches("\\d+")) {
96 end_results = Integer.parseInt(args[i]);
97 }
98 }
99 }
100
101 // Lucene does "OR" queries by default; do an "AND" query if specified
102 if (default_conjuction_operator.equals("AND")) {
103 query_parser.setDefaultOperator(query_parser.AND_OPERATOR);
104 query_parser_no_stop_words.setDefaultOperator(query_parser.AND_OPERATOR);
105 }
106
107 BufferedReader in = new BufferedReader(new InputStreamReader(System.in, "UTF-8"));
108 while (true) {
109 // Read the query from STDIN
110 String query_string = in.readLine();
111 if (query_string == null || query_string.length() == -1) {
112 break;
113 }
114 System.out.println("<ResultSet>");
115 System.out.println(" <QueryString>" + xmlSafe(query_string) + "</QueryString>");
116 if (filter != null) {
117 System.out.println(" <FilterString>" + filter.toString() + "</FilterString>");
118 }
119
120 try {
121 Query query_including_stop_words = parseQuery(reader, query_parser_no_stop_words, query_string, fuzziness);
122 query_including_stop_words = query_including_stop_words.rewrite(reader);
123
124 Query query = parseQuery(reader, query_parser, query_string, fuzziness);
125 query = query.rewrite(reader);
126
127 // Return the list of expanded query terms and their frequencies
128 HashSet terms = new HashSet();
129 query.extractTerms(terms);
130 Iterator term_iterator = terms.iterator();
131 System.out.println(" <QueryTermsInfo num=\"" + terms.size() + "\"/>");
132 while (term_iterator.hasNext()) {
133 Term term = (Term) term_iterator.next();
134
135 // Get the term frequency over all the documents
136 TermDocs term_docs = reader.termDocs(term);
137 int term_freq = term_docs.freq();
138 while (term_docs.next()) {
139 term_freq += term_docs.freq();
140 }
141
142 // If you wanted to limit this to just text terms add
143 // something like this:
144 // if (term.field().equals(TEXTFIELD))
145 System.out.println(" <Term value=\"" + term.text() + "\" field=\"" + term.field() + "\" freq=\"" + term_freq + "\" />");
146 }
147
148 // Return the list of stop words removed from the query
149 HashSet terms_including_stop_words = new HashSet();
150 query_including_stop_words.extractTerms(terms_including_stop_words);
151 Iterator terms_including_stop_words_iter = terms_including_stop_words.iterator();
152 while (terms_including_stop_words_iter.hasNext()) {
153 Term term = (Term) terms_including_stop_words_iter.next();
154 if (!terms.contains(term)) {
155 System.out.println(" <StopWord value=\"" + term.text() + "\"/>");
156 }
157 }
158
159 // Perform the query (filter and sorter may be null)
160 Hits hits = searcher.search(query, filter, sorter);
161
162 // Do we need to use a hit iterator to get sorted results?
163 System.out.println(" <MatchingDocsInfo num=\"" + hits.length() + "\"/>");
164 System.out.println(" <StartResults num=\"" + start_results + "\" />");
165 System.out.println(" <EndsResults num=\"" + end_results + "\" />");
166
167 int counter = 1;
168 Iterator hit_iter = hits.iterator();
169 while (hit_iter.hasNext()) {
170 Hit hit = (Hit) hit_iter.next();
171 Document doc = hit.getDocument();
172
173 // May not be paging results
174 if (start_results == 1 && end_results == -1) {
175 String node_id = doc.get("nodeID");
176 System.out.println(" <Match id=\"" + node_id + "\" />");
177 }
178 // Otherwise skip up until page offset
179 else if (start_results <= counter && counter <= end_results) {
180 String node_id = doc.get("nodeID");
181 System.out.println(" <Match id=\"" + node_id + "\" />");
182 }
183 // And skip all the rest
184
185 ++counter;
186 }
187 }
188 catch (ParseException parse_exception) {
189 System.out.println(" <Error type=\"PARSE_EXCEPTION\"/>");
190 }
191 catch (TooManyClauses too_many_clauses_exception) {
192 System.out.println(" <Error type=\"TOO_MANY_CLAUSES\"/>");
193 }
194
195 System.out.println("</ResultSet>");
196 }
197
198 searcher.close();
199 }
200 catch (IOException exception) {
201 exception.printStackTrace();
202 }
203 }
204
205 private static String xmlSafe(String text) {
206 return text.replaceAll("\\&", "\\&amp;");
207 }
208
209 private static Query parseQuery(IndexReader reader, QueryParser query_parser, String query_string, String fuzziness)
210 throws java.io.IOException, org.apache.lucene.queryParser.ParseException
211 {
212 // Split query string into the search terms and the filter terms
213 // * The first +(...) term contains the search terms so count
214 // up '(' and stop when we finish matching ')'
215 int offset = 0;
216 int paren_count = 0;
217 boolean seen_paren = false;
218 while (offset < query_string.length() && (!seen_paren || paren_count > 0)) {
219 if (query_string.charAt(offset) == '(') {
220 paren_count++;
221 seen_paren = true;
222 }
223 if (query_string.charAt(offset) == ')') {
224 paren_count--;
225 }
226 offset++;
227 }
228 String query_prefix = query_string.substring(0, offset);
229 String query_suffix = query_string.substring(offset);
230
231 ///ystem.err.println("Prefix: " + query_prefix);
232 ///ystem.err.println("Suffix: " + query_suffix);
233
234 Query query = query_parser.parse(query_prefix);
235 query = query.rewrite(reader);
236
237 // If this is a fuzzy search, then we need to add the fuzzy
238 // flag to each of the query terms
239 if (fuzziness != null && query.toString().length() > 0) {
240 // Revert the query to a string
241 System.err.println("Rewritten query: " + query.toString());
242 // Search through the string for TX:<term> query terms
243 // and append the ~ operator. Not that this search will
244 // not change phrase searches (TX:"<term> <term>") as
245 // fuzzy searching is not possible for these entries.
246 // Yahoo! Time for a state machine!
247 StringBuffer mutable_query_string = new StringBuffer(query.toString());
248 int o = 0; // Offset
249 // 0 = BASE, 1 = SEEN_T, 2 = SEEN_TX, 3 = SEEN_TX:
250 int s = 0; // State
251 while (o < mutable_query_string.length()) {
252 char c = mutable_query_string.charAt(o);
253 if (s == 0 && c == TEXTFIELD.charAt(0)) {
254 ///ystem.err.println("Found T!");
255 s = 1;
256 }
257 else if (s == 1) {
258 if (c == TEXTFIELD.charAt(1)) {
259 ///ystem.err.println("Found X!");
260 s = 2;
261 }
262 else {
263 s = 0; // Reset
264 }
265 }
266 else if (s == 2) {
267 if (c == ':') {
268 ///ystem.err.println("Found TX:!");
269 s = 3;
270 }
271 else {
272 s = 0; // Reset
273 }
274 }
275 else if (s == 3) {
276 // Don't process phrases
277 if (c == '"') {
278 ///ystem.err.println("Stupid phrase...");
279 s = 0; // Reset
280 }
281 // Found the end of the term... add the
282 // fuzzy search indicator
283 // Nor outside the scope of parentheses
284 else if (Character.isWhitespace(c) || c == ')') {
285 ///ystem.err.println("Yahoo! Found fuzzy term.");
286 mutable_query_string.insert(o, '~' + fuzziness);
287 o++;
288 s = 0; // Reset
289 }
290 }
291 o++;
292 }
293 // If we were in the state of looking for the end of a
294 // term - then we just found it!
295 if (s == 3) {
296 mutable_query_string.append('~' + fuzziness);
297 }
298 // Reparse the query
299 ///ystem.err.println("Fuzzy query: " + mutable_query_string.toString() + query_suffix);
300 query = query_parser.parse(mutable_query_string.toString() + query_suffix);
301 }
302 else {
303 query = query_parser.parse(query_prefix + query_suffix);
304 }
305
306 return query;
307 }
308
309
310 /**
311 * @todo Michael to comment
312 */
313 private static Filter parseFilterString(String filter_string)
314 {
315 Filter result = null;
316 Pattern pattern = Pattern.compile("\\s*\\+(\\w+)\\:([\\{\\[])(\\d+)\\s+TO\\s+(\\d+)([\\}\\]])\\s*");
317 Matcher matcher = pattern.matcher(filter_string);
318 if (matcher.matches()) {
319 String field_name = matcher.group(1);
320 boolean include_lower = matcher.group(2).equals("[");
321 String lower_term = matcher.group(3);
322 String upper_term = matcher.group(4);
323 boolean include_upper = matcher.group(5).equals("]");
324 result = new RangeFilter(field_name, lower_term, upper_term, include_lower, include_upper);
325 }
326 else {
327 System.err.println("Error: Could not understand filter string \"" + filter_string + "\"");
328 }
329 return result;
330 }
331 /** parseFilterString() **/
332}
Note: See TracBrowser for help on using the repository browser.