source: trunk/indexers/lucene-gs/src/org/greenstone/LuceneWrapper/GS2LuceneQuery.java@ 12387

Last change on this file since 12387 was 12387, checked in by mdewsnip, 18 years ago

Fixes for fuzzy searching, many thanks to John Thompson and DL Consulting Ltd.

  • Property svn:keywords set to Author Date Id Revision
File size: 14.1 KB
Line 
1/**
2 *
3 * @author [email protected]
4 * @author [email protected]
5 * @version
6 */
7
8package org.nzdl.gsdl.LuceneWrap;
9
10
11import java.io.BufferedReader;
12import java.io.InputStreamReader;
13import java.util.Collections;
14import java.util.HashMap;
15import java.util.HashSet;
16import java.util.Iterator;
17import java.util.Set;
18
19import org.apache.lucene.analysis.Analyzer;
20import org.apache.lucene.analysis.standard.StandardAnalyzer;
21import org.apache.lucene.document.Document;
22import org.apache.lucene.index.IndexReader;
23import org.apache.lucene.index.Term;
24import org.apache.lucene.index.TermFreqVector;
25import org.apache.lucene.queryParser.QueryParser;
26import org.apache.lucene.search.Hit;
27import org.apache.lucene.search.Hits;
28import org.apache.lucene.search.IndexSearcher;
29import org.apache.lucene.search.Query;
30import org.apache.lucene.search.Searcher;
31import org.apache.lucene.search.Sort;
32
33
34public class GS2LuceneQuery
35{
36 public static void main (String args[])
37 {
38 if (args.length == 0) {
39 System.out.println("Usage: GS2LuceneQuery <index directory> (<sort field>)");
40 return;
41 }
42
43 try {
44 Searcher searcher = new IndexSearcher(args[0]);
45 IndexReader reader = ((IndexSearcher) searcher).getIndexReader();
46
47 Sort sorter = new Sort();
48 boolean fuzzy = false;
49
50 // New code to allow the default conjunction operator to be
51 // definable
52 String default_conjuction_operator = "OR";
53 for (int i = 1; i < args.length; i++)
54 {
55 if (args[i].equals("-sort"))
56 {
57 i++;
58 ///ystem.err.println("**** sort by = " + args[i]);
59 sorter = new Sort(args[i]);
60 }
61 if (args[i].equals("-dco"))
62 {
63 i++;
64 default_conjuction_operator = args[i];
65 }
66 if (args[i].equals("-fuzzy"))
67 {
68 fuzzy = true;
69 }
70 }
71
72 // Create one query parser with the standard set of stop words, and one with none
73 QueryParser query_parser = new QueryParser("TX", new StandardAnalyzer());
74 QueryParser query_parser_no_stop_words = new QueryParser("TX", new StandardAnalyzer(new String[] { }));
75
76 // Lucene does "OR" queries by default; do an "AND" query if specified
77 if (default_conjuction_operator.equals("AND")) {
78 query_parser.setDefaultOperator(query_parser.AND_OPERATOR);
79 query_parser_no_stop_words.setDefaultOperator(query_parser.AND_OPERATOR);
80 }
81
82 BufferedReader in = new BufferedReader(new InputStreamReader(System.in, "UTF-8"));
83 while (true) {
84 // Read the query from STDIN
85 String query_string = in.readLine();
86 if (query_string == null || query_string.length() == -1) {
87 break;
88 }
89 ///ystem.err.println("**** query = " + query_string);
90
91 Query query_including_stop_words = query_parser_no_stop_words.parse(query_string);
92 query_including_stop_words = query_including_stop_words.rewrite(reader);
93
94 // Split query string into the search terms and the filter terms
95 // * The first +(...) term contains the search terms so count
96 // up '(' and stop when we finish matching ')'
97 int offset = 0;
98 int paren_count = 0;
99 boolean seen_paren = false;
100 while (offset < query_string.length() && (!seen_paren || paren_count > 0))
101 {
102 if (query_string.charAt(offset) == '(')
103 {
104 paren_count++;
105 seen_paren = true;
106 }
107 if (query_string.charAt(offset) == ')')
108 {
109 paren_count--;
110 }
111 offset++;
112 }
113 String query_prefix = query_string.substring(0, offset);
114 String query_suffix = query_string.substring(offset);
115
116 ///ystem.err.println("Prefix: " + query_prefix);
117 ///ystem.err.println("Suffix: " + query_suffix);
118
119 Query query = query_parser.parse(query_prefix);
120 query = query.rewrite(reader);
121
122 // If this is a fuzzy search, then we need to add the fuzzy
123 // flag to each of the query terms
124 if (fuzzy && query.toString().length() > 0)
125 {
126 // Revert the query to a string
127 ///ystem.err.println("Rewritten query: " + query.toString());
128 // Search through the string for TX:<term> query terms
129 // and append the ~ operator. Not that this search will
130 // not change phrase searches (TX:"<term> <term>") as
131 // fuzzy searching is not possible for these entries.
132 // Yahoo! Time for a state machine!
133 StringBuffer mutable_query_string = new StringBuffer(query.toString());
134 int o = 0; // Offset
135 // 0 = BASE, 1 = SEEN_T, 2 = SEEN_TX, 3 = SEEN_TX:
136 int s = 0; // State
137 while(o < mutable_query_string.length())
138 {
139 char c = mutable_query_string.charAt(o);
140 if (s == 0 && c == 'T')
141 {
142 ///ystem.err.println("Found T!");
143 s = 1;
144 }
145 else if (s == 1)
146 {
147 if (c == 'X')
148 {
149 ///ystem.err.println("Found X!");
150 s = 2;
151 }
152 else
153 {
154 s = 0; // Reset
155 }
156 }
157 else if (s == 2)
158 {
159 if (c == ':')
160 {
161 ///ystem.err.println("Found TX:!");
162 s = 3;
163 }
164 else
165 {
166 s = 0; // Reset
167 }
168 }
169 else if (s == 3)
170 {
171 // Don't process phrases
172 if (c == '"')
173 {
174 ///ystem.err.println("Stupid phrase...");
175 s = 0; // Reset
176 }
177 // Found the end of the term... add the
178 // fuzzy search indicator
179 // Nor outside the scope of parentheses
180 else if (Character.isWhitespace(c) || c == ')')
181 {
182 ///ystem.err.println("Yahoo! Found fuzzy term.");
183 mutable_query_string.insert(o, '~');
184 o++;
185 s = 0; // Reset
186 }
187 }
188 o++;
189 }
190 // If we were in the state of looking for the end of a
191 // term - then we just found it!
192 if (s == 3)
193 {
194 mutable_query_string.append('~');
195 }
196 // Reparse the query
197 ///ystem.err.println("Fuzzy query: " + mutable_query_string.toString() + query_suffix);
198 query = query_parser.parse(mutable_query_string.toString() + query_suffix);
199 // And rewrite again
200 query = query.rewrite(reader);
201 ///ystem.err.println("Rewritten Fuzzy query: " + query.toString());
202 }
203
204
205 // Perform the query
206 Hits hits = searcher.search(query, sorter);
207 System.out.println("<ResultSet>");
208 System.out.println(" <QueryString>" + query_string + "</QueryString>");
209 // Return the list of expanded query terms and their frequencies
210 HashMap term_counts = new HashMap();
211 HashMap term_fields = new HashMap();
212 HashSet terms = new HashSet();
213 query.extractTerms(terms);
214 //System.out.println(" <QueryTermsInfo num=\"" + terms.size() + "\"/>");
215 Iterator iter = terms.iterator();
216 while (iter.hasNext())
217 {
218 Term term = (Term) iter.next();
219 // If you wanted to limit this to just TX terms add
220 // something like this:
221 //if (term.field().equals("TX"))
222 term_counts.put(term.text(), new Integer(0));
223 term_fields.put(term.text(), term.field());
224 }
225
226 // Do we need to use a hit iterator to get sorted results?
227 System.out.println(" <MatchingDocsInfo num=\"" + hits.length() + "\"/>");
228 Iterator hit_iter = hits.iterator();
229 while (hit_iter.hasNext())
230 {
231 Hit hit = (Hit) hit_iter.next();
232 Document doc = hit.getDocument();
233 String node_id = doc.get("nodeID");
234 System.out.println(" <Match id=\"" + node_id + "\" />");
235
236 // From the document, extract the Term Vector for the
237 // TX field
238 TermFreqVector term_freq_vector = reader.getTermFreqVector(hit.getId(), "TX");
239 if (term_freq_vector.size() > 0)
240 {
241 int[] term_frequencies = term_freq_vector.getTermFrequencies();
242 // Now for each query term, determine the
243 // frequency - which may of course be 0.
244 Set term_counts_set = term_counts.keySet();
245 Iterator terms_iter = term_counts_set.iterator();
246 while (terms_iter.hasNext())
247 {
248 String term = (String) terms_iter.next();
249 Integer count_integer = (Integer) term_counts.get(term);
250 int count = count_integer.intValue();
251 int index = term_freq_vector.indexOf(term);
252 // If the term has a count, then add to
253 // the total count for this term
254 if (index != -1)
255 {
256 count += term_frequencies[index];
257
258 }
259 // Store the result
260 term_counts.put(term, new Integer(count));
261 count_integer = null;
262 term = null;
263 }
264 terms_iter = null;
265 term_counts_set = null;
266 }
267 else
268 {
269 ///ystem.err.println("Error! Missing term vector for document " + hit.getId());
270 }
271 }
272
273 // Retrieve all the useful terms
274 Set term_counts_set = term_counts.keySet();
275 System.out.println(" <QueryTermsInfo num=\"" + term_counts_set.size() + "\"/>");
276 // Iterate over them
277 Iterator terms_iter = term_counts_set.iterator();
278 while (terms_iter.hasNext())
279 {
280 String term = (String) terms_iter.next();
281 Integer count = (Integer) term_counts.get(term);
282 String field = (String) term_fields.get(term);
283 System.out.println(" <Term value=\"" + term + "\" field=\"" + field + "\" freq=\"" + count.intValue() + "\" />");
284 count = null;
285 term = null;
286 }
287 // Cleanup
288 terms_iter = null;
289 term_counts_set = null;
290
291 // Return the list of stop words removed from the query
292 HashSet terms_including_stop_words = new HashSet();
293 query_including_stop_words.extractTerms(terms_including_stop_words);
294 Iterator terms_including_stop_words_iter = terms_including_stop_words.iterator();
295 while (terms_including_stop_words_iter.hasNext()) {
296 Term term = (Term) terms_including_stop_words_iter.next();
297 if (!terms.contains(term)) {
298 System.out.println(" <StopWord value=\"" + term.text() + "\"/>");
299 }
300 }
301
302 System.out.println("</ResultSet>");
303 }
304
305 searcher.close();
306 }
307 catch (Exception exception) {
308 exception.printStackTrace();
309 }
310 }
311}
Note: See TracBrowser for help on using the repository browser.