source: trunk/indexers/lucene-gs/src/org/greenstone/LuceneWrapper/GS2LuceneQuery.java@ 12390

Last change on this file since 12390 was 12390, checked in by mdewsnip, 18 years ago

More fixes, many thanks to John Thompson and DL Consulting Ltd.

  • Property svn:keywords set to Author Date Id Revision
File size: 14.6 KB
Line 
1/**
2 *
3 * @author [email protected]
4 * @author [email protected]
5 * @version
6 */
7
8package org.nzdl.gsdl.LuceneWrap;
9
10
11import java.io.BufferedReader;
12import java.io.InputStreamReader;
13import java.util.Collections;
14import java.util.HashMap;
15import java.util.HashSet;
16import java.util.Iterator;
17import java.util.Set;
18
19import org.apache.lucene.analysis.Analyzer;
20import org.apache.lucene.analysis.standard.StandardAnalyzer;
21import org.apache.lucene.document.Document;
22import org.apache.lucene.index.IndexReader;
23import org.apache.lucene.index.Term;
24import org.apache.lucene.index.TermFreqVector;
25import org.apache.lucene.queryParser.QueryParser;
26import org.apache.lucene.search.Hit;
27import org.apache.lucene.search.Hits;
28import org.apache.lucene.search.IndexSearcher;
29import org.apache.lucene.search.Query;
30import org.apache.lucene.search.Searcher;
31import org.apache.lucene.search.Sort;
32
33
34public class GS2LuceneQuery
35{
36 public static void main (String args[])
37 {
38 if (args.length == 0) {
39 System.out.println("Usage: GS2LuceneQuery <index directory> (<sort field>)");
40 return;
41 }
42
43 try {
44 Searcher searcher = new IndexSearcher(args[0]);
45 IndexReader reader = ((IndexSearcher) searcher).getIndexReader();
46
47 Sort sorter = new Sort();
48 boolean fuzzy = false;
49
50 // New code to allow the default conjunction operator to be
51 // definable
52 String default_conjuction_operator = "OR";
53 for (int i = 1; i < args.length; i++)
54 {
55 if (args[i].equals("-sort"))
56 {
57 i++;
58 ///ystem.err.println("**** sort by = " + args[i]);
59 sorter = new Sort(args[i]);
60 }
61 if (args[i].equals("-dco"))
62 {
63 i++;
64 default_conjuction_operator = args[i];
65 }
66 if (args[i].equals("-fuzzy"))
67 {
68 fuzzy = true;
69 }
70 }
71
72 // Create one query parser with the standard set of stop words, and one with none
73 QueryParser query_parser = new QueryParser("TX", new StandardAnalyzer());
74 QueryParser query_parser_no_stop_words = new QueryParser("TX", new StandardAnalyzer(new String[] { }));
75
76 // Lucene does "OR" queries by default; do an "AND" query if specified
77 if (default_conjuction_operator.equals("AND"))
78 {
79 query_parser.setDefaultOperator(query_parser.AND_OPERATOR);
80 query_parser_no_stop_words.setDefaultOperator(query_parser.AND_OPERATOR);
81 }
82
83 BufferedReader in = new BufferedReader(new InputStreamReader(System.in, "UTF-8"));
84 while (true)
85 {
86 // Read the query from STDIN
87 String query_string = in.readLine();
88 if (query_string == null || query_string.length() == -1)
89 {
90 break;
91 }
92 ///ystem.err.println("**** query = " + query_string);
93
94 Query query_including_stop_words = query_parser_no_stop_words.parse(query_string);
95 query_including_stop_words = query_including_stop_words.rewrite(reader);
96
97 // Split query string into the search terms and the filter terms
98 // * The first +(...) term contains the search terms so count
99 // up '(' and stop when we finish matching ')'
100 int offset = 0;
101 int paren_count = 0;
102 boolean seen_paren = false;
103 while (offset < query_string.length() && (!seen_paren || paren_count > 0))
104 {
105 if (query_string.charAt(offset) == '(')
106 {
107 paren_count++;
108 seen_paren = true;
109 }
110 if (query_string.charAt(offset) == ')')
111 {
112 paren_count--;
113 }
114 offset++;
115 }
116 String query_prefix = query_string.substring(0, offset);
117 String query_suffix = query_string.substring(offset);
118
119 ///ystem.err.println("Prefix: " + query_prefix);
120 ///ystem.err.println("Suffix: " + query_suffix);
121
122 Query query = query_parser.parse(query_prefix);
123 query = query.rewrite(reader);
124
125 // If this is a fuzzy search, then we need to add the fuzzy
126 // flag to each of the query terms
127 if (fuzzy && query.toString().length() > 0)
128 {
129 // Revert the query to a string
130 ///ystem.err.println("Rewritten query: " + query.toString());
131 // Search through the string for TX:<term> query terms
132 // and append the ~ operator. Not that this search will
133 // not change phrase searches (TX:"<term> <term>") as
134 // fuzzy searching is not possible for these entries.
135 // Yahoo! Time for a state machine!
136 StringBuffer mutable_query_string = new StringBuffer(query.toString());
137 int o = 0; // Offset
138 // 0 = BASE, 1 = SEEN_T, 2 = SEEN_TX, 3 = SEEN_TX:
139 int s = 0; // State
140 while(o < mutable_query_string.length())
141 {
142 char c = mutable_query_string.charAt(o);
143 if (s == 0 && c == 'T')
144 {
145 ///ystem.err.println("Found T!");
146 s = 1;
147 }
148 else if (s == 1)
149 {
150 if (c == 'X')
151 {
152 ///ystem.err.println("Found X!");
153 s = 2;
154 }
155 else
156 {
157 s = 0; // Reset
158 }
159 }
160 else if (s == 2)
161 {
162 if (c == ':')
163 {
164 ///ystem.err.println("Found TX:!");
165 s = 3;
166 }
167 else
168 {
169 s = 0; // Reset
170 }
171 }
172 else if (s == 3)
173 {
174 // Don't process phrases
175 if (c == '"')
176 {
177 ///ystem.err.println("Stupid phrase...");
178 s = 0; // Reset
179 }
180 // Found the end of the term... add the
181 // fuzzy search indicator
182 // Nor outside the scope of parentheses
183 else if (Character.isWhitespace(c) || c == ')')
184 {
185 ///ystem.err.println("Yahoo! Found fuzzy term.");
186 mutable_query_string.insert(o, '~');
187 o++;
188 s = 0; // Reset
189 }
190 }
191 o++;
192 }
193 // If we were in the state of looking for the end of a
194 // term - then we just found it!
195 if (s == 3)
196 {
197 mutable_query_string.append('~');
198 }
199 // Reparse the query
200 ///ystem.err.println("Fuzzy query: " + mutable_query_string.toString() + query_suffix);
201 query = query_parser.parse(mutable_query_string.toString() + query_suffix);
202 // And rewrite again
203 query = query.rewrite(reader);
204 ///ystem.err.println("Rewritten Fuzzy query: " + query.toString());
205 }
206 else
207 {
208 query = query_parser.parse(query_prefix + query_suffix);
209 query = query.rewrite(reader);
210 }
211
212 // Perform the query
213 Hits hits = searcher.search(query, sorter);
214 System.out.println("<ResultSet>");
215 System.out.println(" <QueryString>" + query_string + "</QueryString>");
216 // Return the list of expanded query terms and their frequencies
217 HashMap term_counts = new HashMap();
218 HashMap term_fields = new HashMap();
219 HashSet terms = new HashSet();
220 query.extractTerms(terms);
221 //System.out.println(" <QueryTermsInfo num=\"" + terms.size() + "\"/>");
222 Iterator iter = terms.iterator();
223 while (iter.hasNext())
224 {
225 Term term = (Term) iter.next();
226 // If you wanted to limit this to just TX terms add
227 // something like this:
228 //if (term.field().equals("TX"))
229 term_counts.put(term.text(), new Integer(0));
230 term_fields.put(term.text(), term.field());
231 }
232
233 // Do we need to use a hit iterator to get sorted results?
234 System.out.println(" <MatchingDocsInfo num=\"" + hits.length() + "\"/>");
235 Iterator hit_iter = hits.iterator();
236 while (hit_iter.hasNext())
237 {
238 Hit hit = (Hit) hit_iter.next();
239 Document doc = hit.getDocument();
240 String node_id = doc.get("nodeID");
241 System.out.println(" <Match id=\"" + node_id + "\" />");
242
243 // From the document, extract the Term Vector for the
244 // TX field
245 TermFreqVector term_freq_vector = reader.getTermFreqVector(hit.getId(), "TX");
246 if (term_freq_vector.size() > 0)
247 {
248 int[] term_frequencies = term_freq_vector.getTermFrequencies();
249 // Now for each query term, determine the
250 // frequency - which may of course be 0.
251 Set term_counts_set = term_counts.keySet();
252 Iterator terms_iter = term_counts_set.iterator();
253 while (terms_iter.hasNext())
254 {
255 String term = (String) terms_iter.next();
256 Integer count_integer = (Integer) term_counts.get(term);
257 int count = count_integer.intValue();
258 int index = term_freq_vector.indexOf(term);
259 // If the term has a count, then add to
260 // the total count for this term
261 if (index != -1)
262 {
263 count += term_frequencies[index];
264
265 }
266 // Store the result
267 term_counts.put(term, new Integer(count));
268 count_integer = null;
269 term = null;
270 }
271 terms_iter = null;
272 term_counts_set = null;
273 }
274 else
275 {
276 ///ystem.err.println("Error! Missing term vector for document " + hit.getId());
277 }
278 }
279
280 // Retrieve all the useful terms
281 Set term_counts_set = term_counts.keySet();
282 System.out.println(" <QueryTermsInfo num=\"" + term_counts_set.size() + "\"/>");
283 // Iterate over them
284 Iterator terms_iter = term_counts_set.iterator();
285 while (terms_iter.hasNext())
286 {
287 String term = (String) terms_iter.next();
288 Integer count = (Integer) term_counts.get(term);
289 String field = (String) term_fields.get(term);
290 System.out.println(" <Term value=\"" + term + "\" field=\"" + field + "\" freq=\"" + count.intValue() + "\" />");
291 count = null;
292 term = null;
293 }
294 // Cleanup
295 terms_iter = null;
296 term_counts_set = null;
297
298 // Return the list of stop words removed from the query
299 HashSet terms_including_stop_words = new HashSet();
300 query_including_stop_words.extractTerms(terms_including_stop_words);
301 Iterator terms_including_stop_words_iter = terms_including_stop_words.iterator();
302 while (terms_including_stop_words_iter.hasNext()) {
303 Term term = (Term) terms_including_stop_words_iter.next();
304 if (!terms.contains(term)) {
305 System.out.println(" <StopWord value=\"" + term.text() + "\"/>");
306 }
307 }
308
309 System.out.println("</ResultSet>");
310 }
311
312 searcher.close();
313 }
314 catch (Exception exception) {
315 exception.printStackTrace();
316 }
317 }
318}
Note: See TracBrowser for help on using the repository browser.