Context Navigation

source: trunk/indexers/lucene-gs/src/org/greenstone/LuceneWrapper/GS2LuceneQuery.java@ 12776

Last change on this file since 12776 was 12776, checked in by mdewsnip, 18 years ago
Fixed a bug where misspelled words could be marked as stop words with fuzzy searching on.
Property svn:keywords set to `Author Date Id Revision`
File size: 13.0 KB

Line
1	/**
2	*
3	* @author [email protected]
4	* @author [email protected]
5	* @author [email protected]
6	* @author [email protected]
7	* @version
8	*/
9
10	package org.nzdl.gsdl.LuceneWrap;
11
12
13	import java.io.*;
14	import java.util.*;
15	import java.util.regex.*;
16
17	import org.apache.lucene.analysis.Analyzer;
18	import org.apache.lucene.analysis.standard.StandardAnalyzer;
19	import org.apache.lucene.document.Document;
20	import org.apache.lucene.index.IndexReader;
21	import org.apache.lucene.index.Term;
22	import org.apache.lucene.index.TermFreqVector;
23	import org.apache.lucene.queryParser.ParseException;
24	import org.apache.lucene.queryParser.QueryParser;
25	import org.apache.lucene.search.BooleanQuery.TooManyClauses;
26	import org.apache.lucene.search.Filter;
27	import org.apache.lucene.search.Hit;
28	import org.apache.lucene.search.Hits;
29	import org.apache.lucene.search.IndexSearcher;
30	import org.apache.lucene.search.Query;
31	import org.apache.lucene.search.QueryFilter;
32	import org.apache.lucene.search.RangeFilter;
33	import org.apache.lucene.search.Searcher;
34	import org.apache.lucene.search.Sort;
35
36
37	public class GS2LuceneQuery
38	{
39	// Use the standard set of English stop words by default
40	static private String[] stop_words = StandardAnalyzer.STOP_WORDS;
41
42
43	static public void main (String args[])
44	{
45	if (args.length == 0) {
46	System.out.println("Usage: GS2LuceneQuery <index directory> [-fuzziness value] [-filter filter_string] [-sort sort_field] [-dco AND\|OR] [-startresults number -endresults number]");
47	return;
48	}
49
50	try {
51	Searcher searcher = new IndexSearcher(args[0]);
52	IndexReader reader = ((IndexSearcher) searcher).getIndexReader();
53
54	// Create one query parser with the standard set of stop words, and one with none
55	QueryParser query_parser = new QueryParser("TX", new StandardAnalyzer(stop_words));
56	QueryParser query_parser_no_stop_words = new QueryParser("TX", new StandardAnalyzer(new String[] { }));
57
58	Sort sorter = new Sort();
59	Filter filter = null;
60	String fuzziness = null;
61
62	// Paging
63	int start_results = 1;
64	int end_results = -1;
65
66	// New code to allow the default conjunction operator to be
67	// definable
68	String default_conjuction_operator = "OR";
69	for (int i = 1; i < args.length; i++) {
70	if (args[i].equals("-sort")) {
71	i++;
72	sorter = new Sort(args[i]);
73	}
74	if (args[i].equals("-filter")) {
75	i++;
76
77	// Parse up filter
78	filter = parseFilterString(args[i]);
79	}
80	if (args[i].equals("-dco")) {
81	i++;
82	default_conjuction_operator = args[i];
83	}
84	if (args[i].equals("-fuzziness")) {
85	i++;
86	fuzziness = args[i];
87	}
88	if (args[i].equals("-startresults")) {
89	i++;
90	if (args[i].matches("\\d+")) {
91	start_results = Integer.parseInt(args[i]);
92	}
93	}
94	if (args[i].equals("-endresults")) {
95	i++;
96	if (args[i].matches("\\d+")) {
97	end_results = Integer.parseInt(args[i]);
98	}
99	}
100	}
101
102	// Lucene does "OR" queries by default; do an "AND" query if specified
103	if (default_conjuction_operator.equals("AND")) {
104	query_parser.setDefaultOperator(query_parser.AND_OPERATOR);
105	query_parser_no_stop_words.setDefaultOperator(query_parser.AND_OPERATOR);
106	}
107
108	BufferedReader in = new BufferedReader(new InputStreamReader(System.in, "UTF-8"));
109	while (true) {
110	// Read the query from STDIN
111	String query_string = in.readLine();
112	if (query_string == null \|\| query_string.length() == -1) {
113	break;
114	}
115	System.out.println("<ResultSet>");
116	System.out.println(" <QueryString>" + xmlSafe(query_string) + "</QueryString>");
117	if (filter != null) {
118	System.out.println(" <FilterString>" + filter.toString() + "</FilterString>");
119	}
120
121	try {
122	Query query_including_stop_words = parseQuery(reader, query_parser_no_stop_words, query_string, fuzziness);
123	query_including_stop_words = query_including_stop_words.rewrite(reader);
124
125	Query query = parseQuery(reader, query_parser, query_string, fuzziness);
126	query = query.rewrite(reader);
127
128	// Perform the query
129	Hits hits;
130	if (filter != null) {
131	hits = searcher.search(query, filter, sorter);
132	}
133	else {
134	hits = searcher.search(query, sorter);
135	}
136
137	// Return the list of expanded query terms and their frequencies
138	HashMap term_counts = new HashMap();
139	HashMap term_fields = new HashMap();
140	HashSet terms = new HashSet();
141	query.extractTerms(terms);
142	Iterator iter = terms.iterator();
143	while (iter.hasNext()) {
144	Term term = (Term) iter.next();
145	// If you wanted to limit this to just TX terms add
146	// something like this:
147	//if (term.field().equals("TX"))
148	term_counts.put(term.text(), new Integer(0));
149	term_fields.put(term.text(), term.field());
150	}
151
152	// Do we need to use a hit iterator to get sorted results?
153	System.out.println(" <MatchingDocsInfo num=\"" + hits.length() + "\"/>");
154	System.out.println(" <StartResults num=\"" + start_results + "\" />");
155	System.out.println(" <EndsResults num=\"" + end_results + "\" />");
156
157	int counter = 1;
158	Iterator hit_iter = hits.iterator();
159	while (hit_iter.hasNext()) {
160	Hit hit = (Hit) hit_iter.next();
161	Document doc = hit.getDocument();
162	String node_id = doc.get("nodeID");
163
164	// May not be paging results
165	if (start_results == 1 && end_results == -1) {
166	System.out.println(" <Match id=\"" + node_id + "\" />");
167	}
168	// Otherwise skip up until page offset
169	else if (start_results <= counter && counter <= end_results) {
170	System.out.println(" <Match id=\"" + node_id + "\" />");
171	}
172	// And skip all the rest
173
174	// From the document, extract the Term Vector for the
175	// TX field
176	TermFreqVector term_freq_vector = reader.getTermFreqVector(hit.getId(), "TX");
177	if (term_freq_vector != null && term_freq_vector.size() > 0) {
178	int[] term_frequencies = term_freq_vector.getTermFrequencies();
179	// Now for each query term, determine the
180	// frequency - which may of course be 0.
181	Set term_counts_set = term_counts.keySet();
182	Iterator terms_iter = term_counts_set.iterator();
183	while (terms_iter.hasNext()) {
184
185	String term = (String) terms_iter.next();
186	Integer count_integer = (Integer) term_counts.get(term);
187	int count = count_integer.intValue();
188	int index = term_freq_vector.indexOf(term);
189	// If the term has a count, then add to
190	// the total count for this term
191	if (index != -1) {
192	count += term_frequencies[index];
193	}
194	// Store the result
195	term_counts.put(term, new Integer(count));
196	count_integer = null;
197	term = null;
198	}
199	terms_iter = null;
200	term_counts_set = null;
201	}
202	else {
203	///ystem.err.println("Error! Missing term vector for document " + hit.getId());
204	}
205	++counter;
206	}
207
208	// Retrieve all the useful terms
209	Set term_counts_set = term_counts.keySet();
210	System.out.println(" <QueryTermsInfo num=\"" + term_counts_set.size() + "\"/>");
211	// Iterate over them
212	Iterator terms_iter = term_counts_set.iterator();
213	while (terms_iter.hasNext()) {
214	String term = (String) terms_iter.next();
215	Integer count = (Integer) term_counts.get(term);
216	String field = (String) term_fields.get(term);
217
218	// Ignore any terms with zero frequency, because they don't exist in the matching
219	// documents. It seems that this should never happen, but it's a consequence of
220	// how the terms are identified. The terms are found by rewriting the query (above).
221	// At this point, the query hasn't been run, so each query term is expanded without
222	// knowing whether the expanded term will actually appear in one of the resulting
223	// documents. For example, "otago" may be expanded to "otaio" and "otaqo", but if
224	// the search is for "otago AND auckland", no matching documents may include "otaio".
225	// Hopefully that made some sense...
226	if (count.intValue() > 0) {
227	System.out.println(" <Term value=\"" + term + "\" field=\"" + field + "\" freq=\"" + count.intValue() + "\" />");
228	}
229	count = null;
230	term = null;
231	}
232
233	// Cleanup
234	terms_iter = null;
235	term_counts_set = null;
236
237	// Return the list of stop words removed from the query
238	HashSet terms_including_stop_words = new HashSet();
239	query_including_stop_words.extractTerms(terms_including_stop_words);
240	Iterator terms_including_stop_words_iter = terms_including_stop_words.iterator();
241	while (terms_including_stop_words_iter.hasNext()) {
242	Term term = (Term) terms_including_stop_words_iter.next();
243	if (!terms.contains(term)) {
244	System.out.println(" <StopWord value=\"" + term.text() + "\"/>");
245	}
246	}
247	}
248	catch (ParseException parse_exception) {
249	System.out.println(" <Error type=\"PARSE_EXCEPTION\"/>");
250	}
251	catch (TooManyClauses too_many_clauses_exception) {
252	System.out.println(" <Error type=\"TOO_MANY_CLAUSES\"/>");
253	}
254
255	System.out.println("</ResultSet>");
256	}
257
258	searcher.close();
259	}
260	catch (IOException exception) {
261	exception.printStackTrace();
262	}
263	}
264
265	private static String xmlSafe(String text) {
266	return text.replaceAll("\\&", "\\&");
267	}
268
269	private static Query parseQuery(IndexReader reader, QueryParser query_parser, String query_string, String fuzziness)
270	throws java.io.IOException, org.apache.lucene.queryParser.ParseException
271	{
272	// Split query string into the search terms and the filter terms
273	// * The first +(...) term contains the search terms so count
274	// up '(' and stop when we finish matching ')'
275	int offset = 0;
276	int paren_count = 0;
277	boolean seen_paren = false;
278	while (offset < query_string.length() && (!seen_paren \|\| paren_count > 0)) {
279	if (query_string.charAt(offset) == '(') {
280	paren_count++;
281	seen_paren = true;
282	}
283	if (query_string.charAt(offset) == ')') {
284	paren_count--;
285	}
286	offset++;
287	}
288	String query_prefix = query_string.substring(0, offset);
289	String query_suffix = query_string.substring(offset);
290
291	///ystem.err.println("Prefix: " + query_prefix);
292	///ystem.err.println("Suffix: " + query_suffix);
293
294	Query query = query_parser.parse(query_prefix);
295	query = query.rewrite(reader);
296
297	// If this is a fuzzy search, then we need to add the fuzzy
298	// flag to each of the query terms
299	if (fuzziness != null && query.toString().length() > 0) {
300	// Revert the query to a string
301	System.err.println("Rewritten query: " + query.toString());
302	// Search through the string for TX:<term> query terms
303	// and append the ~ operator. Not that this search will
304	// not change phrase searches (TX:"<term> <term>") as
305	// fuzzy searching is not possible for these entries.
306	// Yahoo! Time for a state machine!
307	StringBuffer mutable_query_string = new StringBuffer(query.toString());
308	int o = 0; // Offset
309	// 0 = BASE, 1 = SEEN_T, 2 = SEEN_TX, 3 = SEEN_TX:
310	int s = 0; // State
311	while (o < mutable_query_string.length()) {
312	char c = mutable_query_string.charAt(o);
313	if (s == 0 && c == 'T') {
314	///ystem.err.println("Found T!");
315	s = 1;
316	}
317	else if (s == 1) {
318	if (c == 'X') {
319	///ystem.err.println("Found X!");
320	s = 2;
321	}
322	else {
323	s = 0; // Reset
324	}
325	}
326	else if (s == 2) {
327	if (c == ':') {
328	///ystem.err.println("Found TX:!");
329	s = 3;
330	}
331	else {
332	s = 0; // Reset
333	}
334	}
335	else if (s == 3) {
336	// Don't process phrases
337	if (c == '"') {
338	///ystem.err.println("Stupid phrase...");
339	s = 0; // Reset
340	}
341	// Found the end of the term... add the
342	// fuzzy search indicator
343	// Nor outside the scope of parentheses
344	else if (Character.isWhitespace(c) \|\| c == ')') {
345	///ystem.err.println("Yahoo! Found fuzzy term.");
346	mutable_query_string.insert(o, '~' + fuzziness);
347	o++;
348	s = 0; // Reset
349	}
350	}
351	o++;
352	}
353	// If we were in the state of looking for the end of a
354	// term - then we just found it!
355	if (s == 3) {
356	mutable_query_string.append('~' + fuzziness);
357	}
358	// Reparse the query
359	///ystem.err.println("Fuzzy query: " + mutable_query_string.toString() + query_suffix);
360	query = query_parser.parse(mutable_query_string.toString() + query_suffix);
361	}
362	else {
363	query = query_parser.parse(query_prefix + query_suffix);
364	}
365
366	return query;
367	}
368
369
370	/**
371	* @todo Michael to comment
372	*/
373	private static Filter parseFilterString(String filter_string)
374	{
375	Filter result = null;
376	Pattern pattern = Pattern.compile("\\s\\+(\\w+)\\:([\\{\\[])(\\d+)\\s+TO\\s+(\\d+)([\\}\\]])\\s");
377	Matcher matcher = pattern.matcher(filter_string);
378	if (matcher.matches()) {
379	String field_name = matcher.group(1);
380	boolean include_lower = matcher.group(2).equals("[");
381	String lower_term = matcher.group(3);
382	String upper_term = matcher.group(4);
383	boolean include_upper = matcher.group(5).equals("]");
384	result = new RangeFilter(field_name, lower_term, upper_term, include_lower, include_upper);
385	}
386	else {
387	System.err.println("Error: Could not understand filter string \"" + filter_string + "\"");
388	}
389	return result;
390	}
391	/ parseFilterString() /
392	}

Note: See TracBrowser for help on using the repository browser.

Download in other formats: