Context Navigation

source: trunk/indexers/lucene-gs/src/org/greenstone/LuceneWrapper/GS2LuceneQuery.java@ 12775

Last change on this file since 12775 was 12775, checked in by mdewsnip, 18 years ago
Fixed bug where some terms have zero frequency (because they don't actually appear in the matching documents).
Property svn:keywords set to `Author Date Id Revision`
File size: 13.0 KB

Line
1	/**
2	*
3	* @author [email protected]
4	* @author [email protected]
5	* @author [email protected]
6	* @author [email protected]
7	* @version
8	*/
9
10	package org.nzdl.gsdl.LuceneWrap;
11
12
13	import java.io.*;
14	import java.util.*;
15	import java.util.regex.*;
16
17	import org.apache.lucene.analysis.Analyzer;
18	import org.apache.lucene.analysis.standard.StandardAnalyzer;
19	import org.apache.lucene.document.Document;
20	import org.apache.lucene.index.IndexReader;
21	import org.apache.lucene.index.Term;
22	import org.apache.lucene.index.TermFreqVector;
23	import org.apache.lucene.queryParser.ParseException;
24	import org.apache.lucene.queryParser.QueryParser;
25	import org.apache.lucene.search.BooleanQuery.TooManyClauses;
26	import org.apache.lucene.search.Filter;
27	import org.apache.lucene.search.Hit;
28	import org.apache.lucene.search.Hits;
29	import org.apache.lucene.search.IndexSearcher;
30	import org.apache.lucene.search.Query;
31	import org.apache.lucene.search.QueryFilter;
32	import org.apache.lucene.search.RangeFilter;
33	import org.apache.lucene.search.Searcher;
34	import org.apache.lucene.search.Sort;
35
36
37	public class GS2LuceneQuery
38	{
39	// Use the standard set of English stop words by default
40	static private String[] stop_words = StandardAnalyzer.STOP_WORDS;
41
42
43	static public void main (String args[])
44	{
45	if (args.length == 0) {
46	System.out.println("Usage: GS2LuceneQuery <index directory> [-fuzziness value] [-filter filter_string] [-sort sort_field] [-dco AND\|OR] [-startresults number -endresults number]");
47	return;
48	}
49
50	try {
51	Searcher searcher = new IndexSearcher(args[0]);
52	IndexReader reader = ((IndexSearcher) searcher).getIndexReader();
53
54	// Create one query parser with the standard set of stop words, and one with none
55	QueryParser query_parser = new QueryParser("TX", new StandardAnalyzer(stop_words));
56	QueryParser query_parser_no_stop_words = new QueryParser("TX", new StandardAnalyzer(new String[] { }));
57
58	Sort sorter = new Sort();
59	Filter filter = null;
60	String fuzziness = null;
61
62	// Paging
63	int start_results = 1;
64	int end_results = -1;
65
66	// New code to allow the default conjunction operator to be
67	// definable
68	String default_conjuction_operator = "OR";
69	for (int i = 1; i < args.length; i++) {
70	if (args[i].equals("-sort")) {
71	i++;
72	sorter = new Sort(args[i]);
73	}
74	if (args[i].equals("-filter")) {
75	i++;
76
77	// Parse up filter
78	filter = parseFilterString(args[i]);
79	}
80	if (args[i].equals("-dco")) {
81	i++;
82	default_conjuction_operator = args[i];
83	}
84	if (args[i].equals("-fuzziness")) {
85	i++;
86	fuzziness = args[i];
87	}
88	if (args[i].equals("-startresults")) {
89	i++;
90	if (args[i].matches("\\d+")) {
91	start_results = Integer.parseInt(args[i]);
92	}
93	}
94	if (args[i].equals("-endresults")) {
95	i++;
96	if (args[i].matches("\\d+")) {
97	end_results = Integer.parseInt(args[i]);
98	}
99	}
100	}
101
102	// Lucene does "OR" queries by default; do an "AND" query if specified
103	if (default_conjuction_operator.equals("AND")) {
104	query_parser.setDefaultOperator(query_parser.AND_OPERATOR);
105	query_parser_no_stop_words.setDefaultOperator(query_parser.AND_OPERATOR);
106	}
107
108	BufferedReader in = new BufferedReader(new InputStreamReader(System.in, "UTF-8"));
109	while (true) {
110	// Read the query from STDIN
111	String query_string = in.readLine();
112	if (query_string == null \|\| query_string.length() == -1) {
113	break;
114	}
115	System.out.println("<ResultSet>");
116	System.out.println(" <QueryString>" + xmlSafe(query_string) + "</QueryString>");
117	if (filter != null) {
118	System.out.println(" <FilterString>" + filter.toString() + "</FilterString>");
119	}
120
121	try {
122	Query query_including_stop_words = query_parser_no_stop_words.parse(query_string);
123	query_including_stop_words = query_including_stop_words.rewrite(reader);
124
125	Query query = parseQuery(reader, query_parser, query_string, fuzziness);
126	query = query.rewrite(reader);
127
128	// Perform the query
129	Hits hits;
130	if (filter != null) {
131	hits = searcher.search(query, filter, sorter);
132	}
133	else {
134	hits = searcher.search(query, sorter);
135	}
136
137	// Return the list of expanded query terms and their frequencies
138	HashMap term_counts = new HashMap();
139	HashMap term_fields = new HashMap();
140	HashSet terms = new HashSet();
141	query.extractTerms(terms);
142	Iterator iter = terms.iterator();
143	while (iter.hasNext()) {
144	Term term = (Term) iter.next();
145	// If you wanted to limit this to just TX terms add
146	// something like this:
147	//if (term.field().equals("TX"))
148	term_counts.put(term.text(), new Integer(0));
149	term_fields.put(term.text(), term.field());
150	}
151
152	// Do we need to use a hit iterator to get sorted results?
153	System.out.println(" <MatchingDocsInfo num=\"" + hits.length() + "\"/>");
154	System.out.println(" <StartResults num=\"" + start_results + "\" />");
155	System.out.println(" <EndsResults num=\"" + end_results + "\" />");
156
157	int counter = 1;
158	Iterator hit_iter = hits.iterator();
159	while (hit_iter.hasNext()) {
160	Hit hit = (Hit) hit_iter.next();
161	Document doc = hit.getDocument();
162	String node_id = doc.get("nodeID");
163
164	// May not be paging results
165	if (start_results == 1 && end_results == -1) {
166	System.out.println(" <Match id=\"" + node_id + "\" />");
167	}
168	// Otherwise skip up until page offset
169	else if (start_results <= counter && counter <= end_results) {
170	System.out.println(" <Match id=\"" + node_id + "\" />");
171	}
172	// And skip all the rest
173
174	// From the document, extract the Term Vector for the
175	// TX field
176	TermFreqVector term_freq_vector = reader.getTermFreqVector(hit.getId(), "TX");
177	if (term_freq_vector != null && term_freq_vector.size() > 0) {
178	int[] term_frequencies = term_freq_vector.getTermFrequencies();
179	// Now for each query term, determine the
180	// frequency - which may of course be 0.
181	Set term_counts_set = term_counts.keySet();
182	Iterator terms_iter = term_counts_set.iterator();
183	while (terms_iter.hasNext()) {
184
185	String term = (String) terms_iter.next();
186	Integer count_integer = (Integer) term_counts.get(term);
187	int count = count_integer.intValue();
188	int index = term_freq_vector.indexOf(term);
189	// If the term has a count, then add to
190	// the total count for this term
191	if (index != -1) {
192	count += term_frequencies[index];
193	}
194	// Store the result
195	term_counts.put(term, new Integer(count));
196	count_integer = null;
197	term = null;
198	}
199	terms_iter = null;
200	term_counts_set = null;
201	}
202	else {
203	///ystem.err.println("Error! Missing term vector for document " + hit.getId());
204	}
205	++counter;
206	}
207
208	// Retrieve all the useful terms
209	Set term_counts_set = term_counts.keySet();
210	System.out.println(" <QueryTermsInfo num=\"" + term_counts_set.size() + "\"/>");
211	// Iterate over them
212	Iterator terms_iter = term_counts_set.iterator();
213	while (terms_iter.hasNext()) {
214	String term = (String) terms_iter.next();
215	Integer count = (Integer) term_counts.get(term);
216	String field = (String) term_fields.get(term);
217
218	// Ignore any terms with zero frequency, because they don't exist in the matching
219	// documents. It seems that this should never happen, but it's a consequence of
220	// how the terms are identified. The terms are found by rewriting the query (above).
221	// At this point, the query hasn't been run, so each query term is expanded without
222	// knowing whether the expanded term will actually appear in one of the resulting
223	// documents. For example, "otago" may be expanded to "otaio" and "otaqo", but if
224	// the search is for "otago AND auckland", no matching documents may include "otaio".
225	// Hopefully that made some sense...
226	if (count.intValue() > 0) {
227	System.out.println(" <Term value=\"" + term + "\" field=\"" + field + "\" freq=\"" + count.intValue() + "\" />");
228	}
229	count = null;
230	term = null;
231	}
232
233	// Cleanup
234	terms_iter = null;
235	term_counts_set = null;
236
237	// Return the list of stop words removed from the query
238	HashSet terms_including_stop_words = new HashSet();
239	query_including_stop_words.extractTerms(terms_including_stop_words);
240	Iterator terms_including_stop_words_iter = terms_including_stop_words.iterator();
241	while (terms_including_stop_words_iter.hasNext()) {
242	Term term = (Term) terms_including_stop_words_iter.next();
243	if (!terms.contains(term)) {
244	System.out.println(" <StopWord value=\"" + term.text() + "\"/>");
245	}
246	}
247	}
248	catch (ParseException parse_exception) {
249	System.out.println(" <Error type=\"PARSE_EXCEPTION\"/>");
250	}
251	catch (TooManyClauses too_many_clauses_exception) {
252	System.out.println(" <Error type=\"TOO_MANY_CLAUSES\"/>");
253	}
254
255	System.out.println("</ResultSet>");
256	}
257
258	searcher.close();
259	}
260	catch (IOException exception) {
261	exception.printStackTrace();
262	}
263	}
264
265	private static String xmlSafe(String text) {
266	return text.replaceAll("\\&", "\\&");
267	}
268
269	private static Query parseQuery(IndexReader reader, QueryParser query_parser, String query_string, String fuzziness)
270	throws java.io.IOException, org.apache.lucene.queryParser.ParseException
271	{
272	// Split query string into the search terms and the filter terms
273	// * The first +(...) term contains the search terms so count
274	// up '(' and stop when we finish matching ')'
275	int offset = 0;
276	int paren_count = 0;
277	boolean seen_paren = false;
278	while (offset < query_string.length() && (!seen_paren \|\| paren_count > 0)) {
279	if (query_string.charAt(offset) == '(') {
280	paren_count++;
281	seen_paren = true;
282	}
283	if (query_string.charAt(offset) == ')') {
284	paren_count--;
285	}
286	offset++;
287	}
288	String query_prefix = query_string.substring(0, offset);
289	String query_suffix = query_string.substring(offset);
290
291	///ystem.err.println("Prefix: " + query_prefix);
292	///ystem.err.println("Suffix: " + query_suffix);
293
294	Query query = query_parser.parse(query_prefix);
295	query = query.rewrite(reader);
296
297	// If this is a fuzzy search, then we need to add the fuzzy
298	// flag to each of the query terms
299	if (fuzziness != null && query.toString().length() > 0) {
300	// Revert the query to a string
301	System.err.println("Rewritten query: " + query.toString());
302	// Search through the string for TX:<term> query terms
303	// and append the ~ operator. Not that this search will
304	// not change phrase searches (TX:"<term> <term>") as
305	// fuzzy searching is not possible for these entries.
306	// Yahoo! Time for a state machine!
307	StringBuffer mutable_query_string = new StringBuffer(query.toString());
308	int o = 0; // Offset
309	// 0 = BASE, 1 = SEEN_T, 2 = SEEN_TX, 3 = SEEN_TX:
310	int s = 0; // State
311	while (o < mutable_query_string.length()) {
312	char c = mutable_query_string.charAt(o);
313	if (s == 0 && c == 'T') {
314	///ystem.err.println("Found T!");
315	s = 1;
316	}
317	else if (s == 1) {
318	if (c == 'X') {
319	///ystem.err.println("Found X!");
320	s = 2;
321	}
322	else {
323	s = 0; // Reset
324	}
325	}
326	else if (s == 2) {
327	if (c == ':') {
328	///ystem.err.println("Found TX:!");
329	s = 3;
330	}
331	else {
332	s = 0; // Reset
333	}
334	}
335	else if (s == 3) {
336	// Don't process phrases
337	if (c == '"') {
338	///ystem.err.println("Stupid phrase...");
339	s = 0; // Reset
340	}
341	// Found the end of the term... add the
342	// fuzzy search indicator
343	// Nor outside the scope of parentheses
344	else if (Character.isWhitespace(c) \|\| c == ')') {
345	///ystem.err.println("Yahoo! Found fuzzy term.");
346	mutable_query_string.insert(o, '~' + fuzziness);
347	o++;
348	s = 0; // Reset
349	}
350	}
351	o++;
352	}
353	// If we were in the state of looking for the end of a
354	// term - then we just found it!
355	if (s == 3) {
356	mutable_query_string.append('~' + fuzziness);
357	}
358	// Reparse the query
359	///ystem.err.println("Fuzzy query: " + mutable_query_string.toString() + query_suffix);
360	query = query_parser.parse(mutable_query_string.toString() + query_suffix);
361	}
362	else {
363	query = query_parser.parse(query_prefix + query_suffix);
364	}
365
366	return query;
367	}
368
369
370	/**
371	* @todo Michael to comment
372	*/
373	private static Filter parseFilterString(String filter_string)
374	{
375	Filter result = null;
376	Pattern pattern = Pattern.compile("\\s\\+(\\w+)\\:([\\{\\[])(\\d+)\\s+TO\\s+(\\d+)([\\}\\]])\\s");
377	Matcher matcher = pattern.matcher(filter_string);
378	if (matcher.matches()) {
379	String field_name = matcher.group(1);
380	boolean include_lower = matcher.group(2).equals("[");
381	String lower_term = matcher.group(3);
382	String upper_term = matcher.group(4);
383	boolean include_upper = matcher.group(5).equals("]");
384	result = new RangeFilter(field_name, lower_term, upper_term, include_lower, include_upper);
385	}
386	else {
387	System.err.println("Error: Could not understand filter string \"" + filter_string + "\"");
388	}
389	return result;
390	}
391	/ parseFilterString() /
392	}

Note: See TracBrowser for help on using the repository browser.

Download in other formats: