Context Navigation

source: trunk/indexers/lucene-gs/src/org/greenstone/LuceneWrapper/GS2LuceneQuery.java@ 12408

Last change on this file since 12408 was 12408, checked in by mdewsnip, 18 years ago
Added a "-filter" option which can currently be used for specifying range filters (eg. we're going to use it for dates). Many thanks to Me and DL Consulting Ltd.
Property svn:keywords set to `Author Date Id Revision`
File size: 15.9 KB

Line
1	/**
2	*
3	* @author [email protected]
4	* @author [email protected]
5	* @version
6	*/
7
8	package org.nzdl.gsdl.LuceneWrap;
9
10
11	import java.io.BufferedReader;
12	import java.io.InputStreamReader;
13	import java.util.Collections;
14	import java.util.HashMap;
15	import java.util.HashSet;
16	import java.util.Iterator;
17	import java.util.Set;
18
19	import org.apache.lucene.analysis.Analyzer;
20	import org.apache.lucene.analysis.standard.StandardAnalyzer;
21	import org.apache.lucene.document.Document;
22	import org.apache.lucene.index.IndexReader;
23	import org.apache.lucene.index.Term;
24	import org.apache.lucene.index.TermFreqVector;
25	import org.apache.lucene.queryParser.QueryParser;
26	import org.apache.lucene.search.Filter;
27	import org.apache.lucene.search.Hit;
28	import org.apache.lucene.search.Hits;
29	import org.apache.lucene.search.IndexSearcher;
30	import org.apache.lucene.search.Query;
31	import org.apache.lucene.search.RangeFilter;
32	import org.apache.lucene.search.Searcher;
33	import org.apache.lucene.search.Sort;
34
35
36	public class GS2LuceneQuery
37	{
38	public static void main (String args[])
39	{
40	if (args.length == 0) {
41	System.out.println("Usage: GS2LuceneQuery <index directory> (<sort field>)");
42	return;
43	}
44
45	try {
46	Searcher searcher = new IndexSearcher(args[0]);
47	IndexReader reader = ((IndexSearcher) searcher).getIndexReader();
48
49	Sort sorter = new Sort();
50	Filter filter = null;
51	boolean fuzzy = false;
52
53	// New code to allow the default conjunction operator to be
54	// definable
55	String default_conjuction_operator = "OR";
56	for (int i = 1; i < args.length; i++)
57	{
58	if (args[i].equals("-sort"))
59	{
60	i++;
61	///ystem.err.println("**** sort by = " + args[i]);
62	sorter = new Sort(args[i]);
63	}
64	if (args[i].equals("-filter"))
65	{
66	i++;
67	filter = parseFilterString(args[i]);
68	}
69	if (args[i].equals("-dco"))
70	{
71	i++;
72	default_conjuction_operator = args[i];
73	}
74	if (args[i].equals("-fuzzy"))
75	{
76	fuzzy = true;
77	}
78	}
79
80	// Create one query parser with the standard set of stop words, and one with none
81	QueryParser query_parser = new QueryParser("TX", new StandardAnalyzer());
82	QueryParser query_parser_no_stop_words = new QueryParser("TX", new StandardAnalyzer(new String[] { }));
83
84	// Lucene does "OR" queries by default; do an "AND" query if specified
85	if (default_conjuction_operator.equals("AND"))
86	{
87	query_parser.setDefaultOperator(query_parser.AND_OPERATOR);
88	query_parser_no_stop_words.setDefaultOperator(query_parser.AND_OPERATOR);
89	}
90
91	BufferedReader in = new BufferedReader(new InputStreamReader(System.in, "UTF-8"));
92	while (true)
93	{
94	// Read the query from STDIN
95	String query_string = in.readLine();
96	if (query_string == null \|\| query_string.length() == -1)
97	{
98	break;
99	}
100	///ystem.err.println("**** query = " + query_string);
101
102	Query query_including_stop_words = query_parser_no_stop_words.parse(query_string);
103	query_including_stop_words = query_including_stop_words.rewrite(reader);
104
105	// Split query string into the search terms and the filter terms
106	// * The first +(...) term contains the search terms so count
107	// up '(' and stop when we finish matching ')'
108	int offset = 0;
109	int paren_count = 0;
110	boolean seen_paren = false;
111	while (offset < query_string.length() && (!seen_paren \|\| paren_count > 0))
112	{
113	if (query_string.charAt(offset) == '(')
114	{
115	paren_count++;
116	seen_paren = true;
117	}
118	if (query_string.charAt(offset) == ')')
119	{
120	paren_count--;
121	}
122	offset++;
123	}
124	String query_prefix = query_string.substring(0, offset);
125	String query_suffix = query_string.substring(offset);
126
127	///ystem.err.println("Prefix: " + query_prefix);
128	///ystem.err.println("Suffix: " + query_suffix);
129
130	Query query = query_parser.parse(query_prefix);
131	query = query.rewrite(reader);
132
133	// If this is a fuzzy search, then we need to add the fuzzy
134	// flag to each of the query terms
135	if (fuzzy && query.toString().length() > 0)
136	{
137	// Revert the query to a string
138	///ystem.err.println("Rewritten query: " + query.toString());
139	// Search through the string for TX:<term> query terms
140	// and append the ~ operator. Not that this search will
141	// not change phrase searches (TX:"<term> <term>") as
142	// fuzzy searching is not possible for these entries.
143	// Yahoo! Time for a state machine!
144	StringBuffer mutable_query_string = new StringBuffer(query.toString());
145	int o = 0; // Offset
146	// 0 = BASE, 1 = SEEN_T, 2 = SEEN_TX, 3 = SEEN_TX:
147	int s = 0; // State
148	while(o < mutable_query_string.length())
149	{
150	char c = mutable_query_string.charAt(o);
151	if (s == 0 && c == 'T')
152	{
153	///ystem.err.println("Found T!");
154	s = 1;
155	}
156	else if (s == 1)
157	{
158	if (c == 'X')
159	{
160	///ystem.err.println("Found X!");
161	s = 2;
162	}
163	else
164	{
165	s = 0; // Reset
166	}
167	}
168	else if (s == 2)
169	{
170	if (c == ':')
171	{
172	///ystem.err.println("Found TX:!");
173	s = 3;
174	}
175	else
176	{
177	s = 0; // Reset
178	}
179	}
180	else if (s == 3)
181	{
182	// Don't process phrases
183	if (c == '"')
184	{
185	///ystem.err.println("Stupid phrase...");
186	s = 0; // Reset
187	}
188	// Found the end of the term... add the
189	// fuzzy search indicator
190	// Nor outside the scope of parentheses
191	else if (Character.isWhitespace(c) \|\| c == ')')
192	{
193	///ystem.err.println("Yahoo! Found fuzzy term.");
194	mutable_query_string.insert(o, '~');
195	o++;
196	s = 0; // Reset
197	}
198	}
199	o++;
200	}
201	// If we were in the state of looking for the end of a
202	// term - then we just found it!
203	if (s == 3)
204	{
205	mutable_query_string.append('~');
206	}
207	// Reparse the query
208	///ystem.err.println("Fuzzy query: " + mutable_query_string.toString() + query_suffix);
209	query = query_parser.parse(mutable_query_string.toString() + query_suffix);
210	// And rewrite again
211	query = query.rewrite(reader);
212	///ystem.err.println("Rewritten Fuzzy query: " + query.toString());
213	}
214	else
215	{
216	query = query_parser.parse(query_prefix + query_suffix);
217	query = query.rewrite(reader);
218	}
219
220	// Perform the query
221	Hits hits;
222	if (filter != null) {
223	hits = searcher.search(query, filter, sorter);
224	}
225	else {
226	hits = searcher.search(query, sorter);
227	}
228	System.out.println("<ResultSet>");
229	System.out.println(" <QueryString>" + query_string + "</QueryString>");
230	// Return the list of expanded query terms and their frequencies
231	HashMap term_counts = new HashMap();
232	HashMap term_fields = new HashMap();
233	HashSet terms = new HashSet();
234	query.extractTerms(terms);
235	//System.out.println(" <QueryTermsInfo num=\"" + terms.size() + "\"/>");
236	Iterator iter = terms.iterator();
237	while (iter.hasNext())
238	{
239	Term term = (Term) iter.next();
240	// If you wanted to limit this to just TX terms add
241	// something like this:
242	//if (term.field().equals("TX"))
243	term_counts.put(term.text(), new Integer(0));
244	term_fields.put(term.text(), term.field());
245	}
246
247	// Do we need to use a hit iterator to get sorted results?
248	System.out.println(" <MatchingDocsInfo num=\"" + hits.length() + "\"/>");
249	Iterator hit_iter = hits.iterator();
250	while (hit_iter.hasNext())
251	{
252	Hit hit = (Hit) hit_iter.next();
253	Document doc = hit.getDocument();
254	String node_id = doc.get("nodeID");
255	System.out.println(" <Match id=\"" + node_id + "\" />");
256
257	// From the document, extract the Term Vector for the
258	// TX field
259	TermFreqVector term_freq_vector = reader.getTermFreqVector(hit.getId(), "TX");
260	if (term_freq_vector != null && term_freq_vector.size() > 0)
261	{
262	int[] term_frequencies = term_freq_vector.getTermFrequencies();
263	// Now for each query term, determine the
264	// frequency - which may of course be 0.
265	Set term_counts_set = term_counts.keySet();
266	Iterator terms_iter = term_counts_set.iterator();
267	while (terms_iter.hasNext())
268	{
269	String term = (String) terms_iter.next();
270	Integer count_integer = (Integer) term_counts.get(term);
271	int count = count_integer.intValue();
272	int index = term_freq_vector.indexOf(term);
273	// If the term has a count, then add to
274	// the total count for this term
275	if (index != -1)
276	{
277	count += term_frequencies[index];
278
279	}
280	// Store the result
281	term_counts.put(term, new Integer(count));
282	count_integer = null;
283	term = null;
284	}
285	terms_iter = null;
286	term_counts_set = null;
287	}
288	else
289	{
290	///ystem.err.println("Error! Missing term vector for document " + hit.getId());
291	}
292	}
293
294	// Retrieve all the useful terms
295	Set term_counts_set = term_counts.keySet();
296	System.out.println(" <QueryTermsInfo num=\"" + term_counts_set.size() + "\"/>");
297	// Iterate over them
298	Iterator terms_iter = term_counts_set.iterator();
299	while (terms_iter.hasNext())
300	{
301	String term = (String) terms_iter.next();
302	Integer count = (Integer) term_counts.get(term);
303	String field = (String) term_fields.get(term);
304	System.out.println(" <Term value=\"" + term + "\" field=\"" + field + "\" freq=\"" + count.intValue() + "\" />");
305	count = null;
306	term = null;
307	}
308	// Cleanup
309	terms_iter = null;
310	term_counts_set = null;
311
312	// Return the list of stop words removed from the query
313	HashSet terms_including_stop_words = new HashSet();
314	query_including_stop_words.extractTerms(terms_including_stop_words);
315	Iterator terms_including_stop_words_iter = terms_including_stop_words.iterator();
316	while (terms_including_stop_words_iter.hasNext()) {
317	Term term = (Term) terms_including_stop_words_iter.next();
318	if (!terms.contains(term)) {
319	System.out.println(" <StopWord value=\"" + term.text() + "\"/>");
320	}
321	}
322
323	System.out.println("</ResultSet>");
324	}
325
326	searcher.close();
327	}
328	catch (Exception exception) {
329	exception.printStackTrace();
330	}
331	}
332
333
334	private static Filter parseFilterString(String filter_string)
335	{
336	// Range filters
337	if (filter_string.matches("(.*):[\\{\\[](.+) TO (.+)[\\}\\]]")) {
338	String field_name = filter_string.substring(0, filter_string.indexOf(":"));
339	boolean include_lower = (filter_string.charAt(filter_string.indexOf(":") + 1) == '[');
340	String lower_term = filter_string.substring(filter_string.indexOf(":") + 2, filter_string.indexOf(" TO "));
341	String upper_term = filter_string.substring(filter_string.indexOf(" TO ") + " TO ".length(), filter_string.length() - 1);
342	boolean include_upper = (filter_string.charAt(filter_string.length() - 1) == ']');
343	return new RangeFilter(field_name, lower_term, upper_term, include_lower, include_upper);
344	}
345
346	System.err.println("Error: Could not understand filter string \"" + filter_string + "\"");
347	return null;
348	}
349	}

Note: See TracBrowser for help on using the repository browser.

Download in other formats: