Context Navigation

source: trunk/indexers/lucene-gs/src/org/greenstone/LuceneWrapper/GS2LuceneQuery.java@ 12653

Last change on this file since 12653 was 12653, checked in by mdewsnip, 18 years ago
Made it a little bit easier to use a custom set of stop words with Lucene.
Property svn:keywords set to `Author Date Id Revision`
File size: 11.1 KB

Line
1	/**
2	*
3	* @author [email protected]
4	* @author [email protected]
5	* @version
6	*/
7
8	package org.nzdl.gsdl.LuceneWrap;
9
10
11	import java.io.*;
12	import java.util.*;
13
14	import org.apache.lucene.analysis.Analyzer;
15	import org.apache.lucene.analysis.standard.StandardAnalyzer;
16	import org.apache.lucene.document.Document;
17	import org.apache.lucene.index.IndexReader;
18	import org.apache.lucene.index.Term;
19	import org.apache.lucene.index.TermFreqVector;
20	import org.apache.lucene.queryParser.ParseException;
21	import org.apache.lucene.queryParser.QueryParser;
22	import org.apache.lucene.search.BooleanQuery.TooManyClauses;
23	import org.apache.lucene.search.Filter;
24	import org.apache.lucene.search.Hit;
25	import org.apache.lucene.search.Hits;
26	import org.apache.lucene.search.IndexSearcher;
27	import org.apache.lucene.search.Query;
28	import org.apache.lucene.search.QueryFilter;
29	import org.apache.lucene.search.RangeFilter;
30	import org.apache.lucene.search.Searcher;
31	import org.apache.lucene.search.Sort;
32
33
34	public class GS2LuceneQuery
35	{
36	// Use the standard set of English stop words by default
37	static private String[] stop_words = StandardAnalyzer.STOP_WORDS;
38
39
40	static public void main (String args[])
41	{
42	if (args.length == 0) {
43	System.out.println("Usage: GS2LuceneQuery <index directory> [-fuzzy] [-filter filter_string] [-sort sort_field] [-dco AND\|OR]");
44	return;
45	}
46
47	try {
48	Searcher searcher = new IndexSearcher(args[0]);
49	IndexReader reader = ((IndexSearcher) searcher).getIndexReader();
50
51	// Create one query parser with stop words, and one with none
52	QueryParser query_parser = new QueryParser("TX", new StandardAnalyzer(stop_words));
53	QueryParser query_parser_no_stop_words = new QueryParser("TX", new StandardAnalyzer(new String[] { }));
54
55	Sort sorter = new Sort();
56	QueryFilter filter = null;
57	boolean fuzzy = false;
58
59	// New code to allow the default conjunction operator to be
60	// definable
61	String default_conjuction_operator = "OR";
62	for (int i = 1; i < args.length; i++)
63	{
64	if (args[i].equals("-sort"))
65	{
66	i++;
67	sorter = new Sort(args[i]);
68	}
69	if (args[i].equals("-filter"))
70	{
71	i++;
72	try {
73	filter = new QueryFilter(query_parser.parse(args[i]));
74	}
75	catch (ParseException exception) {
76	exception.printStackTrace();
77	}
78	}
79	if (args[i].equals("-dco"))
80	{
81	i++;
82	default_conjuction_operator = args[i];
83	}
84	if (args[i].equals("-fuzzy"))
85	{
86	fuzzy = true;
87	}
88	}
89
90	// Lucene does "OR" queries by default; do an "AND" query if specified
91	if (default_conjuction_operator.equals("AND"))
92	{
93	query_parser.setDefaultOperator(query_parser.AND_OPERATOR);
94	query_parser_no_stop_words.setDefaultOperator(query_parser.AND_OPERATOR);
95	}
96
97	BufferedReader in = new BufferedReader(new InputStreamReader(System.in, "UTF-8"));
98	while (true) {
99	// Read the query from STDIN
100	String query_string = in.readLine();
101	if (query_string == null \|\| query_string.length() == -1) {
102	break;
103	}
104	System.out.println("<ResultSet>");
105	System.out.println(" <QueryString>" + query_string + "</QueryString>");
106
107	try {
108	Query query_including_stop_words = query_parser_no_stop_words.parse(query_string);
109	query_including_stop_words = query_including_stop_words.rewrite(reader);
110
111	Query query = parseQuery(reader, query_parser, query_string, fuzzy);
112	query = query.rewrite(reader);
113
114	// Perform the query
115	Hits hits;
116	if (filter != null) {
117	hits = searcher.search(query, filter, sorter);
118	}
119	else {
120	hits = searcher.search(query, sorter);
121	}
122
123	// Return the list of expanded query terms and their frequencies
124	HashMap term_counts = new HashMap();
125	HashMap term_fields = new HashMap();
126	HashSet terms = new HashSet();
127	query.extractTerms(terms);
128	Iterator iter = terms.iterator();
129	while (iter.hasNext())
130	{
131	Term term = (Term) iter.next();
132	// If you wanted to limit this to just TX terms add
133	// something like this:
134	//if (term.field().equals("TX"))
135	term_counts.put(term.text(), new Integer(0));
136	term_fields.put(term.text(), term.field());
137	}
138
139	// Do we need to use a hit iterator to get sorted results?
140	System.out.println(" <MatchingDocsInfo num=\"" + hits.length() + "\"/>");
141	Iterator hit_iter = hits.iterator();
142	while (hit_iter.hasNext())
143	{
144	Hit hit = (Hit) hit_iter.next();
145	Document doc = hit.getDocument();
146	String node_id = doc.get("nodeID");
147	System.out.println(" <Match id=\"" + node_id + "\" />");
148
149	// From the document, extract the Term Vector for the
150	// TX field
151	TermFreqVector term_freq_vector = reader.getTermFreqVector(hit.getId(), "TX");
152	if (term_freq_vector != null && term_freq_vector.size() > 0)
153	{
154	int[] term_frequencies = term_freq_vector.getTermFrequencies();
155	// Now for each query term, determine the
156	// frequency - which may of course be 0.
157	Set term_counts_set = term_counts.keySet();
158	Iterator terms_iter = term_counts_set.iterator();
159	while (terms_iter.hasNext())
160	{
161	String term = (String) terms_iter.next();
162	Integer count_integer = (Integer) term_counts.get(term);
163	int count = count_integer.intValue();
164	int index = term_freq_vector.indexOf(term);
165	// If the term has a count, then add to
166	// the total count for this term
167	if (index != -1)
168	{
169	count += term_frequencies[index];
170
171	}
172	// Store the result
173	term_counts.put(term, new Integer(count));
174	count_integer = null;
175	term = null;
176	}
177	terms_iter = null;
178	term_counts_set = null;
179	}
180	else
181	{
182	///ystem.err.println("Error! Missing term vector for document " + hit.getId());
183	}
184	}
185
186	// Retrieve all the useful terms
187	Set term_counts_set = term_counts.keySet();
188	System.out.println(" <QueryTermsInfo num=\"" + term_counts_set.size() + "\"/>");
189	// Iterate over them
190	Iterator terms_iter = term_counts_set.iterator();
191	while (terms_iter.hasNext())
192	{
193	String term = (String) terms_iter.next();
194	Integer count = (Integer) term_counts.get(term);
195	String field = (String) term_fields.get(term);
196	System.out.println(" <Term value=\"" + term + "\" field=\"" + field + "\" freq=\"" + count.intValue() + "\" />");
197	count = null;
198	term = null;
199	}
200	// Cleanup
201	terms_iter = null;
202	term_counts_set = null;
203
204	// Return the list of stop words removed from the query
205	HashSet terms_including_stop_words = new HashSet();
206	query_including_stop_words.extractTerms(terms_including_stop_words);
207	Iterator terms_including_stop_words_iter = terms_including_stop_words.iterator();
208	while (terms_including_stop_words_iter.hasNext()) {
209	Term term = (Term) terms_including_stop_words_iter.next();
210	if (!terms.contains(term)) {
211	System.out.println(" <StopWord value=\"" + term.text() + "\"/>");
212	}
213	}
214	}
215	catch (ParseException parse_exception) {
216	System.out.println(" <Error type=\"PARSE_EXCEPTION\"/>");
217	}
218	catch (TooManyClauses too_many_clauses_exception) {
219	System.out.println(" <Error type=\"TOO_MANY_CLAUSES\"/>");
220	}
221
222	System.out.println("</ResultSet>");
223	}
224
225	searcher.close();
226	}
227	catch (IOException exception) {
228	exception.printStackTrace();
229	}
230	}
231
232
233	private static Query parseQuery(IndexReader reader, QueryParser query_parser, String query_string, boolean fuzzy)
234	throws java.io.IOException, org.apache.lucene.queryParser.ParseException
235	{
236	// Split query string into the search terms and the filter terms
237	// * The first +(...) term contains the search terms so count
238	// up '(' and stop when we finish matching ')'
239	int offset = 0;
240	int paren_count = 0;
241	boolean seen_paren = false;
242	while (offset < query_string.length() && (!seen_paren \|\| paren_count > 0))
243	{
244	if (query_string.charAt(offset) == '(')
245	{
246	paren_count++;
247	seen_paren = true;
248	}
249	if (query_string.charAt(offset) == ')')
250	{
251	paren_count--;
252	}
253	offset++;
254	}
255	String query_prefix = query_string.substring(0, offset);
256	String query_suffix = query_string.substring(offset);
257
258	///ystem.err.println("Prefix: " + query_prefix);
259	///ystem.err.println("Suffix: " + query_suffix);
260
261	Query query = query_parser.parse(query_prefix);
262	query = query.rewrite(reader);
263
264	// If this is a fuzzy search, then we need to add the fuzzy
265	// flag to each of the query terms
266	if (fuzzy && query.toString().length() > 0)
267	{
268	// Revert the query to a string
269	System.err.println("Rewritten query: " + query.toString());
270	// Search through the string for TX:<term> query terms
271	// and append the ~ operator. Not that this search will
272	// not change phrase searches (TX:"<term> <term>") as
273	// fuzzy searching is not possible for these entries.
274	// Yahoo! Time for a state machine!
275	StringBuffer mutable_query_string = new StringBuffer(query.toString());
276	int o = 0; // Offset
277	// 0 = BASE, 1 = SEEN_T, 2 = SEEN_TX, 3 = SEEN_TX:
278	int s = 0; // State
279	while(o < mutable_query_string.length())
280	{
281	char c = mutable_query_string.charAt(o);
282	if (s == 0 && c == 'T')
283	{
284	///ystem.err.println("Found T!");
285	s = 1;
286	}
287	else if (s == 1)
288	{
289	if (c == 'X')
290	{
291	///ystem.err.println("Found X!");
292	s = 2;
293	}
294	else
295	{
296	s = 0; // Reset
297	}
298	}
299	else if (s == 2)
300	{
301	if (c == ':')
302	{
303	///ystem.err.println("Found TX:!");
304	s = 3;
305	}
306	else
307	{
308	s = 0; // Reset
309	}
310	}
311	else if (s == 3)
312	{
313	// Don't process phrases
314	if (c == '"')
315	{
316	///ystem.err.println("Stupid phrase...");
317	s = 0; // Reset
318	}
319	// Found the end of the term... add the
320	// fuzzy search indicator
321	// Nor outside the scope of parentheses
322	else if (Character.isWhitespace(c) \|\| c == ')')
323	{
324	///ystem.err.println("Yahoo! Found fuzzy term.");
325	mutable_query_string.insert(o, '~');
326	o++;
327	s = 0; // Reset
328	}
329	}
330	o++;
331	}
332	// If we were in the state of looking for the end of a
333	// term - then we just found it!
334	if (s == 3)
335	{
336	mutable_query_string.append('~');
337	}
338	// Reparse the query
339	///ystem.err.println("Fuzzy query: " + mutable_query_string.toString() + query_suffix);
340	query = query_parser.parse(mutable_query_string.toString() + query_suffix);
341	}
342	else
343	{
344	query = query_parser.parse(query_prefix + query_suffix);
345	}
346
347	return query;
348	}
349	}

Note: See TracBrowser for help on using the repository browser.

Download in other formats: