Context Navigation

source: trunk/indexers/lucene-gs/src/org/greenstone/LuceneWrapper/GS2LuceneQuery.java@ 12390

Last change on this file since 12390 was 12390, checked in by mdewsnip, 18 years ago
More fixes, many thanks to John Thompson and DL Consulting Ltd.
Property svn:keywords set to `Author Date Id Revision`
File size: 14.6 KB

Line
1	/**
2	*
3	* @author [email protected]
4	* @author [email protected]
5	* @version
6	*/
7
8	package org.nzdl.gsdl.LuceneWrap;
9
10
11	import java.io.BufferedReader;
12	import java.io.InputStreamReader;
13	import java.util.Collections;
14	import java.util.HashMap;
15	import java.util.HashSet;
16	import java.util.Iterator;
17	import java.util.Set;
18
19	import org.apache.lucene.analysis.Analyzer;
20	import org.apache.lucene.analysis.standard.StandardAnalyzer;
21	import org.apache.lucene.document.Document;
22	import org.apache.lucene.index.IndexReader;
23	import org.apache.lucene.index.Term;
24	import org.apache.lucene.index.TermFreqVector;
25	import org.apache.lucene.queryParser.QueryParser;
26	import org.apache.lucene.search.Hit;
27	import org.apache.lucene.search.Hits;
28	import org.apache.lucene.search.IndexSearcher;
29	import org.apache.lucene.search.Query;
30	import org.apache.lucene.search.Searcher;
31	import org.apache.lucene.search.Sort;
32
33
34	public class GS2LuceneQuery
35	{
36	public static void main (String args[])
37	{
38	if (args.length == 0) {
39	System.out.println("Usage: GS2LuceneQuery <index directory> (<sort field>)");
40	return;
41	}
42
43	try {
44	Searcher searcher = new IndexSearcher(args[0]);
45	IndexReader reader = ((IndexSearcher) searcher).getIndexReader();
46
47	Sort sorter = new Sort();
48	boolean fuzzy = false;
49
50	// New code to allow the default conjunction operator to be
51	// definable
52	String default_conjuction_operator = "OR";
53	for (int i = 1; i < args.length; i++)
54	{
55	if (args[i].equals("-sort"))
56	{
57	i++;
58	///ystem.err.println("**** sort by = " + args[i]);
59	sorter = new Sort(args[i]);
60	}
61	if (args[i].equals("-dco"))
62	{
63	i++;
64	default_conjuction_operator = args[i];
65	}
66	if (args[i].equals("-fuzzy"))
67	{
68	fuzzy = true;
69	}
70	}
71
72	// Create one query parser with the standard set of stop words, and one with none
73	QueryParser query_parser = new QueryParser("TX", new StandardAnalyzer());
74	QueryParser query_parser_no_stop_words = new QueryParser("TX", new StandardAnalyzer(new String[] { }));
75
76	// Lucene does "OR" queries by default; do an "AND" query if specified
77	if (default_conjuction_operator.equals("AND"))
78	{
79	query_parser.setDefaultOperator(query_parser.AND_OPERATOR);
80	query_parser_no_stop_words.setDefaultOperator(query_parser.AND_OPERATOR);
81	}
82
83	BufferedReader in = new BufferedReader(new InputStreamReader(System.in, "UTF-8"));
84	while (true)
85	{
86	// Read the query from STDIN
87	String query_string = in.readLine();
88	if (query_string == null \|\| query_string.length() == -1)
89	{
90	break;
91	}
92	///ystem.err.println("**** query = " + query_string);
93
94	Query query_including_stop_words = query_parser_no_stop_words.parse(query_string);
95	query_including_stop_words = query_including_stop_words.rewrite(reader);
96
97	// Split query string into the search terms and the filter terms
98	// * The first +(...) term contains the search terms so count
99	// up '(' and stop when we finish matching ')'
100	int offset = 0;
101	int paren_count = 0;
102	boolean seen_paren = false;
103	while (offset < query_string.length() && (!seen_paren \|\| paren_count > 0))
104	{
105	if (query_string.charAt(offset) == '(')
106	{
107	paren_count++;
108	seen_paren = true;
109	}
110	if (query_string.charAt(offset) == ')')
111	{
112	paren_count--;
113	}
114	offset++;
115	}
116	String query_prefix = query_string.substring(0, offset);
117	String query_suffix = query_string.substring(offset);
118
119	///ystem.err.println("Prefix: " + query_prefix);
120	///ystem.err.println("Suffix: " + query_suffix);
121
122	Query query = query_parser.parse(query_prefix);
123	query = query.rewrite(reader);
124
125	// If this is a fuzzy search, then we need to add the fuzzy
126	// flag to each of the query terms
127	if (fuzzy && query.toString().length() > 0)
128	{
129	// Revert the query to a string
130	///ystem.err.println("Rewritten query: " + query.toString());
131	// Search through the string for TX:<term> query terms
132	// and append the ~ operator. Not that this search will
133	// not change phrase searches (TX:"<term> <term>") as
134	// fuzzy searching is not possible for these entries.
135	// Yahoo! Time for a state machine!
136	StringBuffer mutable_query_string = new StringBuffer(query.toString());
137	int o = 0; // Offset
138	// 0 = BASE, 1 = SEEN_T, 2 = SEEN_TX, 3 = SEEN_TX:
139	int s = 0; // State
140	while(o < mutable_query_string.length())
141	{
142	char c = mutable_query_string.charAt(o);
143	if (s == 0 && c == 'T')
144	{
145	///ystem.err.println("Found T!");
146	s = 1;
147	}
148	else if (s == 1)
149	{
150	if (c == 'X')
151	{
152	///ystem.err.println("Found X!");
153	s = 2;
154	}
155	else
156	{
157	s = 0; // Reset
158	}
159	}
160	else if (s == 2)
161	{
162	if (c == ':')
163	{
164	///ystem.err.println("Found TX:!");
165	s = 3;
166	}
167	else
168	{
169	s = 0; // Reset
170	}
171	}
172	else if (s == 3)
173	{
174	// Don't process phrases
175	if (c == '"')
176	{
177	///ystem.err.println("Stupid phrase...");
178	s = 0; // Reset
179	}
180	// Found the end of the term... add the
181	// fuzzy search indicator
182	// Nor outside the scope of parentheses
183	else if (Character.isWhitespace(c) \|\| c == ')')
184	{
185	///ystem.err.println("Yahoo! Found fuzzy term.");
186	mutable_query_string.insert(o, '~');
187	o++;
188	s = 0; // Reset
189	}
190	}
191	o++;
192	}
193	// If we were in the state of looking for the end of a
194	// term - then we just found it!
195	if (s == 3)
196	{
197	mutable_query_string.append('~');
198	}
199	// Reparse the query
200	///ystem.err.println("Fuzzy query: " + mutable_query_string.toString() + query_suffix);
201	query = query_parser.parse(mutable_query_string.toString() + query_suffix);
202	// And rewrite again
203	query = query.rewrite(reader);
204	///ystem.err.println("Rewritten Fuzzy query: " + query.toString());
205	}
206	else
207	{
208	query = query_parser.parse(query_prefix + query_suffix);
209	query = query.rewrite(reader);
210	}
211
212	// Perform the query
213	Hits hits = searcher.search(query, sorter);
214	System.out.println("<ResultSet>");
215	System.out.println(" <QueryString>" + query_string + "</QueryString>");
216	// Return the list of expanded query terms and their frequencies
217	HashMap term_counts = new HashMap();
218	HashMap term_fields = new HashMap();
219	HashSet terms = new HashSet();
220	query.extractTerms(terms);
221	//System.out.println(" <QueryTermsInfo num=\"" + terms.size() + "\"/>");
222	Iterator iter = terms.iterator();
223	while (iter.hasNext())
224	{
225	Term term = (Term) iter.next();
226	// If you wanted to limit this to just TX terms add
227	// something like this:
228	//if (term.field().equals("TX"))
229	term_counts.put(term.text(), new Integer(0));
230	term_fields.put(term.text(), term.field());
231	}
232
233	// Do we need to use a hit iterator to get sorted results?
234	System.out.println(" <MatchingDocsInfo num=\"" + hits.length() + "\"/>");
235	Iterator hit_iter = hits.iterator();
236	while (hit_iter.hasNext())
237	{
238	Hit hit = (Hit) hit_iter.next();
239	Document doc = hit.getDocument();
240	String node_id = doc.get("nodeID");
241	System.out.println(" <Match id=\"" + node_id + "\" />");
242
243	// From the document, extract the Term Vector for the
244	// TX field
245	TermFreqVector term_freq_vector = reader.getTermFreqVector(hit.getId(), "TX");
246	if (term_freq_vector.size() > 0)
247	{
248	int[] term_frequencies = term_freq_vector.getTermFrequencies();
249	// Now for each query term, determine the
250	// frequency - which may of course be 0.
251	Set term_counts_set = term_counts.keySet();
252	Iterator terms_iter = term_counts_set.iterator();
253	while (terms_iter.hasNext())
254	{
255	String term = (String) terms_iter.next();
256	Integer count_integer = (Integer) term_counts.get(term);
257	int count = count_integer.intValue();
258	int index = term_freq_vector.indexOf(term);
259	// If the term has a count, then add to
260	// the total count for this term
261	if (index != -1)
262	{
263	count += term_frequencies[index];
264
265	}
266	// Store the result
267	term_counts.put(term, new Integer(count));
268	count_integer = null;
269	term = null;
270	}
271	terms_iter = null;
272	term_counts_set = null;
273	}
274	else
275	{
276	///ystem.err.println("Error! Missing term vector for document " + hit.getId());
277	}
278	}
279
280	// Retrieve all the useful terms
281	Set term_counts_set = term_counts.keySet();
282	System.out.println(" <QueryTermsInfo num=\"" + term_counts_set.size() + "\"/>");
283	// Iterate over them
284	Iterator terms_iter = term_counts_set.iterator();
285	while (terms_iter.hasNext())
286	{
287	String term = (String) terms_iter.next();
288	Integer count = (Integer) term_counts.get(term);
289	String field = (String) term_fields.get(term);
290	System.out.println(" <Term value=\"" + term + "\" field=\"" + field + "\" freq=\"" + count.intValue() + "\" />");
291	count = null;
292	term = null;
293	}
294	// Cleanup
295	terms_iter = null;
296	term_counts_set = null;
297
298	// Return the list of stop words removed from the query
299	HashSet terms_including_stop_words = new HashSet();
300	query_including_stop_words.extractTerms(terms_including_stop_words);
301	Iterator terms_including_stop_words_iter = terms_including_stop_words.iterator();
302	while (terms_including_stop_words_iter.hasNext()) {
303	Term term = (Term) terms_including_stop_words_iter.next();
304	if (!terms.contains(term)) {
305	System.out.println(" <StopWord value=\"" + term.text() + "\"/>");
306	}
307	}
308
309	System.out.println("</ResultSet>");
310	}
311
312	searcher.close();
313	}
314	catch (Exception exception) {
315	exception.printStackTrace();
316	}
317	}
318	}

Note: See TracBrowser for help on using the repository browser.

Download in other formats: