Context Navigation

source: trunk/indexers/lucene-gs/src/org/greenstone/LuceneWrapper/GS2LuceneQuery.java@ 12976

Last change on this file since 12976 was 12976, checked in by mdewsnip, 18 years ago
Rearranged some code to make the fact that the term information is now independent of the search results clearer.
Property svn:keywords set to `Author Date Id Revision`
File size: 10.7 KB

Line
1	/**
2	*
3	* @author [email protected]
4	* @author [email protected]
5	* @author [email protected]
6	* @author [email protected]
7	* @version
8	*/
9
10	package org.nzdl.gsdl.LuceneWrap;
11
12
13	import java.io.*;
14	import java.util.*;
15	import java.util.regex.*;
16
17	import org.apache.lucene.analysis.Analyzer;
18	import org.apache.lucene.analysis.standard.StandardAnalyzer;
19	import org.apache.lucene.document.Document;
20	import org.apache.lucene.index.IndexReader;
21	import org.apache.lucene.index.Term;
22	import org.apache.lucene.index.TermDocs;
23	import org.apache.lucene.queryParser.ParseException;
24	import org.apache.lucene.queryParser.QueryParser;
25	import org.apache.lucene.search.BooleanQuery.TooManyClauses;
26	import org.apache.lucene.search.Filter;
27	import org.apache.lucene.search.Hit;
28	import org.apache.lucene.search.Hits;
29	import org.apache.lucene.search.IndexSearcher;
30	import org.apache.lucene.search.Query;
31	import org.apache.lucene.search.RangeFilter;
32	import org.apache.lucene.search.Searcher;
33	import org.apache.lucene.search.Sort;
34
35
36	public class GS2LuceneQuery
37	{
38	static private String TEXTFIELD = "TX";
39
40	// Use the standard set of English stop words by default
41	static private String[] stop_words = StandardAnalyzer.STOP_WORDS;
42
43
44	static public void main (String args[])
45	{
46	if (args.length == 0) {
47	System.out.println("Usage: GS2LuceneQuery <index directory> [-fuzziness value] [-filter filter_string] [-sort sort_field] [-dco AND\|OR] [-startresults number -endresults number]");
48	return;
49	}
50
51	try {
52	Searcher searcher = new IndexSearcher(args[0]);
53	IndexReader reader = ((IndexSearcher) searcher).getIndexReader();
54
55	// Create one query parser with the standard set of stop words, and one with none
56	QueryParser query_parser = new QueryParser(TEXTFIELD, new StandardAnalyzer(stop_words));
57	QueryParser query_parser_no_stop_words = new QueryParser(TEXTFIELD, new StandardAnalyzer(new String[] { }));
58
59	Sort sorter = null;
60	Filter filter = null;
61	String fuzziness = null;
62
63	// Paging
64	int start_results = 1;
65	int end_results = -1;
66
67	// New code to allow the default conjunction operator to be
68	// definable
69	String default_conjuction_operator = "OR";
70	for (int i = 1; i < args.length; i++) {
71	if (args[i].equals("-sort")) {
72	i++;
73	sorter = new Sort(args[i]);
74	}
75	if (args[i].equals("-filter")) {
76	i++;
77	filter = parseFilterString(args[i]);
78	}
79	if (args[i].equals("-dco")) {
80	i++;
81	default_conjuction_operator = args[i];
82	}
83	if (args[i].equals("-fuzziness")) {
84	i++;
85	fuzziness = args[i];
86	}
87	if (args[i].equals("-startresults")) {
88	i++;
89	if (args[i].matches("\\d+")) {
90	start_results = Integer.parseInt(args[i]);
91	}
92	}
93	if (args[i].equals("-endresults")) {
94	i++;
95	if (args[i].matches("\\d+")) {
96	end_results = Integer.parseInt(args[i]);
97	}
98	}
99	}
100
101	// Lucene does "OR" queries by default; do an "AND" query if specified
102	if (default_conjuction_operator.equals("AND")) {
103	query_parser.setDefaultOperator(query_parser.AND_OPERATOR);
104	query_parser_no_stop_words.setDefaultOperator(query_parser.AND_OPERATOR);
105	}
106
107	BufferedReader in = new BufferedReader(new InputStreamReader(System.in, "UTF-8"));
108	while (true) {
109	// Read the query from STDIN
110	String query_string = in.readLine();
111	if (query_string == null \|\| query_string.length() == -1) {
112	break;
113	}
114	System.out.println("<ResultSet>");
115	System.out.println(" <QueryString>" + xmlSafe(query_string) + "</QueryString>");
116	if (filter != null) {
117	System.out.println(" <FilterString>" + filter.toString() + "</FilterString>");
118	}
119
120	try {
121	Query query_including_stop_words = parseQuery(reader, query_parser_no_stop_words, query_string, fuzziness);
122	query_including_stop_words = query_including_stop_words.rewrite(reader);
123
124	Query query = parseQuery(reader, query_parser, query_string, fuzziness);
125	query = query.rewrite(reader);
126
127	// Return the list of expanded query terms and their frequencies
128	HashSet terms = new HashSet();
129	query.extractTerms(terms);
130	Iterator term_iterator = terms.iterator();
131	System.out.println(" <QueryTermsInfo num=\"" + terms.size() + "\"/>");
132	while (term_iterator.hasNext()) {
133	Term term = (Term) term_iterator.next();
134
135	// Get the term frequency over all the documents
136	TermDocs term_docs = reader.termDocs(term);
137	int term_freq = term_docs.freq();
138	while (term_docs.next()) {
139	term_freq += term_docs.freq();
140	}
141
142	// If you wanted to limit this to just text terms add
143	// something like this:
144	// if (term.field().equals(TEXTFIELD))
145	System.out.println(" <Term value=\"" + term.text() + "\" field=\"" + term.field() + "\" freq=\"" + term_freq + "\" />");
146	}
147
148	// Return the list of stop words removed from the query
149	HashSet terms_including_stop_words = new HashSet();
150	query_including_stop_words.extractTerms(terms_including_stop_words);
151	Iterator terms_including_stop_words_iter = terms_including_stop_words.iterator();
152	while (terms_including_stop_words_iter.hasNext()) {
153	Term term = (Term) terms_including_stop_words_iter.next();
154	if (!terms.contains(term)) {
155	System.out.println(" <StopWord value=\"" + term.text() + "\"/>");
156	}
157	}
158
159	// Perform the query (filter and sorter may be null)
160	Hits hits = searcher.search(query, filter, sorter);
161
162	// Do we need to use a hit iterator to get sorted results?
163	System.out.println(" <MatchingDocsInfo num=\"" + hits.length() + "\"/>");
164	System.out.println(" <StartResults num=\"" + start_results + "\" />");
165	System.out.println(" <EndsResults num=\"" + end_results + "\" />");
166
167	int counter = 1;
168	Iterator hit_iter = hits.iterator();
169	while (hit_iter.hasNext()) {
170	Hit hit = (Hit) hit_iter.next();
171	Document doc = hit.getDocument();
172
173	// May not be paging results
174	if (start_results == 1 && end_results == -1) {
175	String node_id = doc.get("nodeID");
176	System.out.println(" <Match id=\"" + node_id + "\" />");
177	}
178	// Otherwise skip up until page offset
179	else if (start_results <= counter && counter <= end_results) {
180	String node_id = doc.get("nodeID");
181	System.out.println(" <Match id=\"" + node_id + "\" />");
182	}
183	// And skip all the rest
184
185	++counter;
186	}
187	}
188	catch (ParseException parse_exception) {
189	System.out.println(" <Error type=\"PARSE_EXCEPTION\"/>");
190	}
191	catch (TooManyClauses too_many_clauses_exception) {
192	System.out.println(" <Error type=\"TOO_MANY_CLAUSES\"/>");
193	}
194
195	System.out.println("</ResultSet>");
196	}
197
198	searcher.close();
199	}
200	catch (IOException exception) {
201	exception.printStackTrace();
202	}
203	}
204
205	private static String xmlSafe(String text) {
206	return text.replaceAll("\\&", "\\&");
207	}
208
209	private static Query parseQuery(IndexReader reader, QueryParser query_parser, String query_string, String fuzziness)
210	throws java.io.IOException, org.apache.lucene.queryParser.ParseException
211	{
212	// Split query string into the search terms and the filter terms
213	// * The first +(...) term contains the search terms so count
214	// up '(' and stop when we finish matching ')'
215	int offset = 0;
216	int paren_count = 0;
217	boolean seen_paren = false;
218	while (offset < query_string.length() && (!seen_paren \|\| paren_count > 0)) {
219	if (query_string.charAt(offset) == '(') {
220	paren_count++;
221	seen_paren = true;
222	}
223	if (query_string.charAt(offset) == ')') {
224	paren_count--;
225	}
226	offset++;
227	}
228	String query_prefix = query_string.substring(0, offset);
229	String query_suffix = query_string.substring(offset);
230
231	///ystem.err.println("Prefix: " + query_prefix);
232	///ystem.err.println("Suffix: " + query_suffix);
233
234	Query query = query_parser.parse(query_prefix);
235	query = query.rewrite(reader);
236
237	// If this is a fuzzy search, then we need to add the fuzzy
238	// flag to each of the query terms
239	if (fuzziness != null && query.toString().length() > 0) {
240	// Revert the query to a string
241	System.err.println("Rewritten query: " + query.toString());
242	// Search through the string for TX:<term> query terms
243	// and append the ~ operator. Not that this search will
244	// not change phrase searches (TX:"<term> <term>") as
245	// fuzzy searching is not possible for these entries.
246	// Yahoo! Time for a state machine!
247	StringBuffer mutable_query_string = new StringBuffer(query.toString());
248	int o = 0; // Offset
249	// 0 = BASE, 1 = SEEN_T, 2 = SEEN_TX, 3 = SEEN_TX:
250	int s = 0; // State
251	while (o < mutable_query_string.length()) {
252	char c = mutable_query_string.charAt(o);
253	if (s == 0 && c == TEXTFIELD.charAt(0)) {
254	///ystem.err.println("Found T!");
255	s = 1;
256	}
257	else if (s == 1) {
258	if (c == TEXTFIELD.charAt(1)) {
259	///ystem.err.println("Found X!");
260	s = 2;
261	}
262	else {
263	s = 0; // Reset
264	}
265	}
266	else if (s == 2) {
267	if (c == ':') {
268	///ystem.err.println("Found TX:!");
269	s = 3;
270	}
271	else {
272	s = 0; // Reset
273	}
274	}
275	else if (s == 3) {
276	// Don't process phrases
277	if (c == '"') {
278	///ystem.err.println("Stupid phrase...");
279	s = 0; // Reset
280	}
281	// Found the end of the term... add the
282	// fuzzy search indicator
283	// Nor outside the scope of parentheses
284	else if (Character.isWhitespace(c) \|\| c == ')') {
285	///ystem.err.println("Yahoo! Found fuzzy term.");
286	mutable_query_string.insert(o, '~' + fuzziness);
287	o++;
288	s = 0; // Reset
289	}
290	}
291	o++;
292	}
293	// If we were in the state of looking for the end of a
294	// term - then we just found it!
295	if (s == 3) {
296	mutable_query_string.append('~' + fuzziness);
297	}
298	// Reparse the query
299	///ystem.err.println("Fuzzy query: " + mutable_query_string.toString() + query_suffix);
300	query = query_parser.parse(mutable_query_string.toString() + query_suffix);
301	}
302	else {
303	query = query_parser.parse(query_prefix + query_suffix);
304	}
305
306	return query;
307	}
308
309
310	/**
311	* @todo Michael to comment
312	*/
313	private static Filter parseFilterString(String filter_string)
314	{
315	Filter result = null;
316	Pattern pattern = Pattern.compile("\\s\\+(\\w+)\\:([\\{\\[])(\\d+)\\s+TO\\s+(\\d+)([\\}\\]])\\s");
317	Matcher matcher = pattern.matcher(filter_string);
318	if (matcher.matches()) {
319	String field_name = matcher.group(1);
320	boolean include_lower = matcher.group(2).equals("[");
321	String lower_term = matcher.group(3);
322	String upper_term = matcher.group(4);
323	boolean include_upper = matcher.group(5).equals("]");
324	result = new RangeFilter(field_name, lower_term, upper_term, include_lower, include_upper);
325	}
326	else {
327	System.err.println("Error: Could not understand filter string \"" + filter_string + "\"");
328	}
329	return result;
330	}
331	/ parseFilterString() /
332	}

Note: See TracBrowser for help on using the repository browser.

Download in other formats: