Context Navigation

source: trunk/indexers/lucene-gs/src/org/greenstone/LuceneWrapper/GS2LuceneQuery.java@ 12980

Last change on this file since 12980 was 12980, checked in by mdewsnip, 18 years ago
Now passes the endresults value (if defined) into the Searcher.search() call so only the required number of documents are returned.
Property svn:keywords set to `Author Date Id Revision`
File size: 11.2 KB

Line
1	/**
2	*
3	* @author [email protected]
4	* @author [email protected]
5	* @author [email protected]
6	* @author [email protected]
7	* @version
8	*/
9
10	package org.nzdl.gsdl.LuceneWrap;
11
12
13	import java.io.*;
14	import java.util.*;
15	import java.util.regex.*;
16
17	import org.apache.lucene.analysis.Analyzer;
18	import org.apache.lucene.analysis.standard.StandardAnalyzer;
19	import org.apache.lucene.document.Document;
20	import org.apache.lucene.index.IndexReader;
21	import org.apache.lucene.index.Term;
22	import org.apache.lucene.index.TermDocs;
23	import org.apache.lucene.queryParser.ParseException;
24	import org.apache.lucene.queryParser.QueryParser;
25	import org.apache.lucene.search.BooleanQuery.TooManyClauses;
26	import org.apache.lucene.search.Filter;
27	import org.apache.lucene.search.Hit;
28	import org.apache.lucene.search.Hits;
29	import org.apache.lucene.search.IndexSearcher;
30	import org.apache.lucene.search.Query;
31	import org.apache.lucene.search.RangeFilter;
32	import org.apache.lucene.search.Searcher;
33	import org.apache.lucene.search.ScoreDoc;
34	import org.apache.lucene.search.Sort;
35	import org.apache.lucene.search.TopFieldDocs;
36
37
38	public class GS2LuceneQuery
39	{
40	static private String TEXTFIELD = "TX";
41
42	// Use the standard set of English stop words by default
43	static private String[] stop_words = StandardAnalyzer.STOP_WORDS;
44
45
46	static public void main (String args[])
47	{
48	if (args.length == 0) {
49	System.out.println("Usage: GS2LuceneQuery <index directory> [-fuzziness value] [-filter filter_string] [-sort sort_field] [-dco AND\|OR] [-startresults number -endresults number]");
50	return;
51	}
52
53	try {
54	Searcher searcher = new IndexSearcher(args[0]);
55	IndexReader reader = ((IndexSearcher) searcher).getIndexReader();
56
57	// Create one query parser with the standard set of stop words, and one with none
58	QueryParser query_parser = new QueryParser(TEXTFIELD, new StandardAnalyzer(stop_words));
59	QueryParser query_parser_no_stop_words = new QueryParser(TEXTFIELD, new StandardAnalyzer(new String[] { }));
60
61	Sort sorter = new Sort();
62	Filter filter = null;
63	String fuzziness = null;
64
65	// Paging
66	int start_results = 1;
67	int end_results = Integer.MAX_VALUE;
68
69	// New code to allow the default conjunction operator to be
70	// definable
71	String default_conjuction_operator = "OR";
72	for (int i = 1; i < args.length; i++) {
73	if (args[i].equals("-sort")) {
74	i++;
75	sorter = new Sort(args[i]);
76	}
77	if (args[i].equals("-filter")) {
78	i++;
79	filter = parseFilterString(args[i]);
80	}
81	if (args[i].equals("-dco")) {
82	i++;
83	default_conjuction_operator = args[i];
84	}
85	if (args[i].equals("-fuzziness")) {
86	i++;
87	fuzziness = args[i];
88	}
89	if (args[i].equals("-startresults")) {
90	i++;
91	if (args[i].matches("\\d+")) {
92	start_results = Integer.parseInt(args[i]);
93	}
94	}
95	if (args[i].equals("-endresults")) {
96	i++;
97	if (args[i].matches("\\d+")) {
98	end_results = Integer.parseInt(args[i]);
99	}
100	}
101	}
102
103	// Lucene does "OR" queries by default; do an "AND" query if specified
104	if (default_conjuction_operator.equals("AND")) {
105	query_parser.setDefaultOperator(query_parser.AND_OPERATOR);
106	query_parser_no_stop_words.setDefaultOperator(query_parser.AND_OPERATOR);
107	}
108
109	BufferedReader in = new BufferedReader(new InputStreamReader(System.in, "UTF-8"));
110	while (true) {
111	// Read the query from STDIN
112	String query_string = in.readLine();
113	if (query_string == null \|\| query_string.length() == -1) {
114	break;
115	}
116	System.out.println("<ResultSet>");
117	System.out.println(" <QueryString>" + xmlSafe(query_string) + "</QueryString>");
118	if (filter != null) {
119	System.out.println(" <FilterString>" + filter.toString() + "</FilterString>");
120	}
121
122	try {
123	Query query_including_stop_words = parseQuery(reader, query_parser_no_stop_words, query_string, fuzziness);
124	query_including_stop_words = query_including_stop_words.rewrite(reader);
125
126	Query query = parseQuery(reader, query_parser, query_string, fuzziness);
127	query = query.rewrite(reader);
128
129	// Return the list of expanded query terms and their frequencies
130	HashSet terms = new HashSet();
131	query.extractTerms(terms);
132	Iterator term_iterator = terms.iterator();
133	System.out.println(" <QueryTermsInfo num=\"" + terms.size() + "\"/>");
134	while (term_iterator.hasNext()) {
135	Term term = (Term) term_iterator.next();
136
137	// Get the term frequency over all the documents
138	TermDocs term_docs = reader.termDocs(term);
139	int term_freq = term_docs.freq();
140	while (term_docs.next()) {
141	term_freq += term_docs.freq();
142	}
143
144	// If you wanted to limit this to just text terms add
145	// something like this:
146	// if (term.field().equals(TEXTFIELD))
147	System.out.println(" <Term value=\"" + term.text() + "\" field=\"" + term.field() + "\" freq=\"" + term_freq + "\" />");
148	}
149
150	// Return the list of stop words removed from the query
151	HashSet terms_including_stop_words = new HashSet();
152	query_including_stop_words.extractTerms(terms_including_stop_words);
153	Iterator terms_including_stop_words_iter = terms_including_stop_words.iterator();
154	while (terms_including_stop_words_iter.hasNext()) {
155	Term term = (Term) terms_including_stop_words_iter.next();
156	if (!terms.contains(term)) {
157	System.out.println(" <StopWord value=\"" + term.text() + "\"/>");
158	}
159	}
160
161	// Simple case for getting all the matching documents
162	if (end_results == Integer.MAX_VALUE) {
163	// Perform the query (filter and sorter may be null)
164	Hits hits = searcher.search(query, filter, sorter);
165	System.out.println(" <MatchingDocsInfo num=\"" + hits.length() + "\"/>");
166
167	// Output the matching documents
168	System.out.println(" <StartResults num=\"" + start_results + "\" />");
169	System.out.println(" <EndsResults num=\"" + hits.length() + "\" />");
170	for (int i = start_results; i <= hits.length(); i++) {
171	Document doc = hits.doc(i - 1);
172	System.out.println(" <Match id=\"" + doc.get("nodeID") + "\" />");
173	}
174	}
175
176	// Slightly more complicated case for returning a subset of the matching documents
177	else {
178	// Perform the query (filter may be null)
179	TopFieldDocs hits = searcher.search(query, filter, end_results, sorter);
180	System.out.println(" <MatchingDocsInfo num=\"" + hits.totalHits + "\"/>");
181
182	// Output the matching documents
183	System.out.println(" <StartResults num=\"" + start_results + "\" />");
184	System.out.println(" <EndsResults num=\"" + end_results + "\" />");
185	for (int i = start_results; (i <= hits.scoreDocs.length && i <= end_results); i++) {
186	Document doc = reader.document(hits.scoreDocs[i - 1].doc);
187	System.out.println(" <Match id=\"" + doc.get("nodeID") + "\" />");
188	}
189	}
190	}
191	catch (ParseException parse_exception) {
192	System.out.println(" <Error type=\"PARSE_EXCEPTION\"/>");
193	}
194	catch (TooManyClauses too_many_clauses_exception) {
195	System.out.println(" <Error type=\"TOO_MANY_CLAUSES\"/>");
196	}
197
198	System.out.println("</ResultSet>");
199	}
200
201	searcher.close();
202	}
203	catch (IOException exception) {
204	exception.printStackTrace();
205	}
206	}
207
208	private static String xmlSafe(String text) {
209	return text.replaceAll("\\&", "\\&");
210	}
211
212	private static Query parseQuery(IndexReader reader, QueryParser query_parser, String query_string, String fuzziness)
213	throws java.io.IOException, org.apache.lucene.queryParser.ParseException
214	{
215	// Split query string into the search terms and the filter terms
216	// * The first +(...) term contains the search terms so count
217	// up '(' and stop when we finish matching ')'
218	int offset = 0;
219	int paren_count = 0;
220	boolean seen_paren = false;
221	while (offset < query_string.length() && (!seen_paren \|\| paren_count > 0)) {
222	if (query_string.charAt(offset) == '(') {
223	paren_count++;
224	seen_paren = true;
225	}
226	if (query_string.charAt(offset) == ')') {
227	paren_count--;
228	}
229	offset++;
230	}
231	String query_prefix = query_string.substring(0, offset);
232	String query_suffix = query_string.substring(offset);
233
234	///ystem.err.println("Prefix: " + query_prefix);
235	///ystem.err.println("Suffix: " + query_suffix);
236
237	Query query = query_parser.parse(query_prefix);
238	query = query.rewrite(reader);
239
240	// If this is a fuzzy search, then we need to add the fuzzy
241	// flag to each of the query terms
242	if (fuzziness != null && query.toString().length() > 0) {
243	// Revert the query to a string
244	System.err.println("Rewritten query: " + query.toString());
245	// Search through the string for TX:<term> query terms
246	// and append the ~ operator. Not that this search will
247	// not change phrase searches (TX:"<term> <term>") as
248	// fuzzy searching is not possible for these entries.
249	// Yahoo! Time for a state machine!
250	StringBuffer mutable_query_string = new StringBuffer(query.toString());
251	int o = 0; // Offset
252	// 0 = BASE, 1 = SEEN_T, 2 = SEEN_TX, 3 = SEEN_TX:
253	int s = 0; // State
254	while (o < mutable_query_string.length()) {
255	char c = mutable_query_string.charAt(o);
256	if (s == 0 && c == TEXTFIELD.charAt(0)) {
257	///ystem.err.println("Found T!");
258	s = 1;
259	}
260	else if (s == 1) {
261	if (c == TEXTFIELD.charAt(1)) {
262	///ystem.err.println("Found X!");
263	s = 2;
264	}
265	else {
266	s = 0; // Reset
267	}
268	}
269	else if (s == 2) {
270	if (c == ':') {
271	///ystem.err.println("Found TX:!");
272	s = 3;
273	}
274	else {
275	s = 0; // Reset
276	}
277	}
278	else if (s == 3) {
279	// Don't process phrases
280	if (c == '"') {
281	///ystem.err.println("Stupid phrase...");
282	s = 0; // Reset
283	}
284	// Found the end of the term... add the
285	// fuzzy search indicator
286	// Nor outside the scope of parentheses
287	else if (Character.isWhitespace(c) \|\| c == ')') {
288	///ystem.err.println("Yahoo! Found fuzzy term.");
289	mutable_query_string.insert(o, '~' + fuzziness);
290	o++;
291	s = 0; // Reset
292	}
293	}
294	o++;
295	}
296	// If we were in the state of looking for the end of a
297	// term - then we just found it!
298	if (s == 3) {
299	mutable_query_string.append('~' + fuzziness);
300	}
301	// Reparse the query
302	///ystem.err.println("Fuzzy query: " + mutable_query_string.toString() + query_suffix);
303	query = query_parser.parse(mutable_query_string.toString() + query_suffix);
304	}
305	else {
306	query = query_parser.parse(query_prefix + query_suffix);
307	}
308
309	return query;
310	}
311
312
313	/**
314	* @todo Michael to comment
315	*/
316	private static Filter parseFilterString(String filter_string)
317	{
318	Filter result = null;
319	Pattern pattern = Pattern.compile("\\s\\+(\\w+)\\:([\\{\\[])(\\d+)\\s+TO\\s+(\\d+)([\\}\\]])\\s");
320	Matcher matcher = pattern.matcher(filter_string);
321	if (matcher.matches()) {
322	String field_name = matcher.group(1);
323	boolean include_lower = matcher.group(2).equals("[");
324	String lower_term = matcher.group(3);
325	String upper_term = matcher.group(4);
326	boolean include_upper = matcher.group(5).equals("]");
327	result = new RangeFilter(field_name, lower_term, upper_term, include_lower, include_upper);
328	}
329	else {
330	System.err.println("Error: Could not understand filter string \"" + filter_string + "\"");
331	}
332	return result;
333	}
334	/ parseFilterString() /
335	}

Note: See TracBrowser for help on using the repository browser.

Download in other formats: