Context Navigation

source: trunk/indexers/lucene-gs/src/org/greenstone/LuceneWrapper/GS2LuceneQuery.java@ 12989

Last change on this file since 12989 was 12989, checked in by mdewsnip, 18 years ago
Follow to close the searcher object.
Property svn:keywords set to `Author Date Id Revision`
File size: 11.5 KB

Line
1	/**
2	*
3	* @author [email protected]
4	* @author [email protected]
5	* @author [email protected]
6	* @author [email protected]
7	* @version
8	*/
9
10	package org.nzdl.gsdl.LuceneWrap;
11
12
13	import java.io.*;
14	import java.util.*;
15	import java.util.regex.*;
16
17	import org.apache.lucene.analysis.Analyzer;
18	import org.apache.lucene.analysis.standard.StandardAnalyzer;
19	import org.apache.lucene.document.Document;
20	import org.apache.lucene.index.IndexReader;
21	import org.apache.lucene.index.Term;
22	import org.apache.lucene.index.TermDocs;
23	import org.apache.lucene.queryParser.ParseException;
24	import org.apache.lucene.queryParser.QueryParser;
25	import org.apache.lucene.search.BooleanQuery.TooManyClauses;
26	import org.apache.lucene.search.Filter;
27	import org.apache.lucene.search.Hit;
28	import org.apache.lucene.search.Hits;
29	import org.apache.lucene.search.IndexSearcher;
30	import org.apache.lucene.search.Query;
31	import org.apache.lucene.search.RangeFilter;
32	import org.apache.lucene.search.Searcher;
33	import org.apache.lucene.search.ScoreDoc;
34	import org.apache.lucene.search.Sort;
35	import org.apache.lucene.search.TopFieldDocs;
36
37
38	public class GS2LuceneQuery
39	{
40	static private String TEXTFIELD = "TX";
41
42	// Use the standard set of English stop words by default
43	static private String[] stop_words = StandardAnalyzer.STOP_WORDS;
44
45	// Command-line options
46	static private String fuzziness = null;
47	static private Filter filter = null;
48	static private Sort sorter = new Sort();
49	static private String default_conjuction_operator = "OR";
50	static private int start_results = 1;
51	static private int end_results = Integer.MAX_VALUE;
52
53
54	static public void main (String args[])
55	{
56	if (args.length == 0) {
57	System.out.println("Usage: GS2LuceneQuery <index directory> [-fuzziness value] [-filter filter_string] [-sort sort_field] [-dco AND\|OR] [-startresults number -endresults number] [query]");
58	return;
59	}
60
61	try {
62	Searcher searcher = new IndexSearcher(args[0]);
63	IndexReader reader = ((IndexSearcher) searcher).getIndexReader();
64
65	// Create one query parser with the standard set of stop words, and one with none
66	QueryParser query_parser = new QueryParser(TEXTFIELD, new StandardAnalyzer(stop_words));
67	QueryParser query_parser_no_stop_words = new QueryParser(TEXTFIELD, new StandardAnalyzer(new String[] { }));
68
69	String query_string = null;
70
71	// Parse the command-line arguments
72	for (int i = 1; i < args.length; i++) {
73	if (args[i].equals("-sort")) {
74	i++;
75	sorter = new Sort(args[i]);
76	}
77	else if (args[i].equals("-filter")) {
78	i++;
79	filter = parseFilterString(args[i]);
80	}
81	else if (args[i].equals("-dco")) {
82	i++;
83	default_conjuction_operator = args[i];
84	}
85	else if (args[i].equals("-fuzziness")) {
86	i++;
87	fuzziness = args[i];
88	}
89	else if (args[i].equals("-startresults")) {
90	i++;
91	if (args[i].matches("\\d+")) {
92	start_results = Integer.parseInt(args[i]);
93	}
94	}
95	else if (args[i].equals("-endresults")) {
96	i++;
97	if (args[i].matches("\\d+")) {
98	end_results = Integer.parseInt(args[i]);
99	}
100	}
101	else {
102	query_string = args[i];
103	}
104	}
105
106	// Lucene does "OR" queries by default; do an "AND" query if specified
107	if (default_conjuction_operator.equals("AND")) {
108	query_parser.setDefaultOperator(query_parser.AND_OPERATOR);
109	query_parser_no_stop_words.setDefaultOperator(query_parser.AND_OPERATOR);
110	}
111
112	// The query string has been specified as a command-line argument
113	if (query_string != null) {
114	runQuery(searcher, reader, query_parser, query_parser_no_stop_words, query_string);
115	}
116
117	// Read queries from STDIN
118	else {
119	BufferedReader in = new BufferedReader(new InputStreamReader(System.in, "UTF-8"));
120	while (true) {
121	// Read the query from STDIN
122	query_string = in.readLine();
123	if (query_string == null \|\| query_string.length() == -1) {
124	break;
125	}
126
127	runQuery(searcher, reader, query_parser, query_parser_no_stop_words, query_string);
128	}
129	}
130
131	searcher.close();
132	}
133	catch (IOException exception) {
134	exception.printStackTrace();
135	}
136	}
137
138
139	private static void runQuery(Searcher searcher, IndexReader reader, QueryParser query_parser, QueryParser query_parser_no_stop_words, String query_string)
140	throws IOException
141	{
142	try {
143	Query query_including_stop_words = parseQuery(reader, query_parser_no_stop_words, query_string, fuzziness);
144	query_including_stop_words = query_including_stop_words.rewrite(reader);
145
146	Query query = parseQuery(reader, query_parser, query_string, fuzziness);
147	query = query.rewrite(reader);
148
149	// Return the list of expanded query terms and their frequencies
150	HashSet terms = new HashSet();
151	query.extractTerms(terms);
152	Iterator term_iterator = terms.iterator();
153	System.out.println(" <QueryTermsInfo num=\"" + terms.size() + "\"/>");
154	while (term_iterator.hasNext()) {
155	Term term = (Term) term_iterator.next();
156
157	// Get the term frequency over all the documents
158	TermDocs term_docs = reader.termDocs(term);
159	int term_freq = term_docs.freq();
160	while (term_docs.next()) {
161	term_freq += term_docs.freq();
162	}
163
164	// If you wanted to limit this to just text terms add
165	// something like this:
166	// if (term.field().equals(TEXTFIELD))
167	System.out.println(" <Term value=\"" + term.text() + "\" field=\"" + term.field() + "\" freq=\"" + term_freq + "\" />");
168	}
169
170	// Return the list of stop words removed from the query
171	HashSet terms_including_stop_words = new HashSet();
172	query_including_stop_words.extractTerms(terms_including_stop_words);
173	Iterator terms_including_stop_words_iter = terms_including_stop_words.iterator();
174	while (terms_including_stop_words_iter.hasNext()) {
175	Term term = (Term) terms_including_stop_words_iter.next();
176	if (!terms.contains(term)) {
177	System.out.println(" <StopWord value=\"" + term.text() + "\"/>");
178	}
179	}
180
181	// Simple case for getting all the matching documents
182	if (end_results == Integer.MAX_VALUE) {
183	// Perform the query (filter and sorter may be null)
184	Hits hits = searcher.search(query, filter, sorter);
185	System.out.println(" <MatchingDocsInfo num=\"" + hits.length() + "\"/>");
186
187	// Output the matching documents
188	System.out.println(" <StartResults num=\"" + start_results + "\" />");
189	System.out.println(" <EndsResults num=\"" + hits.length() + "\" />");
190	for (int i = start_results; i <= hits.length(); i++) {
191	Document doc = hits.doc(i - 1);
192	System.out.println(" <Match id=\"" + doc.get("nodeID") + "\" />");
193	}
194	}
195
196	// Slightly more complicated case for returning a subset of the matching documents
197	else {
198	// Perform the query (filter may be null)
199	TopFieldDocs hits = searcher.search(query, filter, end_results, sorter);
200	System.out.println(" <MatchingDocsInfo num=\"" + hits.totalHits + "\"/>");
201
202	// Output the matching documents
203	System.out.println(" <StartResults num=\"" + start_results + "\" />");
204	System.out.println(" <EndsResults num=\"" + end_results + "\" />");
205	for (int i = start_results; (i <= hits.scoreDocs.length && i <= end_results); i++) {
206	Document doc = reader.document(hits.scoreDocs[i - 1].doc);
207	System.out.println(" <Match id=\"" + doc.get("nodeID") + "\" />");
208	}
209	}
210	}
211	catch (ParseException parse_exception) {
212	System.out.println(" <Error type=\"PARSE_EXCEPTION\"/>");
213	}
214	catch (TooManyClauses too_many_clauses_exception) {
215	System.out.println(" <Error type=\"TOO_MANY_CLAUSES\"/>");
216	}
217
218	System.out.println("</ResultSet>");
219	}
220
221
222	private static String xmlSafe(String text) {
223	return text.replaceAll("\\&", "\\&");
224	}
225
226	private static Query parseQuery(IndexReader reader, QueryParser query_parser, String query_string, String fuzziness)
227	throws java.io.IOException, org.apache.lucene.queryParser.ParseException
228	{
229	// Split query string into the search terms and the filter terms
230	// * The first +(...) term contains the search terms so count
231	// up '(' and stop when we finish matching ')'
232	int offset = 0;
233	int paren_count = 0;
234	boolean seen_paren = false;
235	while (offset < query_string.length() && (!seen_paren \|\| paren_count > 0)) {
236	if (query_string.charAt(offset) == '(') {
237	paren_count++;
238	seen_paren = true;
239	}
240	if (query_string.charAt(offset) == ')') {
241	paren_count--;
242	}
243	offset++;
244	}
245	String query_prefix = query_string.substring(0, offset);
246	String query_suffix = query_string.substring(offset);
247
248	///ystem.err.println("Prefix: " + query_prefix);
249	///ystem.err.println("Suffix: " + query_suffix);
250
251	Query query = query_parser.parse(query_prefix);
252	query = query.rewrite(reader);
253
254	// If this is a fuzzy search, then we need to add the fuzzy
255	// flag to each of the query terms
256	if (fuzziness != null && query.toString().length() > 0) {
257	// Revert the query to a string
258	System.err.println("Rewritten query: " + query.toString());
259	// Search through the string for TX:<term> query terms
260	// and append the ~ operator. Not that this search will
261	// not change phrase searches (TX:"<term> <term>") as
262	// fuzzy searching is not possible for these entries.
263	// Yahoo! Time for a state machine!
264	StringBuffer mutable_query_string = new StringBuffer(query.toString());
265	int o = 0; // Offset
266	// 0 = BASE, 1 = SEEN_T, 2 = SEEN_TX, 3 = SEEN_TX:
267	int s = 0; // State
268	while (o < mutable_query_string.length()) {
269	char c = mutable_query_string.charAt(o);
270	if (s == 0 && c == TEXTFIELD.charAt(0)) {
271	///ystem.err.println("Found T!");
272	s = 1;
273	}
274	else if (s == 1) {
275	if (c == TEXTFIELD.charAt(1)) {
276	///ystem.err.println("Found X!");
277	s = 2;
278	}
279	else {
280	s = 0; // Reset
281	}
282	}
283	else if (s == 2) {
284	if (c == ':') {
285	///ystem.err.println("Found TX:!");
286	s = 3;
287	}
288	else {
289	s = 0; // Reset
290	}
291	}
292	else if (s == 3) {
293	// Don't process phrases
294	if (c == '"') {
295	///ystem.err.println("Stupid phrase...");
296	s = 0; // Reset
297	}
298	// Found the end of the term... add the
299	// fuzzy search indicator
300	// Nor outside the scope of parentheses
301	else if (Character.isWhitespace(c) \|\| c == ')') {
302	///ystem.err.println("Yahoo! Found fuzzy term.");
303	mutable_query_string.insert(o, '~' + fuzziness);
304	o++;
305	s = 0; // Reset
306	}
307	}
308	o++;
309	}
310	// If we were in the state of looking for the end of a
311	// term - then we just found it!
312	if (s == 3) {
313	mutable_query_string.append('~' + fuzziness);
314	}
315	// Reparse the query
316	///ystem.err.println("Fuzzy query: " + mutable_query_string.toString() + query_suffix);
317	query = query_parser.parse(mutable_query_string.toString() + query_suffix);
318	}
319	else {
320	query = query_parser.parse(query_prefix + query_suffix);
321	}
322
323	return query;
324	}
325
326
327	/**
328	* @todo Michael to comment
329	*/
330	private static Filter parseFilterString(String filter_string)
331	{
332	Filter result = null;
333	Pattern pattern = Pattern.compile("\\s\\+(\\w+)\\:([\\{\\[])(\\d+)\\s+TO\\s+(\\d+)([\\}\\]])\\s");
334	Matcher matcher = pattern.matcher(filter_string);
335	if (matcher.matches()) {
336	String field_name = matcher.group(1);
337	boolean include_lower = matcher.group(2).equals("[");
338	String lower_term = matcher.group(3);
339	String upper_term = matcher.group(4);
340	boolean include_upper = matcher.group(5).equals("]");
341	result = new RangeFilter(field_name, lower_term, upper_term, include_lower, include_upper);
342	}
343	else {
344	System.err.println("Error: Could not understand filter string \"" + filter_string + "\"");
345	}
346	return result;
347	}
348	/ parseFilterString() /
349	}

Note: See TracBrowser for help on using the repository browser.

Download in other formats: