Context Navigation

source: trunk/indexers/lucene-gs/src/org/greenstone/LuceneWrapper/GS2LuceneQuery.java@ 12983

Last change on this file since 12983 was 12983, checked in by mdewsnip, 18 years ago
Moved the stuff for running the query into a new runQuery function, in preparation for allowing the query string to be specified as a command-line argument.
Property svn:keywords set to `Author Date Id Revision`
File size: 11.1 KB

Line
1	/**
2	*
3	* @author [email protected]
4	* @author [email protected]
5	* @author [email protected]
6	* @author [email protected]
7	* @version
8	*/
9
10	package org.nzdl.gsdl.LuceneWrap;
11
12
13	import java.io.*;
14	import java.util.*;
15	import java.util.regex.*;
16
17	import org.apache.lucene.analysis.Analyzer;
18	import org.apache.lucene.analysis.standard.StandardAnalyzer;
19	import org.apache.lucene.document.Document;
20	import org.apache.lucene.index.IndexReader;
21	import org.apache.lucene.index.Term;
22	import org.apache.lucene.index.TermDocs;
23	import org.apache.lucene.queryParser.ParseException;
24	import org.apache.lucene.queryParser.QueryParser;
25	import org.apache.lucene.search.BooleanQuery.TooManyClauses;
26	import org.apache.lucene.search.Filter;
27	import org.apache.lucene.search.Hit;
28	import org.apache.lucene.search.Hits;
29	import org.apache.lucene.search.IndexSearcher;
30	import org.apache.lucene.search.Query;
31	import org.apache.lucene.search.RangeFilter;
32	import org.apache.lucene.search.Searcher;
33	import org.apache.lucene.search.ScoreDoc;
34	import org.apache.lucene.search.Sort;
35	import org.apache.lucene.search.TopFieldDocs;
36
37
38	public class GS2LuceneQuery
39	{
40	static private String TEXTFIELD = "TX";
41
42	// Use the standard set of English stop words by default
43	static private String[] stop_words = StandardAnalyzer.STOP_WORDS;
44
45	// Command-line options
46	static private String fuzziness = null;
47	static private Filter filter = null;
48	static private Sort sorter = new Sort();
49	static private String default_conjuction_operator = "OR";
50	static private int start_results = 1;
51	static private int end_results = Integer.MAX_VALUE;
52
53
54	static public void main (String args[])
55	{
56	if (args.length == 0) {
57	System.out.println("Usage: GS2LuceneQuery <index directory> [-fuzziness value] [-filter filter_string] [-sort sort_field] [-dco AND\|OR] [-startresults number -endresults number]");
58	return;
59	}
60
61	try {
62	Searcher searcher = new IndexSearcher(args[0]);
63	IndexReader reader = ((IndexSearcher) searcher).getIndexReader();
64
65	// Create one query parser with the standard set of stop words, and one with none
66	QueryParser query_parser = new QueryParser(TEXTFIELD, new StandardAnalyzer(stop_words));
67	QueryParser query_parser_no_stop_words = new QueryParser(TEXTFIELD, new StandardAnalyzer(new String[] { }));
68
69	for (int i = 1; i < args.length; i++) {
70	if (args[i].equals("-sort")) {
71	i++;
72	sorter = new Sort(args[i]);
73	}
74	else if (args[i].equals("-filter")) {
75	i++;
76	filter = parseFilterString(args[i]);
77	}
78	else if (args[i].equals("-dco")) {
79	i++;
80	default_conjuction_operator = args[i];
81	}
82	else if (args[i].equals("-fuzziness")) {
83	i++;
84	fuzziness = args[i];
85	}
86	else if (args[i].equals("-startresults")) {
87	i++;
88	if (args[i].matches("\\d+")) {
89	start_results = Integer.parseInt(args[i]);
90	}
91	}
92	else if (args[i].equals("-endresults")) {
93	i++;
94	if (args[i].matches("\\d+")) {
95	end_results = Integer.parseInt(args[i]);
96	}
97	}
98	}
99
100	// Lucene does "OR" queries by default; do an "AND" query if specified
101	if (default_conjuction_operator.equals("AND")) {
102	query_parser.setDefaultOperator(query_parser.AND_OPERATOR);
103	query_parser_no_stop_words.setDefaultOperator(query_parser.AND_OPERATOR);
104	}
105
106	BufferedReader in = new BufferedReader(new InputStreamReader(System.in, "UTF-8"));
107	while (true) {
108	// Read the query from STDIN
109	String query_string = in.readLine();
110	if (query_string == null \|\| query_string.length() == -1) {
111	break;
112	}
113
114	runQuery(searcher, reader, query_parser, query_parser_no_stop_words, query_string);
115	}
116	}
117	catch (IOException exception) {
118	exception.printStackTrace();
119	}
120	}
121
122
123	private static void runQuery(Searcher searcher, IndexReader reader, QueryParser query_parser, QueryParser query_parser_no_stop_words, String query_string)
124	throws IOException
125	{
126	try {
127	Query query_including_stop_words = parseQuery(reader, query_parser_no_stop_words, query_string, fuzziness);
128	query_including_stop_words = query_including_stop_words.rewrite(reader);
129
130	Query query = parseQuery(reader, query_parser, query_string, fuzziness);
131	query = query.rewrite(reader);
132
133	// Return the list of expanded query terms and their frequencies
134	HashSet terms = new HashSet();
135	query.extractTerms(terms);
136	Iterator term_iterator = terms.iterator();
137	System.out.println(" <QueryTermsInfo num=\"" + terms.size() + "\"/>");
138	while (term_iterator.hasNext()) {
139	Term term = (Term) term_iterator.next();
140
141	// Get the term frequency over all the documents
142	TermDocs term_docs = reader.termDocs(term);
143	int term_freq = term_docs.freq();
144	while (term_docs.next()) {
145	term_freq += term_docs.freq();
146	}
147
148	// If you wanted to limit this to just text terms add
149	// something like this:
150	// if (term.field().equals(TEXTFIELD))
151	System.out.println(" <Term value=\"" + term.text() + "\" field=\"" + term.field() + "\" freq=\"" + term_freq + "\" />");
152	}
153
154	// Return the list of stop words removed from the query
155	HashSet terms_including_stop_words = new HashSet();
156	query_including_stop_words.extractTerms(terms_including_stop_words);
157	Iterator terms_including_stop_words_iter = terms_including_stop_words.iterator();
158	while (terms_including_stop_words_iter.hasNext()) {
159	Term term = (Term) terms_including_stop_words_iter.next();
160	if (!terms.contains(term)) {
161	System.out.println(" <StopWord value=\"" + term.text() + "\"/>");
162	}
163	}
164
165	// Simple case for getting all the matching documents
166	if (end_results == Integer.MAX_VALUE) {
167	// Perform the query (filter and sorter may be null)
168	Hits hits = searcher.search(query, filter, sorter);
169	System.out.println(" <MatchingDocsInfo num=\"" + hits.length() + "\"/>");
170
171	// Output the matching documents
172	System.out.println(" <StartResults num=\"" + start_results + "\" />");
173	System.out.println(" <EndsResults num=\"" + hits.length() + "\" />");
174	for (int i = start_results; i <= hits.length(); i++) {
175	Document doc = hits.doc(i - 1);
176	System.out.println(" <Match id=\"" + doc.get("nodeID") + "\" />");
177	}
178	}
179
180	// Slightly more complicated case for returning a subset of the matching documents
181	else {
182	// Perform the query (filter may be null)
183	TopFieldDocs hits = searcher.search(query, filter, end_results, sorter);
184	System.out.println(" <MatchingDocsInfo num=\"" + hits.totalHits + "\"/>");
185
186	// Output the matching documents
187	System.out.println(" <StartResults num=\"" + start_results + "\" />");
188	System.out.println(" <EndsResults num=\"" + end_results + "\" />");
189	for (int i = start_results; (i <= hits.scoreDocs.length && i <= end_results); i++) {
190	Document doc = reader.document(hits.scoreDocs[i - 1].doc);
191	System.out.println(" <Match id=\"" + doc.get("nodeID") + "\" />");
192	}
193	}
194	}
195	catch (ParseException parse_exception) {
196	System.out.println(" <Error type=\"PARSE_EXCEPTION\"/>");
197	}
198	catch (TooManyClauses too_many_clauses_exception) {
199	System.out.println(" <Error type=\"TOO_MANY_CLAUSES\"/>");
200	}
201
202	System.out.println("</ResultSet>");
203	}
204
205
206	private static String xmlSafe(String text) {
207	return text.replaceAll("\\&", "\\&");
208	}
209
210	private static Query parseQuery(IndexReader reader, QueryParser query_parser, String query_string, String fuzziness)
211	throws java.io.IOException, org.apache.lucene.queryParser.ParseException
212	{
213	// Split query string into the search terms and the filter terms
214	// * The first +(...) term contains the search terms so count
215	// up '(' and stop when we finish matching ')'
216	int offset = 0;
217	int paren_count = 0;
218	boolean seen_paren = false;
219	while (offset < query_string.length() && (!seen_paren \|\| paren_count > 0)) {
220	if (query_string.charAt(offset) == '(') {
221	paren_count++;
222	seen_paren = true;
223	}
224	if (query_string.charAt(offset) == ')') {
225	paren_count--;
226	}
227	offset++;
228	}
229	String query_prefix = query_string.substring(0, offset);
230	String query_suffix = query_string.substring(offset);
231
232	///ystem.err.println("Prefix: " + query_prefix);
233	///ystem.err.println("Suffix: " + query_suffix);
234
235	Query query = query_parser.parse(query_prefix);
236	query = query.rewrite(reader);
237
238	// If this is a fuzzy search, then we need to add the fuzzy
239	// flag to each of the query terms
240	if (fuzziness != null && query.toString().length() > 0) {
241	// Revert the query to a string
242	System.err.println("Rewritten query: " + query.toString());
243	// Search through the string for TX:<term> query terms
244	// and append the ~ operator. Not that this search will
245	// not change phrase searches (TX:"<term> <term>") as
246	// fuzzy searching is not possible for these entries.
247	// Yahoo! Time for a state machine!
248	StringBuffer mutable_query_string = new StringBuffer(query.toString());
249	int o = 0; // Offset
250	// 0 = BASE, 1 = SEEN_T, 2 = SEEN_TX, 3 = SEEN_TX:
251	int s = 0; // State
252	while (o < mutable_query_string.length()) {
253	char c = mutable_query_string.charAt(o);
254	if (s == 0 && c == TEXTFIELD.charAt(0)) {
255	///ystem.err.println("Found T!");
256	s = 1;
257	}
258	else if (s == 1) {
259	if (c == TEXTFIELD.charAt(1)) {
260	///ystem.err.println("Found X!");
261	s = 2;
262	}
263	else {
264	s = 0; // Reset
265	}
266	}
267	else if (s == 2) {
268	if (c == ':') {
269	///ystem.err.println("Found TX:!");
270	s = 3;
271	}
272	else {
273	s = 0; // Reset
274	}
275	}
276	else if (s == 3) {
277	// Don't process phrases
278	if (c == '"') {
279	///ystem.err.println("Stupid phrase...");
280	s = 0; // Reset
281	}
282	// Found the end of the term... add the
283	// fuzzy search indicator
284	// Nor outside the scope of parentheses
285	else if (Character.isWhitespace(c) \|\| c == ')') {
286	///ystem.err.println("Yahoo! Found fuzzy term.");
287	mutable_query_string.insert(o, '~' + fuzziness);
288	o++;
289	s = 0; // Reset
290	}
291	}
292	o++;
293	}
294	// If we were in the state of looking for the end of a
295	// term - then we just found it!
296	if (s == 3) {
297	mutable_query_string.append('~' + fuzziness);
298	}
299	// Reparse the query
300	///ystem.err.println("Fuzzy query: " + mutable_query_string.toString() + query_suffix);
301	query = query_parser.parse(mutable_query_string.toString() + query_suffix);
302	}
303	else {
304	query = query_parser.parse(query_prefix + query_suffix);
305	}
306
307	return query;
308	}
309
310
311	/**
312	* @todo Michael to comment
313	*/
314	private static Filter parseFilterString(String filter_string)
315	{
316	Filter result = null;
317	Pattern pattern = Pattern.compile("\\s\\+(\\w+)\\:([\\{\\[])(\\d+)\\s+TO\\s+(\\d+)([\\}\\]])\\s");
318	Matcher matcher = pattern.matcher(filter_string);
319	if (matcher.matches()) {
320	String field_name = matcher.group(1);
321	boolean include_lower = matcher.group(2).equals("[");
322	String lower_term = matcher.group(3);
323	String upper_term = matcher.group(4);
324	boolean include_upper = matcher.group(5).equals("]");
325	result = new RangeFilter(field_name, lower_term, upper_term, include_lower, include_upper);
326	}
327	else {
328	System.err.println("Error: Could not understand filter string \"" + filter_string + "\"");
329	}
330	return result;
331	}
332	/ parseFilterString() /
333	}

Note: See TracBrowser for help on using the repository browser.

Download in other formats: