Context Navigation

source: trunk/indexers/lucene-gs/src/org/greenstone/LuceneWrapper/GS2LuceneQuery.java@ 12987

Last change on this file since 12987 was 12987, checked in by mdewsnip, 18 years ago
You can now specify the query string as a command-line argument to GS2LuceneQuery.
Property svn:keywords set to `Author Date Id Revision`
File size: 11.5 KB

Line
1	/**
2	*
3	* @author [email protected]
4	* @author [email protected]
5	* @author [email protected]
6	* @author [email protected]
7	* @version
8	*/
9
10	package org.nzdl.gsdl.LuceneWrap;
11
12
13	import java.io.*;
14	import java.util.*;
15	import java.util.regex.*;
16
17	import org.apache.lucene.analysis.Analyzer;
18	import org.apache.lucene.analysis.standard.StandardAnalyzer;
19	import org.apache.lucene.document.Document;
20	import org.apache.lucene.index.IndexReader;
21	import org.apache.lucene.index.Term;
22	import org.apache.lucene.index.TermDocs;
23	import org.apache.lucene.queryParser.ParseException;
24	import org.apache.lucene.queryParser.QueryParser;
25	import org.apache.lucene.search.BooleanQuery.TooManyClauses;
26	import org.apache.lucene.search.Filter;
27	import org.apache.lucene.search.Hit;
28	import org.apache.lucene.search.Hits;
29	import org.apache.lucene.search.IndexSearcher;
30	import org.apache.lucene.search.Query;
31	import org.apache.lucene.search.RangeFilter;
32	import org.apache.lucene.search.Searcher;
33	import org.apache.lucene.search.ScoreDoc;
34	import org.apache.lucene.search.Sort;
35	import org.apache.lucene.search.TopFieldDocs;
36
37
38	public class GS2LuceneQuery
39	{
40	static private String TEXTFIELD = "TX";
41
42	// Use the standard set of English stop words by default
43	static private String[] stop_words = StandardAnalyzer.STOP_WORDS;
44
45	// Command-line options
46	static private String fuzziness = null;
47	static private Filter filter = null;
48	static private Sort sorter = new Sort();
49	static private String default_conjuction_operator = "OR";
50	static private int start_results = 1;
51	static private int end_results = Integer.MAX_VALUE;
52
53
54	static public void main (String args[])
55	{
56	if (args.length == 0) {
57	System.out.println("Usage: GS2LuceneQuery <index directory> [-fuzziness value] [-filter filter_string] [-sort sort_field] [-dco AND\|OR] [-startresults number -endresults number] [query]");
58	return;
59	}
60
61	try {
62	Searcher searcher = new IndexSearcher(args[0]);
63	IndexReader reader = ((IndexSearcher) searcher).getIndexReader();
64
65	// Create one query parser with the standard set of stop words, and one with none
66	QueryParser query_parser = new QueryParser(TEXTFIELD, new StandardAnalyzer(stop_words));
67	QueryParser query_parser_no_stop_words = new QueryParser(TEXTFIELD, new StandardAnalyzer(new String[] { }));
68
69	String query_string = null;
70
71	// Parse the command-line arguments
72	for (int i = 1; i < args.length; i++) {
73	if (args[i].equals("-sort")) {
74	i++;
75	sorter = new Sort(args[i]);
76	}
77	else if (args[i].equals("-filter")) {
78	i++;
79	filter = parseFilterString(args[i]);
80	}
81	else if (args[i].equals("-dco")) {
82	i++;
83	default_conjuction_operator = args[i];
84	}
85	else if (args[i].equals("-fuzziness")) {
86	i++;
87	fuzziness = args[i];
88	}
89	else if (args[i].equals("-startresults")) {
90	i++;
91	if (args[i].matches("\\d+")) {
92	start_results = Integer.parseInt(args[i]);
93	}
94	}
95	else if (args[i].equals("-endresults")) {
96	i++;
97	if (args[i].matches("\\d+")) {
98	end_results = Integer.parseInt(args[i]);
99	}
100	}
101	else {
102	query_string = args[i];
103	}
104	}
105
106	// Lucene does "OR" queries by default; do an "AND" query if specified
107	if (default_conjuction_operator.equals("AND")) {
108	query_parser.setDefaultOperator(query_parser.AND_OPERATOR);
109	query_parser_no_stop_words.setDefaultOperator(query_parser.AND_OPERATOR);
110	}
111
112	// The query string has been specified as a command-line argument
113	if (query_string != null) {
114	runQuery(searcher, reader, query_parser, query_parser_no_stop_words, query_string);
115	}
116
117	// Read queries from STDIN
118	else {
119	BufferedReader in = new BufferedReader(new InputStreamReader(System.in, "UTF-8"));
120	while (true) {
121	// Read the query from STDIN
122	query_string = in.readLine();
123	if (query_string == null \|\| query_string.length() == -1) {
124	break;
125	}
126
127	runQuery(searcher, reader, query_parser, query_parser_no_stop_words, query_string);
128	}
129	}
130	}
131	catch (IOException exception) {
132	exception.printStackTrace();
133	}
134	}
135
136
137	private static void runQuery(Searcher searcher, IndexReader reader, QueryParser query_parser, QueryParser query_parser_no_stop_words, String query_string)
138	throws IOException
139	{
140	try {
141	Query query_including_stop_words = parseQuery(reader, query_parser_no_stop_words, query_string, fuzziness);
142	query_including_stop_words = query_including_stop_words.rewrite(reader);
143
144	Query query = parseQuery(reader, query_parser, query_string, fuzziness);
145	query = query.rewrite(reader);
146
147	// Return the list of expanded query terms and their frequencies
148	HashSet terms = new HashSet();
149	query.extractTerms(terms);
150	Iterator term_iterator = terms.iterator();
151	System.out.println(" <QueryTermsInfo num=\"" + terms.size() + "\"/>");
152	while (term_iterator.hasNext()) {
153	Term term = (Term) term_iterator.next();
154
155	// Get the term frequency over all the documents
156	TermDocs term_docs = reader.termDocs(term);
157	int term_freq = term_docs.freq();
158	while (term_docs.next()) {
159	term_freq += term_docs.freq();
160	}
161
162	// If you wanted to limit this to just text terms add
163	// something like this:
164	// if (term.field().equals(TEXTFIELD))
165	System.out.println(" <Term value=\"" + term.text() + "\" field=\"" + term.field() + "\" freq=\"" + term_freq + "\" />");
166	}
167
168	// Return the list of stop words removed from the query
169	HashSet terms_including_stop_words = new HashSet();
170	query_including_stop_words.extractTerms(terms_including_stop_words);
171	Iterator terms_including_stop_words_iter = terms_including_stop_words.iterator();
172	while (terms_including_stop_words_iter.hasNext()) {
173	Term term = (Term) terms_including_stop_words_iter.next();
174	if (!terms.contains(term)) {
175	System.out.println(" <StopWord value=\"" + term.text() + "\"/>");
176	}
177	}
178
179	// Simple case for getting all the matching documents
180	if (end_results == Integer.MAX_VALUE) {
181	// Perform the query (filter and sorter may be null)
182	Hits hits = searcher.search(query, filter, sorter);
183	System.out.println(" <MatchingDocsInfo num=\"" + hits.length() + "\"/>");
184
185	// Output the matching documents
186	System.out.println(" <StartResults num=\"" + start_results + "\" />");
187	System.out.println(" <EndsResults num=\"" + hits.length() + "\" />");
188	for (int i = start_results; i <= hits.length(); i++) {
189	Document doc = hits.doc(i - 1);
190	System.out.println(" <Match id=\"" + doc.get("nodeID") + "\" />");
191	}
192	}
193
194	// Slightly more complicated case for returning a subset of the matching documents
195	else {
196	// Perform the query (filter may be null)
197	TopFieldDocs hits = searcher.search(query, filter, end_results, sorter);
198	System.out.println(" <MatchingDocsInfo num=\"" + hits.totalHits + "\"/>");
199
200	// Output the matching documents
201	System.out.println(" <StartResults num=\"" + start_results + "\" />");
202	System.out.println(" <EndsResults num=\"" + end_results + "\" />");
203	for (int i = start_results; (i <= hits.scoreDocs.length && i <= end_results); i++) {
204	Document doc = reader.document(hits.scoreDocs[i - 1].doc);
205	System.out.println(" <Match id=\"" + doc.get("nodeID") + "\" />");
206	}
207	}
208	}
209	catch (ParseException parse_exception) {
210	System.out.println(" <Error type=\"PARSE_EXCEPTION\"/>");
211	}
212	catch (TooManyClauses too_many_clauses_exception) {
213	System.out.println(" <Error type=\"TOO_MANY_CLAUSES\"/>");
214	}
215
216	System.out.println("</ResultSet>");
217	}
218
219
220	private static String xmlSafe(String text) {
221	return text.replaceAll("\\&", "\\&");
222	}
223
224	private static Query parseQuery(IndexReader reader, QueryParser query_parser, String query_string, String fuzziness)
225	throws java.io.IOException, org.apache.lucene.queryParser.ParseException
226	{
227	// Split query string into the search terms and the filter terms
228	// * The first +(...) term contains the search terms so count
229	// up '(' and stop when we finish matching ')'
230	int offset = 0;
231	int paren_count = 0;
232	boolean seen_paren = false;
233	while (offset < query_string.length() && (!seen_paren \|\| paren_count > 0)) {
234	if (query_string.charAt(offset) == '(') {
235	paren_count++;
236	seen_paren = true;
237	}
238	if (query_string.charAt(offset) == ')') {
239	paren_count--;
240	}
241	offset++;
242	}
243	String query_prefix = query_string.substring(0, offset);
244	String query_suffix = query_string.substring(offset);
245
246	///ystem.err.println("Prefix: " + query_prefix);
247	///ystem.err.println("Suffix: " + query_suffix);
248
249	Query query = query_parser.parse(query_prefix);
250	query = query.rewrite(reader);
251
252	// If this is a fuzzy search, then we need to add the fuzzy
253	// flag to each of the query terms
254	if (fuzziness != null && query.toString().length() > 0) {
255	// Revert the query to a string
256	System.err.println("Rewritten query: " + query.toString());
257	// Search through the string for TX:<term> query terms
258	// and append the ~ operator. Not that this search will
259	// not change phrase searches (TX:"<term> <term>") as
260	// fuzzy searching is not possible for these entries.
261	// Yahoo! Time for a state machine!
262	StringBuffer mutable_query_string = new StringBuffer(query.toString());
263	int o = 0; // Offset
264	// 0 = BASE, 1 = SEEN_T, 2 = SEEN_TX, 3 = SEEN_TX:
265	int s = 0; // State
266	while (o < mutable_query_string.length()) {
267	char c = mutable_query_string.charAt(o);
268	if (s == 0 && c == TEXTFIELD.charAt(0)) {
269	///ystem.err.println("Found T!");
270	s = 1;
271	}
272	else if (s == 1) {
273	if (c == TEXTFIELD.charAt(1)) {
274	///ystem.err.println("Found X!");
275	s = 2;
276	}
277	else {
278	s = 0; // Reset
279	}
280	}
281	else if (s == 2) {
282	if (c == ':') {
283	///ystem.err.println("Found TX:!");
284	s = 3;
285	}
286	else {
287	s = 0; // Reset
288	}
289	}
290	else if (s == 3) {
291	// Don't process phrases
292	if (c == '"') {
293	///ystem.err.println("Stupid phrase...");
294	s = 0; // Reset
295	}
296	// Found the end of the term... add the
297	// fuzzy search indicator
298	// Nor outside the scope of parentheses
299	else if (Character.isWhitespace(c) \|\| c == ')') {
300	///ystem.err.println("Yahoo! Found fuzzy term.");
301	mutable_query_string.insert(o, '~' + fuzziness);
302	o++;
303	s = 0; // Reset
304	}
305	}
306	o++;
307	}
308	// If we were in the state of looking for the end of a
309	// term - then we just found it!
310	if (s == 3) {
311	mutable_query_string.append('~' + fuzziness);
312	}
313	// Reparse the query
314	///ystem.err.println("Fuzzy query: " + mutable_query_string.toString() + query_suffix);
315	query = query_parser.parse(mutable_query_string.toString() + query_suffix);
316	}
317	else {
318	query = query_parser.parse(query_prefix + query_suffix);
319	}
320
321	return query;
322	}
323
324
325	/**
326	* @todo Michael to comment
327	*/
328	private static Filter parseFilterString(String filter_string)
329	{
330	Filter result = null;
331	Pattern pattern = Pattern.compile("\\s\\+(\\w+)\\:([\\{\\[])(\\d+)\\s+TO\\s+(\\d+)([\\}\\]])\\s");
332	Matcher matcher = pattern.matcher(filter_string);
333	if (matcher.matches()) {
334	String field_name = matcher.group(1);
335	boolean include_lower = matcher.group(2).equals("[");
336	String lower_term = matcher.group(3);
337	String upper_term = matcher.group(4);
338	boolean include_upper = matcher.group(5).equals("]");
339	result = new RangeFilter(field_name, lower_term, upper_term, include_lower, include_upper);
340	}
341	else {
342	System.err.println("Error: Could not understand filter string \"" + filter_string + "\"");
343	}
344	return result;
345	}
346	/ parseFilterString() /
347	}

Note: See TracBrowser for help on using the repository browser.

Download in other formats: