Context Navigation

source: trunk/gsdl/src/java/org/nzdl/gsdl/LuceneWrap/GS2LuceneQuery.java@ 12991

Last change on this file since 12991 was 12991, checked in by mdewsnip, 18 years ago
Ooops... managed to lose the header of the XML output in my recent changes.
Property svn:keywords set to `Author Date Id Revision`
File size: 11.8 KB

Line
1	/**
2	*
3	* @author [email protected]
4	* @author [email protected]
5	* @author [email protected]
6	* @author [email protected]
7	* @version
8	*/
9
10	package org.nzdl.gsdl.LuceneWrap;
11
12
13	import java.io.*;
14	import java.util.*;
15	import java.util.regex.*;
16
17	import org.apache.lucene.analysis.Analyzer;
18	import org.apache.lucene.analysis.standard.StandardAnalyzer;
19	import org.apache.lucene.document.Document;
20	import org.apache.lucene.index.IndexReader;
21	import org.apache.lucene.index.Term;
22	import org.apache.lucene.index.TermDocs;
23	import org.apache.lucene.queryParser.ParseException;
24	import org.apache.lucene.queryParser.QueryParser;
25	import org.apache.lucene.search.BooleanQuery.TooManyClauses;
26	import org.apache.lucene.search.Filter;
27	import org.apache.lucene.search.Hit;
28	import org.apache.lucene.search.Hits;
29	import org.apache.lucene.search.IndexSearcher;
30	import org.apache.lucene.search.Query;
31	import org.apache.lucene.search.RangeFilter;
32	import org.apache.lucene.search.Searcher;
33	import org.apache.lucene.search.ScoreDoc;
34	import org.apache.lucene.search.Sort;
35	import org.apache.lucene.search.TopFieldDocs;
36
37
38	public class GS2LuceneQuery
39	{
40	static private String TEXTFIELD = "TX";
41
42	// Use the standard set of English stop words by default
43	static private String[] stop_words = StandardAnalyzer.STOP_WORDS;
44
45	// Command-line options
46	static private String fuzziness = null;
47	static private Filter filter = null;
48	static private Sort sorter = new Sort();
49	static private String default_conjuction_operator = "OR";
50	static private int start_results = 1;
51	static private int end_results = Integer.MAX_VALUE;
52
53
54	static public void main (String args[])
55	{
56	if (args.length == 0) {
57	System.out.println("Usage: GS2LuceneQuery <index directory> [-fuzziness value] [-filter filter_string] [-sort sort_field] [-dco AND\|OR] [-startresults number -endresults number] [query]");
58	return;
59	}
60
61	try {
62	Searcher searcher = new IndexSearcher(args[0]);
63	IndexReader reader = ((IndexSearcher) searcher).getIndexReader();
64
65	// Create one query parser with the standard set of stop words, and one with none
66	QueryParser query_parser = new QueryParser(TEXTFIELD, new StandardAnalyzer(stop_words));
67	QueryParser query_parser_no_stop_words = new QueryParser(TEXTFIELD, new StandardAnalyzer(new String[] { }));
68
69	String query_string = null;
70
71	// Parse the command-line arguments
72	for (int i = 1; i < args.length; i++) {
73	if (args[i].equals("-sort")) {
74	i++;
75	sorter = new Sort(args[i]);
76	}
77	else if (args[i].equals("-filter")) {
78	i++;
79	filter = parseFilterString(args[i]);
80	}
81	else if (args[i].equals("-dco")) {
82	i++;
83	default_conjuction_operator = args[i];
84	}
85	else if (args[i].equals("-fuzziness")) {
86	i++;
87	fuzziness = args[i];
88	}
89	else if (args[i].equals("-startresults")) {
90	i++;
91	if (args[i].matches("\\d+")) {
92	start_results = Integer.parseInt(args[i]);
93	}
94	}
95	else if (args[i].equals("-endresults")) {
96	i++;
97	if (args[i].matches("\\d+")) {
98	end_results = Integer.parseInt(args[i]);
99	}
100	}
101	else {
102	query_string = args[i];
103	}
104	}
105
106	// Lucene does "OR" queries by default; do an "AND" query if specified
107	if (default_conjuction_operator.equals("AND")) {
108	query_parser.setDefaultOperator(query_parser.AND_OPERATOR);
109	query_parser_no_stop_words.setDefaultOperator(query_parser.AND_OPERATOR);
110	}
111
112	// The query string has been specified as a command-line argument
113	if (query_string != null) {
114	runQuery(searcher, reader, query_parser, query_parser_no_stop_words, query_string);
115	}
116
117	// Read queries from STDIN
118	else {
119	BufferedReader in = new BufferedReader(new InputStreamReader(System.in, "UTF-8"));
120	while (true) {
121	// Read the query from STDIN
122	query_string = in.readLine();
123	if (query_string == null \|\| query_string.length() == -1) {
124	break;
125	}
126
127	runQuery(searcher, reader, query_parser, query_parser_no_stop_words, query_string);
128	}
129	}
130
131	searcher.close();
132	}
133	catch (IOException exception) {
134	exception.printStackTrace();
135	}
136	}
137
138
139	private static void runQuery(Searcher searcher, IndexReader reader, QueryParser query_parser, QueryParser query_parser_no_stop_words, String query_string)
140	throws IOException
141	{
142	System.out.println("<ResultSet>");
143	System.out.println(" <QueryString>" + xmlSafe(query_string) + "</QueryString>");
144	if (filter != null) {
145	System.out.println(" <FilterString>" + filter.toString() + "</FilterString>");
146	}
147
148	try {
149	Query query_including_stop_words = parseQuery(reader, query_parser_no_stop_words, query_string, fuzziness);
150	query_including_stop_words = query_including_stop_words.rewrite(reader);
151
152	Query query = parseQuery(reader, query_parser, query_string, fuzziness);
153	query = query.rewrite(reader);
154
155	// Return the list of expanded query terms and their frequencies
156	HashSet terms = new HashSet();
157	query.extractTerms(terms);
158	Iterator term_iterator = terms.iterator();
159	System.out.println(" <QueryTermsInfo num=\"" + terms.size() + "\"/>");
160	while (term_iterator.hasNext()) {
161	Term term = (Term) term_iterator.next();
162
163	// Get the term frequency over all the documents
164	TermDocs term_docs = reader.termDocs(term);
165	int term_freq = term_docs.freq();
166	while (term_docs.next()) {
167	term_freq += term_docs.freq();
168	}
169
170	// If you wanted to limit this to just text terms add
171	// something like this:
172	// if (term.field().equals(TEXTFIELD))
173	System.out.println(" <Term value=\"" + term.text() + "\" field=\"" + term.field() + "\" freq=\"" + term_freq + "\" />");
174	}
175
176	// Return the list of stop words removed from the query
177	HashSet terms_including_stop_words = new HashSet();
178	query_including_stop_words.extractTerms(terms_including_stop_words);
179	Iterator terms_including_stop_words_iter = terms_including_stop_words.iterator();
180	while (terms_including_stop_words_iter.hasNext()) {
181	Term term = (Term) terms_including_stop_words_iter.next();
182	if (!terms.contains(term)) {
183	System.out.println(" <StopWord value=\"" + term.text() + "\"/>");
184	}
185	}
186
187	// Simple case for getting all the matching documents
188	if (end_results == Integer.MAX_VALUE) {
189	// Perform the query (filter and sorter may be null)
190	Hits hits = searcher.search(query, filter, sorter);
191	System.out.println(" <MatchingDocsInfo num=\"" + hits.length() + "\"/>");
192
193	// Output the matching documents
194	System.out.println(" <StartResults num=\"" + start_results + "\" />");
195	System.out.println(" <EndsResults num=\"" + hits.length() + "\" />");
196	for (int i = start_results; i <= hits.length(); i++) {
197	Document doc = hits.doc(i - 1);
198	System.out.println(" <Match id=\"" + doc.get("nodeID") + "\" />");
199	}
200	}
201
202	// Slightly more complicated case for returning a subset of the matching documents
203	else {
204	// Perform the query (filter may be null)
205	TopFieldDocs hits = searcher.search(query, filter, end_results, sorter);
206	System.out.println(" <MatchingDocsInfo num=\"" + hits.totalHits + "\"/>");
207
208	// Output the matching documents
209	System.out.println(" <StartResults num=\"" + start_results + "\" />");
210	System.out.println(" <EndsResults num=\"" + end_results + "\" />");
211	for (int i = start_results; (i <= hits.scoreDocs.length && i <= end_results); i++) {
212	Document doc = reader.document(hits.scoreDocs[i - 1].doc);
213	System.out.println(" <Match id=\"" + doc.get("nodeID") + "\" />");
214	}
215	}
216	}
217	catch (ParseException parse_exception) {
218	System.out.println(" <Error type=\"PARSE_EXCEPTION\"/>");
219	}
220	catch (TooManyClauses too_many_clauses_exception) {
221	System.out.println(" <Error type=\"TOO_MANY_CLAUSES\"/>");
222	}
223
224	System.out.println("</ResultSet>");
225	}
226
227
228	private static String xmlSafe(String text) {
229	return text.replaceAll("\\&", "\\&");
230	}
231
232	private static Query parseQuery(IndexReader reader, QueryParser query_parser, String query_string, String fuzziness)
233	throws java.io.IOException, org.apache.lucene.queryParser.ParseException
234	{
235	// Split query string into the search terms and the filter terms
236	// * The first +(...) term contains the search terms so count
237	// up '(' and stop when we finish matching ')'
238	int offset = 0;
239	int paren_count = 0;
240	boolean seen_paren = false;
241	while (offset < query_string.length() && (!seen_paren \|\| paren_count > 0)) {
242	if (query_string.charAt(offset) == '(') {
243	paren_count++;
244	seen_paren = true;
245	}
246	if (query_string.charAt(offset) == ')') {
247	paren_count--;
248	}
249	offset++;
250	}
251	String query_prefix = query_string.substring(0, offset);
252	String query_suffix = query_string.substring(offset);
253
254	///ystem.err.println("Prefix: " + query_prefix);
255	///ystem.err.println("Suffix: " + query_suffix);
256
257	Query query = query_parser.parse(query_prefix);
258	query = query.rewrite(reader);
259
260	// If this is a fuzzy search, then we need to add the fuzzy
261	// flag to each of the query terms
262	if (fuzziness != null && query.toString().length() > 0) {
263	// Revert the query to a string
264	System.err.println("Rewritten query: " + query.toString());
265	// Search through the string for TX:<term> query terms
266	// and append the ~ operator. Not that this search will
267	// not change phrase searches (TX:"<term> <term>") as
268	// fuzzy searching is not possible for these entries.
269	// Yahoo! Time for a state machine!
270	StringBuffer mutable_query_string = new StringBuffer(query.toString());
271	int o = 0; // Offset
272	// 0 = BASE, 1 = SEEN_T, 2 = SEEN_TX, 3 = SEEN_TX:
273	int s = 0; // State
274	while (o < mutable_query_string.length()) {
275	char c = mutable_query_string.charAt(o);
276	if (s == 0 && c == TEXTFIELD.charAt(0)) {
277	///ystem.err.println("Found T!");
278	s = 1;
279	}
280	else if (s == 1) {
281	if (c == TEXTFIELD.charAt(1)) {
282	///ystem.err.println("Found X!");
283	s = 2;
284	}
285	else {
286	s = 0; // Reset
287	}
288	}
289	else if (s == 2) {
290	if (c == ':') {
291	///ystem.err.println("Found TX:!");
292	s = 3;
293	}
294	else {
295	s = 0; // Reset
296	}
297	}
298	else if (s == 3) {
299	// Don't process phrases
300	if (c == '"') {
301	///ystem.err.println("Stupid phrase...");
302	s = 0; // Reset
303	}
304	// Found the end of the term... add the
305	// fuzzy search indicator
306	// Nor outside the scope of parentheses
307	else if (Character.isWhitespace(c) \|\| c == ')') {
308	///ystem.err.println("Yahoo! Found fuzzy term.");
309	mutable_query_string.insert(o, '~' + fuzziness);
310	o++;
311	s = 0; // Reset
312	}
313	}
314	o++;
315	}
316	// If we were in the state of looking for the end of a
317	// term - then we just found it!
318	if (s == 3) {
319	mutable_query_string.append('~' + fuzziness);
320	}
321	// Reparse the query
322	///ystem.err.println("Fuzzy query: " + mutable_query_string.toString() + query_suffix);
323	query = query_parser.parse(mutable_query_string.toString() + query_suffix);
324	}
325	else {
326	query = query_parser.parse(query_prefix + query_suffix);
327	}
328
329	return query;
330	}
331
332
333	/**
334	* @todo Michael to comment
335	*/
336	private static Filter parseFilterString(String filter_string)
337	{
338	Filter result = null;
339	Pattern pattern = Pattern.compile("\\s\\+(\\w+)\\:([\\{\\[])(\\d+)\\s+TO\\s+(\\d+)([\\}\\]])\\s");
340	Matcher matcher = pattern.matcher(filter_string);
341	if (matcher.matches()) {
342	String field_name = matcher.group(1);
343	boolean include_lower = matcher.group(2).equals("[");
344	String lower_term = matcher.group(3);
345	String upper_term = matcher.group(4);
346	boolean include_upper = matcher.group(5).equals("]");
347	result = new RangeFilter(field_name, lower_term, upper_term, include_lower, include_upper);
348	}
349	else {
350	System.err.println("Error: Could not understand filter string \"" + filter_string + "\"");
351	}
352	return result;
353	}
354	/ parseFilterString() /
355	}

Note: See TracBrowser for help on using the repository browser.

Download in other formats: