Context Navigation

source: trunk/gsdl/src/java/org/nzdl/gsdl/LuceneWrap/GS2LuceneQuery.java@ 13054

Last change on this file since 13054 was 13054, checked in by mdewsnip, 18 years ago
Now puts the terms through xmlSafe() as well, to prevent invalid XML with weird terms containing punctuation.
Property svn:keywords set to `Author Date Id Revision`
File size: 15.5 KB

Line
1	/**
2	*
3	* @author [email protected]
4	* @author [email protected]
5	* @author [email protected]
6	* @author [email protected]
7	* @version
8	*/
9
10	package org.nzdl.gsdl.LuceneWrap;
11
12
13	import java.io.*;
14	import java.util.*;
15	import java.util.regex.*;
16
17	import org.apache.lucene.analysis.Analyzer;
18	import org.apache.lucene.analysis.standard.StandardAnalyzer;
19	import org.apache.lucene.document.Document;
20	import org.apache.lucene.index.IndexReader;
21	import org.apache.lucene.index.Term;
22	import org.apache.lucene.index.TermDocs;
23	import org.apache.lucene.queryParser.ParseException;
24	import org.apache.lucene.queryParser.QueryParser;
25	import org.apache.lucene.search.BooleanQuery.TooManyClauses;
26	import org.apache.lucene.search.Filter;
27	import org.apache.lucene.search.Hit;
28	import org.apache.lucene.search.Hits;
29	import org.apache.lucene.search.IndexSearcher;
30	import org.apache.lucene.search.Query;
31	import org.apache.lucene.search.RangeFilter;
32	import org.apache.lucene.search.Searcher;
33	import org.apache.lucene.search.ScoreDoc;
34	import org.apache.lucene.search.Sort;
35	import org.apache.lucene.search.TopFieldDocs;
36
37
38	public class GS2LuceneQuery
39	{
40	static private String TEXTFIELD = "TX";
41
42	// Fairly self-explanatory I should hope
43	static private boolean query_result_caching_enabled = false;
44
45	// Use the standard set of English stop words by default
46	static private String[] stop_words = StandardAnalyzer.STOP_WORDS;
47
48	// Command-line options
49	static private String fuzziness = null;
50	static private String filter_string = null;
51	static private Filter filter = null;
52	static private String sort_string = null;
53	static private Sort sorter = new Sort();
54	static private String default_conjuction_operator = "OR";
55	static private int start_results = 1;
56	static private int end_results = Integer.MAX_VALUE;
57
58
59	static public void main (String args[])
60	{
61	if (args.length == 0) {
62	System.out.println("Usage: GS2LuceneQuery <index directory> [-fuzziness value] [-filter filter_string] [-sort sort_field] [-dco AND\|OR] [-startresults number -endresults number] [query]");
63	return;
64	}
65
66	try {
67	String index_directory = args[0];
68	Searcher searcher = new IndexSearcher(index_directory);
69	IndexReader reader = ((IndexSearcher) searcher).getIndexReader();
70
71	// Prepare the index cache directory, if query result caching is enabled
72	if (query_result_caching_enabled) {
73	// Make the index cache directory if it doesn't already exist
74	File index_cache_directory = new File(index_directory, "cache");
75	if (!index_cache_directory.exists()) {
76	index_cache_directory.mkdir();
77	}
78
79	// Disable caching if the index cache directory isn't available
80	if (!index_cache_directory.exists() \|\| !index_cache_directory.isDirectory()) {
81	query_result_caching_enabled = false;
82	}
83	}
84
85	// Create one query parser with the standard set of stop words, and one with none
86	QueryParser query_parser = new QueryParser(TEXTFIELD, new StandardAnalyzer(stop_words));
87	QueryParser query_parser_no_stop_words = new QueryParser(TEXTFIELD, new StandardAnalyzer(new String[] { }));
88
89	String query_string = null;
90
91	// Parse the command-line arguments
92	for (int i = 1; i < args.length; i++) {
93	if (args[i].equals("-sort")) {
94	i++;
95	sort_string = args[i];
96	sorter = new Sort(sort_string);
97	}
98	else if (args[i].equals("-filter")) {
99	i++;
100	filter_string = args[i];
101	filter = parseFilterString(filter_string);
102	}
103	else if (args[i].equals("-dco")) {
104	i++;
105	default_conjuction_operator = args[i];
106	}
107	else if (args[i].equals("-fuzziness")) {
108	i++;
109	fuzziness = args[i];
110	}
111	else if (args[i].equals("-startresults")) {
112	i++;
113	if (args[i].matches("\\d+")) {
114	start_results = Integer.parseInt(args[i]);
115	}
116	}
117	else if (args[i].equals("-endresults")) {
118	i++;
119	if (args[i].matches("\\d+")) {
120	end_results = Integer.parseInt(args[i]);
121	}
122	}
123	else {
124	query_string = args[i];
125	}
126	}
127
128	// Lucene does "OR" queries by default; do an "AND" query if specified
129	if (default_conjuction_operator.equals("AND")) {
130	query_parser.setDefaultOperator(query_parser.AND_OPERATOR);
131	query_parser_no_stop_words.setDefaultOperator(query_parser.AND_OPERATOR);
132	}
133
134	// The query string has been specified as a command-line argument
135	if (query_string != null) {
136	runQuery(index_directory, searcher, reader, query_parser, query_parser_no_stop_words, query_string);
137	}
138
139	// Read queries from STDIN
140	else {
141	BufferedReader in = new BufferedReader(new InputStreamReader(System.in, "UTF-8"));
142	while (true) {
143	// Read the query from STDIN
144	query_string = in.readLine();
145	if (query_string == null \|\| query_string.length() == -1) {
146	break;
147	}
148
149	runQuery(index_directory, searcher, reader, query_parser, query_parser_no_stop_words, query_string);
150	}
151	}
152
153	searcher.close();
154	}
155	catch (IOException exception) {
156	exception.printStackTrace();
157	}
158	}
159
160
161	private static void runQuery(String index_directory, Searcher searcher, IndexReader reader, QueryParser query_parser, QueryParser query_parser_no_stop_words, String query_string)
162	throws IOException
163	{
164	StringBuffer query_results_xml = new StringBuffer();
165
166	// Check if this query result has been cached from a previous search (if it's enabled)
167	File query_result_cache_file = null;
168	if (query_result_caching_enabled) {
169	// Generate the cache file name from the query options
170	String query_result_cache_file_name = query_string + "-";
171	query_result_cache_file_name += ((fuzziness != null) ? fuzziness : "") + "-";
172	query_result_cache_file_name += ((filter_string != null) ? filter_string : "") + "-";
173	query_result_cache_file_name += ((sort_string != null) ? sort_string : "") + "-";
174	query_result_cache_file_name += default_conjuction_operator + "-";
175	query_result_cache_file_name += start_results + "-" + end_results;
176	query_result_cache_file_name = fileSafe(query_result_cache_file_name);
177
178	// If the query result cache file exists, just return its contents and we're done
179	File index_cache_directory = new File(index_directory, "cache");
180	query_result_cache_file = new File(index_cache_directory, query_result_cache_file_name);
181	if (query_result_cache_file.exists() && query_result_cache_file.isFile()) {
182	FileInputStream fis = new FileInputStream(query_result_cache_file);
183	InputStreamReader isr = new InputStreamReader(fis, "UTF-8");
184	BufferedReader buffered_reader = new BufferedReader(isr);
185	String line = "";
186	while ((line = buffered_reader.readLine()) != null) {
187	query_results_xml.append(line + "\n");
188	}
189	String query_results_xml_string = query_results_xml.toString();
190	query_results_xml_string = query_results_xml_string.replaceFirst("cached=\"false\"", "cached=\"true\"");
191	System.out.print(query_results_xml_string);
192	return;
193	}
194	}
195
196	query_results_xml.append("<ResultSet cached=\"false\">\n");
197	query_results_xml.append(" <QueryString>" + xmlSafe(query_string) + "</QueryString>\n");
198	if (filter != null) {
199	query_results_xml.append(" <FilterString>" + filter.toString() + "</FilterString>\n");
200	}
201
202	try {
203	Query query_including_stop_words = parseQuery(reader, query_parser_no_stop_words, query_string, fuzziness);
204	query_including_stop_words = query_including_stop_words.rewrite(reader);
205
206	Query query = parseQuery(reader, query_parser, query_string, fuzziness);
207	query = query.rewrite(reader);
208
209	// Return the list of expanded query terms and their frequencies
210	HashSet terms = new HashSet();
211	query.extractTerms(terms);
212	Iterator term_iterator = terms.iterator();
213	query_results_xml.append(" <QueryTermsInfo num=\"" + terms.size() + "\"/>\n");
214	while (term_iterator.hasNext()) {
215	Term term = (Term) term_iterator.next();
216
217	// Get the term frequency over all the documents
218	TermDocs term_docs = reader.termDocs(term);
219	int term_freq = term_docs.freq();
220	while (term_docs.next()) {
221	term_freq += term_docs.freq();
222	}
223
224	// If you wanted to limit this to just text terms add
225	// something like this:
226	// if (term.field().equals(TEXTFIELD))
227	query_results_xml.append(" <Term value=\"" + xmlSafe(term.text()) + "\" field=\"" + term.field() + "\" freq=\"" + term_freq + "\" />\n");
228	}
229
230	// Return the list of stop words removed from the query
231	HashSet terms_including_stop_words = new HashSet();
232	query_including_stop_words.extractTerms(terms_including_stop_words);
233	Iterator terms_including_stop_words_iter = terms_including_stop_words.iterator();
234	while (terms_including_stop_words_iter.hasNext()) {
235	Term term = (Term) terms_including_stop_words_iter.next();
236	if (!terms.contains(term)) {
237	query_results_xml.append(" <StopWord value=\"" + term.text() + "\"/>\n");
238	}
239	}
240
241	// Simple case for getting all the matching documents
242	if (end_results == Integer.MAX_VALUE) {
243	// Perform the query (filter and sorter may be null)
244	Hits hits = searcher.search(query, filter, sorter);
245	query_results_xml.append(" <MatchingDocsInfo num=\"" + hits.length() + "\"/>\n");
246
247	// Output the matching documents
248	query_results_xml.append(" <StartResults num=\"" + start_results + "\" />\n");
249	query_results_xml.append(" <EndsResults num=\"" + hits.length() + "\" />\n");
250	for (int i = start_results; i <= hits.length(); i++) {
251	Document doc = hits.doc(i - 1);
252	query_results_xml.append(" <Match id=\"" + doc.get("nodeID") + "\" />\n");
253	}
254	}
255
256	// Slightly more complicated case for returning a subset of the matching documents
257	else {
258	// Perform the query (filter may be null)
259	TopFieldDocs hits = searcher.search(query, filter, end_results, sorter);
260	query_results_xml.append(" <MatchingDocsInfo num=\"" + hits.totalHits + "\"/>\n");
261
262	// Output the matching documents
263	query_results_xml.append(" <StartResults num=\"" + start_results + "\" />\n");
264	query_results_xml.append(" <EndsResults num=\"" + end_results + "\" />\n");
265	for (int i = start_results; (i <= hits.scoreDocs.length && i <= end_results); i++) {
266	Document doc = reader.document(hits.scoreDocs[i - 1].doc);
267	query_results_xml.append(" <Match id=\"" + doc.get("nodeID") + "\" />\n");
268	}
269	}
270	}
271	catch (ParseException parse_exception) {
272	query_results_xml.append(" <Error type=\"PARSE_EXCEPTION\"/>\n");
273	}
274	catch (TooManyClauses too_many_clauses_exception) {
275	query_results_xml.append(" <Error type=\"TOO_MANY_CLAUSES\"/>\n");
276	}
277
278	query_results_xml.append("</ResultSet>\n");
279
280	System.out.print(query_results_xml);
281
282	// Cache this query result, if desired
283	if (query_result_caching_enabled) {
284	FileWriter query_result_cache_file_writer = new FileWriter(query_result_cache_file);
285	query_result_cache_file_writer.write(query_results_xml.toString());
286	query_result_cache_file_writer.close();
287	}
288	}
289
290
291	private static String fileSafe(String text)
292	{
293	StringBuffer file_safe_text = new StringBuffer();
294	for (int i = 0; i < text.length(); i++) {
295	char character = text.charAt(i);
296	if ((character >= 'A' && character <= 'Z') \|\| (character >= 'a' && character <= 'z') \|\| (character >= '0' && character <= '9') \|\| character == '-') {
297	file_safe_text.append(character);
298	}
299	else {
300	file_safe_text.append('%');
301	file_safe_text.append((int) character);
302	}
303	}
304	return file_safe_text.toString();
305	}
306
307
308	private static String xmlSafe(String text) {
309	text = text.replaceAll("&","&amp;");
310	text = text.replaceAll("<","&lt;");
311	text = text.replaceAll(">","&gt;");
312	text = text.replaceAll("'","&#039;");
313	text = text.replaceAll("\\\"","&quot;");
314	return text;
315	}
316
317
318	private static Query parseQuery(IndexReader reader, QueryParser query_parser, String query_string, String fuzziness)
319	throws java.io.IOException, org.apache.lucene.queryParser.ParseException
320	{
321	// Split query string into the search terms and the filter terms
322	// * The first +(...) term contains the search terms so count
323	// up '(' and stop when we finish matching ')'
324	int offset = 0;
325	int paren_count = 0;
326	boolean seen_paren = false;
327	while (offset < query_string.length() && (!seen_paren \|\| paren_count > 0)) {
328	if (query_string.charAt(offset) == '(') {
329	paren_count++;
330	seen_paren = true;
331	}
332	if (query_string.charAt(offset) == ')') {
333	paren_count--;
334	}
335	offset++;
336	}
337	String query_prefix = query_string.substring(0, offset);
338	String query_suffix = query_string.substring(offset);
339
340	///ystem.err.println("Prefix: " + query_prefix);
341	///ystem.err.println("Suffix: " + query_suffix);
342
343	Query query = query_parser.parse(query_prefix);
344	query = query.rewrite(reader);
345
346	// If this is a fuzzy search, then we need to add the fuzzy
347	// flag to each of the query terms
348	if (fuzziness != null && query.toString().length() > 0) {
349	// Revert the query to a string
350	System.err.println("Rewritten query: " + query.toString());
351	// Search through the string for TX:<term> query terms
352	// and append the ~ operator. Not that this search will
353	// not change phrase searches (TX:"<term> <term>") as
354	// fuzzy searching is not possible for these entries.
355	// Yahoo! Time for a state machine!
356	StringBuffer mutable_query_string = new StringBuffer(query.toString());
357	int o = 0; // Offset
358	// 0 = BASE, 1 = SEEN_T, 2 = SEEN_TX, 3 = SEEN_TX:
359	int s = 0; // State
360	while (o < mutable_query_string.length()) {
361	char c = mutable_query_string.charAt(o);
362	if (s == 0 && c == TEXTFIELD.charAt(0)) {
363	///ystem.err.println("Found T!");
364	s = 1;
365	}
366	else if (s == 1) {
367	if (c == TEXTFIELD.charAt(1)) {
368	///ystem.err.println("Found X!");
369	s = 2;
370	}
371	else {
372	s = 0; // Reset
373	}
374	}
375	else if (s == 2) {
376	if (c == ':') {
377	///ystem.err.println("Found TX:!");
378	s = 3;
379	}
380	else {
381	s = 0; // Reset
382	}
383	}
384	else if (s == 3) {
385	// Don't process phrases
386	if (c == '"') {
387	///ystem.err.println("Stupid phrase...");
388	s = 0; // Reset
389	}
390	// Found the end of the term... add the
391	// fuzzy search indicator
392	// Nor outside the scope of parentheses
393	else if (Character.isWhitespace(c) \|\| c == ')') {
394	///ystem.err.println("Yahoo! Found fuzzy term.");
395	mutable_query_string.insert(o, '~' + fuzziness);
396	o++;
397	s = 0; // Reset
398	}
399	}
400	o++;
401	}
402	// If we were in the state of looking for the end of a
403	// term - then we just found it!
404	if (s == 3) {
405	mutable_query_string.append('~' + fuzziness);
406	}
407	// Reparse the query
408	///ystem.err.println("Fuzzy query: " + mutable_query_string.toString() + query_suffix);
409	query = query_parser.parse(mutable_query_string.toString() + query_suffix);
410	}
411	else {
412	query = query_parser.parse(query_prefix + query_suffix);
413	}
414
415	return query;
416	}
417
418
419	/**
420	* @todo Michael to comment
421	*/
422	private static Filter parseFilterString(String filter_string)
423	{
424	Filter result = null;
425	Pattern pattern = Pattern.compile("\\s\\+(\\w+)\\:([\\{\\[])(\\d+)\\s+TO\\s+(\\d+)([\\}\\]])\\s");
426	Matcher matcher = pattern.matcher(filter_string);
427	if (matcher.matches()) {
428	String field_name = matcher.group(1);
429	boolean include_lower = matcher.group(2).equals("[");
430	String lower_term = matcher.group(3);
431	String upper_term = matcher.group(4);
432	boolean include_upper = matcher.group(5).equals("]");
433	result = new RangeFilter(field_name, lower_term, upper_term, include_lower, include_upper);
434	}
435	else {
436	System.err.println("Error: Could not understand filter string \"" + filter_string + "\"");
437	}
438	return result;
439	}
440	/ parseFilterString() /
441	}

Note: See TracBrowser for help on using the repository browser.

Download in other formats: