Context Navigation

← Previous Revision
Latest Revision
Next Revision →
Blame
Revision Log

source: gs3-extensions/solr/trunk/src/src/java/org/greenstone/LuceneWrapper/GS2LuceneQuery.java@ 24641

Last change on this file since 24641 was 24641, checked in by davidb, 13 years ago
Initial cut at Greenstone3 runtime code to support Solr. Solr code based on version 3.3, so this also include an upgraded version of the LuceneWrapper code (gs2build/common-src/indexers/lucene-gs) that works with this version of the support jar files
File size: 20.5 KB

Line
1	/**********************************************************************
2	*
3	* GS2LuceneQuery.java
4	*
5	* Copyright 2004 The New Zealand Digital Library Project
6	*
7	* A component of the Greenstone digital library software
8	* from the New Zealand Digital Library Project at the
9	* University of Waikato, New Zealand.
10	*
11	* This program is free software; you can redistribute it and/or modify
12	* it under the terms of the GNU General Public License as published by
13	* the Free Software Foundation; either version 2 of the License, or
14	* (at your option) any later version.
15	*
16	* This program is distributed in the hope that it will be useful,
17	* but WITHOUT ANY WARRANTY; without even the implied warranty of
18	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19	* GNU General Public License for more details.
20	*
21	* You should have received a copy of the GNU General Public License
22	* along with this program; if not, write to the Free Software
23	* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24	*
25	*********************************************************************/
26	package org.greenstone.LuceneWrapper;
27
28
29	import java.io.*;
30	import java.util.*;
31	import java.util.regex.*;
32
33	import org.apache.lucene.analysis.Analyzer;
34	import org.apache.lucene.analysis.standard.StandardAnalyzer;
35	import org.apache.lucene.document.Document;
36	import org.apache.lucene.index.IndexReader;
37	import org.apache.lucene.index.Term;
38	import org.apache.lucene.index.TermDocs;
39	import org.apache.lucene.queryParser.ParseException;
40	import org.apache.lucene.queryParser.QueryParser;
41	import org.apache.lucene.search.BooleanQuery.TooManyClauses;
42	import org.apache.lucene.search.Filter;
43	import org.apache.lucene.search.Hit;
44	import org.apache.lucene.search.Hits;
45	import org.apache.lucene.search.IndexSearcher;
46	import org.apache.lucene.search.Query;
47	import org.apache.lucene.search.TermRangeFilter;
48	import org.apache.lucene.search.Searcher;
49	import org.apache.lucene.search.ScoreDoc;
50	import org.apache.lucene.search.Sort;
51	import org.apache.lucene.search.SortField;
52	import org.apache.lucene.search.TopFieldDocs;
53
54	import org.apache.lucene.store.Directory;
55	import org.apache.lucene.store.FSDirectory;
56	import org.apache.lucene.util.Version;
57
58	public class GS2LuceneQuery extends SharedSoleneQuery
59	{
60	protected String full_indexdir="";
61
62	protected Sort sorter=new Sort();
63	protected Filter filter = null;
64
65	protected static Version matchVersion = Version.LUCENE_24;
66
67	protected QueryParser query_parser = null;
68	protected QueryParser query_parser_no_stop_words = null;
69	protected Searcher searcher = null;
70	protected IndexReader reader = null;
71
72	public GS2LuceneQuery() {
73	super();
74
75	// Create one query parser with the standard set of stop words, and one with none
76
77	query_parser = new QueryParser(matchVersion, TEXTFIELD, new GS2Analyzer()); // uses built-in stop_words_set
78	query_parser_no_stop_words = new QueryParser(matchVersion, TEXTFIELD, new GS2Analyzer(new String[] { }));
79	}
80
81
82	public boolean initialise() {
83
84	if (!super.initialise()) {
85	return false;
86	}
87
88
89	if (full_indexdir==null \|\| full_indexdir.length()==-1){
90	utf8out.println("Index directory is not indicated ");
91	utf8out.flush();
92	return false;
93	}
94
95	try {
96	Directory full_indexdir_dir = FSDirectory.open(new File(full_indexdir));
97	searcher = new IndexSearcher(full_indexdir_dir,true);
98	reader = ((IndexSearcher) searcher).getIndexReader();
99
100	}
101	catch (IOException exception) {
102	exception.printStackTrace();
103	return false;
104	}
105	return true;
106
107	}
108
109	public void setIndexDir(String full_indexdir) {
110	this.full_indexdir = full_indexdir;
111	}
112
113	public void setSortField(String sort_field) {
114	super.setSortField(sort_field);
115
116	if (sort_field == null) {
117	this.sorter = new Sort();
118	} else {
119	this.sorter = new Sort(new SortField(sort_field,SortField.STRING)); // **** can do better than this?!?
120	}
121	}
122
123	public void setFilterString(String filter_string) {
124	super.setFilterString(filter_string);
125	this.filter = parseFilterString(filter_string);
126	}
127
128	public Filter getFilter() {
129	return this.filter;
130	}
131
132
133	public LuceneQueryResult runQuery(String query_string) {
134
135	if (query_string == null \|\| query_string.equals("")) {
136	utf8out.println("The query word is not indicated ");
137	utf8out.flush();
138	return null;
139	}
140
141	LuceneQueryResult lucene_query_result=new LuceneQueryResult();
142	lucene_query_result.clear();
143
144	try {
145	Query query_including_stop_words = query_parser_no_stop_words.parse(query_string);
146	query_including_stop_words = query_including_stop_words.rewrite(reader);
147
148	// System.err.println("******* query_string " + query_string + "**");
149
150	Query query = parseQuery(reader, query_parser, query_string, fuzziness);
151	query = query.rewrite(reader);
152
153	// Get the list of expanded query terms and their frequencies
154	// num docs matching, and total frequency
155	HashSet terms = new HashSet();
156	query.extractTerms(terms);
157
158	HashMap doc_term_freq_map = new HashMap();
159
160	Iterator iter = terms.iterator();
161	while (iter.hasNext()) {
162
163	Term term = (Term) iter.next();
164
165	// Get the term frequency over all the documents
166	TermDocs term_docs = reader.termDocs(term);
167	int term_freq = 0;
168	int match_docs = 0;
169	while (term_docs.next())
170	{
171	if (term_docs.freq() != 0)
172	{
173	term_freq += term_docs.freq();
174	match_docs++;
175
176	// Calculate the document-level term frequency as well
177	Integer lucene_doc_num_obj = new Integer(term_docs.doc());
178	int doc_term_freq = 0;
179	if (doc_term_freq_map.containsKey(lucene_doc_num_obj))
180	{
181	doc_term_freq = ((Integer) doc_term_freq_map.get(lucene_doc_num_obj)).intValue();
182	}
183	doc_term_freq += term_docs.freq();
184
185	doc_term_freq_map.put(lucene_doc_num_obj, new Integer(doc_term_freq));
186	}
187	}
188
189	// Create a term
190	lucene_query_result.addTerm(term.text(), term.field(), match_docs, term_freq);
191	}
192
193	// Get the list of stop words removed from the query
194	HashSet terms_including_stop_words = new HashSet();
195	query_including_stop_words.extractTerms(terms_including_stop_words);
196	Iterator terms_including_stop_words_iter = terms_including_stop_words.iterator();
197	while (terms_including_stop_words_iter.hasNext()) {
198	Term term = (Term) terms_including_stop_words_iter.next();
199	if (!terms.contains(term)) {
200	lucene_query_result.addStopWord(term.text());
201	}
202	}
203
204	// do the query
205	// Simple case for getting all the matching documents
206	if (end_results == Integer.MAX_VALUE) {
207	// Perform the query (filter and sorter may be null)
208	TopFieldDocs hits = searcher.search(query, filter, end_results, sorter);
209	lucene_query_result.setTotalDocs(hits.totalHits);
210
211	// Output the matching documents
212	lucene_query_result.setStartResults(start_results);
213	lucene_query_result.setEndResults(hits.totalHits);
214
215	for (int i = start_results; i <= hits.totalHits; i++) {
216	int lucene_doc_num = hits.scoreDocs[i - 1].doc;
217	Document doc = reader.document(lucene_doc_num);
218	int doc_term_freq = 0;
219	Integer doc_term_freq_object = (Integer) doc_term_freq_map.get(new Integer(lucene_doc_num));
220	if (doc_term_freq_object != null)
221	{
222	doc_term_freq = doc_term_freq_object.intValue();
223	}
224	lucene_query_result.addDoc(doc.get("docOID").trim(), hits.scoreDocs[i-1].score, doc_term_freq);
225	}
226	}
227
228	// Slightly more complicated case for returning a subset of the matching documents
229	else {
230	// Perform the query (filter may be null)
231	TopFieldDocs hits = searcher.search(query, filter, end_results, sorter);
232	lucene_query_result.setTotalDocs(hits.totalHits);
233
234	lucene_query_result.setStartResults(start_results);
235	lucene_query_result.setEndResults(end_results < hits.scoreDocs.length ? end_results: hits.scoreDocs.length);
236
237	// Output the matching documents
238	for (int i = start_results; (i <= hits.scoreDocs.length && i <= end_results); i++) {
239	int lucene_doc_num = hits.scoreDocs[i - 1].doc;
240	Document doc = reader.document(lucene_doc_num);
241	int doc_term_freq = 0;
242	Integer doc_term_freq_object = (Integer) doc_term_freq_map.get(new Integer(lucene_doc_num));
243	if (doc_term_freq_object != null)
244	{
245	doc_term_freq = doc_term_freq_object.intValue();
246	}
247	lucene_query_result.addDoc(doc.get("docOID").trim(), hits.scoreDocs[i-1].score, doc_term_freq);
248	}
249	}
250	}
251
252	catch (ParseException parse_exception) {
253	lucene_query_result.setError(LuceneQueryResult.PARSE_ERROR);
254	}
255	catch (TooManyClauses too_many_clauses_exception) {
256	lucene_query_result.setError(LuceneQueryResult.TOO_MANY_CLAUSES_ERROR);
257	}
258	catch (IOException exception) {
259	lucene_query_result.setError(LuceneQueryResult.IO_ERROR);
260	exception.printStackTrace();
261	}
262	catch (Exception exception) {
263	lucene_query_result.setError(LuceneQueryResult.OTHER_ERROR);
264	exception.printStackTrace();
265	}
266	return lucene_query_result;
267	}
268
269	public void setDefaultConjunctionOperator(String default_conjunction_operator) {
270	super.setDefaultConjunctionOperator(default_conjunction_operator);
271
272	if (default_conjunction_operator.equals("AND")) {
273	query_parser.setDefaultOperator(query_parser.AND_OPERATOR);
274	query_parser_no_stop_words.setDefaultOperator(query_parser.AND_OPERATOR);
275	} else { // default is OR
276	query_parser.setDefaultOperator(query_parser.OR_OPERATOR);
277	query_parser_no_stop_words.setDefaultOperator(query_parser.OR_OPERATOR);
278	}
279	}
280
281
282	public void cleanUp() {
283	super.cleanUp();
284	try {
285	if (searcher != null) {
286	searcher.close();
287	}
288	} catch (IOException exception) {
289	exception.printStackTrace();
290	}
291	}
292
293
294	protected Query parseQuery(IndexReader reader, QueryParser query_parser, String query_string, String fuzziness)
295	throws java.io.IOException, org.apache.lucene.queryParser.ParseException
296	{
297	// Split query string into the search terms and the filter terms
298	// * The first +(...) term contains the search terms so count
299	// up '(' and stop when we finish matching ')'
300	int offset = 0;
301	int paren_count = 0;
302	boolean seen_paren = false;
303	while (offset < query_string.length() && (!seen_paren \|\| paren_count > 0)) {
304	if (query_string.charAt(offset) == '(') {
305	paren_count++;
306	seen_paren = true;
307	}
308	if (query_string.charAt(offset) == ')') {
309	paren_count--;
310	}
311	offset++;
312	}
313	String query_prefix = query_string.substring(0, offset);
314	String query_suffix = query_string.substring(offset);
315
316	///ystem.err.println("Prefix: " + query_prefix);
317	///ystem.err.println("Suffix: " + query_suffix);
318
319	Query query = query_parser.parse(query_prefix);
320	query = query.rewrite(reader);
321
322	// If this is a fuzzy search, then we need to add the fuzzy
323	// flag to each of the query terms
324	if (fuzziness != null && query.toString().length() > 0) {
325
326	// Revert the query to a string
327	System.err.println("Rewritten query: " + query.toString());
328	// Search through the string for TX:<term> query terms
329	// and append the ~ operator. Note that this search will
330	// not change phrase searches (TX:"<term> <term>") as
331	// fuzzy searching is not possible for these entries.
332	// Yahoo! Time for a state machine!
333	StringBuffer mutable_query_string = new StringBuffer(query.toString());
334	int o = 0; // Offset
335	// 0 = BASE, 1 = SEEN_T, 2 = SEEN_TX, 3 = SEEN_TX:
336	int s = 0; // State
337	while(o < mutable_query_string.length()) {
338	char c = mutable_query_string.charAt(o);
339	if (s == 0 && c == TEXTFIELD.charAt(0)) {
340	///ystem.err.println("Found T!");
341	s = 1;
342	}
343	else if (s == 1) {
344	if (c == TEXTFIELD.charAt(1)) {
345	///ystem.err.println("Found X!");
346	s = 2;
347	}
348	else {
349	s = 0; // Reset
350	}
351	}
352	else if (s == 2) {
353	if (c == ':') {
354	///ystem.err.println("Found TX:!");
355	s = 3;
356	}
357	else {
358	s = 0; // Reset
359	}
360	}
361	else if (s == 3) {
362	// Don't process phrases
363	if (c == '"') {
364	///ystem.err.println("Stupid phrase...");
365	s = 0; // Reset
366	}
367	// Found the end of the term... add the
368	// fuzzy search indicator
369	// Nor outside the scope of parentheses
370	else if (Character.isWhitespace(c) \|\| c == ')') {
371	///ystem.err.println("Yahoo! Found fuzzy term.");
372	mutable_query_string.insert(o, '~' + fuzziness);
373	o++;
374	s = 0; // Reset
375	}
376	}
377	o++;
378	}
379	// If we were in the state of looking for the end of a
380	// term - then we just found it!
381	if (s == 3) {
382
383	mutable_query_string.append('~' + fuzziness);
384	}
385	// Reparse the query
386	///ystem.err.println("Fuzzy query: " + mutable_query_string.toString() + query_suffix);
387	query = query_parser.parse(mutable_query_string.toString() + query_suffix);
388	}
389	else {
390	query = query_parser.parse(query_prefix + query_suffix);
391	}
392
393	return query;
394	}
395
396	protected Filter parseFilterString(String filter_string)
397	{
398	Filter result = null;
399	Pattern pattern = Pattern.compile("\\s\\+(\\w+)\\:([\\{\\[])(\\d+)\\s+TO\\s+(\\d+)([\\}\\]])\\s");
400	Matcher matcher = pattern.matcher(filter_string);
401	if (matcher.matches()) {
402	String field_name = matcher.group(1);
403	boolean include_lower = matcher.group(2).equals("[");
404	String lower_term = matcher.group(3);
405	String upper_term = matcher.group(4);
406	boolean include_upper = matcher.group(5).equals("]");
407	result = new TermRangeFilter(field_name, lower_term, upper_term, include_lower, include_upper);
408	}
409	else {
410	System.err.println("Error: Could not understand filter string \"" + filter_string + "\"");
411	}
412	return result;
413	}
414
415
416	/** command line program and auxiliary methods */
417
418	// Fairly self-explanatory I should hope
419	static protected boolean query_result_caching_enabled = false;
420
421
422	static public void main (String args[])
423	{
424	if (args.length == 0) {
425	System.out.println("Usage: GS2LuceneQuery <index directory> [-fuzziness value] [-filter filter_string] [-sort sort_field] [-dco AND\|OR] [-startresults number -endresults number] [query]");
426	return;
427	}
428
429	try {
430	String index_directory = args[0];
431
432	GS2LuceneQuery queryer = new GS2LuceneQuery();
433	queryer.setIndexDir(index_directory);
434
435	// Prepare the index cache directory, if query result caching is enabled
436	if (query_result_caching_enabled) {
437	// Make the index cache directory if it doesn't already exist
438	File index_cache_directory = new File(index_directory, "cache");
439	if (!index_cache_directory.exists()) {
440	index_cache_directory.mkdir();
441	}
442
443	// Disable caching if the index cache directory isn't available
444	if (!index_cache_directory.exists() \|\| !index_cache_directory.isDirectory()) {
445	query_result_caching_enabled = false;
446	}
447	}
448
449	String query_string = null;
450
451	// Parse the command-line arguments
452	for (int i = 1; i < args.length; i++) {
453	if (args[i].equals("-sort")) {
454	i++;
455	queryer.setSortField(args[i]);
456	}
457	else if (args[i].equals("-filter")) {
458	i++;
459	queryer.setFilterString(args[i]);
460	}
461	else if (args[i].equals("-dco")) {
462	i++;
463	queryer.setDefaultConjunctionOperator(args[i]);
464	}
465	else if (args[i].equals("-fuzziness")) {
466	i++;
467	queryer.setFuzziness(args[i]);
468	}
469	else if (args[i].equals("-startresults")) {
470	i++;
471	if (args[i].matches("\\d+")) {
472	queryer.setStartResults(Integer.parseInt(args[i]));
473	}
474	}
475	else if (args[i].equals("-endresults")) {
476	i++;
477	if (args[i].matches("\\d+")) {
478	queryer.setEndResults(Integer.parseInt(args[i]));
479	}
480	}
481	else {
482	query_string = args[i];
483	}
484	}
485
486	if (!queryer.initialise()) {
487	return;
488	}
489
490	// The query string has been specified as a command-line argument
491	if (query_string != null) {
492	runQueryCaching(index_directory, queryer, query_string);
493	}
494
495	// Read queries from STDIN
496	else {
497	BufferedReader in = new BufferedReader(new InputStreamReader(System.in, "UTF-8"));
498	while (true) {
499	// Read the query from STDIN
500	query_string = in.readLine();
501	if (query_string == null \|\| query_string.length() == -1) {
502	break;
503	}
504
505	runQueryCaching(index_directory, queryer, query_string);
506
507	}
508	}
509	queryer.cleanUp();
510	}
511	catch (IOException exception) {
512	exception.printStackTrace();
513	}
514	}
515
516	protected static void runQueryCaching(String index_directory, GS2LuceneQuery queryer, String query_string)
517	throws IOException
518	{
519	StringBuffer query_results_xml = new StringBuffer();
520
521	// Check if this query result has been cached from a previous search (if it's enabled)
522	File query_result_cache_file = null;
523	if (query_result_caching_enabled) {
524	// Generate the cache file name from the query options
525	String query_result_cache_file_name = query_string + "-";
526	String fuzziness = queryer.getFuzziness();
527	query_result_cache_file_name += ((fuzziness != null) ? fuzziness : "") + "-";
528	String filter_string = queryer.getFilterString();
529	query_result_cache_file_name += ((filter_string != null) ? filter_string : "") + "-";
530	String sort_string = queryer.getSortField();
531	query_result_cache_file_name += ((sort_string != null) ? sort_string : "") + "-";
532	String default_conjunction_operator = queryer.getDefaultConjunctionOperator();
533	query_result_cache_file_name += default_conjunction_operator + "-";
534	int start_results = queryer.getStartResults();
535	int end_results = queryer.getEndResults();
536	query_result_cache_file_name += start_results + "-" + end_results;
537	query_result_cache_file_name = fileSafe(query_result_cache_file_name);
538
539	// If the query result cache file exists, just return its contents and we're done
540	File index_cache_directory = new File(index_directory, "cache");
541	query_result_cache_file = new File(index_cache_directory, query_result_cache_file_name);
542	if (query_result_cache_file.exists() && query_result_cache_file.isFile()) {
543	FileInputStream fis = new FileInputStream(query_result_cache_file);
544	InputStreamReader isr = new InputStreamReader(fis, "UTF-8");
545	BufferedReader buffered_reader = new BufferedReader(isr);
546	String line = "";
547	while ((line = buffered_reader.readLine()) != null) {
548	query_results_xml.append(line + "\n");
549	}
550	String query_results_xml_string = query_results_xml.toString();
551	query_results_xml_string = query_results_xml_string.replaceFirst("cached=\"false\"", "cached=\"true\"");
552
553	utf8out.print(query_results_xml_string);
554	utf8out.flush();
555
556	return;
557	}
558	}
559
560	// not cached
561	query_results_xml.append("<ResultSet cached=\"false\">\n");
562	query_results_xml.append("<QueryString>" + LuceneQueryResult.xmlSafe(query_string) + "</QueryString>\n");
563	Filter filter = queryer.getFilter();
564	if (filter != null) {
565	query_results_xml.append("<FilterString>" + filter.toString() + "</FilterString>\n");
566	}
567
568	LuceneQueryResult query_result = queryer.runQuery(query_string);
569	if (query_result == null) {
570	System.err.println("Couldn't run the query");
571	return;
572	}
573
574	if (query_result.getError() != LuceneQueryResult.NO_ERROR) {
575	query_results_xml.append("<Error type=\""+query_result.getErrorString()+"\" />\n");
576	} else {
577	query_results_xml.append(query_result.getXMLString());
578	}
579	query_results_xml.append("</ResultSet>\n");
580
581	utf8out.print(query_results_xml);
582	utf8out.flush();
583
584	// Cache this query result, if desired
585	if (query_result_caching_enabled) {
586	// Catch any exceptions thrown trying to write the query result cache file and warn about them, but don't
587	// bother with the full stack trace. It won't affect the functionality if we can't write some cache
588	// files, it will just affect the speed of subsequent requests.
589	// Example exceptions are "permission denied" errors, or "filename too long" errors (the filter string
590	// can get very long in some collections)
591	try
592	{
593	FileWriter query_result_cache_file_writer = new FileWriter(query_result_cache_file);
594	query_result_cache_file_writer.write(query_results_xml.toString());
595	query_result_cache_file_writer.close();
596	}
597	catch (Exception exception)
598	{
599	System.err.println("Warning: Exception occurred trying to write query result cache file (" + exception + ")");
600	}
601	}
602	}
603
604	protected static String fileSafe(String text)
605	{
606	StringBuffer file_safe_text = new StringBuffer();
607	for (int i = 0; i < text.length(); i++) {
608	char character = text.charAt(i);
609	if ((character >= 'A' && character <= 'Z') \|\| (character >= 'a' && character <= 'z') \|\| (character >= '0' && character <= '9') \|\| character == '-') {
610	file_safe_text.append(character);
611	}
612	else {
613	file_safe_text.append('%');
614	file_safe_text.append((int) character);
615	}
616	}
617	return file_safe_text.toString();
618	}
619
620
621	}
622
623

Note: See TracBrowser for help on using the repository browser.

Download in other formats: