source: gs3-extensions/solr/trunk/src/src/java/org/greenstone/solrserver/Greenstone3SearchHandler.java@ 29986

Last change on this file since 29986 was 29986, checked in by ak19, 9 years ago

The getTerms() functionality previously used by the EmbeddedSolrServer has now been re-implemented for HttpSolrServer with the new custom Greenstone Solr RequestHandler class Greenstone3SearchHandler, which lives on the solr server side, in tomcat's solr webapp. The functionality has been improvemed, such as being able to search for: econom* cat, by recursively calling setRewriteMethods on any PrefixQuery and WildcardQuery MultiQueries within an overall BooleanQuery, and by handling BooleanQuery.TooManyClauses exceptions when the number of expanded terms is too large, such as for a search of a*.

File size: 9.9 KB
Line 
1/**********************************************************************
2 *
3 * Greenstone3SearchHandler.java
4 *
5 * Copyright 2015 The New Zealand Digital Library Project
6 *
7 * A component of the Greenstone digital library software
8 * from the New Zealand Digital Library Project at the
9 * University of Waikato, New Zealand.
10 *
11 * This program is free software; you can redistribute it and/or modify
12 * it under the terms of the GNU General Public License as published by
13 * the Free Software Foundation; either version 2 of the License, or
14 * (at your option) any later version.
15 *
16 * This program is distributed in the hope that it will be useful,
17 * but WITHOUT ANY WARRANTY; without even the implied warranty of
18 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19 * GNU General Public License for more details.
20 *
21 * You should have received a copy of the GNU General Public License
22 * along with this program; if not, write to the Free Software
23 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24 *
25 *********************************************************************/
26
27package org.greenstone.solrserver;
28
29import org.apache.solr.handler.component.SearchHandler;
30
31
32import org.slf4j.Logger;
33import org.slf4j.LoggerFactory;
34//import org.apache.log4j.Logger;
35
36import org.apache.solr.client.solrj.SolrQuery; // subclass of ModifiableSolrParams, a subclass of SolrParams
37
38import org.apache.solr.core.CoreContainer;
39import org.apache.solr.core.SolrCore;
40
41import org.apache.lucene.index.IndexReader;
42import org.apache.lucene.index.Term;
43
44import org.apache.lucene.search.BooleanClause;
45import org.apache.lucene.search.BooleanQuery;
46import org.apache.lucene.search.MultiTermQuery;
47import org.apache.lucene.search.Query; // Query, TermQuery, BooleanQuery, BooleanClause and more
48
49
50import org.apache.solr.search.QParser;
51import org.apache.solr.search.SolrIndexSearcher;
52
53import org.apache.solr.request.SolrQueryRequest;
54import org.apache.solr.response.SolrQueryResponse;
55
56import java.util.ArrayList;
57import java.util.Collection;
58import java.util.Iterator;
59import java.util.List;
60import java.util.Set;
61import java.util.HashSet;
62
63/**
64 * This class is a custom Solr RequestHandler that sits on the solr server side (in tomcat's solr webapp)
65 * and when it receives a query request (sent to this SearchHandler), it will expand the query terms
66 * by calling query.rewrite and then request the totaltermfreq and totalfreq for these individual terms.
67 * This class was made necessary by the fact that solr/lucene index locking exceptions occurred when
68 * this code used to be in ext/solr's SolrQueryWrapper.java::getTerms().
69 *
70 * With the customisations in this class, can search a Solr collection for: econom* cat
71 * And the total and term frequencies will be returned for all expanded forms, depending on the analyzer.
72 */
73
74
75// Important page:
76// https://wiki.apache.org/solr/SolrPlugins
77public class Greenstone3SearchHandler extends SearchHandler
78{
79 // IMPORTANT NOTE: Logging doesn't work in this calss either with log4j or slf4j,
80 // but System.err goes to catalina.out.
81
82 //protected static Logger log = LoggerFactory.getLogger(Greenstone3SearchHandler.class);
83 //static Logger logger = LoggerFactory.getLogger(org.greenstone.solrserver.Greenstone3SearchHandler.class.getName());
84
85 protected MultiTermQuery.RewriteMethod currentRewriteMethod
86 = MultiTermQuery.CONSTANT_SCORE_BOOLEAN_QUERY_REWRITE;
87 // which is less CPU intensive than MultiTermQuery.SCORING_BOOLEAN_QUERY_REWRITE)
88
89 // This recursive method calls setRewriteMethod on any MultiTermQueries inside the given (boolean)query,
90 // since by default PrefixQueries get rewritten to ConstantScoreQueries and don't get expanded.
91 // Calling setRewriteMethod on each MultiTermQuery in query here is useful to later ensure that any
92 // MultiTermQueries like PrefixQueries and WildcareQueries can get expanded,
93 // including when embedded in BooleanQueries.
94 protected Query getSimplified(Query query)
95 {
96
97 // base case
98 if(query instanceof MultiTermQuery) { // PrefixQuery or WildcardQuery
99
100 // for some reason, when a PrefixQuery (e.g. econom*) gets rewritten to a ConstantScoreQuery
101 // it no longer rewrites the query to produce the expanded terms. Need to setRewriteMethod
102 // http://stackoverflow.com/questions/3060636/lucene-score-calculation-with-a-prefixquery
103 // See also http://trac.greenstone.org/ticket/845 and http://trac.greenstone.org/changeset/26157
104
105 MultiTermQuery mtQuery = (MultiTermQuery)query;
106 mtQuery.setRewriteMethod(currentRewriteMethod);
107
108 }
109
110 else if(query instanceof BooleanQuery) {
111
112 BooleanQuery bQuery = (BooleanQuery)query;
113 Iterator<BooleanClause> clauses = bQuery.iterator();
114
115 while(clauses.hasNext()) {
116 BooleanClause clause = clauses.next();
117 Query clauseQuery = clause.getQuery();
118 Query expandedClauseQuery = getSimplified(clauseQuery);
119 clause.setQuery(expandedClauseQuery);
120 }
121 }
122
123 // another type of query, leave as-is
124 return query;
125 }
126
127 protected Query expandQuery(SolrQueryRequest req, Query parsedQuery) throws Exception {
128
129 // calls setRewriteMethod on any MultiTermQueries inside the given (boolean)query,
130 // doing so ensures MultiTermQueries like PrefixQueries and WildcareQueries can get expanded
131 parsedQuery = getSimplified(parsedQuery); // can throw exception
132
133 // now finally rewrite the query to any expand Prefix- and WildCareQueries contained in here
134 SolrIndexSearcher searcher = req.getSearcher();
135 IndexReader indexReader = searcher.getIndexReader(); // returns a DirectoryReader
136 parsedQuery = parsedQuery.rewrite(indexReader); // used to get rewritten to ConstantScoreQuery
137
138 return parsedQuery;
139 }
140
141 @Override
142 public void handleRequestBody(SolrQueryRequest req, SolrQueryResponse rsp) throws Exception
143 {
144
145 // do getTerms() here:
146 // getParams, modify solrparams (q is query_string)
147 // if q exists, then do extractTerms and queryRewrite
148 // then req.setSolrParams
149 // and continue on as before: super.handleRequestBody(req, rsp);
150
151 SolrQuery solrParams = new SolrQuery();
152 solrParams.add(req.getParams());
153
154 //String query_string = "TX:(farming)";
155 String query_string = solrParams.get("q");
156
157
158 if(query_string == null || query_string.equals("")) {
159 log.error("@@@@@@@@@ " + this.getClass() + " - QUERY STRING EMPTY");
160 }
161 else {
162 //System.err.println("@@@ Parsing query_string " + query_string);
163
164
165 QParser qParser = QParser.getParser(query_string, "lucene", req);
166 Query parsedQuery = qParser.getQuery();
167
168 // For PrefixQuery or WildCardQuery (a subclass of AutomatonQuery, incl RegexpQ),
169 // like ZZ:econom* and ZZ:*date/regex queries, Query.extractTerms() throws an Exception
170 // because it has not done the Query.rewrite() step yet. So do that manually for them.
171 // This still doesn't provide us with the terms that econom* or *date break down into.
172
173 //if(parsedQuery instanceof PrefixQuery || parsedQuery instanceof AutomatonQuery) {
174 // Should we just check superclass MultiTermQuery?
175 // Can be a BooleanQuery containing PrefixQuery/WildCardQuery among its clauses, so
176 // just test for * in the query_string to determine if we need to do a rewrite() or not
177 if(query_string.contains("*")) {
178
179 //System.err.println("@@@@ query's class: " + parsedQuery.getClass().getName());
180
181
182 // See also common-src/indexers/lucene-gs/src/org/greenstone/LuceneWrapper3/GS2LuceneQuery.java
183 // Of http://trac.greenstone.org/changeset/26157 and http://trac.greenstone.org/ticket/845
184 try {
185 parsedQuery = expandQuery(req, parsedQuery);
186
187 } catch(BooleanQuery.TooManyClauses ex) { // hits this exception if searching solr coll for "a*"
188 System.err.println("@@@@ Encountered TooManyClauses Exception: " + ex.getMessage());
189 System.err.println("@@@@ Trying CustomRewriteMethod");
190
191 MultiTermQuery.ConstantScoreAutoRewrite customRewriteMethod = new MultiTermQuery.ConstantScoreAutoRewrite();
192 customRewriteMethod.setDocCountPercent(100.0);
193 customRewriteMethod.setTermCountCutoff(350); // same as default
194 this.currentRewriteMethod = customRewriteMethod;
195
196 try {
197 // try query.rewrite() again now
198 parsedQuery = expandQuery(req, parsedQuery);
199
200 } catch(BooleanQuery.TooManyClauses bex) { // still too many clauses
201 System.err.println("@@@@ Encountered TooManyClauses Exception despite CustomRewriteMethod: "
202 + bex.getMessage());
203 System.err.println("@@@@ Using default Multiterm RewriteMethod");
204
205 // do what the code originally did: use the default rewriteMethod which
206 // uses a default docCountPercent=0.1 (%) and termCountCutoff=350
207 currentRewriteMethod = MultiTermQuery.CONSTANT_SCORE_AUTO_REWRITE_DEFAULT;
208
209 // this will succeed, but probably won't expand * in Prefix- and WildcardQueries
210 parsedQuery = expandQuery(req, parsedQuery);
211 }
212 }
213 //System.err.println("@@@@ rewritten query is now: " + parsedQuery);
214 }
215
216
217 // extract the terms
218 Set<Term> extractedQueryTerms = new HashSet<Term>();
219 parsedQuery.extractTerms(extractedQueryTerms);
220
221 // need to sort the terms for presentation, since a Set is unsorted
222 List<Term> termsList = new ArrayList<Term>(extractedQueryTerms);
223 java.util.Collections.sort(termsList); // Term implements Comparable, terms sorted alphabetically
224
225
226
227 Iterator<Term> termsIterator = termsList.iterator();//extractedQueryTerms.iterator();
228 while(termsIterator.hasNext()) {
229 Term term = termsIterator.next();
230 //System.err.println("#### Found query term: " + term);
231
232 String field = term.field();
233 String queryTerm = term.text();
234
235 // totaltermfreq(TI, 'farming')
236 // termfreq(TI, 'farming')
237 solrParams.addField("totaltermfreq(" + field + ",'" + queryTerm + "')");
238 solrParams.addField("termfreq(" + field + ",'" + queryTerm + "')");
239 }
240 }
241
242 // set to modified SolrQuery SolrParams
243 req.setParams(solrParams);
244 // send off modified request
245 super.handleRequestBody(req, rsp);
246 }
247}
Note: See TracBrowser for help on using the repository browser.