source: gs3-extensions/solr/trunk/src/src/java/org/greenstone/solrserver/Greenstone3SearchHandler.java@ 29987

Last change on this file since 29987 was 29987, checked in by ak19, 9 years ago

Adjustments to comments.

File size: 9.9 KB
Line 
1/**********************************************************************
2 *
3 * Greenstone3SearchHandler.java
4 *
5 * Copyright 2015 The New Zealand Digital Library Project
6 *
7 * A component of the Greenstone digital library software
8 * from the New Zealand Digital Library Project at the
9 * University of Waikato, New Zealand.
10 *
11 * This program is free software; you can redistribute it and/or modify
12 * it under the terms of the GNU General Public License as published by
13 * the Free Software Foundation; either version 2 of the License, or
14 * (at your option) any later version.
15 *
16 * This program is distributed in the hope that it will be useful,
17 * but WITHOUT ANY WARRANTY; without even the implied warranty of
18 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19 * GNU General Public License for more details.
20 *
21 * You should have received a copy of the GNU General Public License
22 * along with this program; if not, write to the Free Software
23 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24 *
25 *********************************************************************/
26
27package org.greenstone.solrserver;
28
29import org.apache.solr.handler.component.SearchHandler;
30
31
32import org.slf4j.Logger;
33import org.slf4j.LoggerFactory;
34//import org.apache.log4j.Logger;
35
36import org.apache.solr.client.solrj.SolrQuery; // subclass of ModifiableSolrParams, a subclass of SolrParams
37
38import org.apache.solr.core.CoreContainer;
39import org.apache.solr.core.SolrCore;
40
41import org.apache.lucene.index.IndexReader;
42import org.apache.lucene.index.Term;
43
44import org.apache.lucene.search.BooleanClause;
45import org.apache.lucene.search.BooleanQuery;
46import org.apache.lucene.search.MultiTermQuery;
47import org.apache.lucene.search.Query; // Query, TermQuery, BooleanQuery, BooleanClause and more
48
49
50import org.apache.solr.search.QParser;
51import org.apache.solr.search.SolrIndexSearcher;
52
53import org.apache.solr.request.SolrQueryRequest;
54import org.apache.solr.response.SolrQueryResponse;
55
56import java.util.ArrayList;
57import java.util.Collection;
58import java.util.Iterator;
59import java.util.List;
60import java.util.Set;
61import java.util.HashSet;
62
63/**
64 * This class is a custom Solr RequestHandler that sits on the solr server side (in tomcat's solr webapp)
65 * and when it receives a query request (sent to this SearchHandler), it will expand the query terms
66 * by calling query.rewrite and then request the totaltermfreq and totalfreq for these individual terms.
67 * This class was made necessary by the fact that solr/lucene index locking exceptions occurred when
68 * this code used to be in ext/solr's SolrQueryWrapper.java::getTerms().
69 *
70 * With the customisations in this class, can search a Solr collection for: econom* cat
71 * And the total and term frequencies will be returned for all expanded forms, depending on the analyzer.
72 */
73
74
75// Important page:
76// https://wiki.apache.org/solr/SolrPlugins
77public class Greenstone3SearchHandler extends SearchHandler
78{
79 // IMPORTANT NOTE: Logging doesn't work in this calss either with log4j or slf4j,
80 // but System.err goes to catalina.out.
81
82 //protected static Logger log = LoggerFactory.getLogger(Greenstone3SearchHandler.class);
83 //static Logger logger = LoggerFactory.getLogger(org.greenstone.solrserver.Greenstone3SearchHandler.class.getName());
84
85 protected MultiTermQuery.RewriteMethod currentRewriteMethod
86 = MultiTermQuery.CONSTANT_SCORE_BOOLEAN_QUERY_REWRITE;
87 // which is less CPU intensive than MultiTermQuery.SCORING_BOOLEAN_QUERY_REWRITE)
88
89 // This recursive method calls setRewriteMethod on any MultiTermQueries inside the given (boolean)query,
90 // since by default PrefixQueries like farm* get rewritten to ConstantScoreQueries and don't get expanded.
91 // Calling setRewriteMethod on each MultiTermQuery in query here is useful to later ensure that any
92 // MultiTermQueries like PrefixQueries and WildcardQueries can get expanded,
93 // including when embedded in BooleanQueries.
94 protected Query getSimplified(Query query)
95 {
96
97 // base case
98 if(query instanceof MultiTermQuery) { // PrefixQuery or WildcardQuery
99
100 // for some reason, when a PrefixQuery (e.g. econom*) gets rewritten to a ConstantScoreQuery
101 // it no longer rewrites the query to produce the expanded terms. Need to setRewriteMethod
102 // http://stackoverflow.com/questions/3060636/lucene-score-calculation-with-a-prefixquery
103 // See also http://trac.greenstone.org/ticket/845 and http://trac.greenstone.org/changeset/26157
104
105 MultiTermQuery mtQuery = (MultiTermQuery)query;
106 mtQuery.setRewriteMethod(currentRewriteMethod);
107
108 }
109
110 else if(query instanceof BooleanQuery) {
111
112 BooleanQuery bQuery = (BooleanQuery)query;
113 Iterator<BooleanClause> clauses = bQuery.iterator();
114
115 while(clauses.hasNext()) {
116 BooleanClause clause = clauses.next();
117 Query clauseQuery = clause.getQuery();
118 Query expandedClauseQuery = getSimplified(clauseQuery);
119 clause.setQuery(expandedClauseQuery);
120 }
121 }
122
123 // another type of query, leave as-is
124 return query;
125 }
126
127 protected Query expandQuery(SolrQueryRequest req, Query parsedQuery) throws Exception {
128
129 // calls setRewriteMethod on any MultiTermQueries inside the given (boolean)query,
130 // doing so ensures MultiTermQueries like PrefixQueries and WildcareQueries can get expanded
131 parsedQuery = getSimplified(parsedQuery); // can throw exception
132
133 // now finally rewrite the query to any expand Prefix- and WildCareQueries contained in here
134 SolrIndexSearcher searcher = req.getSearcher();
135 IndexReader indexReader = searcher.getIndexReader(); // returns a DirectoryReader
136 parsedQuery = parsedQuery.rewrite(indexReader); // used to get rewritten to ConstantScoreQuery
137
138 return parsedQuery;
139 }
140
141 @Override
142 public void handleRequestBody(SolrQueryRequest req, SolrQueryResponse rsp) throws Exception
143 {
144
145 // do getTerms() here:
146 // getParams, modify solrparams (q is query_string)
147 // if q exists, then do extractTerms and queryRewrite
148 // then req.setSolrParams
149 // and continue on as before: super.handleRequestBody(req, rsp);
150
151 SolrQuery solrParams = new SolrQuery();
152 solrParams.add(req.getParams());
153
154 //String query_string = "TX:(farming)";
155 String query_string = solrParams.get("q");
156
157
158 if(query_string == null || query_string.equals("")) {
159 log.error("@@@@@@@@@ " + this.getClass() + " - QUERY STRING EMPTY");
160 }
161 else {
162 //System.err.println("@@@ Parsing query_string " + query_string);
163
164
165 QParser qParser = QParser.getParser(query_string, "lucene", req);
166 Query parsedQuery = qParser.getQuery();
167
168 // For PrefixQuery or WildCardQuery (a subclass of AutomatonQuery, incl RegexpQ),
169 // like ZZ:econom* and ZZ:*date/regex queries, Query.extractTerms() throws an Exception
170 // because it has not done the Query.rewrite() step yet. So do that manually for them.
171 // This still doesn't provide us with the terms that econom* or *date break down into.
172
173 //if(parsedQuery instanceof PrefixQuery || parsedQuery instanceof AutomatonQuery) {
174 // Should we just check superclass MultiTermQuery?
175 // Can be a BooleanQuery containing PrefixQuery/WildCardQuery among its clauses, so
176 // just test for * in the query_string to determine if we need to do a rewrite() or not
177 if(query_string.contains("*")) {
178
179 //System.err.println("@@@@ query's class: " + parsedQuery.getClass().getName());
180
181
182 // See also common-src/indexers/lucene-gs/src/org/greenstone/LuceneWrapper3/GS2LuceneQuery.java
183 // Of http://trac.greenstone.org/changeset/26157 and http://trac.greenstone.org/ticket/845
184 try {
185 parsedQuery = expandQuery(req, parsedQuery);
186
187 } catch(BooleanQuery.TooManyClauses ex) { // hits this exception if searching solr coll for "a*"
188 System.err.println("@@@@ Encountered TooManyClauses Exception: " + ex.getMessage());
189 System.err.println("@@@@ Trying CustomRewriteMethod");
190
191 MultiTermQuery.ConstantScoreAutoRewrite customRewriteMethod = new MultiTermQuery.ConstantScoreAutoRewrite();
192 customRewriteMethod.setDocCountPercent(100.0);
193 customRewriteMethod.setTermCountCutoff(350); // same as default
194 this.currentRewriteMethod = customRewriteMethod;
195
196 try {
197 // try query.rewrite() again now
198 parsedQuery = expandQuery(req, parsedQuery);
199
200 } catch(BooleanQuery.TooManyClauses bex) { // still too many clauses
201 System.err.println("@@@@ Encountered TooManyClauses Exception despite CustomRewriteMethod: "
202 + bex.getMessage());
203 System.err.println("@@@@ Using default Multiterm RewriteMethod");
204
205 // do what the code originally did: use the default rewriteMethod which
206 // uses a default docCountPercent=0.1 (%) and termCountCutoff=350
207 currentRewriteMethod = MultiTermQuery.CONSTANT_SCORE_AUTO_REWRITE_DEFAULT;
208
209 // this will succeed, but probably won't expand * in Prefix- and WildcardQueries
210 parsedQuery = expandQuery(req, parsedQuery);
211 }
212 }
213 //System.err.println("@@@@ rewritten query is now: " + parsedQuery);
214 }
215
216
217 // extract the terms
218 Set<Term> extractedQueryTerms = new HashSet<Term>();
219 parsedQuery.extractTerms(extractedQueryTerms);
220
221 // need to sort the terms for presentation, since a Set is unsorted
222 List<Term> termsList = new ArrayList<Term>(extractedQueryTerms);
223 java.util.Collections.sort(termsList); // Term implements Comparable, terms sorted alphabetically
224
225
226
227 Iterator<Term> termsIterator = termsList.iterator();//extractedQueryTerms.iterator();
228 while(termsIterator.hasNext()) {
229 Term term = termsIterator.next();
230 //System.err.println("#### Found query term: " + term);
231
232 String field = term.field();
233 String queryTerm = term.text();
234
235 // totaltermfreq(TI, 'farming')
236 // termfreq(TI, 'farming')
237 solrParams.addField("totaltermfreq(" + field + ",'" + queryTerm + "')");
238 solrParams.addField("termfreq(" + field + ",'" + queryTerm + "')");
239 }
240 }
241
242 // set to modified SolrQuery SolrParams
243 req.setParams(solrParams);
244 // send off modified request
245 super.handleRequestBody(req, rsp);
246 }
247}
Note: See TracBrowser for help on using the repository browser.