1 | /**********************************************************************
|
---|
2 | *
|
---|
3 | * Greenstone3SearchHandler.java
|
---|
4 | *
|
---|
5 | * Copyright 2015 The New Zealand Digital Library Project
|
---|
6 | *
|
---|
7 | * A component of the Greenstone digital library software
|
---|
8 | * from the New Zealand Digital Library Project at the
|
---|
9 | * University of Waikato, New Zealand.
|
---|
10 | *
|
---|
11 | * This program is free software; you can redistribute it and/or modify
|
---|
12 | * it under the terms of the GNU General Public License as published by
|
---|
13 | * the Free Software Foundation; either version 2 of the License, or
|
---|
14 | * (at your option) any later version.
|
---|
15 | *
|
---|
16 | * This program is distributed in the hope that it will be useful,
|
---|
17 | * but WITHOUT ANY WARRANTY; without even the implied warranty of
|
---|
18 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
---|
19 | * GNU General Public License for more details.
|
---|
20 | *
|
---|
21 | * You should have received a copy of the GNU General Public License
|
---|
22 | * along with this program; if not, write to the Free Software
|
---|
23 | * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
|
---|
24 | *
|
---|
25 | *********************************************************************/
|
---|
26 |
|
---|
27 | package org.greenstone.solrserver;
|
---|
28 |
|
---|
29 | import org.apache.solr.handler.component.SearchHandler;
|
---|
30 |
|
---|
31 |
|
---|
32 | import org.slf4j.Logger;
|
---|
33 | import org.slf4j.LoggerFactory;
|
---|
34 | //import org.apache.log4j.Logger;
|
---|
35 |
|
---|
36 | import org.apache.solr.client.solrj.SolrQuery; // subclass of ModifiableSolrParams, a subclass of SolrParams
|
---|
37 |
|
---|
38 | import org.apache.solr.core.CoreContainer;
|
---|
39 | import org.apache.solr.core.SolrCore;
|
---|
40 |
|
---|
41 | import org.apache.lucene.index.IndexReader;
|
---|
42 | import org.apache.lucene.index.Term;
|
---|
43 |
|
---|
44 | import org.apache.lucene.search.BooleanClause;
|
---|
45 | import org.apache.lucene.search.BooleanQuery;
|
---|
46 | import org.apache.lucene.search.MultiTermQuery;
|
---|
47 | import org.apache.lucene.search.Query; // Query, TermQuery, BooleanQuery, BooleanClause and more
|
---|
48 |
|
---|
49 |
|
---|
50 | import org.apache.solr.search.QParser;
|
---|
51 | import org.apache.solr.search.SolrIndexSearcher;
|
---|
52 |
|
---|
53 | import org.apache.solr.request.SolrQueryRequest;
|
---|
54 | import org.apache.solr.response.SolrQueryResponse;
|
---|
55 |
|
---|
56 | import java.util.ArrayList;
|
---|
57 | import java.util.Collection;
|
---|
58 | import java.util.Iterator;
|
---|
59 | import java.util.List;
|
---|
60 | import java.util.Set;
|
---|
61 | import java.util.HashSet;
|
---|
62 |
|
---|
63 | /**
|
---|
64 | * This class is a custom Solr RequestHandler that sits on the solr server side (in tomcat's solr webapp)
|
---|
65 | * and when it receives a query request (sent to this SearchHandler), it will expand the query terms
|
---|
66 | * by calling query.rewrite and then request the totaltermfreq and totalfreq for these individual terms.
|
---|
67 | * This class was made necessary by the fact that solr/lucene index locking exceptions occurred when
|
---|
68 | * this code used to be in ext/solr's SolrQueryWrapper.java::getTerms().
|
---|
69 | *
|
---|
70 | * With the customisations in this class, can search a Solr collection for: econom* cat
|
---|
71 | * And the total and term frequencies will be returned for all expanded forms, depending on the analyzer.
|
---|
72 | */
|
---|
73 |
|
---|
74 |
|
---|
75 | // Important page:
|
---|
76 | // https://wiki.apache.org/solr/SolrPlugins
|
---|
77 | public class Greenstone3SearchHandler extends SearchHandler
|
---|
78 | {
|
---|
79 | // IMPORTANT NOTE: Logging doesn't work in this calss either with log4j or slf4j,
|
---|
80 | // but System.err goes to catalina.out.
|
---|
81 |
|
---|
82 | //protected static Logger log = LoggerFactory.getLogger(Greenstone3SearchHandler.class);
|
---|
83 | //static Logger logger = LoggerFactory.getLogger(org.greenstone.solrserver.Greenstone3SearchHandler.class.getName());
|
---|
84 |
|
---|
85 | protected MultiTermQuery.RewriteMethod currentRewriteMethod
|
---|
86 | = MultiTermQuery.CONSTANT_SCORE_BOOLEAN_QUERY_REWRITE;
|
---|
87 | // which is less CPU intensive than MultiTermQuery.SCORING_BOOLEAN_QUERY_REWRITE)
|
---|
88 |
|
---|
89 | // This recursive method calls setRewriteMethod on any MultiTermQueries inside the given (boolean)query,
|
---|
90 | // since by default PrefixQueries get rewritten to ConstantScoreQueries and don't get expanded.
|
---|
91 | // Calling setRewriteMethod on each MultiTermQuery in query here is useful to later ensure that any
|
---|
92 | // MultiTermQueries like PrefixQueries and WildcareQueries can get expanded,
|
---|
93 | // including when embedded in BooleanQueries.
|
---|
94 | protected Query getSimplified(Query query)
|
---|
95 | {
|
---|
96 |
|
---|
97 | // base case
|
---|
98 | if(query instanceof MultiTermQuery) { // PrefixQuery or WildcardQuery
|
---|
99 |
|
---|
100 | // for some reason, when a PrefixQuery (e.g. econom*) gets rewritten to a ConstantScoreQuery
|
---|
101 | // it no longer rewrites the query to produce the expanded terms. Need to setRewriteMethod
|
---|
102 | // http://stackoverflow.com/questions/3060636/lucene-score-calculation-with-a-prefixquery
|
---|
103 | // See also http://trac.greenstone.org/ticket/845 and http://trac.greenstone.org/changeset/26157
|
---|
104 |
|
---|
105 | MultiTermQuery mtQuery = (MultiTermQuery)query;
|
---|
106 | mtQuery.setRewriteMethod(currentRewriteMethod);
|
---|
107 |
|
---|
108 | }
|
---|
109 |
|
---|
110 | else if(query instanceof BooleanQuery) {
|
---|
111 |
|
---|
112 | BooleanQuery bQuery = (BooleanQuery)query;
|
---|
113 | Iterator<BooleanClause> clauses = bQuery.iterator();
|
---|
114 |
|
---|
115 | while(clauses.hasNext()) {
|
---|
116 | BooleanClause clause = clauses.next();
|
---|
117 | Query clauseQuery = clause.getQuery();
|
---|
118 | Query expandedClauseQuery = getSimplified(clauseQuery);
|
---|
119 | clause.setQuery(expandedClauseQuery);
|
---|
120 | }
|
---|
121 | }
|
---|
122 |
|
---|
123 | // another type of query, leave as-is
|
---|
124 | return query;
|
---|
125 | }
|
---|
126 |
|
---|
127 | protected Query expandQuery(SolrQueryRequest req, Query parsedQuery) throws Exception {
|
---|
128 |
|
---|
129 | // calls setRewriteMethod on any MultiTermQueries inside the given (boolean)query,
|
---|
130 | // doing so ensures MultiTermQueries like PrefixQueries and WildcareQueries can get expanded
|
---|
131 | parsedQuery = getSimplified(parsedQuery); // can throw exception
|
---|
132 |
|
---|
133 | // now finally rewrite the query to any expand Prefix- and WildCareQueries contained in here
|
---|
134 | SolrIndexSearcher searcher = req.getSearcher();
|
---|
135 | IndexReader indexReader = searcher.getIndexReader(); // returns a DirectoryReader
|
---|
136 | parsedQuery = parsedQuery.rewrite(indexReader); // used to get rewritten to ConstantScoreQuery
|
---|
137 |
|
---|
138 | return parsedQuery;
|
---|
139 | }
|
---|
140 |
|
---|
141 | @Override
|
---|
142 | public void handleRequestBody(SolrQueryRequest req, SolrQueryResponse rsp) throws Exception
|
---|
143 | {
|
---|
144 |
|
---|
145 | // do getTerms() here:
|
---|
146 | // getParams, modify solrparams (q is query_string)
|
---|
147 | // if q exists, then do extractTerms and queryRewrite
|
---|
148 | // then req.setSolrParams
|
---|
149 | // and continue on as before: super.handleRequestBody(req, rsp);
|
---|
150 |
|
---|
151 | SolrQuery solrParams = new SolrQuery();
|
---|
152 | solrParams.add(req.getParams());
|
---|
153 |
|
---|
154 | //String query_string = "TX:(farming)";
|
---|
155 | String query_string = solrParams.get("q");
|
---|
156 |
|
---|
157 |
|
---|
158 | if(query_string == null || query_string.equals("")) {
|
---|
159 | log.error("@@@@@@@@@ " + this.getClass() + " - QUERY STRING EMPTY");
|
---|
160 | }
|
---|
161 | else {
|
---|
162 | //System.err.println("@@@ Parsing query_string " + query_string);
|
---|
163 |
|
---|
164 |
|
---|
165 | QParser qParser = QParser.getParser(query_string, "lucene", req);
|
---|
166 | Query parsedQuery = qParser.getQuery();
|
---|
167 |
|
---|
168 | // For PrefixQuery or WildCardQuery (a subclass of AutomatonQuery, incl RegexpQ),
|
---|
169 | // like ZZ:econom* and ZZ:*date/regex queries, Query.extractTerms() throws an Exception
|
---|
170 | // because it has not done the Query.rewrite() step yet. So do that manually for them.
|
---|
171 | // This still doesn't provide us with the terms that econom* or *date break down into.
|
---|
172 |
|
---|
173 | //if(parsedQuery instanceof PrefixQuery || parsedQuery instanceof AutomatonQuery) {
|
---|
174 | // Should we just check superclass MultiTermQuery?
|
---|
175 | // Can be a BooleanQuery containing PrefixQuery/WildCardQuery among its clauses, so
|
---|
176 | // just test for * in the query_string to determine if we need to do a rewrite() or not
|
---|
177 | if(query_string.contains("*")) {
|
---|
178 |
|
---|
179 | //System.err.println("@@@@ query's class: " + parsedQuery.getClass().getName());
|
---|
180 |
|
---|
181 |
|
---|
182 | // See also common-src/indexers/lucene-gs/src/org/greenstone/LuceneWrapper3/GS2LuceneQuery.java
|
---|
183 | // Of http://trac.greenstone.org/changeset/26157 and http://trac.greenstone.org/ticket/845
|
---|
184 | try {
|
---|
185 | parsedQuery = expandQuery(req, parsedQuery);
|
---|
186 |
|
---|
187 | } catch(BooleanQuery.TooManyClauses ex) { // hits this exception if searching solr coll for "a*"
|
---|
188 | System.err.println("@@@@ Encountered TooManyClauses Exception: " + ex.getMessage());
|
---|
189 | System.err.println("@@@@ Trying CustomRewriteMethod");
|
---|
190 |
|
---|
191 | MultiTermQuery.ConstantScoreAutoRewrite customRewriteMethod = new MultiTermQuery.ConstantScoreAutoRewrite();
|
---|
192 | customRewriteMethod.setDocCountPercent(100.0);
|
---|
193 | customRewriteMethod.setTermCountCutoff(350); // same as default
|
---|
194 | this.currentRewriteMethod = customRewriteMethod;
|
---|
195 |
|
---|
196 | try {
|
---|
197 | // try query.rewrite() again now
|
---|
198 | parsedQuery = expandQuery(req, parsedQuery);
|
---|
199 |
|
---|
200 | } catch(BooleanQuery.TooManyClauses bex) { // still too many clauses
|
---|
201 | System.err.println("@@@@ Encountered TooManyClauses Exception despite CustomRewriteMethod: "
|
---|
202 | + bex.getMessage());
|
---|
203 | System.err.println("@@@@ Using default Multiterm RewriteMethod");
|
---|
204 |
|
---|
205 | // do what the code originally did: use the default rewriteMethod which
|
---|
206 | // uses a default docCountPercent=0.1 (%) and termCountCutoff=350
|
---|
207 | currentRewriteMethod = MultiTermQuery.CONSTANT_SCORE_AUTO_REWRITE_DEFAULT;
|
---|
208 |
|
---|
209 | // this will succeed, but probably won't expand * in Prefix- and WildcardQueries
|
---|
210 | parsedQuery = expandQuery(req, parsedQuery);
|
---|
211 | }
|
---|
212 | }
|
---|
213 | //System.err.println("@@@@ rewritten query is now: " + parsedQuery);
|
---|
214 | }
|
---|
215 |
|
---|
216 |
|
---|
217 | // extract the terms
|
---|
218 | Set<Term> extractedQueryTerms = new HashSet<Term>();
|
---|
219 | parsedQuery.extractTerms(extractedQueryTerms);
|
---|
220 |
|
---|
221 | // need to sort the terms for presentation, since a Set is unsorted
|
---|
222 | List<Term> termsList = new ArrayList<Term>(extractedQueryTerms);
|
---|
223 | java.util.Collections.sort(termsList); // Term implements Comparable, terms sorted alphabetically
|
---|
224 |
|
---|
225 |
|
---|
226 |
|
---|
227 | Iterator<Term> termsIterator = termsList.iterator();//extractedQueryTerms.iterator();
|
---|
228 | while(termsIterator.hasNext()) {
|
---|
229 | Term term = termsIterator.next();
|
---|
230 | //System.err.println("#### Found query term: " + term);
|
---|
231 |
|
---|
232 | String field = term.field();
|
---|
233 | String queryTerm = term.text();
|
---|
234 |
|
---|
235 | // totaltermfreq(TI, 'farming')
|
---|
236 | // termfreq(TI, 'farming')
|
---|
237 | solrParams.addField("totaltermfreq(" + field + ",'" + queryTerm + "')");
|
---|
238 | solrParams.addField("termfreq(" + field + ",'" + queryTerm + "')");
|
---|
239 | }
|
---|
240 | }
|
---|
241 |
|
---|
242 | // set to modified SolrQuery SolrParams
|
---|
243 | req.setParams(solrParams);
|
---|
244 | // send off modified request
|
---|
245 | super.handleRequestBody(req, rsp);
|
---|
246 | }
|
---|
247 | } |
---|