source: gs3-extensions/solr/trunk/src/src/java/org/greenstone/gsdl3/util/SolrQueryWrapper.java@ 29986

Last change on this file since 29986 was 29986, checked in by ak19, 9 years ago

The getTerms() functionality previously used by the EmbeddedSolrServer has now been re-implemented for HttpSolrServer with the new custom Greenstone Solr RequestHandler class Greenstone3SearchHandler, which lives on the solr server side, in tomcat's solr webapp. The functionality has been improvemed, such as being able to search for: econom* cat, by recursively calling setRewriteMethods on any PrefixQuery and WildcardQuery MultiQueries within an overall BooleanQuery, and by handling BooleanQuery.TooManyClauses exceptions when the number of expanded terms is too large, such as for a search of a*.

  • Property svn:executable set to *
File size: 18.1 KB
Line 
1/**********************************************************************
2 *
3 * SolrQueryWrapper.java
4 *
5 * Copyright 2004 The New Zealand Digital Library Project
6 *
7 * A component of the Greenstone digital library software
8 * from the New Zealand Digital Library Project at the
9 * University of Waikato, New Zealand.
10 *
11 * This program is free software; you can redistribute it and/or modify
12 * it under the terms of the GNU General Public License as published by
13 * the Free Software Foundation; either version 2 of the License, or
14 * (at your option) any later version.
15 *
16 * This program is distributed in the hope that it will be useful,
17 * but WITHOUT ANY WARRANTY; without even the implied warranty of
18 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19 * GNU General Public License for more details.
20 *
21 * You should have received a copy of the GNU General Public License
22 * along with this program; if not, write to the Free Software
23 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24 *
25 *********************************************************************/
26package org.greenstone.gsdl3.util;
27
28import java.lang.reflect.Type;
29import java.net.URLDecoder;
30import java.util.ArrayList;
31import java.util.Collection;
32import java.util.HashMap;
33import java.util.Iterator;
34import java.util.List;
35import java.util.Set;
36import java.util.HashSet;
37
38import java.util.regex.Pattern;
39import java.util.regex.Matcher;
40
41import org.apache.log4j.Logger;
42import org.apache.solr.client.solrj.SolrQuery; // subclass of ModifiableSolrParams
43import org.apache.solr.client.solrj.SolrServer;
44import org.apache.solr.client.solrj.SolrServerException;
45import org.apache.solr.client.solrj.embedded.EmbeddedSolrServer;
46import org.apache.solr.client.solrj.response.QueryResponse;
47import org.apache.solr.client.solrj.response.TermsResponse;
48
49import org.apache.solr.core.CoreContainer;
50import org.apache.solr.core.SolrCore;
51
52import org.apache.solr.common.SolrDocument;
53import org.apache.solr.common.SolrDocumentList;
54import org.apache.solr.common.params.ModifiableSolrParams;
55import org.greenstone.LuceneWrapper4.SharedSoleneQuery;
56import org.greenstone.LuceneWrapper4.SharedSoleneQueryResult;
57
58import org.apache.lucene.search.Query; // Query, TermQuery, BooleanQuery, BooleanClause and more
59import org.apache.lucene.index.IndexReader;
60import org.apache.lucene.index.Term;
61import org.apache.solr.search.QParser;
62import org.apache.solr.search.SolrIndexSearcher;
63import org.apache.solr.request.LocalSolrQueryRequest;
64
65import com.google.gson.Gson;
66import com.google.gson.reflect.TypeToken;
67
68public class SolrQueryWrapper extends SharedSoleneQuery
69{
70 public static String SORT_ASCENDING = "asc";
71 public static String SORT_DESCENDING = "desc";
72 public static String SORT_BY_RANK = "score";
73 public static String SORT_BY_INDEX_ORDER = "_docid_";
74
75 static Logger logger = Logger.getLogger(org.greenstone.gsdl3.util.SolrQueryWrapper.class.getName());
76 protected int max_docs = 100;
77 protected String sort_order = SORT_DESCENDING;
78 protected String sort_field = SORT_BY_RANK; // don't want null default for solr
79 protected ArrayList<String> _facets = new ArrayList<String>();
80 protected ArrayList<String> _facetQueries = new ArrayList<String>();
81 SolrServer solr_core = null;
82
83 String collection_core_name_prefix = null;
84
85 public SolrQueryWrapper()
86 {
87 super();
88 start_results = 0;
89 }
90
91 public void setMaxDocs(int max_docs)
92 {
93 this.max_docs = max_docs;
94 }
95
96 public void setSolrCore(SolrServer solr_core)
97 {
98 this.solr_core = solr_core;
99 }
100
101 public void setCollectionCoreNamePrefix(String colCoreNamePrefix) {
102 this.collection_core_name_prefix = colCoreNamePrefix;
103 }
104
105 // make sure its not null.
106 public void setSortField(String sort_field) {
107 if (sort_field != null) {
108 this.sort_field = sort_field;
109 }
110 }
111
112 public void setSortOrder(String order)
113 {
114 this.sort_order = order;
115 }
116 public void addFacet(String facet)
117 {
118 if (!_facets.contains(facet))
119 {
120 _facets.add(facet);
121 }
122 }
123
124 public void clearFacets()
125 {
126 _facets.clear();
127 }
128
129 public void addFacetQuery(String facetQuery)
130 {
131 if (!_facetQueries.contains(facetQuery))
132 {
133 _facetQueries.add(facetQuery);
134 }
135 }
136
137 public void clearFacetQueries()
138 {
139 _facetQueries.clear();
140 }
141
142 public boolean initialise()
143 {
144 if (solr_core == null)
145 {
146 utf8out.println("Solr Core not loaded in ");
147 utf8out.flush();
148 return false;
149 }
150 return true;
151 }
152
153
154 /**
155 * UNUSED.
156 * Back when we used the EmbeddedSolrServer, this getTerms method would expand the terms of a query.
157 * Because of Solr/Lucene Index locking exceptions, we switched over to the HttpSolrServer instead
158 * of the Embedded kind.
159 *
160 * The functionality of getTerms has been moved to
161 * ../solrserver/Greenstone3SearchHandler.java, which will sit on the solrserver side (inside
162 * tomcat's solr webapp).
163 *
164 * Extracts the query terms from the query string. The query string can be a boolean
165 * combination of the various search fields with their search terms or phrases
166 */
167 public Term[] getTerms(SolrQuery solrQuery, String query_string)
168 {
169 Term terms[] = null;
170
171 if(solr_core instanceof EmbeddedSolrServer) {
172 EmbeddedSolrServer solrServer = (EmbeddedSolrServer)solr_core;
173
174 CoreContainer coreContainer = solrServer.getCoreContainer();
175
176 Collection<SolrCore> solrCores = coreContainer.getCores();
177 if(!solrCores.isEmpty()) {
178 Iterator<SolrCore> coreIterator = solrCores.iterator();
179
180 // Just use the first core that matches the collection name, since the term
181 // frequency of any term is the same regardless of whether its didx or sidx core
182 boolean foundCore = false;
183 while(coreIterator.hasNext() && !foundCore) {
184 SolrCore solrCore = coreIterator.next();
185 if(this.collection_core_name_prefix != null) {
186 if(!solrCore.getName().startsWith(this.collection_core_name_prefix)) {
187 //logger.error("### Skipping core not of this collection: " + solrCore.getName());
188 continue;
189 }
190 } else {
191 logger.error("### Collection_core_name_prefix not set. Won't try to find terms");
192 break;
193 }
194
195 //logger.error("### Found core " + solrCore.getName() + " of this collection " + this.collection_core_name_prefix);
196 foundCore = true;
197
198 LocalSolrQueryRequest solrQueryRequest = new LocalSolrQueryRequest(solrCore, solrQuery);
199 Query parsedQuery = null;
200
201 try {
202
203 // get the qparser, default is LuceneQParserPlugin, which is called "lucene" see http://wiki.apache.org/solr/QueryParser
204 QParser qParser = QParser.getParser(query_string, "lucene", solrQueryRequest);
205 parsedQuery = qParser.getQuery();
206
207 // For PrefixQuery or WildCardQuery (a subclass of AutomatonQuery, incl RegexpQ),
208 // like ZZ:econom* and ZZ:*date/regex queries, Query.extractTerms() throws an Exception
209 // because it has not done the Query.rewrite() step yet. So do that manually for them.
210 // This still doesn't provide us with the terms that econom* or *date break down into.
211
212 //if(parsedQuery instanceof PrefixQuery || parsedQuery instanceof AutomatonQuery) {
213 // Should we just check superclass MultiTermQuery?
214 // Can be a BooleanQuery containing PrefixQuery/WildCardQuery among its clauses, so
215 // just test for * in the query_string to determine if we need to do a rewrite() or not
216 if(query_string.contains("*")) {
217 SolrIndexSearcher searcher = solrQueryRequest.getSearcher();
218 IndexReader indexReader = searcher.getIndexReader(); // returns a DirectoryReader
219 parsedQuery = parsedQuery.rewrite(indexReader); // gets rewritten to ConstantScoreQuery
220 }
221
222 //System.err.println("#### Query type was: " + parsedQuery.getClass());
223 //logger.error("#### Query type was: " + parsedQuery.getClass());
224
225 // extract the terms
226 Set<Term> extractedQueryTerms = new HashSet<Term>();
227 parsedQuery.extractTerms(extractedQueryTerms);
228
229 terms = new Term[extractedQueryTerms.size()];
230
231 Iterator<Term> termsIterator = extractedQueryTerms.iterator();
232 for(int i = 0; termsIterator.hasNext(); i++) {
233 Term term = termsIterator.next();
234 ///System.err.println("#### Found query term: " + term);
235 ///logger.error("#### Found query term: " + term);
236
237 terms[i] = term; //(term.field(), term.text());
238 }
239
240 } catch(Exception queryParseException) {
241 queryParseException.printStackTrace();
242 System.err.println("Exception when parsing query: " + queryParseException.getMessage());
243 System.err.println("#### Query type was: " + parsedQuery.getClass());
244 logger.error("#### Query type was: " + parsedQuery.getClass());
245 }
246 // http://lucene.apache.org/solr/4_7_2/solr-core/org/apache/solr/request/SolrQueryRequestBase.html#close%28%29
247 // close() must be called when the object is no longer in use. Frees resources associated with this request
248 solrQueryRequest.close();
249 }
250
251 } else {
252 System.err.println("#### CoreContainer is empty");
253 logger.error("#### CoreContainer is empty");
254 }
255 } else {
256 System.err.println("#### Not an EmbeddedSolrServer. SolrQueryWrapper.getTerms() not yet implemented for " + solr_core.getClass());
257 logger.error("#### Not an EmbeddedSolrServer. SolrQueryWrapper.getTerms() not yet implemented for " + solr_core.getClass());
258 }
259
260
261 return terms;
262 }
263
264 public SharedSoleneQueryResult runQuery(String query_string)
265 {
266 if (query_string == null || query_string.equals(""))
267 {
268 utf8out.println("The query word is not indicated ");
269 utf8out.flush();
270 return null;
271 }
272
273 SolrQueryResult solr_query_result = new SolrQueryResult();
274 solr_query_result.clear();
275
276 if (_facetQueries.size() > 0)
277 {
278 HashMap<String, ArrayList<String>> grouping = new HashMap<String, ArrayList<String>>();
279 for (String currentQuery : _facetQueries)
280 {
281 //Facet queries are stored in JSON, so we have to decode it
282 Gson gson = new Gson();
283 Type type = new TypeToken<List<String>>()
284 {
285 }.getType();
286 List<String> queryElems = gson.fromJson(currentQuery, type);
287
288 //Group each query segment by the index it uses
289 for (String currentQueryElement : queryElems)
290 {
291 String decodedQueryElement = null;
292 try
293 {
294 decodedQueryElement = URLDecoder.decode(currentQueryElement, "UTF-8");
295 }
296 catch (Exception ex)
297 {
298 continue;
299 }
300
301 int colonIndex = currentQueryElement.indexOf(":");
302 String indexShortName = currentQueryElement.substring(0, colonIndex);
303
304 if (grouping.get(indexShortName) == null)
305 {
306 grouping.put(indexShortName, new ArrayList<String>());
307 }
308 grouping.get(indexShortName).add(decodedQueryElement);
309 }
310 }
311
312 //Construct the facet query string to add to the regular query string
313 StringBuilder facetQueryString = new StringBuilder();
314 int keysetCounter = 0;
315 for (String key : grouping.keySet())
316 {
317 StringBuilder currentFacetString = new StringBuilder("(");
318 int groupCounter = 0;
319 for (String queryElem : grouping.get(key))
320 {
321 currentFacetString.append(queryElem);
322
323 groupCounter++;
324 if (groupCounter < grouping.get(key).size())
325 {
326 currentFacetString.append(" OR ");
327 }
328 }
329 currentFacetString.append(")");
330
331 facetQueryString.append(currentFacetString);
332
333 keysetCounter++;
334 if (keysetCounter < grouping.keySet().size())
335 {
336 facetQueryString.append(" AND ");
337 }
338 }
339
340 if (facetQueryString.length() > 0)
341 {
342 query_string += " AND " + facetQueryString;
343 }
344 }
345
346
347 SolrQuery solrQuery = new SolrQuery(query_string);
348 solrQuery.addSort(this.sort_field, SolrQuery.ORDER.valueOf(this.sort_order)); // sort param, like "score desc" or "byORG asc"
349 solrQuery.setStart(start_results); // which result to start from
350 solrQuery.setRows((end_results - start_results) + 1); // how many results per "page"
351
352 // http://lucene.472066.n3.nabble.com/get-term-frequency-just-only-keywords-search-td4084510.html
353 // WORKS (search didx core):
354 //TI:farming
355 //docOID,score,termfreq(TI,'farming'),totaltermfreq(TI,'farming')
356
357
358 // which fields to return for each document, we'll add the request for totaltermfreq later
359 // fl=docOID score termfreq(TI,'farming') totaltermfreq(TI,'farming')
360 solrQuery.setFields("docOID", "score"); //solrParams.set("fl", "docOID score totaltermfreq(field,'queryterm')");
361
362 //solrQuery.setTerms(true); // turn on the termsComponent
363 //solrQuery.set("terms.fl", "ZZ"); // which field to get the terms from. ModifiableSolrParams method
364
365 // http://wiki.apache.org/solr/TermVectorComponent and https://cwiki.apache.org/confluence/display/solr/The+Term+Vector+Component
366 // http://lucene.472066.n3.nabble.com/get-term-frequency-just-only-keywords-search-td4084510.html
367 // http://stackoverflow.com/questions/13031534/word-frequency-in-solr
368 // http://wiki.apache.org/solr/FunctionQuery#tf and #termfreq and #totaltermfreq
369 // https://wiki.apache.org/solr/TermsComponent
370
371 //solrParams.set("tv.tf", true);// turn on the terms vector Component
372 //solrParams.set("tv.fl", "ZZ");// which field to get the terms from /// ZZ
373
374
375 if (_facets.size() > 0)
376 {
377 // enable facet counts in the query response
378 solrQuery.setFacet(true); //solrParams.set("facet", "true");
379 for (int i = 0; i < _facets.size(); i++)
380 {
381 // add this field as a facet
382 solrQuery.addFacetField(_facets.get(i)); // solrParams.add("facet.field", _facets.get(i));
383 }
384 }
385
386 // the solrserver will now
387 // get the individual terms that make up the query, then request solr to return the totaltermfreq for each term
388
389 // do the query
390 try
391 {
392 QueryResponse solrResponse = solr_core.query(solrQuery); //solr_core.query(solrParams);
393 SolrDocumentList hits = solrResponse.getResults();
394 //TermsResponse termResponse = solrResponse.getTermsResponse(); // null unless termvectors=true in schema.xml
395
396 if (hits != null)
397 {
398 logger.info("*** hits size = " + hits.size());
399 logger.info("*** num docs found = " + hits.getNumFound());
400
401 logger.info("*** start results = " + start_results);
402 logger.info("*** end results = " + end_results);
403 logger.info("*** max docs = " + max_docs);
404
405 // numDocsFound is the total number of matching docs in the collection
406 // as opposed to the number of documents returned in the hits list
407
408 solr_query_result.setTotalDocs((int) hits.getNumFound());
409
410 solr_query_result.setStartResults(start_results);
411 solr_query_result.setEndResults(start_results + hits.size());
412
413
414 // get the first field we're searching in, this will be the fallback field
415 int sepIndex = query_string.indexOf(":");
416 String defaultField = query_string.substring(0, sepIndex);
417 //String query = query_string.substring(sepIndex + 2, query_string.length() - 1); // Replaced by call to getTerms()
418
419 //solr_query_result.addTerm(query, field, (int) hits.getNumFound(), -1);
420
421 // Output the matching documents
422 for (int i = 0; i < hits.size(); i++)
423 {
424 SolrDocument doc = hits.get(i);
425
426 // Need to think about how to support document term frequency. Make zero for now
427 int doc_term_freq = 0;
428 String docOID = (String) doc.get("docOID");
429 Float score = (Float) doc.get("score");
430
431 logger.info("**** docOID = " + docOID);
432 logger.info("**** score = " + score);
433
434
435 // solr returns each term's totaltermfreq, ttf, at the document level, even though
436 // the ttf is the same for each document. So extract this information just for the first document
437 if(i == 0) { // first document, all others repeat the same termfreq data
438 boolean foundTermInfo = false;
439
440 Collection<String> fieldNames = doc.getFieldNames();
441 for(Iterator<String> it = fieldNames.iterator(); it.hasNext(); ) {
442 String fieldName = it.next(); // e.g. looking for totaltermfreq(ZZ,'economically')
443 //logger.info("@@@@ found fieldName " + fieldName);
444
445
446 if(fieldName.startsWith("totaltermfreq")) {
447 //|| fieldName.startsWith("termfreq")) {
448
449 foundTermInfo = true;
450
451 // e.g. totaltermfreq(TI,'farming')
452 // e.g. termfreq(TI,'farming')
453 Pattern pattern = Pattern.compile("(.*?termfreq)\\((.*?),'(.*?)'\\)");
454 Matcher matcher = pattern.matcher(fieldName);
455 String metaField, indexField, queryTerm;
456 while (matcher.find()) {
457 metaField = matcher.group(1); // termfreq or totaltermfreq
458 indexField = matcher.group(2); //ZZ, TI
459 queryTerm = matcher.group(3);
460
461 //logger.info("\t@@@@ found field " + indexField);
462 //logger.info("\t@@@@ queryTerm " + queryTerm);
463
464 // Finally, can ask for the totaltermfreq value for this
465 // searchterm in its indexed field:
466 // e.g. totaltermfreq(TI,'farming'), e.g. termfreq(TI,'farming')
467 Long totaltermfreq = (Long)doc.get("totaltermfreq("+indexField+",'"+queryTerm+"')");
468
469 Integer termfreq = (Integer)doc.get("termfreq("+indexField+",'"+queryTerm+"')");
470
471 //System.err.println("**** ttf = " + totaltermfreq);
472 //System.err.println("**** tf = " + termfreq);
473 //logger.info("**** ttf = " + totaltermfreq);
474 //logger.info("**** tf = " + termfreq);
475 solr_query_result.addTerm(queryTerm, indexField, (int) hits.getNumFound(), totaltermfreq.intValue()); // long totaltermfreq to int
476 }
477 }
478 }
479 if(!foundTermInfo) { // no terms extracted from query_string
480 solr_query_result.addTerm(query_string, defaultField, (int) hits.getNumFound(), -1); // no terms
481 }
482 }
483
484 solr_query_result.addDoc(docOID, score.floatValue(), doc_term_freq); // doc_termfreq for which term????
485 }
486 }
487 else
488 {
489 solr_query_result.setTotalDocs(0);
490
491 solr_query_result.setStartResults(0);
492 solr_query_result.setEndResults(0);
493 }
494
495 solr_query_result.setFacetResults(solrResponse.getFacetFields());
496 }
497 catch (SolrServerException server_exception)
498 {
499 server_exception.printStackTrace();
500 solr_query_result.setError(SolrQueryResult.SERVER_ERROR);
501 }
502
503 return solr_query_result;
504 }
505
506 //Greenstone universe operates with a base of 1 for "start_results"
507 //But Solr operates from 0
508 public void setStartResults(int start_results)
509 {
510 if (start_results < 0)
511 {
512 start_results = 0;
513 }
514 this.start_results = start_results - 1;
515 }
516
517 public void cleanUp()
518 {
519 super.cleanUp();
520 }
521
522}
Note: See TracBrowser for help on using the repository browser.