source: gs3-extensions/solr/trunk/src/src/java/org/greenstone/gsdl3/util/SolrQueryWrapper.java@ 29209

Last change on this file since 29209 was 29209, checked in by ak19, 10 years ago

Correcting else statement error messages

  • Property svn:executable set to *
File size: 15.8 KB
Line 
1/**********************************************************************
2 *
3 * SolrQueryWrapper.java
4 *
5 * Copyright 2004 The New Zealand Digital Library Project
6 *
7 * A component of the Greenstone digital library software
8 * from the New Zealand Digital Library Project at the
9 * University of Waikato, New Zealand.
10 *
11 * This program is free software; you can redistribute it and/or modify
12 * it under the terms of the GNU General Public License as published by
13 * the Free Software Foundation; either version 2 of the License, or
14 * (at your option) any later version.
15 *
16 * This program is distributed in the hope that it will be useful,
17 * but WITHOUT ANY WARRANTY; without even the implied warranty of
18 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19 * GNU General Public License for more details.
20 *
21 * You should have received a copy of the GNU General Public License
22 * along with this program; if not, write to the Free Software
23 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24 *
25 *********************************************************************/
26package org.greenstone.gsdl3.util;
27
28import java.lang.reflect.Type;
29import java.net.URLDecoder;
30import java.util.ArrayList;
31import java.util.Collection;
32import java.util.HashMap;
33import java.util.Iterator;
34import java.util.List;
35import java.util.Set;
36import java.util.HashSet;
37
38import org.apache.log4j.Logger;
39import org.apache.solr.client.solrj.SolrQuery; // subclass of ModifiableSolrParams
40import org.apache.solr.client.solrj.SolrServer;
41import org.apache.solr.client.solrj.SolrServerException;
42import org.apache.solr.client.solrj.embedded.EmbeddedSolrServer;
43import org.apache.solr.client.solrj.response.QueryResponse;
44import org.apache.solr.client.solrj.response.TermsResponse;
45
46import org.apache.solr.core.CoreContainer;
47import org.apache.solr.core.SolrCore;
48
49import org.apache.solr.common.SolrDocument;
50import org.apache.solr.common.SolrDocumentList;
51import org.apache.solr.common.params.ModifiableSolrParams;
52import org.greenstone.LuceneWrapper4.SharedSoleneQuery;
53import org.greenstone.LuceneWrapper4.SharedSoleneQueryResult;
54
55import org.apache.lucene.search.Query; // Query, TermQuery, BooleanQuery, BooleanClause and more
56import org.apache.lucene.index.IndexReader;
57import org.apache.lucene.index.Term;
58import org.apache.solr.search.QParser;
59import org.apache.solr.search.SolrIndexSearcher;
60import org.apache.solr.request.LocalSolrQueryRequest;
61
62import com.google.gson.Gson;
63import com.google.gson.reflect.TypeToken;
64
65public class SolrQueryWrapper extends SharedSoleneQuery
66{
67 public static String SORT_ASCENDING = "asc";
68 public static String SORT_DESCENDING = "desc";
69 public static String SORT_BY_RANK = "score";
70 public static String SORT_BY_INDEX_ORDER = "_docid_";
71
72 static Logger logger = Logger.getLogger(org.greenstone.gsdl3.util.SolrQueryWrapper.class.getName());
73 protected int max_docs = 100;
74 protected String sort_order = SORT_DESCENDING;
75 protected String sort_field = SORT_BY_RANK; // don't want null default for solr
76 protected ArrayList<String> _facets = new ArrayList<String>();
77 protected ArrayList<String> _facetQueries = new ArrayList<String>();
78 SolrServer solr_core = null;
79
80 public SolrQueryWrapper()
81 {
82 super();
83 start_results = 0;
84 }
85
86 public void setMaxDocs(int max_docs)
87 {
88 this.max_docs = max_docs;
89 }
90
91 public void setSolrCore(SolrServer solr_core)
92 {
93 this.solr_core = solr_core;
94 }
95 // make sure its not null.
96 public void setSortField(String sort_field) {
97 if (sort_field != null) {
98 this.sort_field = sort_field;
99 }
100 }
101
102 public void setSortOrder(String order)
103 {
104 this.sort_order = order;
105 }
106 public void addFacet(String facet)
107 {
108 if (!_facets.contains(facet))
109 {
110 _facets.add(facet);
111 }
112 }
113
114 public void clearFacets()
115 {
116 _facets.clear();
117 }
118
119 public void addFacetQuery(String facetQuery)
120 {
121 if (!_facetQueries.contains(facetQuery))
122 {
123 _facetQueries.add(facetQuery);
124 }
125 }
126
127 public void clearFacetQueries()
128 {
129 _facetQueries.clear();
130 }
131
132 public boolean initialise()
133 {
134 if (solr_core == null)
135 {
136 utf8out.println("Solr Core not loaded in ");
137 utf8out.flush();
138 return false;
139 }
140 return true;
141 }
142
143
144 /** Extracts the query terms from the query string. The query string can be a boolean
145 * combination of the various search fields with their search terms or phrases
146 */
147 public Term[] getTerms(SolrQuery solrQuery, String query_string)
148 {
149 Term terms[] = null;
150
151 if(solr_core instanceof EmbeddedSolrServer) {
152 EmbeddedSolrServer solrServer = (EmbeddedSolrServer)solr_core;
153
154 CoreContainer coreContainer = solrServer.getCoreContainer();
155
156 Collection<SolrCore> solrCores = coreContainer.getCores();
157 if(!solrCores.isEmpty()) {
158 Iterator<SolrCore> coreIterator = solrCores.iterator();
159
160 // Just use the first core, since the term frequency of any term is the same regardless of core
161 if(coreIterator.hasNext()) {
162 SolrCore solrCore = coreIterator.next();
163
164
165 LocalSolrQueryRequest solrQueryRequest = new LocalSolrQueryRequest(solrCore, solrQuery);
166 Query parsedQuery = null;
167
168 try {
169
170 // get the qparser, default is LuceneQParserPlugin, which is called "lucene" see http://wiki.apache.org/solr/QueryParser
171 QParser qParser = QParser.getParser(query_string, "lucene", solrQueryRequest);
172 parsedQuery = qParser.getQuery();
173
174 // For PrefixQuery or WildCardQuery (a subclass of AutomatonQuery, incl RegexpQ),
175 // like ZZ:econom* and ZZ:*date/regex queries, Query.extractTerms() throws an Exception
176 // because it has not done the Query.rewrite() step yet. So do that manually for them.
177 // This still doesn't provide us with the terms that econom* or *date break down into.
178
179 //if(parsedQuery instanceof PrefixQuery || parsedQuery instanceof AutomatonQuery) {
180 // Should we just check superclass MultiTermQuery?
181 // Can be a BooleanQuery containing PrefixQuery/WildCardQuery among its clauses, so
182 // just test for * in the query_string to determine if we need to do a rewrite() or not
183 if(query_string.contains("*")) {
184 SolrIndexSearcher searcher = solrQueryRequest.getSearcher();
185 IndexReader indexReader = searcher.getIndexReader(); // returns a DirectoryReader
186 parsedQuery = parsedQuery.rewrite(indexReader); // gets rewritten to ConstantScoreQuery
187 }
188
189 //System.err.println("#### Query type was: " + parsedQuery.getClass());
190 //logger.error("#### Query type was: " + parsedQuery.getClass());
191
192 // extract the terms
193 Set<Term> extractedQueryTerms = new HashSet<Term>();
194 parsedQuery.extractTerms(extractedQueryTerms);
195
196 terms = new Term[extractedQueryTerms.size()];
197
198 Iterator<Term> termsIterator = extractedQueryTerms.iterator();
199 for(int i = 0; termsIterator.hasNext(); i++) {
200 Term term = termsIterator.next();
201 ///System.err.println("#### Found query term: " + term);
202 ///logger.error("#### Found query term: " + term);
203
204 terms[i] = term; //(term.field(), term.text());
205 }
206
207 } catch(Exception queryParseException) {
208 queryParseException.printStackTrace();
209 System.err.println("Exception when parsing query: " + queryParseException.getMessage());
210 System.err.println("#### Query type was: " + parsedQuery.getClass());
211 logger.error("#### Query type was: " + parsedQuery.getClass());
212 }
213 }
214
215 } else {
216 System.err.println("#### CoreContainer is empty");
217 logger.error("#### CoreContainer is empty");
218 }
219 } else {
220 System.err.println("#### Not an EmbeddedSolrServer. This shouldn't happen." + solr_core.getClass());
221 logger.error("#### Not an EmbeddedSolrServer. This shouldn't happen." + solr_core.getClass());
222 }
223
224
225 return terms;
226 }
227
228 public SharedSoleneQueryResult runQuery(String query_string)
229 {
230 if (query_string == null || query_string.equals(""))
231 {
232 utf8out.println("The query word is not indicated ");
233 utf8out.flush();
234 return null;
235 }
236
237 SolrQueryResult solr_query_result = new SolrQueryResult();
238 solr_query_result.clear();
239
240 if (_facetQueries.size() > 0)
241 {
242 HashMap<String, ArrayList<String>> grouping = new HashMap<String, ArrayList<String>>();
243 for (String currentQuery : _facetQueries)
244 {
245 //Facet queries are stored in JSON, so we have to decode it
246 Gson gson = new Gson();
247 Type type = new TypeToken<List<String>>()
248 {
249 }.getType();
250 List<String> queryElems = gson.fromJson(currentQuery, type);
251
252 //Group each query segment by the index it uses
253 for (String currentQueryElement : queryElems)
254 {
255 String decodedQueryElement = null;
256 try
257 {
258 decodedQueryElement = URLDecoder.decode(currentQueryElement, "UTF-8");
259 }
260 catch (Exception ex)
261 {
262 continue;
263 }
264
265 int colonIndex = currentQueryElement.indexOf(":");
266 String indexShortName = currentQueryElement.substring(0, colonIndex);
267
268 if (grouping.get(indexShortName) == null)
269 {
270 grouping.put(indexShortName, new ArrayList<String>());
271 }
272 grouping.get(indexShortName).add(decodedQueryElement);
273 }
274 }
275
276 //Construct the facet query string to add to the regular query string
277 StringBuilder facetQueryString = new StringBuilder();
278 int keysetCounter = 0;
279 for (String key : grouping.keySet())
280 {
281 StringBuilder currentFacetString = new StringBuilder("(");
282 int groupCounter = 0;
283 for (String queryElem : grouping.get(key))
284 {
285 currentFacetString.append(queryElem);
286
287 groupCounter++;
288 if (groupCounter < grouping.get(key).size())
289 {
290 currentFacetString.append(" OR ");
291 }
292 }
293 currentFacetString.append(")");
294
295 facetQueryString.append(currentFacetString);
296
297 keysetCounter++;
298 if (keysetCounter < grouping.keySet().size())
299 {
300 facetQueryString.append(" AND ");
301 }
302 }
303
304 if (facetQueryString.length() > 0)
305 {
306 query_string += " AND " + facetQueryString;
307 }
308 }
309
310
311 SolrQuery solrQuery = new SolrQuery(query_string);
312 solrQuery.addSort(this.sort_field, SolrQuery.ORDER.valueOf(this.sort_order)); // sort param, like "score desc" or "byORG asc"
313 solrQuery.setStart(start_results); // which result to start from
314 solrQuery.setRows((end_results - start_results) + 1); // how many results per "page"
315
316 // http://lucene.472066.n3.nabble.com/get-term-frequency-just-only-keywords-search-td4084510.html
317 // WORKS (search didx core):
318 //TI:farming
319 //docOID,score,termfreq(TI,'farming'),totaltermfreq(TI,'farming')
320
321
322 // which fields to return for each document, we'll add the request for totaltermfreq later
323 // fl=docOID score termfreq(TI,'farming') totaltermfreq(TI,'farming')
324 solrQuery.setFields("docOID", "score"); //solrParams.set("fl", "docOID score totaltermfreq(field,'queryterm')");
325
326 //solrQuery.setTerms(true); // turn on the termsComponent
327 //solrQuery.set("terms.fl", "ZZ"); // which field to get the terms from. ModifiableSolrParams method
328
329 // http://wiki.apache.org/solr/TermVectorComponent and https://cwiki.apache.org/confluence/display/solr/The+Term+Vector+Component
330 // http://lucene.472066.n3.nabble.com/get-term-frequency-just-only-keywords-search-td4084510.html
331 // http://stackoverflow.com/questions/13031534/word-frequency-in-solr
332 // http://wiki.apache.org/solr/FunctionQuery#tf and #termfreq and #totaltermfreq
333 // https://wiki.apache.org/solr/TermsComponent
334
335 //solrParams.set("tv.tf", true);// turn on the terms vector Component
336 //solrParams.set("tv.fl", "ZZ");// which field to get the terms from /// ZZ
337
338
339 if (_facets.size() > 0)
340 {
341 // enable facet counts in the query response
342 solrQuery.setFacet(true); //solrParams.set("facet", "true");
343 for (int i = 0; i < _facets.size(); i++)
344 {
345 // add this field as a facet
346 solrQuery.addFacetField(_facets.get(i)); // solrParams.add("facet.field", _facets.get(i));
347 }
348 }
349
350 // get the individual terms that make up the query, then request solr to return the totaltermfreq for each term
351 Term[] terms = getTerms(solrQuery, query_string);
352 if(terms != null) {
353 for(int i = 0; i < terms.length; i++) {
354 Term term = terms[i];
355 String field = term.field();
356 String queryTerm = term.text();
357 // totaltermfreq(TI, 'farming') termfreq(TI, 'farming')
358
359 solrQuery.addField("totaltermfreq(" + field + ",'" + queryTerm + "')");
360 solrQuery.addField("termfreq(" + field + ",'" + queryTerm + "')");
361 }
362 }
363
364 // do the query
365 try
366 {
367 QueryResponse solrResponse = solr_core.query(solrQuery); //solr_core.query(solrParams);
368 SolrDocumentList hits = solrResponse.getResults();
369 //TermsResponse termResponse = solrResponse.getTermsResponse(); // null unless termvectors=true in schema.xml
370
371 if (hits != null)
372 {
373 logger.info("*** hits size = " + hits.size());
374 logger.info("*** num docs found = " + hits.getNumFound());
375
376 logger.info("*** start results = " + start_results);
377 logger.info("*** end results = " + end_results);
378 logger.info("*** max docs = " + max_docs);
379
380 // numDocsFound is the total number of matching docs in the collection
381 // as opposed to the number of documents returned in the hits list
382
383 solr_query_result.setTotalDocs((int) hits.getNumFound());
384
385 solr_query_result.setStartResults(start_results);
386 solr_query_result.setEndResults(start_results + hits.size());
387
388
389 // get the first field we're searching in, this will be the fallback field
390 int sepIndex = query_string.indexOf(":");
391 String defaultField = query_string.substring(0, sepIndex);
392 //String query = query_string.substring(sepIndex + 2, query_string.length() - 1); // Replaced by call to getTerms()
393
394 //solr_query_result.addTerm(query, field, (int) hits.getNumFound(), -1);
395
396 // Output the matching documents
397 for (int i = 0; i < hits.size(); i++)
398 {
399 SolrDocument doc = hits.get(i);
400
401 // Need to think about how to support document term frequency. Make zero for now
402 int doc_term_freq = 0;
403 String docOID = (String) doc.get("docOID");
404 Float score = (Float) doc.get("score");
405
406 logger.info("**** docOID = " + docOID);
407 logger.info("**** score = " + score);
408
409
410 // solr returns each term's totaltermfreq, ttf, at the document level, even though
411 // the ttf is the same for each document. So extract this information just for the first document
412 if(i == 0) { // first document
413
414 if(terms != null) {
415 for(int j = 0; j < terms.length; j++) {
416 Term term = terms[j];
417 String field = term.field();
418 String queryTerm = term.text();
419
420 // totaltermfreq(TI, 'farming') termfreq(TI, 'farming')
421 Long totaltermfreq = (Long)doc.get("totaltermfreq("+field+",'"+queryTerm+"')");
422 Integer termfreq = (Integer)doc.get("termfreq("+field+",'"+queryTerm+"')");
423
424 //System.err.println("**** ttf = " + totaltermfreq);
425 //System.err.println("**** tf = " + termfreq);
426 //logger.info("**** ttf = " + totaltermfreq);
427 //logger.info("**** tf = " + termfreq);
428
429 solr_query_result.addTerm(queryTerm, field, (int) hits.getNumFound(), totaltermfreq.intValue()); // long totaltermfreq to int
430 }
431 } else { // no terms extracted from query_string
432 solr_query_result.addTerm(query_string, defaultField, (int) hits.getNumFound(), -1); // no terms
433 }
434 }
435
436 solr_query_result.addDoc(docOID, score.floatValue(), doc_term_freq); // doc_termfreq for which term????
437 }
438 }
439 else
440 {
441 solr_query_result.setTotalDocs(0);
442
443 solr_query_result.setStartResults(0);
444 solr_query_result.setEndResults(0);
445 }
446
447 solr_query_result.setFacetResults(solrResponse.getFacetFields());
448 }
449 catch (SolrServerException server_exception)
450 {
451 server_exception.printStackTrace();
452 solr_query_result.setError(SolrQueryResult.SERVER_ERROR);
453 }
454
455 return solr_query_result;
456 }
457
458 //Greenstone universe operates with a base of 1 for "start_results"
459 //But Solr operates from 0
460 public void setStartResults(int start_results)
461 {
462 if (start_results < 0)
463 {
464 start_results = 0;
465 }
466 this.start_results = start_results - 1;
467 }
468
469 public void cleanUp()
470 {
471 super.cleanUp();
472 }
473
474}
Note: See TracBrowser for help on using the repository browser.