source: gs3-extensions/solr/trunk/src/src/java/org/greenstone/gsdl3/util/SolrQueryWrapper.java@ 35172

Last change on this file since 35172 was 35172, checked in by kjdon, 3 years ago

we noticed when doing a solr search at document level that we were getting a random section of the document appearing outside the section headings. This is because when doing a highlighting query at document level, it was returning the first TX element that matched, which was not the top level content element. That was getting added in as the top level content, which it shouldn't be. So only check one TX element for a match then stop.

  • Property svn:executable set to *
File size: 21.7 KB
Line 
1/**********************************************************************
2 *
3 * SolrQueryWrapper.java
4 *
5 * Copyright 2004 The New Zealand Digital Library Project
6 *
7 * A component of the Greenstone digital library software
8 * from the New Zealand Digital Library Project at the
9 * University of Waikato, New Zealand.
10 *
11 * This program is free software; you can redistribute it and/or modify
12 * it under the terms of the GNU General Public License as published by
13 * the Free Software Foundation; either version 2 of the License, or
14 * (at your option) any later version.
15 *
16 * This program is distributed in the hope that it will be useful,
17 * but WITHOUT ANY WARRANTY; without even the implied warranty of
18 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19 * GNU General Public License for more details.
20 *
21 * You should have received a copy of the GNU General Public License
22 * along with this program; if not, write to the Free Software
23 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24 *
25 *********************************************************************/
26package org.greenstone.gsdl3.util;
27
28import java.lang.reflect.Type;
29import java.net.URLDecoder;
30import java.util.ArrayList;
31import java.util.Collection;
32import java.util.HashMap;
33import java.util.Iterator;
34import java.util.List;
35import java.util.Map;
36import java.util.Set;
37import java.util.HashSet;
38import java.util.regex.Pattern;
39import java.util.regex.Matcher;
40
41import org.apache.log4j.Logger;
42import org.apache.solr.client.solrj.SolrQuery; // subclass of ModifiableSolrParams
43import org.apache.solr.client.solrj.SolrServer;
44import org.apache.solr.client.solrj.SolrServerException;
45import org.apache.solr.client.solrj.embedded.EmbeddedSolrServer;
46import org.apache.solr.client.solrj.response.QueryResponse;
47import org.apache.solr.client.solrj.response.TermsResponse;
48import org.apache.solr.core.CoreContainer;
49import org.apache.solr.core.SolrCore;
50import org.apache.solr.common.SolrDocument;
51import org.apache.solr.common.SolrDocumentList;
52import org.apache.solr.common.params.ModifiableSolrParams;
53import org.greenstone.LuceneWrapper4.SharedSoleneQuery;
54import org.greenstone.LuceneWrapper4.SharedSoleneQueryResult;
55import org.apache.lucene.search.Query; // Query, TermQuery, BooleanQuery, BooleanClause and more
56import org.apache.lucene.index.IndexReader;
57import org.apache.lucene.index.Term;
58import org.apache.solr.search.QParser;
59import org.apache.solr.search.SolrIndexSearcher;
60import org.apache.solr.request.LocalSolrQueryRequest;
61
62import com.google.gson.Gson;
63import com.google.gson.reflect.TypeToken;
64
65public class SolrQueryWrapper extends SharedSoleneQuery
66{
67 public static String SORT_ASCENDING = "asc";
68 public static String SORT_DESCENDING = "desc";
69 public static String SORT_BY_RANK = "score";
70 public static String SORT_BY_INDEX_ORDER = "_docid_";
71
72 static Logger logger = Logger.getLogger(org.greenstone.gsdl3.util.SolrQueryWrapper.class.getName());
73 protected int max_docs = 100;
74 protected String sort_order = SORT_DESCENDING;
75 //Filter results by document hash. To get results from limited document sections.
76 protected String docFilter = null;
77 protected String sort_field = SORT_BY_RANK; // don't want null default for solr
78 protected ArrayList<String> _facets = new ArrayList<String>();
79 protected ArrayList<String> _facetQueries = new ArrayList<String>();
80 SolrServer solr_core = null;
81
82 protected String highlight_field = null;
83
84 String collection_core_name_prefix = null;
85
86 public SolrQueryWrapper()
87 {
88 super();
89 start_results = 0;
90 }
91
92 public void setMaxDocs(int max_docs)
93 {
94 this.max_docs = max_docs;
95 }
96
97 public void setSolrCore(SolrServer solr_core)
98 {
99 this.solr_core = solr_core;
100 }
101
102 public void setCollectionCoreNamePrefix(String colCoreNamePrefix) {
103 this.collection_core_name_prefix = colCoreNamePrefix;
104 }
105
106 // make sure its not null.
107 public void setSortField(String sort_field) {
108 if (sort_field != null) {
109 this.sort_field = sort_field;
110 }
111 }
112 public void setHighlightField(String hl_field)
113 {
114 this.highlight_field = hl_field;
115 }
116 public void setSortOrder(String order)
117 {
118 this.sort_order = order;
119 }
120 public void setDocFilter(String docFilter)
121 {
122 this.docFilter = docFilter;
123 }
124 public void addFacet(String facet)
125 {
126 if (!_facets.contains(facet))
127 {
128 _facets.add(facet);
129 }
130 }
131
132 public void clearFacets()
133 {
134 _facets.clear();
135 }
136
137 public void addFacetQuery(String facetQuery)
138 {
139 if (!_facetQueries.contains(facetQuery))
140 {
141 _facetQueries.add(facetQuery);
142 }
143 }
144
145 public void clearFacetQueries()
146 {
147 _facetQueries.clear();
148 }
149
150 public boolean initialise()
151 {
152 if (solr_core == null)
153 {
154 utf8out.println("Solr Core not loaded in ");
155 utf8out.flush();
156 return false;
157 }
158 return true;
159 }
160
161
162 /**
163 * UNUSED.
164 * Back when we used the EmbeddedSolrServer, this getTerms method would expand the terms of a query.
165 * Because of Solr/Lucene Index locking exceptions, we switched over to the HttpSolrServer instead
166 * of the Embedded kind.
167 *
168 * The functionality of getTerms has been moved to
169 * ../solrserver/Greenstone3SearchHandler.java, which will sit on the solrserver side (inside
170 * tomcat's solr webapp).
171 *
172 * Extracts the query terms from the query string. The query string can be a boolean
173 * combination of the various search fields with their search terms or phrases
174 */
175 public Term[] getTerms(SolrQuery solrQuery, String query_string)
176 {
177 Term terms[] = null;
178
179 if(solr_core instanceof EmbeddedSolrServer) {
180 EmbeddedSolrServer solrServer = (EmbeddedSolrServer)solr_core;
181
182 CoreContainer coreContainer = solrServer.getCoreContainer();
183
184 Collection<SolrCore> solrCores = coreContainer.getCores();
185 if(!solrCores.isEmpty()) {
186 Iterator<SolrCore> coreIterator = solrCores.iterator();
187
188 // Just use the first core that matches the collection name, since the term
189 // frequency of any term is the same regardless of whether its didx or sidx core
190 boolean foundCore = false;
191 while(coreIterator.hasNext() && !foundCore) {
192 SolrCore solrCore = coreIterator.next();
193 if(this.collection_core_name_prefix != null) {
194 if(!solrCore.getName().startsWith(this.collection_core_name_prefix)) {
195 //logger.error("### Skipping core not of this collection: " + solrCore.getName());
196 continue;
197 }
198 } else {
199 logger.error("### Collection_core_name_prefix not set. Won't try to find terms");
200 break;
201 }
202
203 //logger.error("### Found core " + solrCore.getName() + " of this collection " + this.collection_core_name_prefix);
204 foundCore = true;
205
206 LocalSolrQueryRequest solrQueryRequest = new LocalSolrQueryRequest(solrCore, solrQuery);
207 Query parsedQuery = null;
208
209 try {
210
211 // get the qparser, default is LuceneQParserPlugin, which is called "lucene" see http://wiki.apache.org/solr/QueryParser
212 QParser qParser = QParser.getParser(query_string, "lucene", solrQueryRequest);
213 parsedQuery = qParser.getQuery();
214
215 // For PrefixQuery or WildCardQuery (a subclass of AutomatonQuery, incl RegexpQ),
216 // like ZZ:econom* and ZZ:*date/regex queries, Query.extractTerms() throws an Exception
217 // because it has not done the Query.rewrite() step yet. So do that manually for them.
218 // This still doesn't provide us with the terms that econom* or *date break down into.
219
220 //if(parsedQuery instanceof PrefixQuery || parsedQuery instanceof AutomatonQuery) {
221 // Should we just check superclass MultiTermQuery?
222 // Can be a BooleanQuery containing PrefixQuery/WildCardQuery among its clauses, so
223 // just test for * in the query_string to determine if we need to do a rewrite() or not
224 if(query_string.contains("*")) {
225 SolrIndexSearcher searcher = solrQueryRequest.getSearcher();
226 IndexReader indexReader = searcher.getIndexReader(); // returns a DirectoryReader
227 parsedQuery = parsedQuery.rewrite(indexReader); // gets rewritten to ConstantScoreQuery
228 }
229
230 //System.err.println("#### Query type was: " + parsedQuery.getClass());
231 //logger.error("#### Query type was: " + parsedQuery.getClass());
232
233 // extract the terms
234 Set<Term> extractedQueryTerms = new HashSet<Term>();
235 parsedQuery.extractTerms(extractedQueryTerms);
236
237 terms = new Term[extractedQueryTerms.size()];
238
239 Iterator<Term> termsIterator = extractedQueryTerms.iterator();
240 for(int i = 0; termsIterator.hasNext(); i++) {
241 Term term = termsIterator.next();
242 ///System.err.println("#### Found query term: " + term);
243 ///logger.error("#### Found query term: " + term);
244
245 terms[i] = term; //(term.field(), term.text());
246 }
247
248 } catch(Exception queryParseException) {
249 queryParseException.printStackTrace();
250 System.err.println("Exception when parsing query: " + queryParseException.getMessage());
251 System.err.println("#### Query type was: " + parsedQuery.getClass());
252 logger.error("#### Query type was: " + parsedQuery.getClass());
253 }
254 // http://lucene.apache.org/solr/4_7_2/solr-core/org/apache/solr/request/SolrQueryRequestBase.html#close%28%29
255 // close() must be called when the object is no longer in use. Frees resources associated with this request
256 solrQueryRequest.close();
257 }
258
259 } else {
260 System.err.println("#### CoreContainer is empty");
261 logger.error("#### CoreContainer is empty");
262 }
263 } else {
264 System.err.println("#### Not an EmbeddedSolrServer. SolrQueryWrapper.getTerms() not yet implemented for " + solr_core.getClass());
265 logger.error("#### Not an EmbeddedSolrServer. SolrQueryWrapper.getTerms() not yet implemented for " + solr_core.getClass());
266 }
267
268
269 return terms;
270 }
271
272 public SharedSoleneQueryResult runQuery(String query_string)
273 {
274 if (query_string == null || query_string.equals(""))
275 {
276 utf8out.println("The query word is not indicated ");
277 utf8out.flush();
278 return null;
279 }
280
281 SolrQueryResult solr_query_result = new SolrQueryResult();
282 solr_query_result.clear();
283
284 if (_facetQueries.size() > 0)
285 {
286 HashMap<String, ArrayList<String>> grouping = new HashMap<String, ArrayList<String>>();
287 for (String currentQuery : _facetQueries)
288 {
289 //Facet queries are stored in JSON, so we have to decode it
290 Gson gson = new Gson();
291 Type type = new TypeToken<List<String>>()
292 {
293 }.getType();
294 List<String> queryElems = gson.fromJson(currentQuery, type);
295
296 //Group each query segment by the index it uses
297 for (String currentQueryElement : queryElems)
298 {
299 //logger.info("@@@@ currentQueryElement " + currentQueryElement);
300
301 String decodedQueryElement = null;
302 try
303 {
304 decodedQueryElement = URLDecoder.decode(currentQueryElement, "UTF-8");
305 }
306 catch (Exception ex)
307 {
308 continue;
309 }
310
311 int colonIndex = currentQueryElement.indexOf(":");
312 String indexShortName = currentQueryElement.substring(0, colonIndex);
313
314 if (grouping.get(indexShortName) == null)
315 {
316 grouping.put(indexShortName, new ArrayList<String>());
317 }
318 grouping.get(indexShortName).add(decodedQueryElement);
319 }
320 }
321
322 //Construct the facet query string to add to the regular query string
323 StringBuilder facetQueryString = new StringBuilder();
324 int keysetCounter = 0;
325 for (String key : grouping.keySet())
326 {
327 StringBuilder currentFacetString = new StringBuilder("(");
328 int groupCounter = 0;
329 for (String queryElem : grouping.get(key))
330 {
331 currentFacetString.append(queryElem);
332
333 groupCounter++;
334 if (groupCounter < grouping.get(key).size())
335 {
336 currentFacetString.append(" OR ");
337 }
338 }
339 currentFacetString.append(")");
340
341 facetQueryString.append(currentFacetString);
342
343 keysetCounter++;
344 if (keysetCounter < grouping.keySet().size())
345 {
346 facetQueryString.append(" AND ");
347 }
348 }
349
350 if (facetQueryString.length() > 0)
351 {
352 query_string += " AND " + facetQueryString;
353 }
354 }
355
356
357 SolrQuery solrQuery = new SolrQuery(query_string);
358 solrQuery.addSort(this.sort_field, SolrQuery.ORDER.valueOf(this.sort_order)); // sort param, like "score desc" or "byORG asc"
359 solrQuery.setStart(start_results); // which result to start from
360 solrQuery.setRows(end_results - start_results); // how many results per "page"
361
362 // http://lucene.472066.n3.nabble.com/get-term-frequency-just-only-keywords-search-td4084510.html
363 // WORKS (search didx core):
364 //TI:farming
365 //docOID,score,termfreq(TI,'farming'),totaltermfreq(TI,'farming')
366
367
368 // which fields to return for each document, we'll add the request for totaltermfreq later
369 // fl=docOID score termfreq(TI,'farming') totaltermfreq(TI,'farming')
370 solrQuery.setFields("docOID", "score"); //solrParams.set("fl", "docOID score totaltermfreq(field,'queryterm')");
371
372 //Turn on highlighting
373 solrQuery.setHighlight(true);
374 //Return 3 snippets for each document
375 solrQuery.setParam("hl.snippets", "3");
376 solrQuery.setParam("hl.useFastVectorHighlighter", "true");
377 solrQuery.setParam("hl.fl", highlight_field);
378 solrQuery.setParam("hl.tag.pre", "&lt;span class=\"snippetText\"&gt;" );
379 solrQuery.setParam("hl.tag.post","&lt;/span&gt;" );
380
381 // set the default conjunction op
382 solrQuery.setParam("q.op", this.default_conjunction_operator);
383 if (docFilter != null) {
384 solrQuery.setParam("fq", "docOID:" + docFilter + "*");
385 }
386 //solrQuery.setTerms(true); // turn on the termsComponent
387 //solrQuery.set("terms.fl", "ZZ"); // which field to get the terms from. ModifiableSolrParams method
388
389 // http://wiki.apache.org/solr/TermVectorComponent and https://cwiki.apache.org/confluence/display/solr/The+Term+Vector+Component
390 // http://lucene.472066.n3.nabble.com/get-term-frequency-just-only-keywords-search-td4084510.html
391 // http://stackoverflow.com/questions/13031534/word-frequency-in-solr
392 // http://wiki.apache.org/solr/FunctionQuery#tf and #termfreq and #totaltermfreq
393 // https://wiki.apache.org/solr/TermsComponent
394
395 //solrParams.set("tv.tf", true);// turn on the terms vector Component
396 //solrParams.set("tv.fl", "ZZ");// which field to get the terms from /// ZZ
397
398
399 if (_facets.size() > 0)
400 {
401 // enable facet counts in the query response
402 solrQuery.setFacet(true); //solrParams.set("facet", "true");
403 for (int i = 0; i < _facets.size(); i++)
404 {
405 // add this field as a facet
406 solrQuery.addFacetField(_facets.get(i)); // solrParams.add("facet.field", _facets.get(i));
407 }
408 //for(int i = 0; i < _facetQueries.size(); i++) {
409 // logger.info("@@@@ facet query i: " + _facetQueries.get(i));
410 //}
411 }
412
413
414 // Some debugging
415 logger.info("@@@@ solrQuery: " + solrQuery);
416 try {
417 // https://stackoverflow.com/questions/2632175/decoding-uri-query-string-in-java
418 String displayQueryString = URLDecoder.decode(solrQuery.toString().replace("+", " "), "UTF-8");
419 logger.info("@@@@ solrQuery URL decoded: " + displayQueryString);
420 } catch(Exception uee) { // UnsupportedEncodingException
421 logger.info("Got debug exception " + uee.getMessage());
422 }
423
424
425 // the solrserver will now
426 // get the individual terms that make up the query, then request solr to return the totaltermfreq for each term
427
428 // do the query
429 try
430 {
431 QueryResponse solrResponse = solr_core.query(solrQuery); //solr_core.query(solrParams);
432 SolrDocumentList hits = solrResponse.getResults();
433 Map<String, Map<String, List<String>>> hlResponse = solrResponse.getHighlighting();
434 solr_query_result.setHighlightResults(hlResponse);
435 //TermsResponse termResponse = solrResponse.getTermsResponse(); // null unless termvectors=true in schema.xml
436
437 if (hits != null)
438 {
439 logger.info("*** hits size = " + hits.size()+
440 ", num docs found = " + hits.getNumFound() +
441 ", start results = " + start_results +
442 ", end results = " + end_results+
443 ", max docs = " + max_docs);
444
445 // numDocsFound is the total number of matching docs in the collection
446 // as opposed to the number of documents returned in the hits list
447
448 solr_query_result.setTotalDocs((int) hits.getNumFound());
449
450 solr_query_result.setStartResults(start_results);
451 solr_query_result.setEndResults(start_results + hits.size());
452
453 // get the first field we're searching in, this will be the fallback field
454 int sepIndex = query_string.indexOf(":");
455 String defaultField = query_string.substring(0, sepIndex);
456 //String query = query_string.substring(sepIndex + 2, query_string.length() - 1); // Replaced by call to getTerms()
457
458 //solr_query_result.addTerm(query, field, (int) hits.getNumFound(), -1);
459
460 // Output the matching documents
461 for (int i = 0; i < hits.size(); i++)
462 {
463 SolrDocument doc = hits.get(i);
464
465 // Need to think about how to support document term frequency. Make zero for now
466 int doc_term_freq = 0;
467 String docOID = (String) doc.get("docOID");
468 Float score = (Float) doc.get("score");
469
470 //logger.info("**** docOID = " + docOID);
471 //logger.info("**** score = " + score);
472
473
474 // solr returns each term's totaltermfreq, ttf, at the document level, even though
475 // the ttf is the same for each document. So extract this information just for the first document
476 // https://wiki.apache.org/solr/FunctionQuery#docfreq
477
478 if(i == 0) { // first document, all others repeat the same termfreq data
479 boolean foundTermInfo = false;
480
481 Collection<String> fieldNames = doc.getFieldNames();
482 for(Iterator<String> it = fieldNames.iterator(); it.hasNext(); ) {
483 String fieldName = it.next(); // e.g. looking for totaltermfreq(ZZ,'economically')
484 //logger.info("@@@@ found fieldName " + fieldName);
485
486
487 if(fieldName.startsWith("totaltermfreq")) {
488 //|| fieldName.startsWith("termfreq")) {
489
490 foundTermInfo = true;
491
492 // e.g. totaltermfreq(TI,'farming')
493 // e.g. termfreq(TI,'farming')
494 Pattern pattern = Pattern.compile("(.*?termfreq)\\((.*?),'(.*?)'\\)");
495 Matcher matcher = pattern.matcher(fieldName);
496 String metaField, indexField, queryTerm;
497 while (matcher.find()) {
498 metaField = matcher.group(1); // termfreq or totaltermfreq
499 indexField = matcher.group(2); //ZZ, TI
500 queryTerm = matcher.group(3);
501
502 //logger.info("\t@@@@ found field " + indexField);
503 //logger.info("\t@@@@ queryTerm " + queryTerm);
504
505 // Finally, can ask for the totaltermfreq value for this
506 // searchterm in its indexed field:
507 // e.g. totaltermfreq(TI,'farming'), e.g. termfreq(TI,'farming')
508 Long totaltermfreq = (Long)doc.get("totaltermfreq("+indexField+",'"+queryTerm+"')");
509
510 Integer termfreq = (Integer)doc.get("termfreq("+indexField+",'"+queryTerm+"')");
511
512 //System.err.println("**** ttf = " + totaltermfreq);
513 //System.err.println("**** tf = " + termfreq);
514 //logger.info("**** ttf = " + totaltermfreq);
515 //logger.info("**** tf = " + termfreq);
516 solr_query_result.addTerm(queryTerm, indexField, (int) hits.getNumFound(), totaltermfreq.intValue()); // long totaltermfreq to int
517 }
518 }
519 }
520 if(!foundTermInfo) { // no terms extracted from query_string
521 solr_query_result.addTerm(query_string, defaultField, (int) hits.getNumFound(), -1); // no terms
522 }
523 }
524
525 solr_query_result.addDoc(docOID, score.floatValue(), doc_term_freq); // doc_termfreq for which term????
526 }
527 }
528 else
529 {
530 solr_query_result.setTotalDocs(0);
531
532 solr_query_result.setStartResults(0);
533 solr_query_result.setEndResults(0);
534 }
535
536 solr_query_result.setFacetResults(solrResponse.getFacetFields());
537 }
538 catch (SolrServerException server_exception)
539 {
540 server_exception.printStackTrace();
541 solr_query_result.setError(SolrQueryResult.SERVER_ERROR);
542 }
543
544 return solr_query_result;
545 }
546// Highlighting query. Returns full highlighted text for document
547 public String runHighlightingQuery(String query,String hldocOID)
548 {
549 SolrQueryResult solr_query_result = new SolrQueryResult();
550 solr_query_result.clear();
551
552
553 /* Create Query*/
554
555 SolrQuery solrQuery = new SolrQuery(query);
556
557 /* Set Query Parameters*/
558
559 //Turn on highlighting
560 solrQuery.setHighlight(true);
561 //Extract default field from query
562
563 //Set field for highlighting
564 solrQuery.setParam("hl.fl", highlight_field);
565
566 // this option only available for the OriginalHighlighter (hl.method=original, the default)
567 // if we are doing document level search, we only want to highlight the first section,
568 // (TX element) if applicable. Otherwise get a middle section displayed at the start of
569 // a document, outside the toc.
570 // if we are doing section level search, there will only be one TX element
571 solrQuery.setParam("hl.maxMultiValuedToExamine", "1");
572
573 //Get whole highlighted field
574 solrQuery.setHighlightFragsize(0);
575
576 //Return only required document by docOID
577 solrQuery.setFilterQueries("docOID:"+ hldocOID);
578
579 solrQuery.setHighlightSimplePre("<span class=\"termHighlight\">");
580 solrQuery.setHighlightSimplePost("</span>");
581
582 //Prepare results
583 String text = null;
584 // do the query
585 try
586 {
587 QueryResponse solrResponse = solr_core.query(solrQuery); //solr_core.query(solrParams);
588 //Get highliting results
589 Map<String,Map<String,List<String>>> highlightingResults = solrResponse.getHighlighting();
590 // Check for existing highlighting results
591 if (highlightingResults != null && highlightingResults.get(hldocOID) != null && highlightingResults.get(hldocOID).get(highlight_field) != null)
592 {
593 //Get highlited document text
594 text = highlightingResults.get(hldocOID).get(highlight_field).get(0);
595 }
596 }
597 catch (SolrServerException server_exception)
598 {
599 server_exception.printStackTrace();
600
601 }
602 return text;
603 }
604
605 // start results always from 0
606 public void setStartResults(int start_results)
607 {
608 if (start_results < 0)
609 {
610 start_results = 0;
611 }
612 this.start_results = start_results;
613 }
614
615 public void cleanUp()
616 {
617 super.cleanUp();
618 }
619
620}
Note: See TracBrowser for help on using the repository browser.