Context Navigation

source: gs3-extensions/solr/trunk/src/src/java/org/greenstone/gsdl3/util/SolrQueryWrapper.java@ 35172

Last change on this file since 35172 was 35172, checked in by kjdon, 3 years ago
we noticed when doing a solr search at document level that we were getting a random section of the document appearing outside the section headings. This is because when doing a highlighting query at document level, it was returning the first TX element that matched, which was not the top level content element. That was getting added in as the top level content, which it shouldn't be. So only check one TX element for a match then stop.
Property svn:executable set to ``*
File size: 21.7 KB

Line
1	/**********************************************************************
2	*
3	* SolrQueryWrapper.java
4	*
5	* Copyright 2004 The New Zealand Digital Library Project
6	*
7	* A component of the Greenstone digital library software
8	* from the New Zealand Digital Library Project at the
9	* University of Waikato, New Zealand.
10	*
11	* This program is free software; you can redistribute it and/or modify
12	* it under the terms of the GNU General Public License as published by
13	* the Free Software Foundation; either version 2 of the License, or
14	* (at your option) any later version.
15	*
16	* This program is distributed in the hope that it will be useful,
17	* but WITHOUT ANY WARRANTY; without even the implied warranty of
18	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19	* GNU General Public License for more details.
20	*
21	* You should have received a copy of the GNU General Public License
22	* along with this program; if not, write to the Free Software
23	* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24	*
25	*********************************************************************/
26	package org.greenstone.gsdl3.util;
27
28	import java.lang.reflect.Type;
29	import java.net.URLDecoder;
30	import java.util.ArrayList;
31	import java.util.Collection;
32	import java.util.HashMap;
33	import java.util.Iterator;
34	import java.util.List;
35	import java.util.Map;
36	import java.util.Set;
37	import java.util.HashSet;
38	import java.util.regex.Pattern;
39	import java.util.regex.Matcher;
40
41	import org.apache.log4j.Logger;
42	import org.apache.solr.client.solrj.SolrQuery; // subclass of ModifiableSolrParams
43	import org.apache.solr.client.solrj.SolrServer;
44	import org.apache.solr.client.solrj.SolrServerException;
45	import org.apache.solr.client.solrj.embedded.EmbeddedSolrServer;
46	import org.apache.solr.client.solrj.response.QueryResponse;
47	import org.apache.solr.client.solrj.response.TermsResponse;
48	import org.apache.solr.core.CoreContainer;
49	import org.apache.solr.core.SolrCore;
50	import org.apache.solr.common.SolrDocument;
51	import org.apache.solr.common.SolrDocumentList;
52	import org.apache.solr.common.params.ModifiableSolrParams;
53	import org.greenstone.LuceneWrapper4.SharedSoleneQuery;
54	import org.greenstone.LuceneWrapper4.SharedSoleneQueryResult;
55	import org.apache.lucene.search.Query; // Query, TermQuery, BooleanQuery, BooleanClause and more
56	import org.apache.lucene.index.IndexReader;
57	import org.apache.lucene.index.Term;
58	import org.apache.solr.search.QParser;
59	import org.apache.solr.search.SolrIndexSearcher;
60	import org.apache.solr.request.LocalSolrQueryRequest;
61
62	import com.google.gson.Gson;
63	import com.google.gson.reflect.TypeToken;
64
65	public class SolrQueryWrapper extends SharedSoleneQuery
66	{
67	public static String SORT_ASCENDING = "asc";
68	public static String SORT_DESCENDING = "desc";
69	public static String SORT_BY_RANK = "score";
70	public static String SORT_BY_INDEX_ORDER = "_docid_";
71
72	static Logger logger = Logger.getLogger(org.greenstone.gsdl3.util.SolrQueryWrapper.class.getName());
73	protected int max_docs = 100;
74	protected String sort_order = SORT_DESCENDING;
75	//Filter results by document hash. To get results from limited document sections.
76	protected String docFilter = null;
77	protected String sort_field = SORT_BY_RANK; // don't want null default for solr
78	protected ArrayList<String> _facets = new ArrayList<String>();
79	protected ArrayList<String> _facetQueries = new ArrayList<String>();
80	SolrServer solr_core = null;
81
82	protected String highlight_field = null;
83
84	String collection_core_name_prefix = null;
85
86	public SolrQueryWrapper()
87	{
88	super();
89	start_results = 0;
90	}
91
92	public void setMaxDocs(int max_docs)
93	{
94	this.max_docs = max_docs;
95	}
96
97	public void setSolrCore(SolrServer solr_core)
98	{
99	this.solr_core = solr_core;
100	}
101
102	public void setCollectionCoreNamePrefix(String colCoreNamePrefix) {
103	this.collection_core_name_prefix = colCoreNamePrefix;
104	}
105
106	// make sure its not null.
107	public void setSortField(String sort_field) {
108	if (sort_field != null) {
109	this.sort_field = sort_field;
110	}
111	}
112	public void setHighlightField(String hl_field)
113	{
114	this.highlight_field = hl_field;
115	}
116	public void setSortOrder(String order)
117	{
118	this.sort_order = order;
119	}
120	public void setDocFilter(String docFilter)
121	{
122	this.docFilter = docFilter;
123	}
124	public void addFacet(String facet)
125	{
126	if (!_facets.contains(facet))
127	{
128	_facets.add(facet);
129	}
130	}
131
132	public void clearFacets()
133	{
134	_facets.clear();
135	}
136
137	public void addFacetQuery(String facetQuery)
138	{
139	if (!_facetQueries.contains(facetQuery))
140	{
141	_facetQueries.add(facetQuery);
142	}
143	}
144
145	public void clearFacetQueries()
146	{
147	_facetQueries.clear();
148	}
149
150	public boolean initialise()
151	{
152	if (solr_core == null)
153	{
154	utf8out.println("Solr Core not loaded in ");
155	utf8out.flush();
156	return false;
157	}
158	return true;
159	}
160
161
162	/**
163	* UNUSED.
164	* Back when we used the EmbeddedSolrServer, this getTerms method would expand the terms of a query.
165	* Because of Solr/Lucene Index locking exceptions, we switched over to the HttpSolrServer instead
166	* of the Embedded kind.
167	*
168	* The functionality of getTerms has been moved to
169	* ../solrserver/Greenstone3SearchHandler.java, which will sit on the solrserver side (inside
170	* tomcat's solr webapp).
171	*
172	* Extracts the query terms from the query string. The query string can be a boolean
173	* combination of the various search fields with their search terms or phrases
174	*/
175	public Term[] getTerms(SolrQuery solrQuery, String query_string)
176	{
177	Term terms[] = null;
178
179	if(solr_core instanceof EmbeddedSolrServer) {
180	EmbeddedSolrServer solrServer = (EmbeddedSolrServer)solr_core;
181
182	CoreContainer coreContainer = solrServer.getCoreContainer();
183
184	Collection<SolrCore> solrCores = coreContainer.getCores();
185	if(!solrCores.isEmpty()) {
186	Iterator<SolrCore> coreIterator = solrCores.iterator();
187
188	// Just use the first core that matches the collection name, since the term
189	// frequency of any term is the same regardless of whether its didx or sidx core
190	boolean foundCore = false;
191	while(coreIterator.hasNext() && !foundCore) {
192	SolrCore solrCore = coreIterator.next();
193	if(this.collection_core_name_prefix != null) {
194	if(!solrCore.getName().startsWith(this.collection_core_name_prefix)) {
195	//logger.error("### Skipping core not of this collection: " + solrCore.getName());
196	continue;
197	}
198	} else {
199	logger.error("### Collection_core_name_prefix not set. Won't try to find terms");
200	break;
201	}
202
203	//logger.error("### Found core " + solrCore.getName() + " of this collection " + this.collection_core_name_prefix);
204	foundCore = true;
205
206	LocalSolrQueryRequest solrQueryRequest = new LocalSolrQueryRequest(solrCore, solrQuery);
207	Query parsedQuery = null;
208
209	try {
210
211	// get the qparser, default is LuceneQParserPlugin, which is called "lucene" see http://wiki.apache.org/solr/QueryParser
212	QParser qParser = QParser.getParser(query_string, "lucene", solrQueryRequest);
213	parsedQuery = qParser.getQuery();
214
215	// For PrefixQuery or WildCardQuery (a subclass of AutomatonQuery, incl RegexpQ),
216	// like ZZ:econom* and ZZ:*date/regex queries, Query.extractTerms() throws an Exception
217	// because it has not done the Query.rewrite() step yet. So do that manually for them.
218	// This still doesn't provide us with the terms that econom* or *date break down into.
219
220	//if(parsedQuery instanceof PrefixQuery \|\| parsedQuery instanceof AutomatonQuery) {
221	// Should we just check superclass MultiTermQuery?
222	// Can be a BooleanQuery containing PrefixQuery/WildCardQuery among its clauses, so
223	// just test for * in the query_string to determine if we need to do a rewrite() or not
224	if(query_string.contains("*")) {
225	SolrIndexSearcher searcher = solrQueryRequest.getSearcher();
226	IndexReader indexReader = searcher.getIndexReader(); // returns a DirectoryReader
227	parsedQuery = parsedQuery.rewrite(indexReader); // gets rewritten to ConstantScoreQuery
228	}
229
230	//System.err.println("#### Query type was: " + parsedQuery.getClass());
231	//logger.error("#### Query type was: " + parsedQuery.getClass());
232
233	// extract the terms
234	Set<Term> extractedQueryTerms = new HashSet<Term>();
235	parsedQuery.extractTerms(extractedQueryTerms);
236
237	terms = new Term[extractedQueryTerms.size()];
238
239	Iterator<Term> termsIterator = extractedQueryTerms.iterator();
240	for(int i = 0; termsIterator.hasNext(); i++) {
241	Term term = termsIterator.next();
242	///System.err.println("#### Found query term: " + term);
243	///logger.error("#### Found query term: " + term);
244
245	terms[i] = term; //(term.field(), term.text());
246	}
247
248	} catch(Exception queryParseException) {
249	queryParseException.printStackTrace();
250	System.err.println("Exception when parsing query: " + queryParseException.getMessage());
251	System.err.println("#### Query type was: " + parsedQuery.getClass());
252	logger.error("#### Query type was: " + parsedQuery.getClass());
253	}
254	// http://lucene.apache.org/solr/4_7_2/solr-core/org/apache/solr/request/SolrQueryRequestBase.html#close%28%29
255	// close() must be called when the object is no longer in use. Frees resources associated with this request
256	solrQueryRequest.close();
257	}
258
259	} else {
260	System.err.println("#### CoreContainer is empty");
261	logger.error("#### CoreContainer is empty");
262	}
263	} else {
264	System.err.println("#### Not an EmbeddedSolrServer. SolrQueryWrapper.getTerms() not yet implemented for " + solr_core.getClass());
265	logger.error("#### Not an EmbeddedSolrServer. SolrQueryWrapper.getTerms() not yet implemented for " + solr_core.getClass());
266	}
267
268
269	return terms;
270	}
271
272	public SharedSoleneQueryResult runQuery(String query_string)
273	{
274	if (query_string == null \|\| query_string.equals(""))
275	{
276	utf8out.println("The query word is not indicated ");
277	utf8out.flush();
278	return null;
279	}
280
281	SolrQueryResult solr_query_result = new SolrQueryResult();
282	solr_query_result.clear();
283
284	if (_facetQueries.size() > 0)
285	{
286	HashMap<String, ArrayList<String>> grouping = new HashMap<String, ArrayList<String>>();
287	for (String currentQuery : _facetQueries)
288	{
289	//Facet queries are stored in JSON, so we have to decode it
290	Gson gson = new Gson();
291	Type type = new TypeToken<List<String>>()
292	{
293	}.getType();
294	List<String> queryElems = gson.fromJson(currentQuery, type);
295
296	//Group each query segment by the index it uses
297	for (String currentQueryElement : queryElems)
298	{
299	//logger.info("@@@@ currentQueryElement " + currentQueryElement);
300
301	String decodedQueryElement = null;
302	try
303	{
304	decodedQueryElement = URLDecoder.decode(currentQueryElement, "UTF-8");
305	}
306	catch (Exception ex)
307	{
308	continue;
309	}
310
311	int colonIndex = currentQueryElement.indexOf(":");
312	String indexShortName = currentQueryElement.substring(0, colonIndex);
313
314	if (grouping.get(indexShortName) == null)
315	{
316	grouping.put(indexShortName, new ArrayList<String>());
317	}
318	grouping.get(indexShortName).add(decodedQueryElement);
319	}
320	}
321
322	//Construct the facet query string to add to the regular query string
323	StringBuilder facetQueryString = new StringBuilder();
324	int keysetCounter = 0;
325	for (String key : grouping.keySet())
326	{
327	StringBuilder currentFacetString = new StringBuilder("(");
328	int groupCounter = 0;
329	for (String queryElem : grouping.get(key))
330	{
331	currentFacetString.append(queryElem);
332
333	groupCounter++;
334	if (groupCounter < grouping.get(key).size())
335	{
336	currentFacetString.append(" OR ");
337	}
338	}
339	currentFacetString.append(")");
340
341	facetQueryString.append(currentFacetString);
342
343	keysetCounter++;
344	if (keysetCounter < grouping.keySet().size())
345	{
346	facetQueryString.append(" AND ");
347	}
348	}
349
350	if (facetQueryString.length() > 0)
351	{
352	query_string += " AND " + facetQueryString;
353	}
354	}
355
356
357	SolrQuery solrQuery = new SolrQuery(query_string);
358	solrQuery.addSort(this.sort_field, SolrQuery.ORDER.valueOf(this.sort_order)); // sort param, like "score desc" or "byORG asc"
359	solrQuery.setStart(start_results); // which result to start from
360	solrQuery.setRows(end_results - start_results); // how many results per "page"
361
362	// http://lucene.472066.n3.nabble.com/get-term-frequency-just-only-keywords-search-td4084510.html
363	// WORKS (search didx core):
364	//TI:farming
365	//docOID,score,termfreq(TI,'farming'),totaltermfreq(TI,'farming')
366
367
368	// which fields to return for each document, we'll add the request for totaltermfreq later
369	// fl=docOID score termfreq(TI,'farming') totaltermfreq(TI,'farming')
370	solrQuery.setFields("docOID", "score"); //solrParams.set("fl", "docOID score totaltermfreq(field,'queryterm')");
371
372	//Turn on highlighting
373	solrQuery.setHighlight(true);
374	//Return 3 snippets for each document
375	solrQuery.setParam("hl.snippets", "3");
376	solrQuery.setParam("hl.useFastVectorHighlighter", "true");
377	solrQuery.setParam("hl.fl", highlight_field);
378	solrQuery.setParam("hl.tag.pre", "<span class=\"snippetText\">" );
379	solrQuery.setParam("hl.tag.post","</span>" );
380
381	// set the default conjunction op
382	solrQuery.setParam("q.op", this.default_conjunction_operator);
383	if (docFilter != null) {
384	solrQuery.setParam("fq", "docOID:" + docFilter + "*");
385	}
386	//solrQuery.setTerms(true); // turn on the termsComponent
387	//solrQuery.set("terms.fl", "ZZ"); // which field to get the terms from. ModifiableSolrParams method
388
389	// http://wiki.apache.org/solr/TermVectorComponent and https://cwiki.apache.org/confluence/display/solr/The+Term+Vector+Component
390	// http://lucene.472066.n3.nabble.com/get-term-frequency-just-only-keywords-search-td4084510.html
391	// http://stackoverflow.com/questions/13031534/word-frequency-in-solr
392	// http://wiki.apache.org/solr/FunctionQuery#tf and #termfreq and #totaltermfreq
393	// https://wiki.apache.org/solr/TermsComponent
394
395	//solrParams.set("tv.tf", true);// turn on the terms vector Component
396	//solrParams.set("tv.fl", "ZZ");// which field to get the terms from /// ZZ
397
398
399	if (_facets.size() > 0)
400	{
401	// enable facet counts in the query response
402	solrQuery.setFacet(true); //solrParams.set("facet", "true");
403	for (int i = 0; i < _facets.size(); i++)
404	{
405	// add this field as a facet
406	solrQuery.addFacetField(_facets.get(i)); // solrParams.add("facet.field", _facets.get(i));
407	}
408	//for(int i = 0; i < _facetQueries.size(); i++) {
409	// logger.info("@@@@ facet query i: " + _facetQueries.get(i));
410	//}
411	}
412
413
414	// Some debugging
415	logger.info("@@@@ solrQuery: " + solrQuery);
416	try {
417	// https://stackoverflow.com/questions/2632175/decoding-uri-query-string-in-java
418	String displayQueryString = URLDecoder.decode(solrQuery.toString().replace("+", " "), "UTF-8");
419	logger.info("@@@@ solrQuery URL decoded: " + displayQueryString);
420	} catch(Exception uee) { // UnsupportedEncodingException
421	logger.info("Got debug exception " + uee.getMessage());
422	}
423
424
425	// the solrserver will now
426	// get the individual terms that make up the query, then request solr to return the totaltermfreq for each term
427
428	// do the query
429	try
430	{
431	QueryResponse solrResponse = solr_core.query(solrQuery); //solr_core.query(solrParams);
432	SolrDocumentList hits = solrResponse.getResults();
433	Map<String, Map<String, List<String>>> hlResponse = solrResponse.getHighlighting();
434	solr_query_result.setHighlightResults(hlResponse);
435	//TermsResponse termResponse = solrResponse.getTermsResponse(); // null unless termvectors=true in schema.xml
436
437	if (hits != null)
438	{
439	logger.info("*** hits size = " + hits.size()+
440	", num docs found = " + hits.getNumFound() +
441	", start results = " + start_results +
442	", end results = " + end_results+
443	", max docs = " + max_docs);
444
445	// numDocsFound is the total number of matching docs in the collection
446	// as opposed to the number of documents returned in the hits list
447
448	solr_query_result.setTotalDocs((int) hits.getNumFound());
449
450	solr_query_result.setStartResults(start_results);
451	solr_query_result.setEndResults(start_results + hits.size());
452
453	// get the first field we're searching in, this will be the fallback field
454	int sepIndex = query_string.indexOf(":");
455	String defaultField = query_string.substring(0, sepIndex);
456	//String query = query_string.substring(sepIndex + 2, query_string.length() - 1); // Replaced by call to getTerms()
457
458	//solr_query_result.addTerm(query, field, (int) hits.getNumFound(), -1);
459
460	// Output the matching documents
461	for (int i = 0; i < hits.size(); i++)
462	{
463	SolrDocument doc = hits.get(i);
464
465	// Need to think about how to support document term frequency. Make zero for now
466	int doc_term_freq = 0;
467	String docOID = (String) doc.get("docOID");
468	Float score = (Float) doc.get("score");
469
470	//logger.info("**** docOID = " + docOID);
471	//logger.info("**** score = " + score);
472
473
474	// solr returns each term's totaltermfreq, ttf, at the document level, even though
475	// the ttf is the same for each document. So extract this information just for the first document
476	// https://wiki.apache.org/solr/FunctionQuery#docfreq
477
478	if(i == 0) { // first document, all others repeat the same termfreq data
479	boolean foundTermInfo = false;
480
481	Collection<String> fieldNames = doc.getFieldNames();
482	for(Iterator<String> it = fieldNames.iterator(); it.hasNext(); ) {
483	String fieldName = it.next(); // e.g. looking for totaltermfreq(ZZ,'economically')
484	//logger.info("@@@@ found fieldName " + fieldName);
485
486
487	if(fieldName.startsWith("totaltermfreq")) {
488	//\|\| fieldName.startsWith("termfreq")) {
489
490	foundTermInfo = true;
491
492	// e.g. totaltermfreq(TI,'farming')
493	// e.g. termfreq(TI,'farming')
494	Pattern pattern = Pattern.compile("(.?termfreq)\\((.?),'(.*?)'\\)");
495	Matcher matcher = pattern.matcher(fieldName);
496	String metaField, indexField, queryTerm;
497	while (matcher.find()) {
498	metaField = matcher.group(1); // termfreq or totaltermfreq
499	indexField = matcher.group(2); //ZZ, TI
500	queryTerm = matcher.group(3);
501
502	//logger.info("\t@@@@ found field " + indexField);
503	//logger.info("\t@@@@ queryTerm " + queryTerm);
504
505	// Finally, can ask for the totaltermfreq value for this
506	// searchterm in its indexed field:
507	// e.g. totaltermfreq(TI,'farming'), e.g. termfreq(TI,'farming')
508	Long totaltermfreq = (Long)doc.get("totaltermfreq("+indexField+",'"+queryTerm+"')");
509
510	Integer termfreq = (Integer)doc.get("termfreq("+indexField+",'"+queryTerm+"')");
511
512	//System.err.println("**** ttf = " + totaltermfreq);
513	//System.err.println("**** tf = " + termfreq);
514	//logger.info("**** ttf = " + totaltermfreq);
515	//logger.info("**** tf = " + termfreq);
516	solr_query_result.addTerm(queryTerm, indexField, (int) hits.getNumFound(), totaltermfreq.intValue()); // long totaltermfreq to int
517	}
518	}
519	}
520	if(!foundTermInfo) { // no terms extracted from query_string
521	solr_query_result.addTerm(query_string, defaultField, (int) hits.getNumFound(), -1); // no terms
522	}
523	}
524
525	solr_query_result.addDoc(docOID, score.floatValue(), doc_term_freq); // doc_termfreq for which term????
526	}
527	}
528	else
529	{
530	solr_query_result.setTotalDocs(0);
531
532	solr_query_result.setStartResults(0);
533	solr_query_result.setEndResults(0);
534	}
535
536	solr_query_result.setFacetResults(solrResponse.getFacetFields());
537	}
538	catch (SolrServerException server_exception)
539	{
540	server_exception.printStackTrace();
541	solr_query_result.setError(SolrQueryResult.SERVER_ERROR);
542	}
543
544	return solr_query_result;
545	}
546	// Highlighting query. Returns full highlighted text for document
547	public String runHighlightingQuery(String query,String hldocOID)
548	{
549	SolrQueryResult solr_query_result = new SolrQueryResult();
550	solr_query_result.clear();
551
552
553	/* Create Query*/
554
555	SolrQuery solrQuery = new SolrQuery(query);
556
557	/* Set Query Parameters*/
558
559	//Turn on highlighting
560	solrQuery.setHighlight(true);
561	//Extract default field from query
562
563	//Set field for highlighting
564	solrQuery.setParam("hl.fl", highlight_field);
565
566	// this option only available for the OriginalHighlighter (hl.method=original, the default)
567	// if we are doing document level search, we only want to highlight the first section,
568	// (TX element) if applicable. Otherwise get a middle section displayed at the start of
569	// a document, outside the toc.
570	// if we are doing section level search, there will only be one TX element
571	solrQuery.setParam("hl.maxMultiValuedToExamine", "1");
572
573	//Get whole highlighted field
574	solrQuery.setHighlightFragsize(0);
575
576	//Return only required document by docOID
577	solrQuery.setFilterQueries("docOID:"+ hldocOID);
578
579	solrQuery.setHighlightSimplePre("<span class=\"termHighlight\">");
580	solrQuery.setHighlightSimplePost("</span>");
581
582	//Prepare results
583	String text = null;
584	// do the query
585	try
586	{
587	QueryResponse solrResponse = solr_core.query(solrQuery); //solr_core.query(solrParams);
588	//Get highliting results
589	Map<String,Map<String,List<String>>> highlightingResults = solrResponse.getHighlighting();
590	// Check for existing highlighting results
591	if (highlightingResults != null && highlightingResults.get(hldocOID) != null && highlightingResults.get(hldocOID).get(highlight_field) != null)
592	{
593	//Get highlited document text
594	text = highlightingResults.get(hldocOID).get(highlight_field).get(0);
595	}
596	}
597	catch (SolrServerException server_exception)
598	{
599	server_exception.printStackTrace();
600
601	}
602	return text;
603	}
604
605	// start results always from 0
606	public void setStartResults(int start_results)
607	{
608	if (start_results < 0)
609	{
610	start_results = 0;
611	}
612	this.start_results = start_results;
613	}
614
615	public void cleanUp()
616	{
617	super.cleanUp();
618	}
619
620	}

Note: See TracBrowser for help on using the repository browser.

Download in other formats: