Context Navigation

source: gs3-extensions/solr/trunk/src/src/java/org/greenstone/gsdl3/util/SolrQueryWrapper.java@ 29986

Last change on this file since 29986 was 29986, checked in by ak19, 9 years ago
The getTerms() functionality previously used by the EmbeddedSolrServer has now been re-implemented for HttpSolrServer with the new custom Greenstone Solr RequestHandler class Greenstone3SearchHandler, which lives on the solr server side, in tomcat's solr webapp. The functionality has been improvemed, such as being able to search for: econom* cat, by recursively calling setRewriteMethods on any PrefixQuery and WildcardQuery MultiQueries within an overall BooleanQuery, and by handling BooleanQuery.TooManyClauses exceptions when the number of expanded terms is too large, such as for a search of a*.
Property svn:executable set to ``*
File size: 18.1 KB

Line
1	/**********************************************************************
2	*
3	* SolrQueryWrapper.java
4	*
5	* Copyright 2004 The New Zealand Digital Library Project
6	*
7	* A component of the Greenstone digital library software
8	* from the New Zealand Digital Library Project at the
9	* University of Waikato, New Zealand.
10	*
11	* This program is free software; you can redistribute it and/or modify
12	* it under the terms of the GNU General Public License as published by
13	* the Free Software Foundation; either version 2 of the License, or
14	* (at your option) any later version.
15	*
16	* This program is distributed in the hope that it will be useful,
17	* but WITHOUT ANY WARRANTY; without even the implied warranty of
18	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19	* GNU General Public License for more details.
20	*
21	* You should have received a copy of the GNU General Public License
22	* along with this program; if not, write to the Free Software
23	* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24	*
25	*********************************************************************/
26	package org.greenstone.gsdl3.util;
27
28	import java.lang.reflect.Type;
29	import java.net.URLDecoder;
30	import java.util.ArrayList;
31	import java.util.Collection;
32	import java.util.HashMap;
33	import java.util.Iterator;
34	import java.util.List;
35	import java.util.Set;
36	import java.util.HashSet;
37
38	import java.util.regex.Pattern;
39	import java.util.regex.Matcher;
40
41	import org.apache.log4j.Logger;
42	import org.apache.solr.client.solrj.SolrQuery; // subclass of ModifiableSolrParams
43	import org.apache.solr.client.solrj.SolrServer;
44	import org.apache.solr.client.solrj.SolrServerException;
45	import org.apache.solr.client.solrj.embedded.EmbeddedSolrServer;
46	import org.apache.solr.client.solrj.response.QueryResponse;
47	import org.apache.solr.client.solrj.response.TermsResponse;
48
49	import org.apache.solr.core.CoreContainer;
50	import org.apache.solr.core.SolrCore;
51
52	import org.apache.solr.common.SolrDocument;
53	import org.apache.solr.common.SolrDocumentList;
54	import org.apache.solr.common.params.ModifiableSolrParams;
55	import org.greenstone.LuceneWrapper4.SharedSoleneQuery;
56	import org.greenstone.LuceneWrapper4.SharedSoleneQueryResult;
57
58	import org.apache.lucene.search.Query; // Query, TermQuery, BooleanQuery, BooleanClause and more
59	import org.apache.lucene.index.IndexReader;
60	import org.apache.lucene.index.Term;
61	import org.apache.solr.search.QParser;
62	import org.apache.solr.search.SolrIndexSearcher;
63	import org.apache.solr.request.LocalSolrQueryRequest;
64
65	import com.google.gson.Gson;
66	import com.google.gson.reflect.TypeToken;
67
68	public class SolrQueryWrapper extends SharedSoleneQuery
69	{
70	public static String SORT_ASCENDING = "asc";
71	public static String SORT_DESCENDING = "desc";
72	public static String SORT_BY_RANK = "score";
73	public static String SORT_BY_INDEX_ORDER = "_docid_";
74
75	static Logger logger = Logger.getLogger(org.greenstone.gsdl3.util.SolrQueryWrapper.class.getName());
76	protected int max_docs = 100;
77	protected String sort_order = SORT_DESCENDING;
78	protected String sort_field = SORT_BY_RANK; // don't want null default for solr
79	protected ArrayList<String> _facets = new ArrayList<String>();
80	protected ArrayList<String> _facetQueries = new ArrayList<String>();
81	SolrServer solr_core = null;
82
83	String collection_core_name_prefix = null;
84
85	public SolrQueryWrapper()
86	{
87	super();
88	start_results = 0;
89	}
90
91	public void setMaxDocs(int max_docs)
92	{
93	this.max_docs = max_docs;
94	}
95
96	public void setSolrCore(SolrServer solr_core)
97	{
98	this.solr_core = solr_core;
99	}
100
101	public void setCollectionCoreNamePrefix(String colCoreNamePrefix) {
102	this.collection_core_name_prefix = colCoreNamePrefix;
103	}
104
105	// make sure its not null.
106	public void setSortField(String sort_field) {
107	if (sort_field != null) {
108	this.sort_field = sort_field;
109	}
110	}
111
112	public void setSortOrder(String order)
113	{
114	this.sort_order = order;
115	}
116	public void addFacet(String facet)
117	{
118	if (!_facets.contains(facet))
119	{
120	_facets.add(facet);
121	}
122	}
123
124	public void clearFacets()
125	{
126	_facets.clear();
127	}
128
129	public void addFacetQuery(String facetQuery)
130	{
131	if (!_facetQueries.contains(facetQuery))
132	{
133	_facetQueries.add(facetQuery);
134	}
135	}
136
137	public void clearFacetQueries()
138	{
139	_facetQueries.clear();
140	}
141
142	public boolean initialise()
143	{
144	if (solr_core == null)
145	{
146	utf8out.println("Solr Core not loaded in ");
147	utf8out.flush();
148	return false;
149	}
150	return true;
151	}
152
153
154	/**
155	* UNUSED.
156	* Back when we used the EmbeddedSolrServer, this getTerms method would expand the terms of a query.
157	* Because of Solr/Lucene Index locking exceptions, we switched over to the HttpSolrServer instead
158	* of the Embedded kind.
159	*
160	* The functionality of getTerms has been moved to
161	* ../solrserver/Greenstone3SearchHandler.java, which will sit on the solrserver side (inside
162	* tomcat's solr webapp).
163	*
164	* Extracts the query terms from the query string. The query string can be a boolean
165	* combination of the various search fields with their search terms or phrases
166	*/
167	public Term[] getTerms(SolrQuery solrQuery, String query_string)
168	{
169	Term terms[] = null;
170
171	if(solr_core instanceof EmbeddedSolrServer) {
172	EmbeddedSolrServer solrServer = (EmbeddedSolrServer)solr_core;
173
174	CoreContainer coreContainer = solrServer.getCoreContainer();
175
176	Collection<SolrCore> solrCores = coreContainer.getCores();
177	if(!solrCores.isEmpty()) {
178	Iterator<SolrCore> coreIterator = solrCores.iterator();
179
180	// Just use the first core that matches the collection name, since the term
181	// frequency of any term is the same regardless of whether its didx or sidx core
182	boolean foundCore = false;
183	while(coreIterator.hasNext() && !foundCore) {
184	SolrCore solrCore = coreIterator.next();
185	if(this.collection_core_name_prefix != null) {
186	if(!solrCore.getName().startsWith(this.collection_core_name_prefix)) {
187	//logger.error("### Skipping core not of this collection: " + solrCore.getName());
188	continue;
189	}
190	} else {
191	logger.error("### Collection_core_name_prefix not set. Won't try to find terms");
192	break;
193	}
194
195	//logger.error("### Found core " + solrCore.getName() + " of this collection " + this.collection_core_name_prefix);
196	foundCore = true;
197
198	LocalSolrQueryRequest solrQueryRequest = new LocalSolrQueryRequest(solrCore, solrQuery);
199	Query parsedQuery = null;
200
201	try {
202
203	// get the qparser, default is LuceneQParserPlugin, which is called "lucene" see http://wiki.apache.org/solr/QueryParser
204	QParser qParser = QParser.getParser(query_string, "lucene", solrQueryRequest);
205	parsedQuery = qParser.getQuery();
206
207	// For PrefixQuery or WildCardQuery (a subclass of AutomatonQuery, incl RegexpQ),
208	// like ZZ:econom* and ZZ:*date/regex queries, Query.extractTerms() throws an Exception
209	// because it has not done the Query.rewrite() step yet. So do that manually for them.
210	// This still doesn't provide us with the terms that econom* or *date break down into.
211
212	//if(parsedQuery instanceof PrefixQuery \|\| parsedQuery instanceof AutomatonQuery) {
213	// Should we just check superclass MultiTermQuery?
214	// Can be a BooleanQuery containing PrefixQuery/WildCardQuery among its clauses, so
215	// just test for * in the query_string to determine if we need to do a rewrite() or not
216	if(query_string.contains("*")) {
217	SolrIndexSearcher searcher = solrQueryRequest.getSearcher();
218	IndexReader indexReader = searcher.getIndexReader(); // returns a DirectoryReader
219	parsedQuery = parsedQuery.rewrite(indexReader); // gets rewritten to ConstantScoreQuery
220	}
221
222	//System.err.println("#### Query type was: " + parsedQuery.getClass());
223	//logger.error("#### Query type was: " + parsedQuery.getClass());
224
225	// extract the terms
226	Set<Term> extractedQueryTerms = new HashSet<Term>();
227	parsedQuery.extractTerms(extractedQueryTerms);
228
229	terms = new Term[extractedQueryTerms.size()];
230
231	Iterator<Term> termsIterator = extractedQueryTerms.iterator();
232	for(int i = 0; termsIterator.hasNext(); i++) {
233	Term term = termsIterator.next();
234	///System.err.println("#### Found query term: " + term);
235	///logger.error("#### Found query term: " + term);
236
237	terms[i] = term; //(term.field(), term.text());
238	}
239
240	} catch(Exception queryParseException) {
241	queryParseException.printStackTrace();
242	System.err.println("Exception when parsing query: " + queryParseException.getMessage());
243	System.err.println("#### Query type was: " + parsedQuery.getClass());
244	logger.error("#### Query type was: " + parsedQuery.getClass());
245	}
246	// http://lucene.apache.org/solr/4_7_2/solr-core/org/apache/solr/request/SolrQueryRequestBase.html#close%28%29
247	// close() must be called when the object is no longer in use. Frees resources associated with this request
248	solrQueryRequest.close();
249	}
250
251	} else {
252	System.err.println("#### CoreContainer is empty");
253	logger.error("#### CoreContainer is empty");
254	}
255	} else {
256	System.err.println("#### Not an EmbeddedSolrServer. SolrQueryWrapper.getTerms() not yet implemented for " + solr_core.getClass());
257	logger.error("#### Not an EmbeddedSolrServer. SolrQueryWrapper.getTerms() not yet implemented for " + solr_core.getClass());
258	}
259
260
261	return terms;
262	}
263
264	public SharedSoleneQueryResult runQuery(String query_string)
265	{
266	if (query_string == null \|\| query_string.equals(""))
267	{
268	utf8out.println("The query word is not indicated ");
269	utf8out.flush();
270	return null;
271	}
272
273	SolrQueryResult solr_query_result = new SolrQueryResult();
274	solr_query_result.clear();
275
276	if (_facetQueries.size() > 0)
277	{
278	HashMap<String, ArrayList<String>> grouping = new HashMap<String, ArrayList<String>>();
279	for (String currentQuery : _facetQueries)
280	{
281	//Facet queries are stored in JSON, so we have to decode it
282	Gson gson = new Gson();
283	Type type = new TypeToken<List<String>>()
284	{
285	}.getType();
286	List<String> queryElems = gson.fromJson(currentQuery, type);
287
288	//Group each query segment by the index it uses
289	for (String currentQueryElement : queryElems)
290	{
291	String decodedQueryElement = null;
292	try
293	{
294	decodedQueryElement = URLDecoder.decode(currentQueryElement, "UTF-8");
295	}
296	catch (Exception ex)
297	{
298	continue;
299	}
300
301	int colonIndex = currentQueryElement.indexOf(":");
302	String indexShortName = currentQueryElement.substring(0, colonIndex);
303
304	if (grouping.get(indexShortName) == null)
305	{
306	grouping.put(indexShortName, new ArrayList<String>());
307	}
308	grouping.get(indexShortName).add(decodedQueryElement);
309	}
310	}
311
312	//Construct the facet query string to add to the regular query string
313	StringBuilder facetQueryString = new StringBuilder();
314	int keysetCounter = 0;
315	for (String key : grouping.keySet())
316	{
317	StringBuilder currentFacetString = new StringBuilder("(");
318	int groupCounter = 0;
319	for (String queryElem : grouping.get(key))
320	{
321	currentFacetString.append(queryElem);
322
323	groupCounter++;
324	if (groupCounter < grouping.get(key).size())
325	{
326	currentFacetString.append(" OR ");
327	}
328	}
329	currentFacetString.append(")");
330
331	facetQueryString.append(currentFacetString);
332
333	keysetCounter++;
334	if (keysetCounter < grouping.keySet().size())
335	{
336	facetQueryString.append(" AND ");
337	}
338	}
339
340	if (facetQueryString.length() > 0)
341	{
342	query_string += " AND " + facetQueryString;
343	}
344	}
345
346
347	SolrQuery solrQuery = new SolrQuery(query_string);
348	solrQuery.addSort(this.sort_field, SolrQuery.ORDER.valueOf(this.sort_order)); // sort param, like "score desc" or "byORG asc"
349	solrQuery.setStart(start_results); // which result to start from
350	solrQuery.setRows((end_results - start_results) + 1); // how many results per "page"
351
352	// http://lucene.472066.n3.nabble.com/get-term-frequency-just-only-keywords-search-td4084510.html
353	// WORKS (search didx core):
354	//TI:farming
355	//docOID,score,termfreq(TI,'farming'),totaltermfreq(TI,'farming')
356
357
358	// which fields to return for each document, we'll add the request for totaltermfreq later
359	// fl=docOID score termfreq(TI,'farming') totaltermfreq(TI,'farming')
360	solrQuery.setFields("docOID", "score"); //solrParams.set("fl", "docOID score totaltermfreq(field,'queryterm')");
361
362	//solrQuery.setTerms(true); // turn on the termsComponent
363	//solrQuery.set("terms.fl", "ZZ"); // which field to get the terms from. ModifiableSolrParams method
364
365	// http://wiki.apache.org/solr/TermVectorComponent and https://cwiki.apache.org/confluence/display/solr/The+Term+Vector+Component
366	// http://lucene.472066.n3.nabble.com/get-term-frequency-just-only-keywords-search-td4084510.html
367	// http://stackoverflow.com/questions/13031534/word-frequency-in-solr
368	// http://wiki.apache.org/solr/FunctionQuery#tf and #termfreq and #totaltermfreq
369	// https://wiki.apache.org/solr/TermsComponent
370
371	//solrParams.set("tv.tf", true);// turn on the terms vector Component
372	//solrParams.set("tv.fl", "ZZ");// which field to get the terms from /// ZZ
373
374
375	if (_facets.size() > 0)
376	{
377	// enable facet counts in the query response
378	solrQuery.setFacet(true); //solrParams.set("facet", "true");
379	for (int i = 0; i < _facets.size(); i++)
380	{
381	// add this field as a facet
382	solrQuery.addFacetField(_facets.get(i)); // solrParams.add("facet.field", _facets.get(i));
383	}
384	}
385
386	// the solrserver will now
387	// get the individual terms that make up the query, then request solr to return the totaltermfreq for each term
388
389	// do the query
390	try
391	{
392	QueryResponse solrResponse = solr_core.query(solrQuery); //solr_core.query(solrParams);
393	SolrDocumentList hits = solrResponse.getResults();
394	//TermsResponse termResponse = solrResponse.getTermsResponse(); // null unless termvectors=true in schema.xml
395
396	if (hits != null)
397	{
398	logger.info("*** hits size = " + hits.size());
399	logger.info("*** num docs found = " + hits.getNumFound());
400
401	logger.info("*** start results = " + start_results);
402	logger.info("*** end results = " + end_results);
403	logger.info("*** max docs = " + max_docs);
404
405	// numDocsFound is the total number of matching docs in the collection
406	// as opposed to the number of documents returned in the hits list
407
408	solr_query_result.setTotalDocs((int) hits.getNumFound());
409
410	solr_query_result.setStartResults(start_results);
411	solr_query_result.setEndResults(start_results + hits.size());
412
413
414	// get the first field we're searching in, this will be the fallback field
415	int sepIndex = query_string.indexOf(":");
416	String defaultField = query_string.substring(0, sepIndex);
417	//String query = query_string.substring(sepIndex + 2, query_string.length() - 1); // Replaced by call to getTerms()
418
419	//solr_query_result.addTerm(query, field, (int) hits.getNumFound(), -1);
420
421	// Output the matching documents
422	for (int i = 0; i < hits.size(); i++)
423	{
424	SolrDocument doc = hits.get(i);
425
426	// Need to think about how to support document term frequency. Make zero for now
427	int doc_term_freq = 0;
428	String docOID = (String) doc.get("docOID");
429	Float score = (Float) doc.get("score");
430
431	logger.info("**** docOID = " + docOID);
432	logger.info("**** score = " + score);
433
434
435	// solr returns each term's totaltermfreq, ttf, at the document level, even though
436	// the ttf is the same for each document. So extract this information just for the first document
437	if(i == 0) { // first document, all others repeat the same termfreq data
438	boolean foundTermInfo = false;
439
440	Collection<String> fieldNames = doc.getFieldNames();
441	for(Iterator<String> it = fieldNames.iterator(); it.hasNext(); ) {
442	String fieldName = it.next(); // e.g. looking for totaltermfreq(ZZ,'economically')
443	//logger.info("@@@@ found fieldName " + fieldName);
444
445
446	if(fieldName.startsWith("totaltermfreq")) {
447	//\|\| fieldName.startsWith("termfreq")) {
448
449	foundTermInfo = true;
450
451	// e.g. totaltermfreq(TI,'farming')
452	// e.g. termfreq(TI,'farming')
453	Pattern pattern = Pattern.compile("(.?termfreq)\\((.?),'(.*?)'\\)");
454	Matcher matcher = pattern.matcher(fieldName);
455	String metaField, indexField, queryTerm;
456	while (matcher.find()) {
457	metaField = matcher.group(1); // termfreq or totaltermfreq
458	indexField = matcher.group(2); //ZZ, TI
459	queryTerm = matcher.group(3);
460
461	//logger.info("\t@@@@ found field " + indexField);
462	//logger.info("\t@@@@ queryTerm " + queryTerm);
463
464	// Finally, can ask for the totaltermfreq value for this
465	// searchterm in its indexed field:
466	// e.g. totaltermfreq(TI,'farming'), e.g. termfreq(TI,'farming')
467	Long totaltermfreq = (Long)doc.get("totaltermfreq("+indexField+",'"+queryTerm+"')");
468
469	Integer termfreq = (Integer)doc.get("termfreq("+indexField+",'"+queryTerm+"')");
470
471	//System.err.println("**** ttf = " + totaltermfreq);
472	//System.err.println("**** tf = " + termfreq);
473	//logger.info("**** ttf = " + totaltermfreq);
474	//logger.info("**** tf = " + termfreq);
475	solr_query_result.addTerm(queryTerm, indexField, (int) hits.getNumFound(), totaltermfreq.intValue()); // long totaltermfreq to int
476	}
477	}
478	}
479	if(!foundTermInfo) { // no terms extracted from query_string
480	solr_query_result.addTerm(query_string, defaultField, (int) hits.getNumFound(), -1); // no terms
481	}
482	}
483
484	solr_query_result.addDoc(docOID, score.floatValue(), doc_term_freq); // doc_termfreq for which term????
485	}
486	}
487	else
488	{
489	solr_query_result.setTotalDocs(0);
490
491	solr_query_result.setStartResults(0);
492	solr_query_result.setEndResults(0);
493	}
494
495	solr_query_result.setFacetResults(solrResponse.getFacetFields());
496	}
497	catch (SolrServerException server_exception)
498	{
499	server_exception.printStackTrace();
500	solr_query_result.setError(SolrQueryResult.SERVER_ERROR);
501	}
502
503	return solr_query_result;
504	}
505
506	//Greenstone universe operates with a base of 1 for "start_results"
507	//But Solr operates from 0
508	public void setStartResults(int start_results)
509	{
510	if (start_results < 0)
511	{
512	start_results = 0;
513	}
514	this.start_results = start_results - 1;
515	}
516
517	public void cleanUp()
518	{
519	super.cleanUp();
520	}
521
522	}

Note: See TracBrowser for help on using the repository browser.

Download in other formats: