Context Navigation

source: gs3-extensions/solr/trunk/src/src/java/org/greenstone/gsdl3/util/SolrQueryWrapper.java@ 29217

Last change on this file since 29217 was 29217, checked in by ak19, 10 years ago
Terms must be found in a sorlCore (sidx or didx) of THIS collection, not in the first solrCore in the CoreContainer (solr.xml), since the CoreContainer can contain core descriptors of other GS collections as welll.
Property svn:executable set to ``*
File size: 16.4 KB

Line
1	/**********************************************************************
2	*
3	* SolrQueryWrapper.java
4	*
5	* Copyright 2004 The New Zealand Digital Library Project
6	*
7	* A component of the Greenstone digital library software
8	* from the New Zealand Digital Library Project at the
9	* University of Waikato, New Zealand.
10	*
11	* This program is free software; you can redistribute it and/or modify
12	* it under the terms of the GNU General Public License as published by
13	* the Free Software Foundation; either version 2 of the License, or
14	* (at your option) any later version.
15	*
16	* This program is distributed in the hope that it will be useful,
17	* but WITHOUT ANY WARRANTY; without even the implied warranty of
18	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19	* GNU General Public License for more details.
20	*
21	* You should have received a copy of the GNU General Public License
22	* along with this program; if not, write to the Free Software
23	* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24	*
25	*********************************************************************/
26	package org.greenstone.gsdl3.util;
27
28	import java.lang.reflect.Type;
29	import java.net.URLDecoder;
30	import java.util.ArrayList;
31	import java.util.Collection;
32	import java.util.HashMap;
33	import java.util.Iterator;
34	import java.util.List;
35	import java.util.Set;
36	import java.util.HashSet;
37
38	import org.apache.log4j.Logger;
39	import org.apache.solr.client.solrj.SolrQuery; // subclass of ModifiableSolrParams
40	import org.apache.solr.client.solrj.SolrServer;
41	import org.apache.solr.client.solrj.SolrServerException;
42	import org.apache.solr.client.solrj.embedded.EmbeddedSolrServer;
43	import org.apache.solr.client.solrj.response.QueryResponse;
44	import org.apache.solr.client.solrj.response.TermsResponse;
45
46	import org.apache.solr.core.CoreContainer;
47	import org.apache.solr.core.SolrCore;
48
49	import org.apache.solr.common.SolrDocument;
50	import org.apache.solr.common.SolrDocumentList;
51	import org.apache.solr.common.params.ModifiableSolrParams;
52	import org.greenstone.LuceneWrapper4.SharedSoleneQuery;
53	import org.greenstone.LuceneWrapper4.SharedSoleneQueryResult;
54
55	import org.apache.lucene.search.Query; // Query, TermQuery, BooleanQuery, BooleanClause and more
56	import org.apache.lucene.index.IndexReader;
57	import org.apache.lucene.index.Term;
58	import org.apache.solr.search.QParser;
59	import org.apache.solr.search.SolrIndexSearcher;
60	import org.apache.solr.request.LocalSolrQueryRequest;
61
62	import com.google.gson.Gson;
63	import com.google.gson.reflect.TypeToken;
64
65	public class SolrQueryWrapper extends SharedSoleneQuery
66	{
67	public static String SORT_ASCENDING = "asc";
68	public static String SORT_DESCENDING = "desc";
69	public static String SORT_BY_RANK = "score";
70	public static String SORT_BY_INDEX_ORDER = "_docid_";
71
72	static Logger logger = Logger.getLogger(org.greenstone.gsdl3.util.SolrQueryWrapper.class.getName());
73	protected int max_docs = 100;
74	protected String sort_order = SORT_DESCENDING;
75	protected String sort_field = SORT_BY_RANK; // don't want null default for solr
76	protected ArrayList<String> _facets = new ArrayList<String>();
77	protected ArrayList<String> _facetQueries = new ArrayList<String>();
78	SolrServer solr_core = null;
79
80	String collection_core_name_prefix = null;
81
82	public SolrQueryWrapper()
83	{
84	super();
85	start_results = 0;
86	}
87
88	public void setMaxDocs(int max_docs)
89	{
90	this.max_docs = max_docs;
91	}
92
93	public void setSolrCore(SolrServer solr_core)
94	{
95	this.solr_core = solr_core;
96	}
97
98	public void setCollectionCoreNamePrefix(String colCoreNamePrefix) {
99	this.collection_core_name_prefix = colCoreNamePrefix;
100	}
101
102	// make sure its not null.
103	public void setSortField(String sort_field) {
104	if (sort_field != null) {
105	this.sort_field = sort_field;
106	}
107	}
108
109	public void setSortOrder(String order)
110	{
111	this.sort_order = order;
112	}
113	public void addFacet(String facet)
114	{
115	if (!_facets.contains(facet))
116	{
117	_facets.add(facet);
118	}
119	}
120
121	public void clearFacets()
122	{
123	_facets.clear();
124	}
125
126	public void addFacetQuery(String facetQuery)
127	{
128	if (!_facetQueries.contains(facetQuery))
129	{
130	_facetQueries.add(facetQuery);
131	}
132	}
133
134	public void clearFacetQueries()
135	{
136	_facetQueries.clear();
137	}
138
139	public boolean initialise()
140	{
141	if (solr_core == null)
142	{
143	utf8out.println("Solr Core not loaded in ");
144	utf8out.flush();
145	return false;
146	}
147	return true;
148	}
149
150
151	/** Extracts the query terms from the query string. The query string can be a boolean
152	* combination of the various search fields with their search terms or phrases
153	*/
154	public Term[] getTerms(SolrQuery solrQuery, String query_string)
155	{
156	Term terms[] = null;
157
158	if(solr_core instanceof EmbeddedSolrServer) {
159	EmbeddedSolrServer solrServer = (EmbeddedSolrServer)solr_core;
160
161	CoreContainer coreContainer = solrServer.getCoreContainer();
162
163	Collection<SolrCore> solrCores = coreContainer.getCores();
164	if(!solrCores.isEmpty()) {
165	Iterator<SolrCore> coreIterator = solrCores.iterator();
166
167	// Just use the first core that matches the collection name, since the term
168	// frequency of any term is the same regardless of whether its didx or sidx core
169	boolean foundCore = false;
170	while(coreIterator.hasNext() && !foundCore) {
171	SolrCore solrCore = coreIterator.next();
172	if(!solrCore.getName().startsWith(this.collection_core_name_prefix)) {
173	//logger.error("### Skipping core not of this collection: " + solrCore.getName());
174	continue;
175	}
176
177	//logger.error("### Found core " + solrCore.getName() + " of this collection " + this.collection_core_name_prefix);
178	foundCore = true;
179
180	LocalSolrQueryRequest solrQueryRequest = new LocalSolrQueryRequest(solrCore, solrQuery);
181	Query parsedQuery = null;
182
183	try {
184
185	// get the qparser, default is LuceneQParserPlugin, which is called "lucene" see http://wiki.apache.org/solr/QueryParser
186	QParser qParser = QParser.getParser(query_string, "lucene", solrQueryRequest);
187	parsedQuery = qParser.getQuery();
188
189	// For PrefixQuery or WildCardQuery (a subclass of AutomatonQuery, incl RegexpQ),
190	// like ZZ:econom* and ZZ:*date/regex queries, Query.extractTerms() throws an Exception
191	// because it has not done the Query.rewrite() step yet. So do that manually for them.
192	// This still doesn't provide us with the terms that econom* or *date break down into.
193
194	//if(parsedQuery instanceof PrefixQuery \|\| parsedQuery instanceof AutomatonQuery) {
195	// Should we just check superclass MultiTermQuery?
196	// Can be a BooleanQuery containing PrefixQuery/WildCardQuery among its clauses, so
197	// just test for * in the query_string to determine if we need to do a rewrite() or not
198	if(query_string.contains("*")) {
199	SolrIndexSearcher searcher = solrQueryRequest.getSearcher();
200	IndexReader indexReader = searcher.getIndexReader(); // returns a DirectoryReader
201	parsedQuery = parsedQuery.rewrite(indexReader); // gets rewritten to ConstantScoreQuery
202	}
203
204	//System.err.println("#### Query type was: " + parsedQuery.getClass());
205	//logger.error("#### Query type was: " + parsedQuery.getClass());
206
207	// extract the terms
208	Set<Term> extractedQueryTerms = new HashSet<Term>();
209	parsedQuery.extractTerms(extractedQueryTerms);
210
211	terms = new Term[extractedQueryTerms.size()];
212
213	Iterator<Term> termsIterator = extractedQueryTerms.iterator();
214	for(int i = 0; termsIterator.hasNext(); i++) {
215	Term term = termsIterator.next();
216	///System.err.println("#### Found query term: " + term);
217	///logger.error("#### Found query term: " + term);
218
219	terms[i] = term; //(term.field(), term.text());
220	}
221
222	} catch(Exception queryParseException) {
223	queryParseException.printStackTrace();
224	System.err.println("Exception when parsing query: " + queryParseException.getMessage());
225	System.err.println("#### Query type was: " + parsedQuery.getClass());
226	logger.error("#### Query type was: " + parsedQuery.getClass());
227	}
228	}
229
230	} else {
231	System.err.println("#### CoreContainer is empty");
232	logger.error("#### CoreContainer is empty");
233	}
234	} else {
235	System.err.println("#### Not an EmbeddedSolrServer. This shouldn't happen." + solr_core.getClass());
236	logger.error("#### Not an EmbeddedSolrServer. This shouldn't happen." + solr_core.getClass());
237	}
238
239
240	return terms;
241	}
242
243	public SharedSoleneQueryResult runQuery(String query_string)
244	{
245	if (query_string == null \|\| query_string.equals(""))
246	{
247	utf8out.println("The query word is not indicated ");
248	utf8out.flush();
249	return null;
250	}
251
252	SolrQueryResult solr_query_result = new SolrQueryResult();
253	solr_query_result.clear();
254
255	if (_facetQueries.size() > 0)
256	{
257	HashMap<String, ArrayList<String>> grouping = new HashMap<String, ArrayList<String>>();
258	for (String currentQuery : _facetQueries)
259	{
260	//Facet queries are stored in JSON, so we have to decode it
261	Gson gson = new Gson();
262	Type type = new TypeToken<List<String>>()
263	{
264	}.getType();
265	List<String> queryElems = gson.fromJson(currentQuery, type);
266
267	//Group each query segment by the index it uses
268	for (String currentQueryElement : queryElems)
269	{
270	String decodedQueryElement = null;
271	try
272	{
273	decodedQueryElement = URLDecoder.decode(currentQueryElement, "UTF-8");
274	}
275	catch (Exception ex)
276	{
277	continue;
278	}
279
280	int colonIndex = currentQueryElement.indexOf(":");
281	String indexShortName = currentQueryElement.substring(0, colonIndex);
282
283	if (grouping.get(indexShortName) == null)
284	{
285	grouping.put(indexShortName, new ArrayList<String>());
286	}
287	grouping.get(indexShortName).add(decodedQueryElement);
288	}
289	}
290
291	//Construct the facet query string to add to the regular query string
292	StringBuilder facetQueryString = new StringBuilder();
293	int keysetCounter = 0;
294	for (String key : grouping.keySet())
295	{
296	StringBuilder currentFacetString = new StringBuilder("(");
297	int groupCounter = 0;
298	for (String queryElem : grouping.get(key))
299	{
300	currentFacetString.append(queryElem);
301
302	groupCounter++;
303	if (groupCounter < grouping.get(key).size())
304	{
305	currentFacetString.append(" OR ");
306	}
307	}
308	currentFacetString.append(")");
309
310	facetQueryString.append(currentFacetString);
311
312	keysetCounter++;
313	if (keysetCounter < grouping.keySet().size())
314	{
315	facetQueryString.append(" AND ");
316	}
317	}
318
319	if (facetQueryString.length() > 0)
320	{
321	query_string += " AND " + facetQueryString;
322	}
323	}
324
325
326	SolrQuery solrQuery = new SolrQuery(query_string);
327	solrQuery.addSort(this.sort_field, SolrQuery.ORDER.valueOf(this.sort_order)); // sort param, like "score desc" or "byORG asc"
328	solrQuery.setStart(start_results); // which result to start from
329	solrQuery.setRows((end_results - start_results) + 1); // how many results per "page"
330
331	// http://lucene.472066.n3.nabble.com/get-term-frequency-just-only-keywords-search-td4084510.html
332	// WORKS (search didx core):
333	//TI:farming
334	//docOID,score,termfreq(TI,'farming'),totaltermfreq(TI,'farming')
335
336
337	// which fields to return for each document, we'll add the request for totaltermfreq later
338	// fl=docOID score termfreq(TI,'farming') totaltermfreq(TI,'farming')
339	solrQuery.setFields("docOID", "score"); //solrParams.set("fl", "docOID score totaltermfreq(field,'queryterm')");
340
341	//solrQuery.setTerms(true); // turn on the termsComponent
342	//solrQuery.set("terms.fl", "ZZ"); // which field to get the terms from. ModifiableSolrParams method
343
344	// http://wiki.apache.org/solr/TermVectorComponent and https://cwiki.apache.org/confluence/display/solr/The+Term+Vector+Component
345	// http://lucene.472066.n3.nabble.com/get-term-frequency-just-only-keywords-search-td4084510.html
346	// http://stackoverflow.com/questions/13031534/word-frequency-in-solr
347	// http://wiki.apache.org/solr/FunctionQuery#tf and #termfreq and #totaltermfreq
348	// https://wiki.apache.org/solr/TermsComponent
349
350	//solrParams.set("tv.tf", true);// turn on the terms vector Component
351	//solrParams.set("tv.fl", "ZZ");// which field to get the terms from /// ZZ
352
353
354	if (_facets.size() > 0)
355	{
356	// enable facet counts in the query response
357	solrQuery.setFacet(true); //solrParams.set("facet", "true");
358	for (int i = 0; i < _facets.size(); i++)
359	{
360	// add this field as a facet
361	solrQuery.addFacetField(_facets.get(i)); // solrParams.add("facet.field", _facets.get(i));
362	}
363	}
364
365	// get the individual terms that make up the query, then request solr to return the totaltermfreq for each term
366	Term[] terms = getTerms(solrQuery, query_string);
367	if(terms != null) {
368	for(int i = 0; i < terms.length; i++) {
369	Term term = terms[i];
370	String field = term.field();
371	String queryTerm = term.text();
372	// totaltermfreq(TI, 'farming') termfreq(TI, 'farming')
373
374	solrQuery.addField("totaltermfreq(" + field + ",'" + queryTerm + "')");
375	solrQuery.addField("termfreq(" + field + ",'" + queryTerm + "')");
376	}
377	}
378
379	// do the query
380	try
381	{
382	QueryResponse solrResponse = solr_core.query(solrQuery); //solr_core.query(solrParams);
383	SolrDocumentList hits = solrResponse.getResults();
384	//TermsResponse termResponse = solrResponse.getTermsResponse(); // null unless termvectors=true in schema.xml
385
386	if (hits != null)
387	{
388	logger.info("*** hits size = " + hits.size());
389	logger.info("*** num docs found = " + hits.getNumFound());
390
391	logger.info("*** start results = " + start_results);
392	logger.info("*** end results = " + end_results);
393	logger.info("*** max docs = " + max_docs);
394
395	// numDocsFound is the total number of matching docs in the collection
396	// as opposed to the number of documents returned in the hits list
397
398	solr_query_result.setTotalDocs((int) hits.getNumFound());
399
400	solr_query_result.setStartResults(start_results);
401	solr_query_result.setEndResults(start_results + hits.size());
402
403
404	// get the first field we're searching in, this will be the fallback field
405	int sepIndex = query_string.indexOf(":");
406	String defaultField = query_string.substring(0, sepIndex);
407	//String query = query_string.substring(sepIndex + 2, query_string.length() - 1); // Replaced by call to getTerms()
408
409	//solr_query_result.addTerm(query, field, (int) hits.getNumFound(), -1);
410
411	// Output the matching documents
412	for (int i = 0; i < hits.size(); i++)
413	{
414	SolrDocument doc = hits.get(i);
415
416	// Need to think about how to support document term frequency. Make zero for now
417	int doc_term_freq = 0;
418	String docOID = (String) doc.get("docOID");
419	Float score = (Float) doc.get("score");
420
421	logger.info("**** docOID = " + docOID);
422	logger.info("**** score = " + score);
423
424
425	// solr returns each term's totaltermfreq, ttf, at the document level, even though
426	// the ttf is the same for each document. So extract this information just for the first document
427	if(i == 0) { // first document
428
429	if(terms != null) {
430	for(int j = 0; j < terms.length; j++) {
431	Term term = terms[j];
432	String field = term.field();
433	String queryTerm = term.text();
434
435	// totaltermfreq(TI, 'farming') termfreq(TI, 'farming')
436	Long totaltermfreq = (Long)doc.get("totaltermfreq("+field+",'"+queryTerm+"')");
437	Integer termfreq = (Integer)doc.get("termfreq("+field+",'"+queryTerm+"')");
438
439	//System.err.println("**** ttf = " + totaltermfreq);
440	//System.err.println("**** tf = " + termfreq);
441	//logger.info("**** ttf = " + totaltermfreq);
442	//logger.info("**** tf = " + termfreq);
443
444	solr_query_result.addTerm(queryTerm, field, (int) hits.getNumFound(), totaltermfreq.intValue()); // long totaltermfreq to int
445	}
446	} else { // no terms extracted from query_string
447	solr_query_result.addTerm(query_string, defaultField, (int) hits.getNumFound(), -1); // no terms
448	}
449	}
450
451	solr_query_result.addDoc(docOID, score.floatValue(), doc_term_freq); // doc_termfreq for which term????
452	}
453	}
454	else
455	{
456	solr_query_result.setTotalDocs(0);
457
458	solr_query_result.setStartResults(0);
459	solr_query_result.setEndResults(0);
460	}
461
462	solr_query_result.setFacetResults(solrResponse.getFacetFields());
463	}
464	catch (SolrServerException server_exception)
465	{
466	server_exception.printStackTrace();
467	solr_query_result.setError(SolrQueryResult.SERVER_ERROR);
468	}
469
470	return solr_query_result;
471	}
472
473	//Greenstone universe operates with a base of 1 for "start_results"
474	//But Solr operates from 0
475	public void setStartResults(int start_results)
476	{
477	if (start_results < 0)
478	{
479	start_results = 0;
480	}
481	this.start_results = start_results - 1;
482	}
483
484	public void cleanUp()
485	{
486	super.cleanUp();
487	}
488
489	}

Note: See TracBrowser for help on using the repository browser.

Download in other formats: