Context Navigation

source: gs3-extensions/solr/trunk/src/src/java/org/greenstone/gsdl3/util/SolrQueryWrapper.java@ 29209

Last change on this file since 29209 was 29209, checked in by ak19, 10 years ago
Correcting else statement error messages
Property svn:executable set to ``*
File size: 15.8 KB

Line
1	/**********************************************************************
2	*
3	* SolrQueryWrapper.java
4	*
5	* Copyright 2004 The New Zealand Digital Library Project
6	*
7	* A component of the Greenstone digital library software
8	* from the New Zealand Digital Library Project at the
9	* University of Waikato, New Zealand.
10	*
11	* This program is free software; you can redistribute it and/or modify
12	* it under the terms of the GNU General Public License as published by
13	* the Free Software Foundation; either version 2 of the License, or
14	* (at your option) any later version.
15	*
16	* This program is distributed in the hope that it will be useful,
17	* but WITHOUT ANY WARRANTY; without even the implied warranty of
18	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19	* GNU General Public License for more details.
20	*
21	* You should have received a copy of the GNU General Public License
22	* along with this program; if not, write to the Free Software
23	* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24	*
25	*********************************************************************/
26	package org.greenstone.gsdl3.util;
27
28	import java.lang.reflect.Type;
29	import java.net.URLDecoder;
30	import java.util.ArrayList;
31	import java.util.Collection;
32	import java.util.HashMap;
33	import java.util.Iterator;
34	import java.util.List;
35	import java.util.Set;
36	import java.util.HashSet;
37
38	import org.apache.log4j.Logger;
39	import org.apache.solr.client.solrj.SolrQuery; // subclass of ModifiableSolrParams
40	import org.apache.solr.client.solrj.SolrServer;
41	import org.apache.solr.client.solrj.SolrServerException;
42	import org.apache.solr.client.solrj.embedded.EmbeddedSolrServer;
43	import org.apache.solr.client.solrj.response.QueryResponse;
44	import org.apache.solr.client.solrj.response.TermsResponse;
45
46	import org.apache.solr.core.CoreContainer;
47	import org.apache.solr.core.SolrCore;
48
49	import org.apache.solr.common.SolrDocument;
50	import org.apache.solr.common.SolrDocumentList;
51	import org.apache.solr.common.params.ModifiableSolrParams;
52	import org.greenstone.LuceneWrapper4.SharedSoleneQuery;
53	import org.greenstone.LuceneWrapper4.SharedSoleneQueryResult;
54
55	import org.apache.lucene.search.Query; // Query, TermQuery, BooleanQuery, BooleanClause and more
56	import org.apache.lucene.index.IndexReader;
57	import org.apache.lucene.index.Term;
58	import org.apache.solr.search.QParser;
59	import org.apache.solr.search.SolrIndexSearcher;
60	import org.apache.solr.request.LocalSolrQueryRequest;
61
62	import com.google.gson.Gson;
63	import com.google.gson.reflect.TypeToken;
64
65	public class SolrQueryWrapper extends SharedSoleneQuery
66	{
67	public static String SORT_ASCENDING = "asc";
68	public static String SORT_DESCENDING = "desc";
69	public static String SORT_BY_RANK = "score";
70	public static String SORT_BY_INDEX_ORDER = "_docid_";
71
72	static Logger logger = Logger.getLogger(org.greenstone.gsdl3.util.SolrQueryWrapper.class.getName());
73	protected int max_docs = 100;
74	protected String sort_order = SORT_DESCENDING;
75	protected String sort_field = SORT_BY_RANK; // don't want null default for solr
76	protected ArrayList<String> _facets = new ArrayList<String>();
77	protected ArrayList<String> _facetQueries = new ArrayList<String>();
78	SolrServer solr_core = null;
79
80	public SolrQueryWrapper()
81	{
82	super();
83	start_results = 0;
84	}
85
86	public void setMaxDocs(int max_docs)
87	{
88	this.max_docs = max_docs;
89	}
90
91	public void setSolrCore(SolrServer solr_core)
92	{
93	this.solr_core = solr_core;
94	}
95	// make sure its not null.
96	public void setSortField(String sort_field) {
97	if (sort_field != null) {
98	this.sort_field = sort_field;
99	}
100	}
101
102	public void setSortOrder(String order)
103	{
104	this.sort_order = order;
105	}
106	public void addFacet(String facet)
107	{
108	if (!_facets.contains(facet))
109	{
110	_facets.add(facet);
111	}
112	}
113
114	public void clearFacets()
115	{
116	_facets.clear();
117	}
118
119	public void addFacetQuery(String facetQuery)
120	{
121	if (!_facetQueries.contains(facetQuery))
122	{
123	_facetQueries.add(facetQuery);
124	}
125	}
126
127	public void clearFacetQueries()
128	{
129	_facetQueries.clear();
130	}
131
132	public boolean initialise()
133	{
134	if (solr_core == null)
135	{
136	utf8out.println("Solr Core not loaded in ");
137	utf8out.flush();
138	return false;
139	}
140	return true;
141	}
142
143
144	/** Extracts the query terms from the query string. The query string can be a boolean
145	* combination of the various search fields with their search terms or phrases
146	*/
147	public Term[] getTerms(SolrQuery solrQuery, String query_string)
148	{
149	Term terms[] = null;
150
151	if(solr_core instanceof EmbeddedSolrServer) {
152	EmbeddedSolrServer solrServer = (EmbeddedSolrServer)solr_core;
153
154	CoreContainer coreContainer = solrServer.getCoreContainer();
155
156	Collection<SolrCore> solrCores = coreContainer.getCores();
157	if(!solrCores.isEmpty()) {
158	Iterator<SolrCore> coreIterator = solrCores.iterator();
159
160	// Just use the first core, since the term frequency of any term is the same regardless of core
161	if(coreIterator.hasNext()) {
162	SolrCore solrCore = coreIterator.next();
163
164
165	LocalSolrQueryRequest solrQueryRequest = new LocalSolrQueryRequest(solrCore, solrQuery);
166	Query parsedQuery = null;
167
168	try {
169
170	// get the qparser, default is LuceneQParserPlugin, which is called "lucene" see http://wiki.apache.org/solr/QueryParser
171	QParser qParser = QParser.getParser(query_string, "lucene", solrQueryRequest);
172	parsedQuery = qParser.getQuery();
173
174	// For PrefixQuery or WildCardQuery (a subclass of AutomatonQuery, incl RegexpQ),
175	// like ZZ:econom* and ZZ:*date/regex queries, Query.extractTerms() throws an Exception
176	// because it has not done the Query.rewrite() step yet. So do that manually for them.
177	// This still doesn't provide us with the terms that econom* or *date break down into.
178
179	//if(parsedQuery instanceof PrefixQuery \|\| parsedQuery instanceof AutomatonQuery) {
180	// Should we just check superclass MultiTermQuery?
181	// Can be a BooleanQuery containing PrefixQuery/WildCardQuery among its clauses, so
182	// just test for * in the query_string to determine if we need to do a rewrite() or not
183	if(query_string.contains("*")) {
184	SolrIndexSearcher searcher = solrQueryRequest.getSearcher();
185	IndexReader indexReader = searcher.getIndexReader(); // returns a DirectoryReader
186	parsedQuery = parsedQuery.rewrite(indexReader); // gets rewritten to ConstantScoreQuery
187	}
188
189	//System.err.println("#### Query type was: " + parsedQuery.getClass());
190	//logger.error("#### Query type was: " + parsedQuery.getClass());
191
192	// extract the terms
193	Set<Term> extractedQueryTerms = new HashSet<Term>();
194	parsedQuery.extractTerms(extractedQueryTerms);
195
196	terms = new Term[extractedQueryTerms.size()];
197
198	Iterator<Term> termsIterator = extractedQueryTerms.iterator();
199	for(int i = 0; termsIterator.hasNext(); i++) {
200	Term term = termsIterator.next();
201	///System.err.println("#### Found query term: " + term);
202	///logger.error("#### Found query term: " + term);
203
204	terms[i] = term; //(term.field(), term.text());
205	}
206
207	} catch(Exception queryParseException) {
208	queryParseException.printStackTrace();
209	System.err.println("Exception when parsing query: " + queryParseException.getMessage());
210	System.err.println("#### Query type was: " + parsedQuery.getClass());
211	logger.error("#### Query type was: " + parsedQuery.getClass());
212	}
213	}
214
215	} else {
216	System.err.println("#### CoreContainer is empty");
217	logger.error("#### CoreContainer is empty");
218	}
219	} else {
220	System.err.println("#### Not an EmbeddedSolrServer. This shouldn't happen." + solr_core.getClass());
221	logger.error("#### Not an EmbeddedSolrServer. This shouldn't happen." + solr_core.getClass());
222	}
223
224
225	return terms;
226	}
227
228	public SharedSoleneQueryResult runQuery(String query_string)
229	{
230	if (query_string == null \|\| query_string.equals(""))
231	{
232	utf8out.println("The query word is not indicated ");
233	utf8out.flush();
234	return null;
235	}
236
237	SolrQueryResult solr_query_result = new SolrQueryResult();
238	solr_query_result.clear();
239
240	if (_facetQueries.size() > 0)
241	{
242	HashMap<String, ArrayList<String>> grouping = new HashMap<String, ArrayList<String>>();
243	for (String currentQuery : _facetQueries)
244	{
245	//Facet queries are stored in JSON, so we have to decode it
246	Gson gson = new Gson();
247	Type type = new TypeToken<List<String>>()
248	{
249	}.getType();
250	List<String> queryElems = gson.fromJson(currentQuery, type);
251
252	//Group each query segment by the index it uses
253	for (String currentQueryElement : queryElems)
254	{
255	String decodedQueryElement = null;
256	try
257	{
258	decodedQueryElement = URLDecoder.decode(currentQueryElement, "UTF-8");
259	}
260	catch (Exception ex)
261	{
262	continue;
263	}
264
265	int colonIndex = currentQueryElement.indexOf(":");
266	String indexShortName = currentQueryElement.substring(0, colonIndex);
267
268	if (grouping.get(indexShortName) == null)
269	{
270	grouping.put(indexShortName, new ArrayList<String>());
271	}
272	grouping.get(indexShortName).add(decodedQueryElement);
273	}
274	}
275
276	//Construct the facet query string to add to the regular query string
277	StringBuilder facetQueryString = new StringBuilder();
278	int keysetCounter = 0;
279	for (String key : grouping.keySet())
280	{
281	StringBuilder currentFacetString = new StringBuilder("(");
282	int groupCounter = 0;
283	for (String queryElem : grouping.get(key))
284	{
285	currentFacetString.append(queryElem);
286
287	groupCounter++;
288	if (groupCounter < grouping.get(key).size())
289	{
290	currentFacetString.append(" OR ");
291	}
292	}
293	currentFacetString.append(")");
294
295	facetQueryString.append(currentFacetString);
296
297	keysetCounter++;
298	if (keysetCounter < grouping.keySet().size())
299	{
300	facetQueryString.append(" AND ");
301	}
302	}
303
304	if (facetQueryString.length() > 0)
305	{
306	query_string += " AND " + facetQueryString;
307	}
308	}
309
310
311	SolrQuery solrQuery = new SolrQuery(query_string);
312	solrQuery.addSort(this.sort_field, SolrQuery.ORDER.valueOf(this.sort_order)); // sort param, like "score desc" or "byORG asc"
313	solrQuery.setStart(start_results); // which result to start from
314	solrQuery.setRows((end_results - start_results) + 1); // how many results per "page"
315
316	// http://lucene.472066.n3.nabble.com/get-term-frequency-just-only-keywords-search-td4084510.html
317	// WORKS (search didx core):
318	//TI:farming
319	//docOID,score,termfreq(TI,'farming'),totaltermfreq(TI,'farming')
320
321
322	// which fields to return for each document, we'll add the request for totaltermfreq later
323	// fl=docOID score termfreq(TI,'farming') totaltermfreq(TI,'farming')
324	solrQuery.setFields("docOID", "score"); //solrParams.set("fl", "docOID score totaltermfreq(field,'queryterm')");
325
326	//solrQuery.setTerms(true); // turn on the termsComponent
327	//solrQuery.set("terms.fl", "ZZ"); // which field to get the terms from. ModifiableSolrParams method
328
329	// http://wiki.apache.org/solr/TermVectorComponent and https://cwiki.apache.org/confluence/display/solr/The+Term+Vector+Component
330	// http://lucene.472066.n3.nabble.com/get-term-frequency-just-only-keywords-search-td4084510.html
331	// http://stackoverflow.com/questions/13031534/word-frequency-in-solr
332	// http://wiki.apache.org/solr/FunctionQuery#tf and #termfreq and #totaltermfreq
333	// https://wiki.apache.org/solr/TermsComponent
334
335	//solrParams.set("tv.tf", true);// turn on the terms vector Component
336	//solrParams.set("tv.fl", "ZZ");// which field to get the terms from /// ZZ
337
338
339	if (_facets.size() > 0)
340	{
341	// enable facet counts in the query response
342	solrQuery.setFacet(true); //solrParams.set("facet", "true");
343	for (int i = 0; i < _facets.size(); i++)
344	{
345	// add this field as a facet
346	solrQuery.addFacetField(_facets.get(i)); // solrParams.add("facet.field", _facets.get(i));
347	}
348	}
349
350	// get the individual terms that make up the query, then request solr to return the totaltermfreq for each term
351	Term[] terms = getTerms(solrQuery, query_string);
352	if(terms != null) {
353	for(int i = 0; i < terms.length; i++) {
354	Term term = terms[i];
355	String field = term.field();
356	String queryTerm = term.text();
357	// totaltermfreq(TI, 'farming') termfreq(TI, 'farming')
358
359	solrQuery.addField("totaltermfreq(" + field + ",'" + queryTerm + "')");
360	solrQuery.addField("termfreq(" + field + ",'" + queryTerm + "')");
361	}
362	}
363
364	// do the query
365	try
366	{
367	QueryResponse solrResponse = solr_core.query(solrQuery); //solr_core.query(solrParams);
368	SolrDocumentList hits = solrResponse.getResults();
369	//TermsResponse termResponse = solrResponse.getTermsResponse(); // null unless termvectors=true in schema.xml
370
371	if (hits != null)
372	{
373	logger.info("*** hits size = " + hits.size());
374	logger.info("*** num docs found = " + hits.getNumFound());
375
376	logger.info("*** start results = " + start_results);
377	logger.info("*** end results = " + end_results);
378	logger.info("*** max docs = " + max_docs);
379
380	// numDocsFound is the total number of matching docs in the collection
381	// as opposed to the number of documents returned in the hits list
382
383	solr_query_result.setTotalDocs((int) hits.getNumFound());
384
385	solr_query_result.setStartResults(start_results);
386	solr_query_result.setEndResults(start_results + hits.size());
387
388
389	// get the first field we're searching in, this will be the fallback field
390	int sepIndex = query_string.indexOf(":");
391	String defaultField = query_string.substring(0, sepIndex);
392	//String query = query_string.substring(sepIndex + 2, query_string.length() - 1); // Replaced by call to getTerms()
393
394	//solr_query_result.addTerm(query, field, (int) hits.getNumFound(), -1);
395
396	// Output the matching documents
397	for (int i = 0; i < hits.size(); i++)
398	{
399	SolrDocument doc = hits.get(i);
400
401	// Need to think about how to support document term frequency. Make zero for now
402	int doc_term_freq = 0;
403	String docOID = (String) doc.get("docOID");
404	Float score = (Float) doc.get("score");
405
406	logger.info("**** docOID = " + docOID);
407	logger.info("**** score = " + score);
408
409
410	// solr returns each term's totaltermfreq, ttf, at the document level, even though
411	// the ttf is the same for each document. So extract this information just for the first document
412	if(i == 0) { // first document
413
414	if(terms != null) {
415	for(int j = 0; j < terms.length; j++) {
416	Term term = terms[j];
417	String field = term.field();
418	String queryTerm = term.text();
419
420	// totaltermfreq(TI, 'farming') termfreq(TI, 'farming')
421	Long totaltermfreq = (Long)doc.get("totaltermfreq("+field+",'"+queryTerm+"')");
422	Integer termfreq = (Integer)doc.get("termfreq("+field+",'"+queryTerm+"')");
423
424	//System.err.println("**** ttf = " + totaltermfreq);
425	//System.err.println("**** tf = " + termfreq);
426	//logger.info("**** ttf = " + totaltermfreq);
427	//logger.info("**** tf = " + termfreq);
428
429	solr_query_result.addTerm(queryTerm, field, (int) hits.getNumFound(), totaltermfreq.intValue()); // long totaltermfreq to int
430	}
431	} else { // no terms extracted from query_string
432	solr_query_result.addTerm(query_string, defaultField, (int) hits.getNumFound(), -1); // no terms
433	}
434	}
435
436	solr_query_result.addDoc(docOID, score.floatValue(), doc_term_freq); // doc_termfreq for which term????
437	}
438	}
439	else
440	{
441	solr_query_result.setTotalDocs(0);
442
443	solr_query_result.setStartResults(0);
444	solr_query_result.setEndResults(0);
445	}
446
447	solr_query_result.setFacetResults(solrResponse.getFacetFields());
448	}
449	catch (SolrServerException server_exception)
450	{
451	server_exception.printStackTrace();
452	solr_query_result.setError(SolrQueryResult.SERVER_ERROR);
453	}
454
455	return solr_query_result;
456	}
457
458	//Greenstone universe operates with a base of 1 for "start_results"
459	//But Solr operates from 0
460	public void setStartResults(int start_results)
461	{
462	if (start_results < 0)
463	{
464	start_results = 0;
465	}
466	this.start_results = start_results - 1;
467	}
468
469	public void cleanUp()
470	{
471	super.cleanUp();
472	}
473
474	}

Note: See TracBrowser for help on using the repository browser.

Download in other formats: