Index: /gs3-extensions/solr/trunk/src/build.xml
===================================================================
--- /gs3-extensions/solr/trunk/src/build.xml (revision 29985)
+++ /gs3-extensions/solr/trunk/src/build.xml (revision 29986)
@@ -130,4 +130,20 @@
+
+
+
+
+
+
+
+
+
+
+
@@ -188,7 +204,8 @@
- Copying xalan related jar files from ${web.libdir} into ${tomcat.dir}/webapps/solr.war
+ Copying xalan related jar files, morphology and gs3-solrserver jars from ${web.libdir} into ${tomcat.dir}/webapps/solr.war
+
@@ -224,5 +241,5 @@
-
+
+
+
+
+
+
+
+
Index: /gs3-extensions/solr/trunk/src/conf/solrconfig.xml
===================================================================
--- /gs3-extensions/solr/trunk/src/conf/solrconfig.xml (revision 29985)
+++ /gs3-extensions/solr/trunk/src/conf/solrconfig.xml (revision 29986)
@@ -824,5 +824,6 @@
queries across multiple shards
-->
-
+
+
-
+
+
explicit
Index: /gs3-extensions/solr/trunk/src/src/java/org/greenstone/gsdl3/util/SolrQueryWrapper.java
===================================================================
--- /gs3-extensions/solr/trunk/src/src/java/org/greenstone/gsdl3/util/SolrQueryWrapper.java (revision 29985)
+++ /gs3-extensions/solr/trunk/src/src/java/org/greenstone/gsdl3/util/SolrQueryWrapper.java (revision 29986)
@@ -36,4 +36,7 @@
import java.util.HashSet;
+import java.util.regex.Pattern;
+import java.util.regex.Matcher;
+
import org.apache.log4j.Logger;
import org.apache.solr.client.solrj.SolrQuery; // subclass of ModifiableSolrParams
@@ -149,5 +152,15 @@
- /** Extracts the query terms from the query string. The query string can be a boolean
+ /**
+ * UNUSED.
+ * Back when we used the EmbeddedSolrServer, this getTerms method would expand the terms of a query.
+ * Because of Solr/Lucene Index locking exceptions, we switched over to the HttpSolrServer instead
+ * of the Embedded kind.
+ *
+ * The functionality of getTerms has been moved to
+ * ../solrserver/Greenstone3SearchHandler.java, which will sit on the solrserver side (inside
+ * tomcat's solr webapp).
+ *
+ * Extracts the query terms from the query string. The query string can be a boolean
* combination of the various search fields with their search terms or phrases
*/
@@ -371,17 +384,6 @@
}
+ // the solrserver will now
// get the individual terms that make up the query, then request solr to return the totaltermfreq for each term
- Term[] terms = getTerms(solrQuery, query_string);
- if(terms != null) {
- for(int i = 0; i < terms.length; i++) {
- Term term = terms[i];
- String field = term.field();
- String queryTerm = term.text();
- // totaltermfreq(TI, 'farming') termfreq(TI, 'farming')
-
- solrQuery.addField("totaltermfreq(" + field + ",'" + queryTerm + "')");
- solrQuery.addField("termfreq(" + field + ",'" + queryTerm + "')");
- }
- }
// do the query
@@ -433,24 +435,47 @@
// solr returns each term's totaltermfreq, ttf, at the document level, even though
// the ttf is the same for each document. So extract this information just for the first document
- if(i == 0) { // first document
-
- if(terms != null) {
- for(int j = 0; j < terms.length; j++) {
- Term term = terms[j];
- String field = term.field();
- String queryTerm = term.text();
-
- // totaltermfreq(TI, 'farming') termfreq(TI, 'farming')
- Long totaltermfreq = (Long)doc.get("totaltermfreq("+field+",'"+queryTerm+"')");
- Integer termfreq = (Integer)doc.get("termfreq("+field+",'"+queryTerm+"')");
-
- //System.err.println("**** ttf = " + totaltermfreq);
- //System.err.println("**** tf = " + termfreq);
- //logger.info("**** ttf = " + totaltermfreq);
- //logger.info("**** tf = " + termfreq);
+ if(i == 0) { // first document, all others repeat the same termfreq data
+ boolean foundTermInfo = false;
+
+ Collection fieldNames = doc.getFieldNames();
+ for(Iterator it = fieldNames.iterator(); it.hasNext(); ) {
+ String fieldName = it.next(); // e.g. looking for totaltermfreq(ZZ,'economically')
+ //logger.info("@@@@ found fieldName " + fieldName);
- solr_query_result.addTerm(queryTerm, field, (int) hits.getNumFound(), totaltermfreq.intValue()); // long totaltermfreq to int
- }
- } else { // no terms extracted from query_string
+
+ if(fieldName.startsWith("totaltermfreq")) {
+ //|| fieldName.startsWith("termfreq")) {
+
+ foundTermInfo = true;
+
+ // e.g. totaltermfreq(TI,'farming')
+ // e.g. termfreq(TI,'farming')
+ Pattern pattern = Pattern.compile("(.*?termfreq)\\((.*?),'(.*?)'\\)");
+ Matcher matcher = pattern.matcher(fieldName);
+ String metaField, indexField, queryTerm;
+ while (matcher.find()) {
+ metaField = matcher.group(1); // termfreq or totaltermfreq
+ indexField = matcher.group(2); //ZZ, TI
+ queryTerm = matcher.group(3);
+
+ //logger.info("\t@@@@ found field " + indexField);
+ //logger.info("\t@@@@ queryTerm " + queryTerm);
+
+ // Finally, can ask for the totaltermfreq value for this
+ // searchterm in its indexed field:
+ // e.g. totaltermfreq(TI,'farming'), e.g. termfreq(TI,'farming')
+ Long totaltermfreq = (Long)doc.get("totaltermfreq("+indexField+",'"+queryTerm+"')");
+
+ Integer termfreq = (Integer)doc.get("termfreq("+indexField+",'"+queryTerm+"')");
+
+ //System.err.println("**** ttf = " + totaltermfreq);
+ //System.err.println("**** tf = " + termfreq);
+ //logger.info("**** ttf = " + totaltermfreq);
+ //logger.info("**** tf = " + termfreq);
+ solr_query_result.addTerm(queryTerm, indexField, (int) hits.getNumFound(), totaltermfreq.intValue()); // long totaltermfreq to int
+ }
+ }
+ }
+ if(!foundTermInfo) { // no terms extracted from query_string
solr_query_result.addTerm(query_string, defaultField, (int) hits.getNumFound(), -1); // no terms
}
Index: /gs3-extensions/solr/trunk/src/src/java/org/greenstone/solrserver/Greenstone3SearchHandler.java
===================================================================
--- /gs3-extensions/solr/trunk/src/src/java/org/greenstone/solrserver/Greenstone3SearchHandler.java (revision 29986)
+++ /gs3-extensions/solr/trunk/src/src/java/org/greenstone/solrserver/Greenstone3SearchHandler.java (revision 29986)
@@ -0,0 +1,247 @@
+/**********************************************************************
+ *
+ * Greenstone3SearchHandler.java
+ *
+ * Copyright 2015 The New Zealand Digital Library Project
+ *
+ * A component of the Greenstone digital library software
+ * from the New Zealand Digital Library Project at the
+ * University of Waikato, New Zealand.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ *
+ *********************************************************************/
+
+package org.greenstone.solrserver;
+
+import org.apache.solr.handler.component.SearchHandler;
+
+
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+//import org.apache.log4j.Logger;
+
+import org.apache.solr.client.solrj.SolrQuery; // subclass of ModifiableSolrParams, a subclass of SolrParams
+
+import org.apache.solr.core.CoreContainer;
+import org.apache.solr.core.SolrCore;
+
+import org.apache.lucene.index.IndexReader;
+import org.apache.lucene.index.Term;
+
+import org.apache.lucene.search.BooleanClause;
+import org.apache.lucene.search.BooleanQuery;
+import org.apache.lucene.search.MultiTermQuery;
+import org.apache.lucene.search.Query; // Query, TermQuery, BooleanQuery, BooleanClause and more
+
+
+import org.apache.solr.search.QParser;
+import org.apache.solr.search.SolrIndexSearcher;
+
+import org.apache.solr.request.SolrQueryRequest;
+import org.apache.solr.response.SolrQueryResponse;
+
+import java.util.ArrayList;
+import java.util.Collection;
+import java.util.Iterator;
+import java.util.List;
+import java.util.Set;
+import java.util.HashSet;
+
+/**
+ * This class is a custom Solr RequestHandler that sits on the solr server side (in tomcat's solr webapp)
+ * and when it receives a query request (sent to this SearchHandler), it will expand the query terms
+ * by calling query.rewrite and then request the totaltermfreq and totalfreq for these individual terms.
+ * This class was made necessary by the fact that solr/lucene index locking exceptions occurred when
+ * this code used to be in ext/solr's SolrQueryWrapper.java::getTerms().
+ *
+ * With the customisations in this class, can search a Solr collection for: econom* cat
+ * And the total and term frequencies will be returned for all expanded forms, depending on the analyzer.
+ */
+
+
+// Important page:
+// https://wiki.apache.org/solr/SolrPlugins
+public class Greenstone3SearchHandler extends SearchHandler
+{
+ // IMPORTANT NOTE: Logging doesn't work in this calss either with log4j or slf4j,
+ // but System.err goes to catalina.out.
+
+ //protected static Logger log = LoggerFactory.getLogger(Greenstone3SearchHandler.class);
+ //static Logger logger = LoggerFactory.getLogger(org.greenstone.solrserver.Greenstone3SearchHandler.class.getName());
+
+ protected MultiTermQuery.RewriteMethod currentRewriteMethod
+ = MultiTermQuery.CONSTANT_SCORE_BOOLEAN_QUERY_REWRITE;
+ // which is less CPU intensive than MultiTermQuery.SCORING_BOOLEAN_QUERY_REWRITE)
+
+ // This recursive method calls setRewriteMethod on any MultiTermQueries inside the given (boolean)query,
+ // since by default PrefixQueries get rewritten to ConstantScoreQueries and don't get expanded.
+ // Calling setRewriteMethod on each MultiTermQuery in query here is useful to later ensure that any
+ // MultiTermQueries like PrefixQueries and WildcareQueries can get expanded,
+ // including when embedded in BooleanQueries.
+ protected Query getSimplified(Query query)
+ {
+
+ // base case
+ if(query instanceof MultiTermQuery) { // PrefixQuery or WildcardQuery
+
+ // for some reason, when a PrefixQuery (e.g. econom*) gets rewritten to a ConstantScoreQuery
+ // it no longer rewrites the query to produce the expanded terms. Need to setRewriteMethod
+ // http://stackoverflow.com/questions/3060636/lucene-score-calculation-with-a-prefixquery
+ // See also http://trac.greenstone.org/ticket/845 and http://trac.greenstone.org/changeset/26157
+
+ MultiTermQuery mtQuery = (MultiTermQuery)query;
+ mtQuery.setRewriteMethod(currentRewriteMethod);
+
+ }
+
+ else if(query instanceof BooleanQuery) {
+
+ BooleanQuery bQuery = (BooleanQuery)query;
+ Iterator clauses = bQuery.iterator();
+
+ while(clauses.hasNext()) {
+ BooleanClause clause = clauses.next();
+ Query clauseQuery = clause.getQuery();
+ Query expandedClauseQuery = getSimplified(clauseQuery);
+ clause.setQuery(expandedClauseQuery);
+ }
+ }
+
+ // another type of query, leave as-is
+ return query;
+ }
+
+ protected Query expandQuery(SolrQueryRequest req, Query parsedQuery) throws Exception {
+
+ // calls setRewriteMethod on any MultiTermQueries inside the given (boolean)query,
+ // doing so ensures MultiTermQueries like PrefixQueries and WildcareQueries can get expanded
+ parsedQuery = getSimplified(parsedQuery); // can throw exception
+
+ // now finally rewrite the query to any expand Prefix- and WildCareQueries contained in here
+ SolrIndexSearcher searcher = req.getSearcher();
+ IndexReader indexReader = searcher.getIndexReader(); // returns a DirectoryReader
+ parsedQuery = parsedQuery.rewrite(indexReader); // used to get rewritten to ConstantScoreQuery
+
+ return parsedQuery;
+ }
+
+ @Override
+ public void handleRequestBody(SolrQueryRequest req, SolrQueryResponse rsp) throws Exception
+ {
+
+ // do getTerms() here:
+ // getParams, modify solrparams (q is query_string)
+ // if q exists, then do extractTerms and queryRewrite
+ // then req.setSolrParams
+ // and continue on as before: super.handleRequestBody(req, rsp);
+
+ SolrQuery solrParams = new SolrQuery();
+ solrParams.add(req.getParams());
+
+ //String query_string = "TX:(farming)";
+ String query_string = solrParams.get("q");
+
+
+ if(query_string == null || query_string.equals("")) {
+ log.error("@@@@@@@@@ " + this.getClass() + " - QUERY STRING EMPTY");
+ }
+ else {
+ //System.err.println("@@@ Parsing query_string " + query_string);
+
+
+ QParser qParser = QParser.getParser(query_string, "lucene", req);
+ Query parsedQuery = qParser.getQuery();
+
+ // For PrefixQuery or WildCardQuery (a subclass of AutomatonQuery, incl RegexpQ),
+ // like ZZ:econom* and ZZ:*date/regex queries, Query.extractTerms() throws an Exception
+ // because it has not done the Query.rewrite() step yet. So do that manually for them.
+ // This still doesn't provide us with the terms that econom* or *date break down into.
+
+ //if(parsedQuery instanceof PrefixQuery || parsedQuery instanceof AutomatonQuery) {
+ // Should we just check superclass MultiTermQuery?
+ // Can be a BooleanQuery containing PrefixQuery/WildCardQuery among its clauses, so
+ // just test for * in the query_string to determine if we need to do a rewrite() or not
+ if(query_string.contains("*")) {
+
+ //System.err.println("@@@@ query's class: " + parsedQuery.getClass().getName());
+
+
+ // See also common-src/indexers/lucene-gs/src/org/greenstone/LuceneWrapper3/GS2LuceneQuery.java
+ // Of http://trac.greenstone.org/changeset/26157 and http://trac.greenstone.org/ticket/845
+ try {
+ parsedQuery = expandQuery(req, parsedQuery);
+
+ } catch(BooleanQuery.TooManyClauses ex) { // hits this exception if searching solr coll for "a*"
+ System.err.println("@@@@ Encountered TooManyClauses Exception: " + ex.getMessage());
+ System.err.println("@@@@ Trying CustomRewriteMethod");
+
+ MultiTermQuery.ConstantScoreAutoRewrite customRewriteMethod = new MultiTermQuery.ConstantScoreAutoRewrite();
+ customRewriteMethod.setDocCountPercent(100.0);
+ customRewriteMethod.setTermCountCutoff(350); // same as default
+ this.currentRewriteMethod = customRewriteMethod;
+
+ try {
+ // try query.rewrite() again now
+ parsedQuery = expandQuery(req, parsedQuery);
+
+ } catch(BooleanQuery.TooManyClauses bex) { // still too many clauses
+ System.err.println("@@@@ Encountered TooManyClauses Exception despite CustomRewriteMethod: "
+ + bex.getMessage());
+ System.err.println("@@@@ Using default Multiterm RewriteMethod");
+
+ // do what the code originally did: use the default rewriteMethod which
+ // uses a default docCountPercent=0.1 (%) and termCountCutoff=350
+ currentRewriteMethod = MultiTermQuery.CONSTANT_SCORE_AUTO_REWRITE_DEFAULT;
+
+ // this will succeed, but probably won't expand * in Prefix- and WildcardQueries
+ parsedQuery = expandQuery(req, parsedQuery);
+ }
+ }
+ //System.err.println("@@@@ rewritten query is now: " + parsedQuery);
+ }
+
+
+ // extract the terms
+ Set extractedQueryTerms = new HashSet();
+ parsedQuery.extractTerms(extractedQueryTerms);
+
+ // need to sort the terms for presentation, since a Set is unsorted
+ List termsList = new ArrayList(extractedQueryTerms);
+ java.util.Collections.sort(termsList); // Term implements Comparable, terms sorted alphabetically
+
+
+
+ Iterator termsIterator = termsList.iterator();//extractedQueryTerms.iterator();
+ while(termsIterator.hasNext()) {
+ Term term = termsIterator.next();
+ //System.err.println("#### Found query term: " + term);
+
+ String field = term.field();
+ String queryTerm = term.text();
+
+ // totaltermfreq(TI, 'farming')
+ // termfreq(TI, 'farming')
+ solrParams.addField("totaltermfreq(" + field + ",'" + queryTerm + "')");
+ solrParams.addField("termfreq(" + field + ",'" + queryTerm + "')");
+ }
+ }
+
+ // set to modified SolrQuery SolrParams
+ req.setParams(solrParams);
+ // send off modified request
+ super.handleRequestBody(req, rsp);
+ }
+}