Changeset 29986 for gs3-extensions

Show
Ignore:
Timestamp:
19.06.2015 23:21:26 (4 years ago)
Author:
ak19
Message:

The getTerms() functionality previously used by the EmbeddedSolrServer? has now been re-implemented for HttpSolrServer? with the new custom Greenstone Solr RequestHandler? class Greenstone3SearchHandler, which lives on the solr server side, in tomcat's solr webapp. The functionality has been improvemed, such as being able to search for: econom* cat, by recursively calling setRewriteMethods on any PrefixQuery? and WildcardQuery? MultiQueries? within an overall BooleanQuery?, and by handling BooleanQuery?.TooManyClauses? exceptions when the number of expanded terms is too large, such as for a search of a*.

Location:
gs3-extensions/solr/trunk/src
Files:
2 added
3 modified

Legend:

Unmodified
Added
Removed
  • gs3-extensions/solr/trunk/src/build.xml

    r29932 r29986  
    130130    </jar> 
    131131    <copy file="${build.home}/gs3-solr.jar" todir="${web.libdir}"/> 
     132 
     133    <!-- customisations to the http solr server --> 
     134    <jar destfile="${build.home}/gs3-solrserver.jar"> 
     135      <fileset dir="${build.home}"> 
     136        <include name="org/greenstone/solrserver/**"/> 
     137      </fileset> 
     138      <manifest> 
     139        <attribute name="Built-By" value="greenstone3" /> 
     140      </manifest> 
     141    </jar> 
     142    <!--<copy file="${build.home}/gs3-solrserver.jar" todir="${tomcat.dir}/webapps/solr/WEB-INF/lib"/> 
     143 
     144          But there's no guarantee the destination will exist at the beginning. 
     145          The jar file also needs to go into solr.war. Which is taken care of in the add-service task. 
     146          Instead, for testing, call ant compile-gs3-solrserver, which will copy it across to solr webapp.  
     147      --> 
    132148  </target> 
    133149   
     
    188204    <copy todir="${tomcat.dir}/webapps" file="webapps/solr.war" /> 
    189205    <unwar src="${tomcat.dir}/webapps/solr.war" dest="${tomcat.dir}/webapps/solr"/> 
    190     <echo>Copying xalan related jar files from ${web.libdir} into ${tomcat.dir}/webapps/solr.war</echo> 
     206    <echo>Copying xalan related jar files, morphology and gs3-solrserver jars from ${web.libdir} into ${tomcat.dir}/webapps/solr.war</echo> 
    191207    <copy todir="${tomcat.dir}/webapps/solr/WEB-INF/lib"> 
    192208      <filelist dir="${web.libdir}" files="${shared-xalan-jars}" /> 
     209      <file file="${build.home}/gs3-solrserver.jar" /> 
    193210      <filelist dir="lib/russianmorphology" files="${russian-morph-jars}" /> 
    194211    </copy> 
     
    224241 
    225242 
    226   <target name="add-service" depends="copy-solr-web,copy-files,solr-for-tomcat,compile" description="Run this target to setup the Solr extension for Greenstone3" /> 
     243  <target name="add-service" depends="copy-solr-web,copy-files,compile,solr-for-tomcat" description="Run this target to setup the Solr extension for Greenstone3" /> 
    227244   
    228245  <target name="del-service" depends="del-files,del-solr-for-tomcat" 
     
    284301  </target> 
    285302   
     303  <target name="compile-gs3-solrserver" description="TEST Target to compile the solr server side gs3-solserver.jar for testing"> 
     304    <delete dir="build/org/greenstone/solrserver" /> 
     305    <delete file="build/gs3-solrserver.jar" /> 
     306    <antcall target="compile" /> 
     307    <copy file="${build.home}/gs3-solrserver.jar" todir="${tomcat.dir}/webapps/solr/WEB-INF/lib" /> 
     308  </target> 
     309 
    286310</project> 
  • gs3-extensions/solr/trunk/src/conf/solrconfig.xml

    r29135 r29986  
    824824       queries across multiple shards 
    825825    --> 
    826   <requestHandler name="/select" class="solr.SearchHandler"> 
     826  <!--<requestHandler name="/select" class="solr.SearchHandler">--> 
     827    <requestHandler name="/select" class="org.greenstone.solrserver.Greenstone3SearchHandler"> 
    827828    <!-- default values for query parameters can be specified, these 
    828829         will be overridden by parameters in the request 
     
    888889 
    889890  <!-- A request handler that returns indented JSON by default --> 
    890   <requestHandler name="/query" class="solr.SearchHandler"> 
     891  <!--<requestHandler name="/query" class="solr.SearchHandler">--> 
     892  <requestHandler name="/query" class="org.greenstone.solrserver.Greenstone3SearchHandler"> 
    891893     <lst name="defaults"> 
    892894       <str name="echoParams">explicit</str> 
  • gs3-extensions/solr/trunk/src/src/java/org/greenstone/gsdl3/util/SolrQueryWrapper.java

    r29711 r29986  
    3636import java.util.HashSet; 
    3737 
     38import java.util.regex.Pattern; 
     39import java.util.regex.Matcher; 
     40 
    3841import org.apache.log4j.Logger; 
    3942import org.apache.solr.client.solrj.SolrQuery; // subclass of ModifiableSolrParams 
     
    149152 
    150153 
    151     /** Extracts the query terms from the query string. The query string can be a boolean  
     154    /** 
     155     * UNUSED. 
     156     * Back when we used the EmbeddedSolrServer, this getTerms method would expand the terms of a query. 
     157     * Because of Solr/Lucene Index locking exceptions, we switched over to the HttpSolrServer instead 
     158     * of the Embedded kind.  
     159     * 
     160     * The functionality of getTerms has been moved to  
     161     * ../solrserver/Greenstone3SearchHandler.java, which will sit on the solrserver side (inside  
     162     * tomcat's solr webapp). 
     163     * 
     164     * Extracts the query terms from the query string. The query string can be a boolean  
    152165     * combination of the various search fields with their search terms or phrases 
    153166     */ 
     
    371384        } 
    372385 
     386        // the solrserver will now 
    373387        // get the individual terms that make up the query, then request solr to return the totaltermfreq for each term 
    374         Term[] terms = getTerms(solrQuery, query_string); 
    375         if(terms != null) { 
    376             for(int i = 0; i < terms.length; i++) { 
    377             Term term = terms[i]; 
    378             String field = term.field(); 
    379             String queryTerm = term.text(); 
    380             // totaltermfreq(TI, 'farming') termfreq(TI, 'farming') 
    381              
    382             solrQuery.addField("totaltermfreq(" + field + ",'" + queryTerm + "')"); 
    383             solrQuery.addField("termfreq(" + field + ",'" + queryTerm + "')"); 
    384             } 
    385         } 
    386388 
    387389        // do the query 
     
    433435                    // solr returns each term's totaltermfreq, ttf, at the document level, even though  
    434436                    // the ttf is the same for each document. So extract this information just for the first document 
    435                     if(i == 0) { // first document 
    436                          
    437                         if(terms != null) { 
    438                         for(int j = 0; j < terms.length; j++) { 
    439                             Term term = terms[j]; 
    440                             String field = term.field(); 
    441                             String queryTerm = term.text(); 
    442  
    443                             // totaltermfreq(TI, 'farming') termfreq(TI, 'farming') 
    444                             Long totaltermfreq = (Long)doc.get("totaltermfreq("+field+",'"+queryTerm+"')"); 
    445                             Integer termfreq = (Integer)doc.get("termfreq("+field+",'"+queryTerm+"')"); 
    446  
    447                             //System.err.println("**** ttf = " + totaltermfreq);  
    448                             //System.err.println("**** tf = " + termfreq); 
    449                             //logger.info("**** ttf = " + totaltermfreq);  
    450                             //logger.info("**** tf = " + termfreq); 
     437                    if(i == 0) { // first document, all others repeat the same termfreq data 
     438                        boolean foundTermInfo = false; 
     439 
     440                        Collection<String> fieldNames = doc.getFieldNames(); 
     441                        for(Iterator<String> it = fieldNames.iterator(); it.hasNext(); ) { 
     442                        String fieldName = it.next(); // e.g. looking for totaltermfreq(ZZ,'economically') 
     443                        //logger.info("@@@@ found fieldName " + fieldName); 
    451444                         
    452                             solr_query_result.addTerm(queryTerm, field, (int) hits.getNumFound(), totaltermfreq.intValue()); // long totaltermfreq to int 
    453                         } 
    454                         } else { // no terms extracted from query_string 
     445 
     446                        if(fieldName.startsWith("totaltermfreq")) { 
     447                           //|| fieldName.startsWith("termfreq")) { 
     448                             
     449                            foundTermInfo = true; 
     450 
     451                            // e.g. totaltermfreq(TI,'farming')  
     452                            // e.g. termfreq(TI,'farming') 
     453                            Pattern pattern = Pattern.compile("(.*?termfreq)\\((.*?),'(.*?)'\\)"); 
     454                            Matcher matcher = pattern.matcher(fieldName); 
     455                            String metaField, indexField, queryTerm; 
     456                            while (matcher.find()) { 
     457                            metaField = matcher.group(1); // termfreq or totaltermfreq 
     458                            indexField = matcher.group(2); //ZZ, TI 
     459                            queryTerm = matcher.group(3); 
     460 
     461                            //logger.info("\t@@@@ found field " + indexField); 
     462                            //logger.info("\t@@@@ queryTerm " + queryTerm); 
     463 
     464                            // Finally, can ask for the totaltermfreq value for this 
     465                            // searchterm in its indexed field: 
     466                            // e.g. totaltermfreq(TI,'farming'), e.g. termfreq(TI,'farming') 
     467                            Long totaltermfreq = (Long)doc.get("totaltermfreq("+indexField+",'"+queryTerm+"')"); 
     468                             
     469                            Integer termfreq = (Integer)doc.get("termfreq("+indexField+",'"+queryTerm+"')"); 
     470                             
     471                            //System.err.println("**** ttf = " + totaltermfreq);  
     472                            //System.err.println("**** tf = " + termfreq); 
     473                            //logger.info("**** ttf = " + totaltermfreq);  
     474                            //logger.info("**** tf = " + termfreq); 
     475                            solr_query_result.addTerm(queryTerm, indexField, (int) hits.getNumFound(), totaltermfreq.intValue()); // long totaltermfreq to int 
     476                            } 
     477                        }                        
     478                        } 
     479                        if(!foundTermInfo) { // no terms extracted from query_string 
    455480                        solr_query_result.addTerm(query_string, defaultField, (int) hits.getNumFound(), -1); // no terms 
    456481                        }