Ignore:
Timestamp:
2015-06-19T23:21:26+12:00 (9 years ago)
Author:
ak19
Message:

The getTerms() functionality previously used by the EmbeddedSolrServer has now been re-implemented for HttpSolrServer with the new custom Greenstone Solr RequestHandler class Greenstone3SearchHandler, which lives on the solr server side, in tomcat's solr webapp. The functionality has been improvemed, such as being able to search for: econom* cat, by recursively calling setRewriteMethods on any PrefixQuery and WildcardQuery MultiQueries within an overall BooleanQuery, and by handling BooleanQuery.TooManyClauses exceptions when the number of expanded terms is too large, such as for a search of a*.

Location:
gs3-extensions/solr/trunk/src/src/java/org/greenstone
Files:
2 added
1 edited

Legend:

Unmodified
Added
Removed
  • gs3-extensions/solr/trunk/src/src/java/org/greenstone/gsdl3/util/SolrQueryWrapper.java

    r29711 r29986  
    3636import java.util.HashSet;
    3737
     38import java.util.regex.Pattern;
     39import java.util.regex.Matcher;
     40
    3841import org.apache.log4j.Logger;
    3942import org.apache.solr.client.solrj.SolrQuery; // subclass of ModifiableSolrParams
     
    149152
    150153
    151     /** Extracts the query terms from the query string. The query string can be a boolean
     154    /**
     155     * UNUSED.
     156     * Back when we used the EmbeddedSolrServer, this getTerms method would expand the terms of a query.
     157     * Because of Solr/Lucene Index locking exceptions, we switched over to the HttpSolrServer instead
     158     * of the Embedded kind.
     159     *
     160     * The functionality of getTerms has been moved to
     161     * ../solrserver/Greenstone3SearchHandler.java, which will sit on the solrserver side (inside
     162     * tomcat's solr webapp).
     163     *
     164     * Extracts the query terms from the query string. The query string can be a boolean
    152165     * combination of the various search fields with their search terms or phrases
    153166     */
     
    371384        }
    372385
     386        // the solrserver will now
    373387        // get the individual terms that make up the query, then request solr to return the totaltermfreq for each term
    374         Term[] terms = getTerms(solrQuery, query_string);
    375         if(terms != null) {
    376             for(int i = 0; i < terms.length; i++) {
    377             Term term = terms[i];
    378             String field = term.field();
    379             String queryTerm = term.text();
    380             // totaltermfreq(TI, 'farming') termfreq(TI, 'farming')
    381            
    382             solrQuery.addField("totaltermfreq(" + field + ",'" + queryTerm + "')");
    383             solrQuery.addField("termfreq(" + field + ",'" + queryTerm + "')");
    384             }
    385         }
    386388
    387389        // do the query
     
    433435                    // solr returns each term's totaltermfreq, ttf, at the document level, even though
    434436                    // the ttf is the same for each document. So extract this information just for the first document
    435                     if(i == 0) { // first document
    436                        
    437                         if(terms != null) {
    438                         for(int j = 0; j < terms.length; j++) {
    439                             Term term = terms[j];
    440                             String field = term.field();
    441                             String queryTerm = term.text();
    442 
    443                             // totaltermfreq(TI, 'farming') termfreq(TI, 'farming')
    444                             Long totaltermfreq = (Long)doc.get("totaltermfreq("+field+",'"+queryTerm+"')");
    445                             Integer termfreq = (Integer)doc.get("termfreq("+field+",'"+queryTerm+"')");
    446 
    447                             //System.err.println("**** ttf = " + totaltermfreq);
    448                             //System.err.println("**** tf = " + termfreq);
    449                             //logger.info("**** ttf = " + totaltermfreq);
    450                             //logger.info("**** tf = " + termfreq);
     437                    if(i == 0) { // first document, all others repeat the same termfreq data
     438                        boolean foundTermInfo = false;
     439
     440                        Collection<String> fieldNames = doc.getFieldNames();
     441                        for(Iterator<String> it = fieldNames.iterator(); it.hasNext(); ) {
     442                        String fieldName = it.next(); // e.g. looking for totaltermfreq(ZZ,'economically')
     443                        //logger.info("@@@@ found fieldName " + fieldName);
    451444                       
    452                             solr_query_result.addTerm(queryTerm, field, (int) hits.getNumFound(), totaltermfreq.intValue()); // long totaltermfreq to int
    453                         }
    454                         } else { // no terms extracted from query_string
     445
     446                        if(fieldName.startsWith("totaltermfreq")) {
     447                           //|| fieldName.startsWith("termfreq")) {
     448                           
     449                            foundTermInfo = true;
     450
     451                            // e.g. totaltermfreq(TI,'farming')
     452                            // e.g. termfreq(TI,'farming')
     453                            Pattern pattern = Pattern.compile("(.*?termfreq)\\((.*?),'(.*?)'\\)");
     454                            Matcher matcher = pattern.matcher(fieldName);
     455                            String metaField, indexField, queryTerm;
     456                            while (matcher.find()) {
     457                            metaField = matcher.group(1); // termfreq or totaltermfreq
     458                            indexField = matcher.group(2); //ZZ, TI
     459                            queryTerm = matcher.group(3);
     460
     461                            //logger.info("\t@@@@ found field " + indexField);
     462                            //logger.info("\t@@@@ queryTerm " + queryTerm);
     463
     464                            // Finally, can ask for the totaltermfreq value for this
     465                            // searchterm in its indexed field:
     466                            // e.g. totaltermfreq(TI,'farming'), e.g. termfreq(TI,'farming')
     467                            Long totaltermfreq = (Long)doc.get("totaltermfreq("+indexField+",'"+queryTerm+"')");
     468                           
     469                            Integer termfreq = (Integer)doc.get("termfreq("+indexField+",'"+queryTerm+"')");
     470                           
     471                            //System.err.println("**** ttf = " + totaltermfreq);
     472                            //System.err.println("**** tf = " + termfreq);
     473                            //logger.info("**** ttf = " + totaltermfreq);
     474                            //logger.info("**** tf = " + termfreq);
     475                            solr_query_result.addTerm(queryTerm, indexField, (int) hits.getNumFound(), totaltermfreq.intValue()); // long totaltermfreq to int
     476                            }
     477                        }                       
     478                        }
     479                        if(!foundTermInfo) { // no terms extracted from query_string
    455480                        solr_query_result.addTerm(query_string, defaultField, (int) hits.getNumFound(), -1); // no terms
    456481                        }
Note: See TracChangeset for help on using the changeset viewer.