Changeset 29170


Ignore:
Timestamp:
2014-08-06T19:27:07+12:00 (10 years ago)
Author:
ak19
Message:

TotalTermFrequency is now calculated for each search term in the query string

Location:
gs3-extensions/solr/trunk/src
Files:
3 edited

Legend:

Unmodified
Added
Removed
  • gs3-extensions/solr/trunk/src/perllib/solrbuilder.pm

    r29142 r29170  
    299299            $schema_insert_xml .= "type=\"text_en_splitting\" ";
    300300        }
     301        # set termVectors=\"true\" when term vectors info is required,
     302        # see TermsResponse termResponse = solrResponse.getTermsResponse();
     303        #$schema_insert_xml .=  "indexed=\"true\" stored=\"false\" termVectors=\"true\" multiValued=\"true\" />\n";
    301304        $schema_insert_xml .=  "indexed=\"true\" stored=\"false\" multiValued=\"true\" />\n";
    302305                #$schema_insert_xml .=  "indexed=\"true\" stored=\"true\" multiValued=\"true\" />\n";
  • gs3-extensions/solr/trunk/src/src/java/org/greenstone/gsdl3/service/GS2SolrSearch.java

    r29142 r29170  
    251251            String value = (String) m.getValue();
    252252
     253            ///System.err.println("### GS2SolrSearch.java: name " + name + " - value " + value);
     254
    253255            if (name.equals(MAXDOCS_PARAM) && !value.equals(""))
    254256            {
  • gs3-extensions/solr/trunk/src/src/java/org/greenstone/gsdl3/util/SolrQueryWrapper.java

    r29142 r29170  
    2929import java.net.URLDecoder;
    3030import java.util.ArrayList;
     31import java.util.Collection;
    3132import java.util.HashMap;
     33import java.util.Iterator;
    3234import java.util.List;
     35import java.util.Set;
     36import java.util.HashSet;
    3337
    3438import org.apache.log4j.Logger;
     39import org.apache.solr.client.solrj.SolrQuery; // subclass of ModifiableSolrParams
    3540import org.apache.solr.client.solrj.SolrServer;
    3641import org.apache.solr.client.solrj.SolrServerException;
     42import org.apache.solr.client.solrj.embedded.EmbeddedSolrServer;
    3743import org.apache.solr.client.solrj.response.QueryResponse;
     44import org.apache.solr.client.solrj.response.TermsResponse;
     45
     46import org.apache.solr.core.CoreContainer;
     47import org.apache.solr.core.SolrCore;
     48
    3849import org.apache.solr.common.SolrDocument;
    3950import org.apache.solr.common.SolrDocumentList;
     
    4152import org.greenstone.LuceneWrapper4.SharedSoleneQuery;
    4253import org.greenstone.LuceneWrapper4.SharedSoleneQueryResult;
     54
     55import org.apache.lucene.search.Query; // Query, TermQuery, BooleanQuery, BooleanClause and more
     56import org.apache.lucene.index.IndexReader;
     57import org.apache.lucene.index.Term;
     58import org.apache.solr.search.QParser;
     59import org.apache.solr.search.SolrIndexSearcher;
     60import org.apache.solr.request.LocalSolrQueryRequest;
    4361
    4462import com.google.gson.Gson;
     
    122140        return true;
    123141    }
     142
     143
     144    /** Extracts the query terms from the query string. The query string can be a boolean
     145     * combination of the various search fields with their search terms or phrases
     146     */
     147    public Term[] getTerms(SolrQuery solrQuery, String query_string)
     148    {
     149    Term terms[] = null;
     150   
     151    if(solr_core instanceof EmbeddedSolrServer) {
     152        EmbeddedSolrServer solrServer = (EmbeddedSolrServer)solr_core;
     153       
     154        CoreContainer coreContainer = solrServer.getCoreContainer();
     155       
     156        Collection<SolrCore> solrCores = coreContainer.getCores();
     157        if(!solrCores.isEmpty()) {
     158        Iterator<SolrCore> coreIterator = solrCores.iterator();
     159
     160        // Just use the first core, since the term frequency of any term is the same regardless of core
     161        if(coreIterator.hasNext()) {
     162            SolrCore solrCore = coreIterator.next();
     163           
     164           
     165            LocalSolrQueryRequest solrQueryRequest = new LocalSolrQueryRequest(solrCore, solrQuery);
     166            Query parsedQuery = null;
     167
     168            try {
     169           
     170            // get the qparser, default is LuceneQParserPlugin, which is called "lucene" see http://wiki.apache.org/solr/QueryParser
     171            QParser qParser = QParser.getParser(query_string, "lucene", solrQueryRequest);
     172            parsedQuery = qParser.getQuery();
     173
     174            // For PrefixQuery or WildCardQuery (a subclass of AutomatonQuery, incl RegexpQ),
     175            // like ZZ:econom* and ZZ:*date/regex queries, Query.extractTerms() throws an Exception
     176            // because it has not done the Query.rewrite() step yet. So do that manually for them.
     177            // This still doesn't provide us with the terms that econom* or *date break down into.
     178
     179            //if(parsedQuery instanceof PrefixQuery || parsedQuery instanceof AutomatonQuery) {
     180                            // Should we just check superclass MultiTermQuery?
     181            // Can be a BooleanQuery containing PrefixQuery/WildCardQuery among its clauses, so
     182            // just test for * in the query_string to determine if we need to do a rewrite() or not
     183            if(query_string.contains("*")) {
     184                SolrIndexSearcher searcher = solrQueryRequest.getSearcher();
     185                IndexReader indexReader = searcher.getIndexReader(); // returns a DirectoryReader
     186                parsedQuery = parsedQuery.rewrite(indexReader); // gets rewritten to ConstantScoreQuery
     187            }
     188
     189            //System.err.println("#### Query type was: " + parsedQuery.getClass());
     190            //logger.error("#### Query type was: " + parsedQuery.getClass());
     191           
     192            // extract the terms
     193            Set<Term> extractedQueryTerms = new HashSet<Term>();
     194            parsedQuery.extractTerms(extractedQueryTerms);
     195
     196            terms = new Term[extractedQueryTerms.size()];
     197           
     198            Iterator<Term> termsIterator = extractedQueryTerms.iterator();
     199            for(int i = 0; termsIterator.hasNext(); i++) {
     200                Term term = termsIterator.next();
     201                ///System.err.println("#### Found query term: " + term);
     202                ///logger.error("#### Found query term: " + term);
     203
     204                terms[i] = term; //(term.field(), term.text());
     205            }
     206           
     207            } catch(Exception queryParseException) {
     208            queryParseException.printStackTrace();
     209            System.err.println("Exception when parsing query: " + queryParseException.getMessage());
     210            System.err.println("#### Query type was: " + parsedQuery.getClass());
     211            logger.error("#### Query type was: " + parsedQuery.getClass());
     212            }
     213        }
     214       
     215        } else {
     216        System.err.println("#### Not an EmbeddedSolrServer. This shouldn't happen.");
     217        logger.error("#### Not an EmbeddedSolrServer. This shouldn't happen.");
     218        }
     219    }
     220   
     221    return terms;
     222    }
    124223
    125224    public SharedSoleneQueryResult runQuery(String query_string)
     
    205304        }
    206305
    207         ModifiableSolrParams solrParams = new ModifiableSolrParams();
    208         solrParams.set("q", query_string);
    209         // sort param, like "score desc" or "byORG asc"
    210         solrParams.set("sort", this.sort_field+" "+this.sort_order);
    211         // which result to start from
    212         solrParams.set("start", start_results);
    213         // how many results per "page"
    214         solrParams.set("rows", (end_results - start_results) + 1);
    215         // which fields to return for each document
    216         solrParams.set("fl", "docOID score");
    217         // turn on the termsComponent
    218         solrParams.set("terms", true);
    219         // which field to get the terms from
    220         solrParams.set("terms.fl", "ZZ");
     306
     307        SolrQuery solrQuery = new SolrQuery(query_string);
     308        solrQuery.addSort(this.sort_field, SolrQuery.ORDER.valueOf(this.sort_order)); // sort param, like "score desc" or "byORG asc"
     309        solrQuery.setStart(start_results); // which result to start from
     310        solrQuery.setRows((end_results - start_results) + 1); // how many results per "page"
     311
     312        // http://lucene.472066.n3.nabble.com/get-term-frequency-just-only-keywords-search-td4084510.html
     313        // WORKS (search didx core):
     314        //TI:farming
     315        //docOID,score,termfreq(TI,'farming'),totaltermfreq(TI,'farming')
     316
     317
     318        // which fields to return for each document, we'll add the request for totaltermfreq later
     319        // fl=docOID score termfreq(TI,'farming') totaltermfreq(TI,'farming')
     320        solrQuery.setFields("docOID", "score"); //solrParams.set("fl", "docOID score totaltermfreq(field,'queryterm')");
     321       
     322        //solrQuery.setTerms(true); // turn on the termsComponent       
     323        //solrQuery.set("terms.fl", "ZZ"); // which field to get the terms from. ModifiableSolrParams method
     324       
     325        // http://wiki.apache.org/solr/TermVectorComponent and https://cwiki.apache.org/confluence/display/solr/The+Term+Vector+Component
     326        // http://lucene.472066.n3.nabble.com/get-term-frequency-just-only-keywords-search-td4084510.html
     327        // http://stackoverflow.com/questions/13031534/word-frequency-in-solr
     328        // http://wiki.apache.org/solr/FunctionQuery#tf and #termfreq and #totaltermfreq
     329        // https://wiki.apache.org/solr/TermsComponent
     330
     331        //solrParams.set("tv.tf", true);// turn on the terms vector Component
     332        //solrParams.set("tv.fl", "ZZ");// which field to get the terms from /// ZZ
     333
    221334
    222335        if (_facets.size() > 0)
    223336        {
    224337          // enable facet counts in the query response
    225             solrParams.set("facet", "true");
     338            solrQuery.setFacet(true); //solrParams.set("facet", "true");
    226339            for (int i = 0; i < _facets.size(); i++)
    227340            {
    228341              // add this field as a facet
    229                 solrParams.add("facet.field", _facets.get(i));
    230             }
    231         }
    232 
     342              solrQuery.addFacetField(_facets.get(i)); // solrParams.add("facet.field", _facets.get(i));
     343            }
     344        }
     345
     346        // get the individual terms that make up the query, then request solr to return the totaltermfreq for each term
     347        Term[] terms = getTerms(solrQuery, query_string);
     348        if(terms != null) {
     349            for(int i = 0; i < terms.length; i++) {
     350            Term term = terms[i];
     351            String field = term.field();
     352            String queryTerm = term.text();
     353            // totaltermfreq(TI, 'farming') termfreq(TI, 'farming')
     354           
     355            solrQuery.addField("totaltermfreq(" + field + ",'" + queryTerm + "')");
     356            solrQuery.addField("termfreq(" + field + ",'" + queryTerm + "')");
     357            }
     358        }
     359
     360        // do the query
    233361        try
    234362        {
    235             QueryResponse solrResponse = solr_core.query(solrParams);
     363            QueryResponse solrResponse = solr_core.query(solrQuery); //solr_core.query(solrParams);
    236364            SolrDocumentList hits = solrResponse.getResults();
     365            //TermsResponse termResponse = solrResponse.getTermsResponse(); // null unless termvectors=true in schema.xml
    237366
    238367            if (hits != null)
     
    253382                solr_query_result.setEndResults(start_results + hits.size());
    254383
     384               
     385                // get the first field we're searching in, this will be the fallback field
    255386                int sepIndex = query_string.indexOf(":");
    256                 String field = query_string.substring(0, sepIndex);
    257                 String query = query_string.substring(sepIndex + 2, query_string.length() - 1);
    258 
    259                 solr_query_result.addTerm(query, field, (int) hits.getNumFound(), -1);
     387                String defaultField = query_string.substring(0, sepIndex);
     388                //String query = query_string.substring(sepIndex + 2, query_string.length() - 1); // Replaced by call to getTerms()
     389
     390                //solr_query_result.addTerm(query, field, (int) hits.getNumFound(), -1);
    260391
    261392                // Output the matching documents
     
    263394                {
    264395                    SolrDocument doc = hits.get(i);
    265 
     396                   
    266397                    // Need to think about how to support document term frequency.  Make zero for now
    267398                    int doc_term_freq = 0;
     
    271402                    logger.info("**** docOID = " + docOID);
    272403                    logger.info("**** score = " + score);
    273 
    274                     solr_query_result.addDoc(docOID, score.floatValue(), doc_term_freq);
     404                                       
     405                   
     406                    // solr returns each term's totaltermfreq, ttf, at the document level, even though
     407                    // the ttf is the same for each document. So extract this information just for the first document
     408                    if(i == 0) { // first document
     409                       
     410                        if(terms != null) {
     411                        for(int j = 0; j < terms.length; j++) {
     412                            Term term = terms[j];
     413                            String field = term.field();
     414                            String queryTerm = term.text();
     415
     416                            // totaltermfreq(TI, 'farming') termfreq(TI, 'farming')
     417                            Long totaltermfreq = (Long)doc.get("totaltermfreq("+field+",'"+queryTerm+"')");
     418                            Integer termfreq = (Integer)doc.get("termfreq("+field+",'"+queryTerm+"')");
     419
     420                            //System.err.println("**** ttf = " + totaltermfreq);
     421                            //System.err.println("**** tf = " + termfreq);
     422                            //logger.info("**** ttf = " + totaltermfreq);
     423                            //logger.info("**** tf = " + termfreq);
     424                       
     425                            solr_query_result.addTerm(queryTerm, field, (int) hits.getNumFound(), totaltermfreq.intValue()); // long totaltermfreq to int
     426                        }
     427                        } else { // no terms extracted from query_string
     428                        solr_query_result.addTerm(query_string, defaultField, (int) hits.getNumFound(), -1); // no terms
     429                        }
     430                    }
     431
     432                    solr_query_result.addDoc(docOID, score.floatValue(), doc_term_freq); // doc_termfreq for which term????
    275433                }
    276434            }
     
    309467        super.cleanUp();
    310468    }
     469
    311470}
Note: See TracChangeset for help on using the changeset viewer.