Changeset 30050


Ignore:
Timestamp:
2015-07-21T05:35:34+12:00 (9 years ago)
Author:
Georgiy Litvinov
Message:

Solr repo modifications for Solr side highlighing and snippets

Location:
gs3-extensions/solr/trunk/src
Files:
6 edited

Legend:

Unmodified
Added
Removed
  • gs3-extensions/solr/trunk/src/conf/schema.xml.in

    r29932 r30050  
    478478    <fieldType name="text_general" class="solr.TextField" positionIncrementGap="100">
    479479      <analyzer type="index">
     480    <charFilter class="solr.HTMLStripCharFilterFactory"/>
    480481        <tokenizer class="solr.StandardTokenizerFactory"/>
    481482        <filter class="solr.StopFilterFactory" ignoreCase="true" words="stopwords.txt" />
     
    500501    <fieldType name="text_en" class="solr.TextField" positionIncrementGap="100">
    501502      <analyzer type="index">
     503    <charFilter class="solr.HTMLStripCharFilterFactory"/>
    502504        <tokenizer class="solr.StandardTokenizerFactory"/>
    503505        <!-- in this example, we will only use synonyms at query time
     
    548550    <fieldType name="text_en_splitting" class="solr.TextField" positionIncrementGap="100" autoGeneratePhraseQueries="true">
    549551      <analyzer type="index">
     552    <charFilter class="solr.HTMLStripCharFilterFactory"/>
    550553        <tokenizer class="solr.WhitespaceTokenizerFactory"/>
    551554        <!-- in this example, we will only use synonyms at query time
     
    583586    <fieldType name="text_en_splitting_tight" class="solr.TextField" positionIncrementGap="100" autoGeneratePhraseQueries="true">
    584587      <analyzer>
     588    <charFilter class="solr.HTMLStripCharFilterFactory"/>
    585589        <tokenizer class="solr.WhitespaceTokenizerFactory"/>
    586590        <filter class="solr.SynonymFilterFactory" synonyms="synonyms.txt" ignoreCase="true" expand="false"/>
     
    762766    <fieldType name="text_ar" class="solr.TextField" positionIncrementGap="100">
    763767      <analyzer>
     768    <charFilter class="solr.HTMLStripCharFilterFactory"/>
    764769        <tokenizer class="solr.StandardTokenizerFactory"/>
    765770        <!-- for any non-arabic -->
     
    775780    <fieldType name="text_bg" class="solr.TextField" positionIncrementGap="100">
    776781      <analyzer>
     782    <charFilter class="solr.HTMLStripCharFilterFactory"/>
    777783        <tokenizer class="solr.StandardTokenizerFactory"/>
    778784        <filter class="solr.LowerCaseFilterFactory"/>
     
    785791    <fieldType name="text_ca" class="solr.TextField" positionIncrementGap="100">
    786792      <analyzer>
     793    <charFilter class="solr.HTMLStripCharFilterFactory"/>
    787794        <tokenizer class="solr.StandardTokenizerFactory"/>
    788795        <!-- removes l', etc -->
     
    797804    <fieldType name="text_cjk" class="solr.TextField" positionIncrementGap="100">
    798805      <analyzer>
     806    <charFilter class="solr.HTMLStripCharFilterFactory"/>
    799807        <tokenizer class="solr.StandardTokenizerFactory"/>
    800808        <!-- normalize width before bigram, as e.g. half-width dakuten combine  -->
     
    809817    <fieldType name="text_ckb" class="solr.TextField" positionIncrementGap="100">
    810818      <analyzer>
     819    <charFilter class="solr.HTMLStripCharFilterFactory"/>
    811820        <tokenizer class="solr.StandardTokenizerFactory"/>
    812821        <filter class="solr.SoraniNormalizationFilterFactory"/>
     
    821830    <fieldType name="text_cz" class="solr.TextField" positionIncrementGap="100">
    822831      <analyzer>
     832    <charFilter class="solr.HTMLStripCharFilterFactory"/>
    823833        <tokenizer class="solr.StandardTokenizerFactory"/>
    824834        <filter class="solr.LowerCaseFilterFactory"/>
     
    831841    <fieldType name="text_da" class="solr.TextField" positionIncrementGap="100">
    832842      <analyzer>
     843    <charFilter class="solr.HTMLStripCharFilterFactory"/>
    833844        <tokenizer class="solr.StandardTokenizerFactory"/>
    834845        <filter class="solr.LowerCaseFilterFactory"/>
     
    841852    <fieldType name="text_de" class="solr.TextField" positionIncrementGap="100">
    842853      <analyzer>
     854    <charFilter class="solr.HTMLStripCharFilterFactory"/>
    843855        <tokenizer class="solr.StandardTokenizerFactory"/>
    844856        <filter class="solr.LowerCaseFilterFactory"/>
     
    854866    <fieldType name="text_el" class="solr.TextField" positionIncrementGap="100">
    855867      <analyzer>
     868    <charFilter class="solr.HTMLStripCharFilterFactory"/>
    856869        <tokenizer class="solr.StandardTokenizerFactory"/>
    857870        <!-- greek specific lowercase for sigma -->
     
    865878    <fieldType name="text_es" class="solr.TextField" positionIncrementGap="100">
    866879      <analyzer>
     880    <charFilter class="solr.HTMLStripCharFilterFactory"/>
    867881        <tokenizer class="solr.StandardTokenizerFactory"/>
    868882        <filter class="solr.LowerCaseFilterFactory"/>
     
    876890    <fieldType name="text_eu" class="solr.TextField" positionIncrementGap="100">
    877891      <analyzer>
     892    <charFilter class="solr.HTMLStripCharFilterFactory"/>
    878893        <tokenizer class="solr.StandardTokenizerFactory"/>
    879894        <filter class="solr.LowerCaseFilterFactory"/>
     
    887902      <analyzer>
    888903        <!-- for ZWNJ -->
     904    <charFilter class="solr.HTMLStripCharFilterFactory"/>
    889905        <charFilter class="solr.PersianCharFilterFactory"/>
    890906        <tokenizer class="solr.StandardTokenizerFactory"/>
     
    899915    <fieldType name="text_fi" class="solr.TextField" positionIncrementGap="100">
    900916      <analyzer>
     917    <charFilter class="solr.HTMLStripCharFilterFactory"/>
    901918        <tokenizer class="solr.StandardTokenizerFactory"/>
    902919        <filter class="solr.LowerCaseFilterFactory"/>
     
    910927    <fieldType name="text_fr" class="solr.TextField" positionIncrementGap="100">
    911928      <analyzer>
     929    <charFilter class="solr.HTMLStripCharFilterFactory"/>
    912930        <tokenizer class="solr.StandardTokenizerFactory"/>
    913931        <!-- removes l', etc -->
     
    924942    <fieldType name="text_ga" class="solr.TextField" positionIncrementGap="100">
    925943      <analyzer>
     944    <charFilter class="solr.HTMLStripCharFilterFactory"/>
    926945        <tokenizer class="solr.StandardTokenizerFactory"/>
    927946        <!-- removes d', etc -->
     
    938957    <fieldType name="text_gl" class="solr.TextField" positionIncrementGap="100">
    939958      <analyzer>
     959    <charFilter class="solr.HTMLStripCharFilterFactory"/>
    940960        <tokenizer class="solr.StandardTokenizerFactory"/>
    941961        <filter class="solr.LowerCaseFilterFactory"/>
     
    949969    <fieldType name="text_hi" class="solr.TextField" positionIncrementGap="100">
    950970      <analyzer>
     971    <charFilter class="solr.HTMLStripCharFilterFactory"/>
    951972        <tokenizer class="solr.StandardTokenizerFactory"/>
    952973        <filter class="solr.LowerCaseFilterFactory"/>
     
    963984    <fieldType name="text_hu" class="solr.TextField" positionIncrementGap="100">
    964985      <analyzer>
     986    <charFilter class="solr.HTMLStripCharFilterFactory"/>
    965987        <tokenizer class="solr.StandardTokenizerFactory"/>
    966988        <filter class="solr.LowerCaseFilterFactory"/>
     
    974996    <fieldType name="text_hy" class="solr.TextField" positionIncrementGap="100">
    975997      <analyzer>
     998    <charFilter class="solr.HTMLStripCharFilterFactory"/>
    976999        <tokenizer class="solr.StandardTokenizerFactory"/>
    9771000        <filter class="solr.LowerCaseFilterFactory"/>
     
    9841007    <fieldType name="text_id" class="solr.TextField" positionIncrementGap="100">
    9851008      <analyzer>
     1009    <charFilter class="solr.HTMLStripCharFilterFactory"/>
    9861010        <tokenizer class="solr.StandardTokenizerFactory"/>
    9871011        <filter class="solr.LowerCaseFilterFactory"/>
     
    9951019    <fieldType name="text_it" class="solr.TextField" positionIncrementGap="100">
    9961020      <analyzer>
     1021    <charFilter class="solr.HTMLStripCharFilterFactory"/>
    9971022        <tokenizer class="solr.StandardTokenizerFactory"/>
    9981023        <!-- removes l', etc -->
     
    10411066           See http://wiki.apache.org/solr/JapaneseLanguageSupport for more on Japanese language support.
    10421067        -->
     1068    <charFilter class="solr.HTMLStripCharFilterFactory"/>
    10431069        <tokenizer class="solr.JapaneseTokenizerFactory" mode="search"/>
    10441070        <!--<tokenizer class="solr.JapaneseTokenizerFactory" mode="search" userDictionary="lang/userdict_ja.txt"/>-->
     
    10611087    <fieldType name="text_lv" class="solr.TextField" positionIncrementGap="100">
    10621088      <analyzer>
     1089    <charFilter class="solr.HTMLStripCharFilterFactory"/>
    10631090        <tokenizer class="solr.StandardTokenizerFactory"/>
    10641091        <filter class="solr.LowerCaseFilterFactory"/>
     
    10711098    <fieldType name="text_nl" class="solr.TextField" positionIncrementGap="100">
    10721099      <analyzer>
     1100    <charFilter class="solr.HTMLStripCharFilterFactory"/>
    10731101        <tokenizer class="solr.StandardTokenizerFactory"/>
    10741102        <filter class="solr.LowerCaseFilterFactory"/>
     
    10821110    <fieldType name="text_no" class="solr.TextField" positionIncrementGap="100">
    10831111      <analyzer>
     1112    <charFilter class="solr.HTMLStripCharFilterFactory"/>
    10841113        <tokenizer class="solr.StandardTokenizerFactory"/>
    10851114        <filter class="solr.LowerCaseFilterFactory"/>
     
    10951124    <fieldType name="text_pt" class="solr.TextField" positionIncrementGap="100">
    10961125      <analyzer>
     1126    <charFilter class="solr.HTMLStripCharFilterFactory"/>
    10971127        <tokenizer class="solr.StandardTokenizerFactory"/>
    10981128        <filter class="solr.LowerCaseFilterFactory"/>
     
    11081138    <fieldType name="text_ro" class="solr.TextField" positionIncrementGap="100">
    11091139      <analyzer>
     1140    <charFilter class="solr.HTMLStripCharFilterFactory"/>
    11101141        <tokenizer class="solr.StandardTokenizerFactory"/>
    11111142        <filter class="solr.LowerCaseFilterFactory"/>
     
    11181149    <fieldType name="text_ru" class="solr.TextField" positionIncrementGap="100">
    11191150      <analyzer>
     1151    <charFilter class="solr.HTMLStripCharFilterFactory"/>
    11201152        <tokenizer class="solr.StandardTokenizerFactory"/>
    11211153        <filter class="solr.LowerCaseFilterFactory"/>
     
    11271159    <!-- Russian with morphology-->
    11281160    <fieldType name="text_ru_morph" class="solr.TextField" positionIncrementGap="100">
    1129           <analyzer>
    1130           <tokenizer class="solr.StandardTokenizerFactory"/>
    1131           <filter class="solr.LowerCaseFilterFactory"/>
    1132           <filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_ru.txt" format="snowball" />
    1133           <filter class="org.apache.lucene.morphology.russian.RussianFilterFactory"/>
    1134           </analyzer>
     1161      <analyzer>
     1162    <charFilter class="solr.HTMLStripCharFilterFactory"/>
     1163        <tokenizer class="solr.StandardTokenizerFactory"/>
     1164        <filter class="solr.LowerCaseFilterFactory"/>
     1165        <filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_ru.txt" format="snowball" />
     1166        <filter class="org.apache.lucene.morphology.russian.RussianFilterFactory"/>
     1167      </analyzer>
    11351168    </fieldType>
    11361169 
     
    11381171    <fieldType name="text_sv" class="solr.TextField" positionIncrementGap="100">
    11391172      <analyzer>
     1173    <charFilter class="solr.HTMLStripCharFilterFactory"/>
    11401174        <tokenizer class="solr.StandardTokenizerFactory"/>
    11411175        <filter class="solr.LowerCaseFilterFactory"/>
     
    11491183    <fieldType name="text_th" class="solr.TextField" positionIncrementGap="100">
    11501184      <analyzer>
     1185    <charFilter class="solr.HTMLStripCharFilterFactory"/>
    11511186        <tokenizer class="solr.StandardTokenizerFactory"/>
    11521187        <filter class="solr.LowerCaseFilterFactory"/>
     
    11591194    <fieldType name="text_tr" class="solr.TextField" positionIncrementGap="100">
    11601195      <analyzer>
     1196    <charFilter class="solr.HTMLStripCharFilterFactory"/>
    11611197        <tokenizer class="solr.StandardTokenizerFactory"/>
    11621198        <filter class="solr.TurkishLowerCaseFilterFactory"/>
  • gs3-extensions/solr/trunk/src/perllib/solrbuilder.pm

    r29711 r30050  
    310310        # see TermsResponse termResponse = solrResponse.getTermsResponse();
    311311        #$schema_insert_xml .=  "indexed=\"true\" stored=\"false\" termVectors=\"true\" multiValued=\"true\" />\n";
    312         $schema_insert_xml .=  "indexed=\"true\" stored=\"false\" multiValued=\"true\" />\n";
     312        $schema_insert_xml .=  "indexed=\"true\" stored=\"true\" multiValued=\"true\" />\n";
    313313                #$schema_insert_xml .=  "indexed=\"true\" stored=\"true\" multiValued=\"true\" />\n";
    314314    }
  • gs3-extensions/solr/trunk/src/perllib/solrbuildproc.pm

    r29945 r30050  
    430430            if ($self->{'indexing_text'}) {
    431431                # we always strip html
    432                 $section_text = $self->preprocess_text($section_text, 1, "");
     432                &ghtml::htmlsafe($section_text);
     433                #$section_text = $self->preprocess_text($section_text, 1, "");
    433434            }
    434435            else {
  • gs3-extensions/solr/trunk/src/src/java/org/greenstone/gsdl3/service/GS2SolrSearch.java

    r29711 r30050  
    6565                paramDefaults.put(SORT_ORDER_PARAM, SORT_ORDER_DESCENDING);
    6666        does_faceting = true;
     67        does_highlight_snippets = true;
     68        does_full_field_highlighting = true;
    6769        // Used to store the solr cores that match the required 'level'
    6870        // of search (e.g. either document-level=>didx, or
     
    341343        try
    342344        {
     345            //if it is a Highlighting Query - execute it
     346            this.solr_src.setHighlightField(indexField);
     347            if(hldocOID != null)
     348            {
     349                String rslt = this.solr_src.runHighlightingQuery(query,hldocOID);
     350                return rslt;
     351            }
    343352            SharedSoleneQueryResult sqr = this.solr_src.runQuery(query);
    344353
     
    352361        return null;
    353362    }
    354 
     363   
     364   
    355365    /** get the total number of docs that match */
    356366    protected long numDocsMatched(Object query_result)
     
    445455
    446456        return newFacetList;
     457    }
     458    @Override
     459    protected Map<String, Map<String, List<String>>> getHighlightSnippets(Object query_result)
     460    {
     461        if (!(query_result instanceof SolrQueryResult))
     462        {
     463            return null;
     464        }
     465
     466        SolrQueryResult result = (SolrQueryResult) query_result;
     467       
     468        return result.getHighlightResults();
    447469    }
    448470
  • gs3-extensions/solr/trunk/src/src/java/org/greenstone/gsdl3/util/SolrQueryResult.java

    r29142 r30050  
    2828
    2929import java.util.List;
     30import java.util.Map;
    3031
    3132import org.apache.solr.client.solrj.response.FacetField;
     
    4142{
    4243    protected List<FacetField> _facetResults = null;
     44    protected Map<String,Map<String,List<String>>> _highlightResults = null;
    4345    SolrQueryResult()
    4446    {
    4547        super();
    4648    }
    47    
    4849    public void setFacetResults(List<FacetField> facetResults)
    4950    {
     
    5556        return _facetResults;
    5657    }
     58    //Save highlighting snippets
     59    public void setHighlightResults(Map<String,Map<String,List<String>>> highlightResults){
     60        _highlightResults = highlightResults;
     61    }
     62    //Extract highlighting snippets
     63    public Map<String,Map<String,List<String>>> getHighlightResults(){
     64        return _highlightResults;
     65    }
     66   
    5767}
  • gs3-extensions/solr/trunk/src/src/java/org/greenstone/gsdl3/util/SolrQueryWrapper.java

    r29987 r30050  
    3333import java.util.Iterator;
    3434import java.util.List;
     35import java.util.Map;
    3536import java.util.Set;
    3637import java.util.HashSet;
    37 
    3838import java.util.regex.Pattern;
    3939import java.util.regex.Matcher;
     
    4646import org.apache.solr.client.solrj.response.QueryResponse;
    4747import org.apache.solr.client.solrj.response.TermsResponse;
    48 
    4948import org.apache.solr.core.CoreContainer;
    5049import org.apache.solr.core.SolrCore;
    51 
    5250import org.apache.solr.common.SolrDocument;
    5351import org.apache.solr.common.SolrDocumentList;
     
    5553import org.greenstone.LuceneWrapper4.SharedSoleneQuery;
    5654import org.greenstone.LuceneWrapper4.SharedSoleneQueryResult;
    57 
    5855import org.apache.lucene.search.Query; // Query, TermQuery, BooleanQuery, BooleanClause and more
    5956import org.apache.lucene.index.IndexReader;
     
    8178    SolrServer solr_core = null;
    8279
     80    protected String highlight_field = null;
     81   
    8382    String collection_core_name_prefix = null;
    8483
     
    109108    }
    110109  }
    111 
     110  public void setHighlightField(String hl_field)
     111  {
     112    this.highlight_field = hl_field;
     113  }
    112114  public void setSortOrder(String order)
    113115  {
     
    360362        solrQuery.setFields("docOID", "score"); //solrParams.set("fl", "docOID score totaltermfreq(field,'queryterm')");
    361363       
     364        //Turn on highlighting
     365        solrQuery.setHighlight(true);
     366        //Return 3 snippets for each document
     367        solrQuery.setParam("hl.snippets", "3");
     368        solrQuery.setParam("hl.fl", highlight_field);
     369        solrQuery.setHighlightSimplePre("&lt;span class=\"snippetText\"&gt;");
     370       
     371        //Set text which appears after highlighted term
     372        solrQuery.setHighlightSimplePost("&lt;/span&gt;");
     373       
    362374        //solrQuery.setTerms(true); // turn on the termsComponent       
    363375        //solrQuery.set("terms.fl", "ZZ"); // which field to get the terms from. ModifiableSolrParams method
     
    392404            QueryResponse solrResponse = solr_core.query(solrQuery); //solr_core.query(solrParams);
    393405            SolrDocumentList hits = solrResponse.getResults();
     406            Map<String, Map<String, List<String>>> hlResponse = solrResponse.getHighlighting();
     407            solr_query_result.setHighlightResults(hlResponse);
    394408            //TermsResponse termResponse = solrResponse.getTermsResponse(); // null unless termvectors=true in schema.xml
    395409
     
    410424                solr_query_result.setStartResults(start_results);
    411425                solr_query_result.setEndResults(start_results + hits.size());
    412 
    413                
     426                   
    414427                // get the first field we're searching in, this will be the fallback field
    415428                int sepIndex = query_string.indexOf(":");
     
    505518        return solr_query_result;
    506519    }
     520// Highlighting query. Returns full highlighted text for document
     521    public String runHighlightingQuery(String query,String hldocOID)
     522    {
     523                   
     524        SolrQueryResult solr_query_result = new SolrQueryResult();
     525        solr_query_result.clear();
     526
     527       
     528        /* Create Query*/
     529       
     530        SolrQuery solrQuery = new SolrQuery(query);
     531       
     532        /* Set Query Parameters*/
     533       
     534        //Turn on highlighting
     535        solrQuery.setHighlight(true);
     536        //Extract default field from query
     537       
     538        //Set field for highlighting
     539        solrQuery.setParam("hl.fl", highlight_field);
     540       
     541        //Get whole highlighted field
     542        solrQuery.setHighlightFragsize(0);
     543       
     544        //Return only required document by docOID
     545        solrQuery.setFilterQueries("docOID:"+ hldocOID);
     546       
     547        //Set text which appears before highlighted term
     548        //solrQuery.setHighlightSimplePre("<annotation type=\"query_term\">");
     549        solrQuery.setHighlightSimplePre("<span class=\"termHighlight\">");
     550        //Set text which appears after highlighted term
     551        //solrQuery.setHighlightSimplePost("</annotation>");
     552        solrQuery.setHighlightSimplePost("</span>");
     553        //Prepare results
     554        String text = null;
     555        // do the query
     556        try
     557        {
     558            QueryResponse solrResponse = solr_core.query(solrQuery); //solr_core.query(solrParams);
     559            //Get highliting results
     560            Map<String,Map<String,List<String>>> highlightingResults = solrResponse.getHighlighting();
     561            //Get highlited document text
     562            text = highlightingResults.get(hldocOID).get(highlight_field).get(0);
     563           
     564                                               
     565        }
     566        catch (SolrServerException server_exception)
     567        {
     568            server_exception.printStackTrace();
     569           
     570        }
     571        return text;
     572    }
    507573
    508574    //Greenstone universe operates with a base of 1 for "start_results"
Note: See TracChangeset for help on using the changeset viewer.