Changeset 30050

Show
Ignore:
Timestamp:
21.07.2015 05:35:34 (4 years ago)
Author:
litvinovg
Message:

Solr repo modifications for Solr side highlighing and snippets

Location:
gs3-extensions/solr/trunk/src
Files:
6 modified

Legend:

Unmodified
Added
Removed
  • gs3-extensions/solr/trunk/src/conf/schema.xml.in

    r29932 r30050  
    478478    <fieldType name="text_general" class="solr.TextField" positionIncrementGap="100"> 
    479479      <analyzer type="index"> 
     480    <charFilter class="solr.HTMLStripCharFilterFactory"/> 
    480481        <tokenizer class="solr.StandardTokenizerFactory"/> 
    481482        <filter class="solr.StopFilterFactory" ignoreCase="true" words="stopwords.txt" /> 
     
    500501    <fieldType name="text_en" class="solr.TextField" positionIncrementGap="100"> 
    501502      <analyzer type="index"> 
     503    <charFilter class="solr.HTMLStripCharFilterFactory"/> 
    502504        <tokenizer class="solr.StandardTokenizerFactory"/> 
    503505        <!-- in this example, we will only use synonyms at query time 
     
    548550    <fieldType name="text_en_splitting" class="solr.TextField" positionIncrementGap="100" autoGeneratePhraseQueries="true"> 
    549551      <analyzer type="index"> 
     552    <charFilter class="solr.HTMLStripCharFilterFactory"/> 
    550553        <tokenizer class="solr.WhitespaceTokenizerFactory"/> 
    551554        <!-- in this example, we will only use synonyms at query time 
     
    583586    <fieldType name="text_en_splitting_tight" class="solr.TextField" positionIncrementGap="100" autoGeneratePhraseQueries="true"> 
    584587      <analyzer> 
     588    <charFilter class="solr.HTMLStripCharFilterFactory"/> 
    585589        <tokenizer class="solr.WhitespaceTokenizerFactory"/> 
    586590        <filter class="solr.SynonymFilterFactory" synonyms="synonyms.txt" ignoreCase="true" expand="false"/> 
     
    762766    <fieldType name="text_ar" class="solr.TextField" positionIncrementGap="100"> 
    763767      <analyzer>  
     768    <charFilter class="solr.HTMLStripCharFilterFactory"/> 
    764769        <tokenizer class="solr.StandardTokenizerFactory"/> 
    765770        <!-- for any non-arabic --> 
     
    775780    <fieldType name="text_bg" class="solr.TextField" positionIncrementGap="100"> 
    776781      <analyzer>  
     782    <charFilter class="solr.HTMLStripCharFilterFactory"/> 
    777783        <tokenizer class="solr.StandardTokenizerFactory"/>  
    778784        <filter class="solr.LowerCaseFilterFactory"/> 
     
    785791    <fieldType name="text_ca" class="solr.TextField" positionIncrementGap="100"> 
    786792      <analyzer>  
     793    <charFilter class="solr.HTMLStripCharFilterFactory"/> 
    787794        <tokenizer class="solr.StandardTokenizerFactory"/> 
    788795        <!-- removes l', etc --> 
     
    797804    <fieldType name="text_cjk" class="solr.TextField" positionIncrementGap="100"> 
    798805      <analyzer> 
     806    <charFilter class="solr.HTMLStripCharFilterFactory"/> 
    799807        <tokenizer class="solr.StandardTokenizerFactory"/> 
    800808        <!-- normalize width before bigram, as e.g. half-width dakuten combine  --> 
     
    809817    <fieldType name="text_ckb" class="solr.TextField" positionIncrementGap="100"> 
    810818      <analyzer> 
     819    <charFilter class="solr.HTMLStripCharFilterFactory"/> 
    811820        <tokenizer class="solr.StandardTokenizerFactory"/> 
    812821        <filter class="solr.SoraniNormalizationFilterFactory"/> 
     
    821830    <fieldType name="text_cz" class="solr.TextField" positionIncrementGap="100"> 
    822831      <analyzer>  
     832    <charFilter class="solr.HTMLStripCharFilterFactory"/> 
    823833        <tokenizer class="solr.StandardTokenizerFactory"/> 
    824834        <filter class="solr.LowerCaseFilterFactory"/> 
     
    831841    <fieldType name="text_da" class="solr.TextField" positionIncrementGap="100"> 
    832842      <analyzer>  
     843    <charFilter class="solr.HTMLStripCharFilterFactory"/> 
    833844        <tokenizer class="solr.StandardTokenizerFactory"/> 
    834845        <filter class="solr.LowerCaseFilterFactory"/> 
     
    841852    <fieldType name="text_de" class="solr.TextField" positionIncrementGap="100"> 
    842853      <analyzer>  
     854    <charFilter class="solr.HTMLStripCharFilterFactory"/> 
    843855        <tokenizer class="solr.StandardTokenizerFactory"/> 
    844856        <filter class="solr.LowerCaseFilterFactory"/> 
     
    854866    <fieldType name="text_el" class="solr.TextField" positionIncrementGap="100"> 
    855867      <analyzer>  
     868    <charFilter class="solr.HTMLStripCharFilterFactory"/> 
    856869        <tokenizer class="solr.StandardTokenizerFactory"/> 
    857870        <!-- greek specific lowercase for sigma --> 
     
    865878    <fieldType name="text_es" class="solr.TextField" positionIncrementGap="100"> 
    866879      <analyzer>  
     880    <charFilter class="solr.HTMLStripCharFilterFactory"/> 
    867881        <tokenizer class="solr.StandardTokenizerFactory"/> 
    868882        <filter class="solr.LowerCaseFilterFactory"/> 
     
    876890    <fieldType name="text_eu" class="solr.TextField" positionIncrementGap="100"> 
    877891      <analyzer>  
     892    <charFilter class="solr.HTMLStripCharFilterFactory"/> 
    878893        <tokenizer class="solr.StandardTokenizerFactory"/> 
    879894        <filter class="solr.LowerCaseFilterFactory"/> 
     
    887902      <analyzer> 
    888903        <!-- for ZWNJ --> 
     904    <charFilter class="solr.HTMLStripCharFilterFactory"/> 
    889905        <charFilter class="solr.PersianCharFilterFactory"/> 
    890906        <tokenizer class="solr.StandardTokenizerFactory"/> 
     
    899915    <fieldType name="text_fi" class="solr.TextField" positionIncrementGap="100"> 
    900916      <analyzer>  
     917    <charFilter class="solr.HTMLStripCharFilterFactory"/> 
    901918        <tokenizer class="solr.StandardTokenizerFactory"/> 
    902919        <filter class="solr.LowerCaseFilterFactory"/> 
     
    910927    <fieldType name="text_fr" class="solr.TextField" positionIncrementGap="100"> 
    911928      <analyzer>  
     929    <charFilter class="solr.HTMLStripCharFilterFactory"/> 
    912930        <tokenizer class="solr.StandardTokenizerFactory"/> 
    913931        <!-- removes l', etc --> 
     
    924942    <fieldType name="text_ga" class="solr.TextField" positionIncrementGap="100"> 
    925943      <analyzer>  
     944    <charFilter class="solr.HTMLStripCharFilterFactory"/> 
    926945        <tokenizer class="solr.StandardTokenizerFactory"/> 
    927946        <!-- removes d', etc --> 
     
    938957    <fieldType name="text_gl" class="solr.TextField" positionIncrementGap="100"> 
    939958      <analyzer>  
     959    <charFilter class="solr.HTMLStripCharFilterFactory"/> 
    940960        <tokenizer class="solr.StandardTokenizerFactory"/> 
    941961        <filter class="solr.LowerCaseFilterFactory"/> 
     
    949969    <fieldType name="text_hi" class="solr.TextField" positionIncrementGap="100"> 
    950970      <analyzer>  
     971    <charFilter class="solr.HTMLStripCharFilterFactory"/> 
    951972        <tokenizer class="solr.StandardTokenizerFactory"/> 
    952973        <filter class="solr.LowerCaseFilterFactory"/> 
     
    963984    <fieldType name="text_hu" class="solr.TextField" positionIncrementGap="100"> 
    964985      <analyzer>  
     986    <charFilter class="solr.HTMLStripCharFilterFactory"/> 
    965987        <tokenizer class="solr.StandardTokenizerFactory"/> 
    966988        <filter class="solr.LowerCaseFilterFactory"/> 
     
    974996    <fieldType name="text_hy" class="solr.TextField" positionIncrementGap="100"> 
    975997      <analyzer>  
     998    <charFilter class="solr.HTMLStripCharFilterFactory"/> 
    976999        <tokenizer class="solr.StandardTokenizerFactory"/> 
    9771000        <filter class="solr.LowerCaseFilterFactory"/> 
     
    9841007    <fieldType name="text_id" class="solr.TextField" positionIncrementGap="100"> 
    9851008      <analyzer>  
     1009    <charFilter class="solr.HTMLStripCharFilterFactory"/> 
    9861010        <tokenizer class="solr.StandardTokenizerFactory"/> 
    9871011        <filter class="solr.LowerCaseFilterFactory"/> 
     
    9951019    <fieldType name="text_it" class="solr.TextField" positionIncrementGap="100"> 
    9961020      <analyzer>  
     1021    <charFilter class="solr.HTMLStripCharFilterFactory"/> 
    9971022        <tokenizer class="solr.StandardTokenizerFactory"/> 
    9981023        <!-- removes l', etc --> 
     
    10411066           See http://wiki.apache.org/solr/JapaneseLanguageSupport for more on Japanese language support. 
    10421067        --> 
     1068    <charFilter class="solr.HTMLStripCharFilterFactory"/> 
    10431069        <tokenizer class="solr.JapaneseTokenizerFactory" mode="search"/> 
    10441070        <!--<tokenizer class="solr.JapaneseTokenizerFactory" mode="search" userDictionary="lang/userdict_ja.txt"/>--> 
     
    10611087    <fieldType name="text_lv" class="solr.TextField" positionIncrementGap="100"> 
    10621088      <analyzer>  
     1089    <charFilter class="solr.HTMLStripCharFilterFactory"/> 
    10631090        <tokenizer class="solr.StandardTokenizerFactory"/> 
    10641091        <filter class="solr.LowerCaseFilterFactory"/> 
     
    10711098    <fieldType name="text_nl" class="solr.TextField" positionIncrementGap="100"> 
    10721099      <analyzer>  
     1100    <charFilter class="solr.HTMLStripCharFilterFactory"/> 
    10731101        <tokenizer class="solr.StandardTokenizerFactory"/> 
    10741102        <filter class="solr.LowerCaseFilterFactory"/> 
     
    10821110    <fieldType name="text_no" class="solr.TextField" positionIncrementGap="100"> 
    10831111      <analyzer>  
     1112    <charFilter class="solr.HTMLStripCharFilterFactory"/> 
    10841113        <tokenizer class="solr.StandardTokenizerFactory"/> 
    10851114        <filter class="solr.LowerCaseFilterFactory"/> 
     
    10951124    <fieldType name="text_pt" class="solr.TextField" positionIncrementGap="100"> 
    10961125      <analyzer>  
     1126    <charFilter class="solr.HTMLStripCharFilterFactory"/> 
    10971127        <tokenizer class="solr.StandardTokenizerFactory"/> 
    10981128        <filter class="solr.LowerCaseFilterFactory"/> 
     
    11081138    <fieldType name="text_ro" class="solr.TextField" positionIncrementGap="100"> 
    11091139      <analyzer>  
     1140    <charFilter class="solr.HTMLStripCharFilterFactory"/> 
    11101141        <tokenizer class="solr.StandardTokenizerFactory"/> 
    11111142        <filter class="solr.LowerCaseFilterFactory"/> 
     
    11181149    <fieldType name="text_ru" class="solr.TextField" positionIncrementGap="100"> 
    11191150      <analyzer>  
     1151    <charFilter class="solr.HTMLStripCharFilterFactory"/> 
    11201152        <tokenizer class="solr.StandardTokenizerFactory"/> 
    11211153        <filter class="solr.LowerCaseFilterFactory"/> 
     
    11271159    <!-- Russian with morphology--> 
    11281160    <fieldType name="text_ru_morph" class="solr.TextField" positionIncrementGap="100"> 
    1129           <analyzer> 
    1130           <tokenizer class="solr.StandardTokenizerFactory"/> 
    1131           <filter class="solr.LowerCaseFilterFactory"/> 
    1132           <filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_ru.txt" format="snowball" /> 
    1133           <filter class="org.apache.lucene.morphology.russian.RussianFilterFactory"/> 
    1134           </analyzer> 
     1161      <analyzer> 
     1162    <charFilter class="solr.HTMLStripCharFilterFactory"/> 
     1163        <tokenizer class="solr.StandardTokenizerFactory"/> 
     1164        <filter class="solr.LowerCaseFilterFactory"/> 
     1165        <filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_ru.txt" format="snowball" /> 
     1166        <filter class="org.apache.lucene.morphology.russian.RussianFilterFactory"/> 
     1167      </analyzer> 
    11351168    </fieldType> 
    11361169  
     
    11381171    <fieldType name="text_sv" class="solr.TextField" positionIncrementGap="100"> 
    11391172      <analyzer>  
     1173    <charFilter class="solr.HTMLStripCharFilterFactory"/> 
    11401174        <tokenizer class="solr.StandardTokenizerFactory"/> 
    11411175        <filter class="solr.LowerCaseFilterFactory"/> 
     
    11491183    <fieldType name="text_th" class="solr.TextField" positionIncrementGap="100"> 
    11501184      <analyzer>  
     1185    <charFilter class="solr.HTMLStripCharFilterFactory"/> 
    11511186        <tokenizer class="solr.StandardTokenizerFactory"/> 
    11521187        <filter class="solr.LowerCaseFilterFactory"/> 
     
    11591194    <fieldType name="text_tr" class="solr.TextField" positionIncrementGap="100"> 
    11601195      <analyzer>  
     1196    <charFilter class="solr.HTMLStripCharFilterFactory"/> 
    11611197        <tokenizer class="solr.StandardTokenizerFactory"/> 
    11621198        <filter class="solr.TurkishLowerCaseFilterFactory"/> 
  • gs3-extensions/solr/trunk/src/perllib/solrbuilder.pm

    r29711 r30050  
    310310        # see TermsResponse termResponse = solrResponse.getTermsResponse();  
    311311        #$schema_insert_xml .=  "indexed=\"true\" stored=\"false\" termVectors=\"true\" multiValued=\"true\" />\n"; 
    312         $schema_insert_xml .=  "indexed=\"true\" stored=\"false\" multiValued=\"true\" />\n"; 
     312        $schema_insert_xml .=  "indexed=\"true\" stored=\"true\" multiValued=\"true\" />\n"; 
    313313                #$schema_insert_xml .=  "indexed=\"true\" stored=\"true\" multiValued=\"true\" />\n"; 
    314314    } 
  • gs3-extensions/solr/trunk/src/perllib/solrbuildproc.pm

    r29945 r30050  
    430430            if ($self->{'indexing_text'}) { 
    431431                # we always strip html 
    432                 $section_text = $self->preprocess_text($section_text, 1, ""); 
     432                &ghtml::htmlsafe($section_text); 
     433                #$section_text = $self->preprocess_text($section_text, 1, ""); 
    433434            } 
    434435            else {  
  • gs3-extensions/solr/trunk/src/src/java/org/greenstone/gsdl3/service/GS2SolrSearch.java

    r29711 r30050  
    6565                paramDefaults.put(SORT_ORDER_PARAM, SORT_ORDER_DESCENDING); 
    6666        does_faceting = true; 
     67        does_highlight_snippets = true; 
     68        does_full_field_highlighting = true; 
    6769        // Used to store the solr cores that match the required 'level'  
    6870        // of search (e.g. either document-level=>didx, or  
     
    341343        try 
    342344        { 
     345            //if it is a Highlighting Query - execute it 
     346            this.solr_src.setHighlightField(indexField); 
     347            if(hldocOID != null) 
     348            { 
     349                String rslt = this.solr_src.runHighlightingQuery(query,hldocOID); 
     350                return rslt; 
     351            } 
    343352            SharedSoleneQueryResult sqr = this.solr_src.runQuery(query); 
    344353 
     
    352361        return null; 
    353362    } 
    354  
     363     
     364     
    355365    /** get the total number of docs that match */ 
    356366    protected long numDocsMatched(Object query_result) 
     
    445455 
    446456        return newFacetList; 
     457    } 
     458    @Override 
     459    protected Map<String, Map<String, List<String>>> getHighlightSnippets(Object query_result) 
     460    { 
     461        if (!(query_result instanceof SolrQueryResult)) 
     462        { 
     463            return null; 
     464        } 
     465 
     466        SolrQueryResult result = (SolrQueryResult) query_result; 
     467         
     468        return result.getHighlightResults(); 
    447469    } 
    448470 
  • gs3-extensions/solr/trunk/src/src/java/org/greenstone/gsdl3/util/SolrQueryResult.java

    r29142 r30050  
    2828 
    2929import java.util.List; 
     30import java.util.Map; 
    3031 
    3132import org.apache.solr.client.solrj.response.FacetField; 
     
    4142{ 
    4243    protected List<FacetField> _facetResults = null; 
     44    protected Map<String,Map<String,List<String>>> _highlightResults = null; 
    4345    SolrQueryResult() 
    4446    { 
    4547        super(); 
    4648    } 
    47      
    4849    public void setFacetResults(List<FacetField> facetResults) 
    4950    { 
     
    5556        return _facetResults; 
    5657    } 
     58    //Save highlighting snippets 
     59    public void setHighlightResults(Map<String,Map<String,List<String>>> highlightResults){ 
     60        _highlightResults = highlightResults; 
     61    } 
     62    //Extract highlighting snippets 
     63    public Map<String,Map<String,List<String>>> getHighlightResults(){ 
     64        return _highlightResults; 
     65    } 
     66     
    5767} 
  • gs3-extensions/solr/trunk/src/src/java/org/greenstone/gsdl3/util/SolrQueryWrapper.java

    r29987 r30050  
    3333import java.util.Iterator; 
    3434import java.util.List; 
     35import java.util.Map; 
    3536import java.util.Set; 
    3637import java.util.HashSet; 
    37  
    3838import java.util.regex.Pattern; 
    3939import java.util.regex.Matcher; 
     
    4646import org.apache.solr.client.solrj.response.QueryResponse; 
    4747import org.apache.solr.client.solrj.response.TermsResponse; 
    48  
    4948import org.apache.solr.core.CoreContainer; 
    5049import org.apache.solr.core.SolrCore; 
    51  
    5250import org.apache.solr.common.SolrDocument; 
    5351import org.apache.solr.common.SolrDocumentList; 
     
    5553import org.greenstone.LuceneWrapper4.SharedSoleneQuery; 
    5654import org.greenstone.LuceneWrapper4.SharedSoleneQueryResult; 
    57  
    5855import org.apache.lucene.search.Query; // Query, TermQuery, BooleanQuery, BooleanClause and more 
    5956import org.apache.lucene.index.IndexReader; 
     
    8178    SolrServer solr_core = null; 
    8279 
     80    protected String highlight_field = null; 
     81     
    8382    String collection_core_name_prefix = null; 
    8483 
     
    109108    } 
    110109  } 
    111  
     110  public void setHighlightField(String hl_field) 
     111  { 
     112    this.highlight_field = hl_field; 
     113  } 
    112114  public void setSortOrder(String order) 
    113115  { 
     
    360362        solrQuery.setFields("docOID", "score"); //solrParams.set("fl", "docOID score totaltermfreq(field,'queryterm')");  
    361363         
     364        //Turn on highlighting 
     365        solrQuery.setHighlight(true); 
     366        //Return 3 snippets for each document 
     367        solrQuery.setParam("hl.snippets", "3"); 
     368        solrQuery.setParam("hl.fl", highlight_field); 
     369        solrQuery.setHighlightSimplePre("&lt;span class=\"snippetText\"&gt;"); 
     370         
     371        //Set text which appears after highlighted term 
     372        solrQuery.setHighlightSimplePost("&lt;/span&gt;"); 
     373         
    362374        //solrQuery.setTerms(true); // turn on the termsComponent        
    363375        //solrQuery.set("terms.fl", "ZZ"); // which field to get the terms from. ModifiableSolrParams method 
     
    392404            QueryResponse solrResponse = solr_core.query(solrQuery); //solr_core.query(solrParams); 
    393405            SolrDocumentList hits = solrResponse.getResults(); 
     406            Map<String, Map<String, List<String>>> hlResponse = solrResponse.getHighlighting(); 
     407            solr_query_result.setHighlightResults(hlResponse); 
    394408            //TermsResponse termResponse = solrResponse.getTermsResponse(); // null unless termvectors=true in schema.xml 
    395409 
     
    410424                solr_query_result.setStartResults(start_results); 
    411425                solr_query_result.setEndResults(start_results + hits.size()); 
    412  
    413                  
     426                     
    414427                // get the first field we're searching in, this will be the fallback field 
    415428                int sepIndex = query_string.indexOf(":"); 
     
    505518        return solr_query_result; 
    506519    } 
     520// Highlighting query. Returns full highlighted text for document 
     521    public String runHighlightingQuery(String query,String hldocOID) 
     522    { 
     523                     
     524        SolrQueryResult solr_query_result = new SolrQueryResult(); 
     525        solr_query_result.clear(); 
     526 
     527         
     528        /* Create Query*/ 
     529         
     530        SolrQuery solrQuery = new SolrQuery(query); 
     531         
     532        /* Set Query Parameters*/ 
     533         
     534        //Turn on highlighting 
     535        solrQuery.setHighlight(true); 
     536        //Extract default field from query 
     537         
     538        //Set field for highlighting 
     539        solrQuery.setParam("hl.fl", highlight_field); 
     540         
     541        //Get whole highlighted field 
     542        solrQuery.setHighlightFragsize(0); 
     543         
     544        //Return only required document by docOID 
     545        solrQuery.setFilterQueries("docOID:"+ hldocOID); 
     546         
     547        //Set text which appears before highlighted term 
     548        //solrQuery.setHighlightSimplePre("<annotation type=\"query_term\">"); 
     549        solrQuery.setHighlightSimplePre("<span class=\"termHighlight\">"); 
     550        //Set text which appears after highlighted term 
     551        //solrQuery.setHighlightSimplePost("</annotation>"); 
     552        solrQuery.setHighlightSimplePost("</span>"); 
     553        //Prepare results 
     554        String text = null; 
     555        // do the query 
     556        try 
     557        { 
     558            QueryResponse solrResponse = solr_core.query(solrQuery); //solr_core.query(solrParams); 
     559            //Get highliting results 
     560            Map<String,Map<String,List<String>>> highlightingResults = solrResponse.getHighlighting(); 
     561            //Get highlited document text 
     562            text = highlightingResults.get(hldocOID).get(highlight_field).get(0); 
     563             
     564                                                 
     565        } 
     566        catch (SolrServerException server_exception) 
     567        { 
     568            server_exception.printStackTrace(); 
     569             
     570        } 
     571        return text; 
     572    } 
    507573 
    508574    //Greenstone universe operates with a base of 1 for "start_results"