Changeset 30050 for gs3-extensions
- Timestamp:
- 2015-07-21T05:35:34+12:00 (9 years ago)
- Location:
- gs3-extensions/solr/trunk/src
- Files:
-
- 6 edited
Legend:
- Unmodified
- Added
- Removed
-
gs3-extensions/solr/trunk/src/conf/schema.xml.in
r29932 r30050 478 478 <fieldType name="text_general" class="solr.TextField" positionIncrementGap="100"> 479 479 <analyzer type="index"> 480 <charFilter class="solr.HTMLStripCharFilterFactory"/> 480 481 <tokenizer class="solr.StandardTokenizerFactory"/> 481 482 <filter class="solr.StopFilterFactory" ignoreCase="true" words="stopwords.txt" /> … … 500 501 <fieldType name="text_en" class="solr.TextField" positionIncrementGap="100"> 501 502 <analyzer type="index"> 503 <charFilter class="solr.HTMLStripCharFilterFactory"/> 502 504 <tokenizer class="solr.StandardTokenizerFactory"/> 503 505 <!-- in this example, we will only use synonyms at query time … … 548 550 <fieldType name="text_en_splitting" class="solr.TextField" positionIncrementGap="100" autoGeneratePhraseQueries="true"> 549 551 <analyzer type="index"> 552 <charFilter class="solr.HTMLStripCharFilterFactory"/> 550 553 <tokenizer class="solr.WhitespaceTokenizerFactory"/> 551 554 <!-- in this example, we will only use synonyms at query time … … 583 586 <fieldType name="text_en_splitting_tight" class="solr.TextField" positionIncrementGap="100" autoGeneratePhraseQueries="true"> 584 587 <analyzer> 588 <charFilter class="solr.HTMLStripCharFilterFactory"/> 585 589 <tokenizer class="solr.WhitespaceTokenizerFactory"/> 586 590 <filter class="solr.SynonymFilterFactory" synonyms="synonyms.txt" ignoreCase="true" expand="false"/> … … 762 766 <fieldType name="text_ar" class="solr.TextField" positionIncrementGap="100"> 763 767 <analyzer> 768 <charFilter class="solr.HTMLStripCharFilterFactory"/> 764 769 <tokenizer class="solr.StandardTokenizerFactory"/> 765 770 <!-- for any non-arabic --> … … 775 780 <fieldType name="text_bg" class="solr.TextField" positionIncrementGap="100"> 776 781 <analyzer> 782 <charFilter class="solr.HTMLStripCharFilterFactory"/> 777 783 <tokenizer class="solr.StandardTokenizerFactory"/> 778 784 <filter class="solr.LowerCaseFilterFactory"/> … … 785 791 <fieldType name="text_ca" class="solr.TextField" positionIncrementGap="100"> 786 792 <analyzer> 793 <charFilter class="solr.HTMLStripCharFilterFactory"/> 787 794 <tokenizer class="solr.StandardTokenizerFactory"/> 788 795 <!-- removes l', etc --> … … 797 804 <fieldType name="text_cjk" class="solr.TextField" positionIncrementGap="100"> 798 805 <analyzer> 806 <charFilter class="solr.HTMLStripCharFilterFactory"/> 799 807 <tokenizer class="solr.StandardTokenizerFactory"/> 800 808 <!-- normalize width before bigram, as e.g. half-width dakuten combine --> … … 809 817 <fieldType name="text_ckb" class="solr.TextField" positionIncrementGap="100"> 810 818 <analyzer> 819 <charFilter class="solr.HTMLStripCharFilterFactory"/> 811 820 <tokenizer class="solr.StandardTokenizerFactory"/> 812 821 <filter class="solr.SoraniNormalizationFilterFactory"/> … … 821 830 <fieldType name="text_cz" class="solr.TextField" positionIncrementGap="100"> 822 831 <analyzer> 832 <charFilter class="solr.HTMLStripCharFilterFactory"/> 823 833 <tokenizer class="solr.StandardTokenizerFactory"/> 824 834 <filter class="solr.LowerCaseFilterFactory"/> … … 831 841 <fieldType name="text_da" class="solr.TextField" positionIncrementGap="100"> 832 842 <analyzer> 843 <charFilter class="solr.HTMLStripCharFilterFactory"/> 833 844 <tokenizer class="solr.StandardTokenizerFactory"/> 834 845 <filter class="solr.LowerCaseFilterFactory"/> … … 841 852 <fieldType name="text_de" class="solr.TextField" positionIncrementGap="100"> 842 853 <analyzer> 854 <charFilter class="solr.HTMLStripCharFilterFactory"/> 843 855 <tokenizer class="solr.StandardTokenizerFactory"/> 844 856 <filter class="solr.LowerCaseFilterFactory"/> … … 854 866 <fieldType name="text_el" class="solr.TextField" positionIncrementGap="100"> 855 867 <analyzer> 868 <charFilter class="solr.HTMLStripCharFilterFactory"/> 856 869 <tokenizer class="solr.StandardTokenizerFactory"/> 857 870 <!-- greek specific lowercase for sigma --> … … 865 878 <fieldType name="text_es" class="solr.TextField" positionIncrementGap="100"> 866 879 <analyzer> 880 <charFilter class="solr.HTMLStripCharFilterFactory"/> 867 881 <tokenizer class="solr.StandardTokenizerFactory"/> 868 882 <filter class="solr.LowerCaseFilterFactory"/> … … 876 890 <fieldType name="text_eu" class="solr.TextField" positionIncrementGap="100"> 877 891 <analyzer> 892 <charFilter class="solr.HTMLStripCharFilterFactory"/> 878 893 <tokenizer class="solr.StandardTokenizerFactory"/> 879 894 <filter class="solr.LowerCaseFilterFactory"/> … … 887 902 <analyzer> 888 903 <!-- for ZWNJ --> 904 <charFilter class="solr.HTMLStripCharFilterFactory"/> 889 905 <charFilter class="solr.PersianCharFilterFactory"/> 890 906 <tokenizer class="solr.StandardTokenizerFactory"/> … … 899 915 <fieldType name="text_fi" class="solr.TextField" positionIncrementGap="100"> 900 916 <analyzer> 917 <charFilter class="solr.HTMLStripCharFilterFactory"/> 901 918 <tokenizer class="solr.StandardTokenizerFactory"/> 902 919 <filter class="solr.LowerCaseFilterFactory"/> … … 910 927 <fieldType name="text_fr" class="solr.TextField" positionIncrementGap="100"> 911 928 <analyzer> 929 <charFilter class="solr.HTMLStripCharFilterFactory"/> 912 930 <tokenizer class="solr.StandardTokenizerFactory"/> 913 931 <!-- removes l', etc --> … … 924 942 <fieldType name="text_ga" class="solr.TextField" positionIncrementGap="100"> 925 943 <analyzer> 944 <charFilter class="solr.HTMLStripCharFilterFactory"/> 926 945 <tokenizer class="solr.StandardTokenizerFactory"/> 927 946 <!-- removes d', etc --> … … 938 957 <fieldType name="text_gl" class="solr.TextField" positionIncrementGap="100"> 939 958 <analyzer> 959 <charFilter class="solr.HTMLStripCharFilterFactory"/> 940 960 <tokenizer class="solr.StandardTokenizerFactory"/> 941 961 <filter class="solr.LowerCaseFilterFactory"/> … … 949 969 <fieldType name="text_hi" class="solr.TextField" positionIncrementGap="100"> 950 970 <analyzer> 971 <charFilter class="solr.HTMLStripCharFilterFactory"/> 951 972 <tokenizer class="solr.StandardTokenizerFactory"/> 952 973 <filter class="solr.LowerCaseFilterFactory"/> … … 963 984 <fieldType name="text_hu" class="solr.TextField" positionIncrementGap="100"> 964 985 <analyzer> 986 <charFilter class="solr.HTMLStripCharFilterFactory"/> 965 987 <tokenizer class="solr.StandardTokenizerFactory"/> 966 988 <filter class="solr.LowerCaseFilterFactory"/> … … 974 996 <fieldType name="text_hy" class="solr.TextField" positionIncrementGap="100"> 975 997 <analyzer> 998 <charFilter class="solr.HTMLStripCharFilterFactory"/> 976 999 <tokenizer class="solr.StandardTokenizerFactory"/> 977 1000 <filter class="solr.LowerCaseFilterFactory"/> … … 984 1007 <fieldType name="text_id" class="solr.TextField" positionIncrementGap="100"> 985 1008 <analyzer> 1009 <charFilter class="solr.HTMLStripCharFilterFactory"/> 986 1010 <tokenizer class="solr.StandardTokenizerFactory"/> 987 1011 <filter class="solr.LowerCaseFilterFactory"/> … … 995 1019 <fieldType name="text_it" class="solr.TextField" positionIncrementGap="100"> 996 1020 <analyzer> 1021 <charFilter class="solr.HTMLStripCharFilterFactory"/> 997 1022 <tokenizer class="solr.StandardTokenizerFactory"/> 998 1023 <!-- removes l', etc --> … … 1041 1066 See http://wiki.apache.org/solr/JapaneseLanguageSupport for more on Japanese language support. 1042 1067 --> 1068 <charFilter class="solr.HTMLStripCharFilterFactory"/> 1043 1069 <tokenizer class="solr.JapaneseTokenizerFactory" mode="search"/> 1044 1070 <!--<tokenizer class="solr.JapaneseTokenizerFactory" mode="search" userDictionary="lang/userdict_ja.txt"/>--> … … 1061 1087 <fieldType name="text_lv" class="solr.TextField" positionIncrementGap="100"> 1062 1088 <analyzer> 1089 <charFilter class="solr.HTMLStripCharFilterFactory"/> 1063 1090 <tokenizer class="solr.StandardTokenizerFactory"/> 1064 1091 <filter class="solr.LowerCaseFilterFactory"/> … … 1071 1098 <fieldType name="text_nl" class="solr.TextField" positionIncrementGap="100"> 1072 1099 <analyzer> 1100 <charFilter class="solr.HTMLStripCharFilterFactory"/> 1073 1101 <tokenizer class="solr.StandardTokenizerFactory"/> 1074 1102 <filter class="solr.LowerCaseFilterFactory"/> … … 1082 1110 <fieldType name="text_no" class="solr.TextField" positionIncrementGap="100"> 1083 1111 <analyzer> 1112 <charFilter class="solr.HTMLStripCharFilterFactory"/> 1084 1113 <tokenizer class="solr.StandardTokenizerFactory"/> 1085 1114 <filter class="solr.LowerCaseFilterFactory"/> … … 1095 1124 <fieldType name="text_pt" class="solr.TextField" positionIncrementGap="100"> 1096 1125 <analyzer> 1126 <charFilter class="solr.HTMLStripCharFilterFactory"/> 1097 1127 <tokenizer class="solr.StandardTokenizerFactory"/> 1098 1128 <filter class="solr.LowerCaseFilterFactory"/> … … 1108 1138 <fieldType name="text_ro" class="solr.TextField" positionIncrementGap="100"> 1109 1139 <analyzer> 1140 <charFilter class="solr.HTMLStripCharFilterFactory"/> 1110 1141 <tokenizer class="solr.StandardTokenizerFactory"/> 1111 1142 <filter class="solr.LowerCaseFilterFactory"/> … … 1118 1149 <fieldType name="text_ru" class="solr.TextField" positionIncrementGap="100"> 1119 1150 <analyzer> 1151 <charFilter class="solr.HTMLStripCharFilterFactory"/> 1120 1152 <tokenizer class="solr.StandardTokenizerFactory"/> 1121 1153 <filter class="solr.LowerCaseFilterFactory"/> … … 1127 1159 <!-- Russian with morphology--> 1128 1160 <fieldType name="text_ru_morph" class="solr.TextField" positionIncrementGap="100"> 1129 <analyzer> 1130 <tokenizer class="solr.StandardTokenizerFactory"/> 1131 <filter class="solr.LowerCaseFilterFactory"/> 1132 <filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_ru.txt" format="snowball" /> 1133 <filter class="org.apache.lucene.morphology.russian.RussianFilterFactory"/> 1134 </analyzer> 1161 <analyzer> 1162 <charFilter class="solr.HTMLStripCharFilterFactory"/> 1163 <tokenizer class="solr.StandardTokenizerFactory"/> 1164 <filter class="solr.LowerCaseFilterFactory"/> 1165 <filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_ru.txt" format="snowball" /> 1166 <filter class="org.apache.lucene.morphology.russian.RussianFilterFactory"/> 1167 </analyzer> 1135 1168 </fieldType> 1136 1169 … … 1138 1171 <fieldType name="text_sv" class="solr.TextField" positionIncrementGap="100"> 1139 1172 <analyzer> 1173 <charFilter class="solr.HTMLStripCharFilterFactory"/> 1140 1174 <tokenizer class="solr.StandardTokenizerFactory"/> 1141 1175 <filter class="solr.LowerCaseFilterFactory"/> … … 1149 1183 <fieldType name="text_th" class="solr.TextField" positionIncrementGap="100"> 1150 1184 <analyzer> 1185 <charFilter class="solr.HTMLStripCharFilterFactory"/> 1151 1186 <tokenizer class="solr.StandardTokenizerFactory"/> 1152 1187 <filter class="solr.LowerCaseFilterFactory"/> … … 1159 1194 <fieldType name="text_tr" class="solr.TextField" positionIncrementGap="100"> 1160 1195 <analyzer> 1196 <charFilter class="solr.HTMLStripCharFilterFactory"/> 1161 1197 <tokenizer class="solr.StandardTokenizerFactory"/> 1162 1198 <filter class="solr.TurkishLowerCaseFilterFactory"/> -
gs3-extensions/solr/trunk/src/perllib/solrbuilder.pm
r29711 r30050 310 310 # see TermsResponse termResponse = solrResponse.getTermsResponse(); 311 311 #$schema_insert_xml .= "indexed=\"true\" stored=\"false\" termVectors=\"true\" multiValued=\"true\" />\n"; 312 $schema_insert_xml .= "indexed=\"true\" stored=\" false\" multiValued=\"true\" />\n";312 $schema_insert_xml .= "indexed=\"true\" stored=\"true\" multiValued=\"true\" />\n"; 313 313 #$schema_insert_xml .= "indexed=\"true\" stored=\"true\" multiValued=\"true\" />\n"; 314 314 } -
gs3-extensions/solr/trunk/src/perllib/solrbuildproc.pm
r29945 r30050 430 430 if ($self->{'indexing_text'}) { 431 431 # we always strip html 432 $section_text = $self->preprocess_text($section_text, 1, ""); 432 &ghtml::htmlsafe($section_text); 433 #$section_text = $self->preprocess_text($section_text, 1, ""); 433 434 } 434 435 else { -
gs3-extensions/solr/trunk/src/src/java/org/greenstone/gsdl3/service/GS2SolrSearch.java
r29711 r30050 65 65 paramDefaults.put(SORT_ORDER_PARAM, SORT_ORDER_DESCENDING); 66 66 does_faceting = true; 67 does_highlight_snippets = true; 68 does_full_field_highlighting = true; 67 69 // Used to store the solr cores that match the required 'level' 68 70 // of search (e.g. either document-level=>didx, or … … 341 343 try 342 344 { 345 //if it is a Highlighting Query - execute it 346 this.solr_src.setHighlightField(indexField); 347 if(hldocOID != null) 348 { 349 String rslt = this.solr_src.runHighlightingQuery(query,hldocOID); 350 return rslt; 351 } 343 352 SharedSoleneQueryResult sqr = this.solr_src.runQuery(query); 344 353 … … 352 361 return null; 353 362 } 354 363 364 355 365 /** get the total number of docs that match */ 356 366 protected long numDocsMatched(Object query_result) … … 445 455 446 456 return newFacetList; 457 } 458 @Override 459 protected Map<String, Map<String, List<String>>> getHighlightSnippets(Object query_result) 460 { 461 if (!(query_result instanceof SolrQueryResult)) 462 { 463 return null; 464 } 465 466 SolrQueryResult result = (SolrQueryResult) query_result; 467 468 return result.getHighlightResults(); 447 469 } 448 470 -
gs3-extensions/solr/trunk/src/src/java/org/greenstone/gsdl3/util/SolrQueryResult.java
r29142 r30050 28 28 29 29 import java.util.List; 30 import java.util.Map; 30 31 31 32 import org.apache.solr.client.solrj.response.FacetField; … … 41 42 { 42 43 protected List<FacetField> _facetResults = null; 44 protected Map<String,Map<String,List<String>>> _highlightResults = null; 43 45 SolrQueryResult() 44 46 { 45 47 super(); 46 48 } 47 48 49 public void setFacetResults(List<FacetField> facetResults) 49 50 { … … 55 56 return _facetResults; 56 57 } 58 //Save highlighting snippets 59 public void setHighlightResults(Map<String,Map<String,List<String>>> highlightResults){ 60 _highlightResults = highlightResults; 61 } 62 //Extract highlighting snippets 63 public Map<String,Map<String,List<String>>> getHighlightResults(){ 64 return _highlightResults; 65 } 66 57 67 } -
gs3-extensions/solr/trunk/src/src/java/org/greenstone/gsdl3/util/SolrQueryWrapper.java
r29987 r30050 33 33 import java.util.Iterator; 34 34 import java.util.List; 35 import java.util.Map; 35 36 import java.util.Set; 36 37 import java.util.HashSet; 37 38 38 import java.util.regex.Pattern; 39 39 import java.util.regex.Matcher; … … 46 46 import org.apache.solr.client.solrj.response.QueryResponse; 47 47 import org.apache.solr.client.solrj.response.TermsResponse; 48 49 48 import org.apache.solr.core.CoreContainer; 50 49 import org.apache.solr.core.SolrCore; 51 52 50 import org.apache.solr.common.SolrDocument; 53 51 import org.apache.solr.common.SolrDocumentList; … … 55 53 import org.greenstone.LuceneWrapper4.SharedSoleneQuery; 56 54 import org.greenstone.LuceneWrapper4.SharedSoleneQueryResult; 57 58 55 import org.apache.lucene.search.Query; // Query, TermQuery, BooleanQuery, BooleanClause and more 59 56 import org.apache.lucene.index.IndexReader; … … 81 78 SolrServer solr_core = null; 82 79 80 protected String highlight_field = null; 81 83 82 String collection_core_name_prefix = null; 84 83 … … 109 108 } 110 109 } 111 110 public void setHighlightField(String hl_field) 111 { 112 this.highlight_field = hl_field; 113 } 112 114 public void setSortOrder(String order) 113 115 { … … 360 362 solrQuery.setFields("docOID", "score"); //solrParams.set("fl", "docOID score totaltermfreq(field,'queryterm')"); 361 363 364 //Turn on highlighting 365 solrQuery.setHighlight(true); 366 //Return 3 snippets for each document 367 solrQuery.setParam("hl.snippets", "3"); 368 solrQuery.setParam("hl.fl", highlight_field); 369 solrQuery.setHighlightSimplePre("<span class=\"snippetText\">"); 370 371 //Set text which appears after highlighted term 372 solrQuery.setHighlightSimplePost("</span>"); 373 362 374 //solrQuery.setTerms(true); // turn on the termsComponent 363 375 //solrQuery.set("terms.fl", "ZZ"); // which field to get the terms from. ModifiableSolrParams method … … 392 404 QueryResponse solrResponse = solr_core.query(solrQuery); //solr_core.query(solrParams); 393 405 SolrDocumentList hits = solrResponse.getResults(); 406 Map<String, Map<String, List<String>>> hlResponse = solrResponse.getHighlighting(); 407 solr_query_result.setHighlightResults(hlResponse); 394 408 //TermsResponse termResponse = solrResponse.getTermsResponse(); // null unless termvectors=true in schema.xml 395 409 … … 410 424 solr_query_result.setStartResults(start_results); 411 425 solr_query_result.setEndResults(start_results + hits.size()); 412 413 426 414 427 // get the first field we're searching in, this will be the fallback field 415 428 int sepIndex = query_string.indexOf(":"); … … 505 518 return solr_query_result; 506 519 } 520 // Highlighting query. Returns full highlighted text for document 521 public String runHighlightingQuery(String query,String hldocOID) 522 { 523 524 SolrQueryResult solr_query_result = new SolrQueryResult(); 525 solr_query_result.clear(); 526 527 528 /* Create Query*/ 529 530 SolrQuery solrQuery = new SolrQuery(query); 531 532 /* Set Query Parameters*/ 533 534 //Turn on highlighting 535 solrQuery.setHighlight(true); 536 //Extract default field from query 537 538 //Set field for highlighting 539 solrQuery.setParam("hl.fl", highlight_field); 540 541 //Get whole highlighted field 542 solrQuery.setHighlightFragsize(0); 543 544 //Return only required document by docOID 545 solrQuery.setFilterQueries("docOID:"+ hldocOID); 546 547 //Set text which appears before highlighted term 548 //solrQuery.setHighlightSimplePre("<annotation type=\"query_term\">"); 549 solrQuery.setHighlightSimplePre("<span class=\"termHighlight\">"); 550 //Set text which appears after highlighted term 551 //solrQuery.setHighlightSimplePost("</annotation>"); 552 solrQuery.setHighlightSimplePost("</span>"); 553 //Prepare results 554 String text = null; 555 // do the query 556 try 557 { 558 QueryResponse solrResponse = solr_core.query(solrQuery); //solr_core.query(solrParams); 559 //Get highliting results 560 Map<String,Map<String,List<String>>> highlightingResults = solrResponse.getHighlighting(); 561 //Get highlited document text 562 text = highlightingResults.get(hldocOID).get(highlight_field).get(0); 563 564 565 } 566 catch (SolrServerException server_exception) 567 { 568 server_exception.printStackTrace(); 569 570 } 571 return text; 572 } 507 573 508 574 //Greenstone universe operates with a base of 1 for "start_results"
Note:
See TracChangeset
for help on using the changeset viewer.