Changeset 33327 for gs3-extensions/solr/trunk/src/perllib/solrbuilder.pm
- Timestamp:
- 2019-07-18T22:45:22+12:00 (5 years ago)
- File:
-
- 1 edited
Legend:
- Unmodified
- Added
- Removed
-
gs3-extensions/solr/trunk/src/perllib/solrbuilder.pm
r32179 r33327 207 207 # build_cfg as, unlike in MGPP, we need these mappings in advance to configure 208 208 # Lucene/Solr. Unfortunately the original function found in mgbuilder.pm makes 209 # a mess of this - it only output fields that have been processed (none have)209 # a mess of this - it only outputs fields that have been processed (none have) 210 210 # and it has a hardcoded renaming for 'text' so it becomes 'TX' according to 211 211 # the schema but 'TE' according to XML sent to lucene_passes.pl/solr_passes.pl 212 # This version is dumber - just copy them all across verbat um - but works. We212 # This version is dumber - just copy them all across verbatim - but works. We 213 213 # do still need to support the special case of 'allfields' 214 214 sub make_final_field_list … … 286 286 $schema_insert_xml .= "<field name=\"$field\" "; 287 287 288 if($field eq "LA" || $field eq "LO") 289 { 290 $schema_insert_xml .= "type=\"location\" "; 288 if($field eq "CD" || $field eq "CS") { 289 # Coordinate and CoordShort meta should not be split but treated as a whole string for searching. So type=string, not type=text_en_splitting 290 # Can't set to type="location", which uses solr.LatLonType, since type=location fields "must not be multivalued" as per conf/schema.xml.in. 291 # And we can have multiple Coordinate (and multiple CoordShort) meta for one doc, so multivalued=true. 292 # Not certain what to set stored to. As per conf/schema.xml.in, stored=false means "you only need to search on the field but 293 # don't need to return the original value". And they advice to set stored="false" for all fields possible (esp large fields)." 294 # But stored=false makes it not visible in Luke. So setting stored=true as for other fields 295 # TermVector: '"A term vector is a list of the document's terms and their number of occurrences in that documented." 296 # Each document has one term vector which is a list.' (http://makble.com/what-is-term-vector-in-lucene and lucene API for Field.TermVector) 297 # e.g. docA contains, "cat" 5 times, "dog" 10 times. We don't care to treat Coordinate meta as a term: not a "term" occurring 298 # in the doc, and don't care how often a Coordinate occurs in a document. 299 # Consequently, we don't care about term positions and term offsets for Coordinate meta either. 300 301 $schema_insert_xml .= "type=\"string\" indexed=\"true\" stored=\"true\" multiValued=\"true\" termVectors=\"false\" termPositions=\"false\" termOffsets=\"false\" />\n"; 291 302 } 292 # elsif ($field ne "ZZ" && $field ne "TX") 293 # { 294 # $schema_insert_xml .= "type=\"string\" "; 295 # } 296 else 297 { 298 #$schema_insert_xml .= "type=\"text_en_splitting\" "; 299 300 # original default solr field type for all fields is text_en_splitting 301 my $solrfieldtype = "text_en_splitting"; 302 if(defined $self->{'collect_cfg'}->{'indexfieldoptions'}->{$fullfieldname}->{'solrfieldtype'}) { 303 $solrfieldtype = $self->{'collect_cfg'}->{'indexfieldoptions'}->{$fullfieldname}->{'solrfieldtype'}; 304 #print STDERR "@@@@#### found TYPE: $solrfieldtype\n"; 305 } 306 $schema_insert_xml .= "type=\"$solrfieldtype\" "; 303 304 elsif($field eq "ML") { 305 # mapLabel: same attributes as for coord meta CD and CS above 306 # mapLabel is also like facets with type="string" to not get tokenized, and multiValued="true" to allow each shape's label to be stored distinctly 307 $schema_insert_xml .= "type=\"string\" indexed=\"true\" stored=\"true\" multiValued=\"true\" termVectors=\"false\" termPositions=\"false\" termOffsets=\"false\" />\n"; 308 } 309 310 else { 311 if($field eq "LT" || $field eq "LO") # full Latitude and Longitude coordinate meta, not the short variants (LatShort/LA and LongShort/LN) 312 { 313 # Latitude and Longitude is being phased out in favour of using Coord meta. 314 # However, if ever returning to using Lat and Lng instead of Coord meta, then the way the Lat Lng meta is currently written out for type="location" 315 # is in the wrong format. Lat and Lng shouldn't get written out separately but as: Lat,Lng 316 # It gets written out in solrbuildproc.pm, I think, so that would be where it needs to be corrected. 317 # For more info on type=location for our solr 4.7.2 or thereabouts, see https://web.archive.org/web/20160312154250/https://wiki.apache.org/solr/SpatialSearchDev 318 # which states: 319 # When indexing, the format is something like: 320 # <field name="store_lat_lon">12.34,-123.45</field> 321 # 322 $schema_insert_xml .= "type=\"location\" "; 323 } 307 324 325 326 # elsif ($field ne "ZZ" && $field ne "TX") 327 # { 328 # $schema_insert_xml .= "type=\"string\" "; 329 # } 330 else 331 { 332 #$schema_insert_xml .= "type=\"text_en_splitting\" "; 333 334 # original default solr field type for all fields is text_en_splitting 335 my $solrfieldtype = "text_en_splitting"; 336 if(defined $self->{'collect_cfg'}->{'indexfieldoptions'}->{$fullfieldname}->{'solrfieldtype'}) { 337 $solrfieldtype = $self->{'collect_cfg'}->{'indexfieldoptions'}->{$fullfieldname}->{'solrfieldtype'}; 338 #print STDERR "@@@@#### found TYPE: $solrfieldtype\n"; 339 } 340 $schema_insert_xml .= "type=\"$solrfieldtype\" "; 341 342 } 343 # set termVectors=\"true\" when term vectors info is required, 344 # see TermsResponse termResponse = solrResponse.getTermsResponse(); 345 $schema_insert_xml .= "indexed=\"true\" stored=\"true\" multiValued=\"true\" termVectors=\"true\" termPositions=\"true\" termOffsets=\"true\" />\n"; 308 346 } 309 # set termVectors=\"true\" when term vectors info is required,310 # see TermsResponse termResponse = solrResponse.getTermsResponse();311 $schema_insert_xml .= "indexed=\"true\" stored=\"true\" multiValued=\"true\" termVectors=\"true\" termPositions=\"true\" termOffsets=\"true\" />\n";312 347 } 313 348
Note:
See TracChangeset
for help on using the changeset viewer.