Changeset 33327 for gs3-extensions
- Timestamp:
- 2019-07-18T22:45:22+12:00 (5 years ago)
- Location:
- gs3-extensions/solr/trunk/src/perllib
- Files:
-
- 2 edited
Legend:
- Unmodified
- Added
- Removed
-
gs3-extensions/solr/trunk/src/perllib/solrbuilder.pm
r32179 r33327 207 207 # build_cfg as, unlike in MGPP, we need these mappings in advance to configure 208 208 # Lucene/Solr. Unfortunately the original function found in mgbuilder.pm makes 209 # a mess of this - it only output fields that have been processed (none have)209 # a mess of this - it only outputs fields that have been processed (none have) 210 210 # and it has a hardcoded renaming for 'text' so it becomes 'TX' according to 211 211 # the schema but 'TE' according to XML sent to lucene_passes.pl/solr_passes.pl 212 # This version is dumber - just copy them all across verbat um - but works. We212 # This version is dumber - just copy them all across verbatim - but works. We 213 213 # do still need to support the special case of 'allfields' 214 214 sub make_final_field_list … … 286 286 $schema_insert_xml .= "<field name=\"$field\" "; 287 287 288 if($field eq "LA" || $field eq "LO") 289 { 290 $schema_insert_xml .= "type=\"location\" "; 288 if($field eq "CD" || $field eq "CS") { 289 # Coordinate and CoordShort meta should not be split but treated as a whole string for searching. So type=string, not type=text_en_splitting 290 # Can't set to type="location", which uses solr.LatLonType, since type=location fields "must not be multivalued" as per conf/schema.xml.in. 291 # And we can have multiple Coordinate (and multiple CoordShort) meta for one doc, so multivalued=true. 292 # Not certain what to set stored to. As per conf/schema.xml.in, stored=false means "you only need to search on the field but 293 # don't need to return the original value". And they advice to set stored="false" for all fields possible (esp large fields)." 294 # But stored=false makes it not visible in Luke. So setting stored=true as for other fields 295 # TermVector: '"A term vector is a list of the document's terms and their number of occurrences in that documented." 296 # Each document has one term vector which is a list.' (http://makble.com/what-is-term-vector-in-lucene and lucene API for Field.TermVector) 297 # e.g. docA contains, "cat" 5 times, "dog" 10 times. We don't care to treat Coordinate meta as a term: not a "term" occurring 298 # in the doc, and don't care how often a Coordinate occurs in a document. 299 # Consequently, we don't care about term positions and term offsets for Coordinate meta either. 300 301 $schema_insert_xml .= "type=\"string\" indexed=\"true\" stored=\"true\" multiValued=\"true\" termVectors=\"false\" termPositions=\"false\" termOffsets=\"false\" />\n"; 291 302 } 292 # elsif ($field ne "ZZ" && $field ne "TX") 293 # { 294 # $schema_insert_xml .= "type=\"string\" "; 295 # } 296 else 297 { 298 #$schema_insert_xml .= "type=\"text_en_splitting\" "; 299 300 # original default solr field type for all fields is text_en_splitting 301 my $solrfieldtype = "text_en_splitting"; 302 if(defined $self->{'collect_cfg'}->{'indexfieldoptions'}->{$fullfieldname}->{'solrfieldtype'}) { 303 $solrfieldtype = $self->{'collect_cfg'}->{'indexfieldoptions'}->{$fullfieldname}->{'solrfieldtype'}; 304 #print STDERR "@@@@#### found TYPE: $solrfieldtype\n"; 305 } 306 $schema_insert_xml .= "type=\"$solrfieldtype\" "; 303 304 elsif($field eq "ML") { 305 # mapLabel: same attributes as for coord meta CD and CS above 306 # mapLabel is also like facets with type="string" to not get tokenized, and multiValued="true" to allow each shape's label to be stored distinctly 307 $schema_insert_xml .= "type=\"string\" indexed=\"true\" stored=\"true\" multiValued=\"true\" termVectors=\"false\" termPositions=\"false\" termOffsets=\"false\" />\n"; 308 } 309 310 else { 311 if($field eq "LT" || $field eq "LO") # full Latitude and Longitude coordinate meta, not the short variants (LatShort/LA and LongShort/LN) 312 { 313 # Latitude and Longitude is being phased out in favour of using Coord meta. 314 # However, if ever returning to using Lat and Lng instead of Coord meta, then the way the Lat Lng meta is currently written out for type="location" 315 # is in the wrong format. Lat and Lng shouldn't get written out separately but as: Lat,Lng 316 # It gets written out in solrbuildproc.pm, I think, so that would be where it needs to be corrected. 317 # For more info on type=location for our solr 4.7.2 or thereabouts, see https://web.archive.org/web/20160312154250/https://wiki.apache.org/solr/SpatialSearchDev 318 # which states: 319 # When indexing, the format is something like: 320 # <field name="store_lat_lon">12.34,-123.45</field> 321 # 322 $schema_insert_xml .= "type=\"location\" "; 323 } 307 324 325 326 # elsif ($field ne "ZZ" && $field ne "TX") 327 # { 328 # $schema_insert_xml .= "type=\"string\" "; 329 # } 330 else 331 { 332 #$schema_insert_xml .= "type=\"text_en_splitting\" "; 333 334 # original default solr field type for all fields is text_en_splitting 335 my $solrfieldtype = "text_en_splitting"; 336 if(defined $self->{'collect_cfg'}->{'indexfieldoptions'}->{$fullfieldname}->{'solrfieldtype'}) { 337 $solrfieldtype = $self->{'collect_cfg'}->{'indexfieldoptions'}->{$fullfieldname}->{'solrfieldtype'}; 338 #print STDERR "@@@@#### found TYPE: $solrfieldtype\n"; 339 } 340 $schema_insert_xml .= "type=\"$solrfieldtype\" "; 341 342 } 343 # set termVectors=\"true\" when term vectors info is required, 344 # see TermsResponse termResponse = solrResponse.getTermsResponse(); 345 $schema_insert_xml .= "indexed=\"true\" stored=\"true\" multiValued=\"true\" termVectors=\"true\" termPositions=\"true\" termOffsets=\"true\" />\n"; 308 346 } 309 # set termVectors=\"true\" when term vectors info is required,310 # see TermsResponse termResponse = solrResponse.getTermsResponse();311 $schema_insert_xml .= "indexed=\"true\" stored=\"true\" multiValued=\"true\" termVectors=\"true\" termPositions=\"true\" termOffsets=\"true\" />\n";312 347 } 313 348 -
gs3-extensions/solr/trunk/src/perllib/solrbuildproc.pm
r32441 r33327 202 202 203 203 } 204 sub create_shortname { 204 205 # UNUSED now by default. 206 # Georgy overrode the mgppbuildproc::create_shortname() method in commit 32441 to create the method below, to override the inherited 207 # behaviour so that create_shortname() worked appropriately for his use cases involving multiple analyzers. 208 # As a result, create_shortname() for solr no longer did a lookup into the %mgppbuildproc::static_indexfield_map for registered shortnames. 209 # For the rest, this method is a copy mgppbuildproc::create_shortname(). 210 # But we want the original mgppbuildproc::create_shortname() behaviour restored, as it does the lookups into %static_indexfield_map that's necessary for us. 211 # So we've renamed this function to create_shortname_multi_solr_analyzer below so it won't get called as default beahviour any more. 212 # Rename to create_shortname() when requiring Georgy's behaviour. 213 sub create_shortname_multi_solr_analyzer { 205 214 my $self = shift(@_); 206 215 … … 500 509 501 510 if ($section_text ne "") { 502 $new_text .= "$section_text "; 511 512 if ($allfields_index) { 513 $allfields_text .= "$section_text "; 514 } 515 516 # Remove any leading or trailing white space 517 $section_text =~ s/\s+$//; 518 $section_text =~ s/^\s+//; 519 520 if ($self->{'indexing_text'}) { 521 # add the tag 522 $new_text .= "<field name=\"$shortname\" >$section_text</field>\n"; 523 } else { 524 $new_text .= "$section_text "; 525 } 503 526 } 504 527 505 528 foreach my $item (@metadata_list) { 506 529 &ghtml::htmlsafe($item); 507 $new_text .= "$item "; 508 } 509 510 if ($allfields_index) { 511 $allfields_text .= $new_text; 512 } 513 514 # Remove any leading or trailing white space 515 $new_text =~ s/\s+$//; 516 $new_text =~ s/^\s+//; 517 530 531 if ($allfields_index) { 532 $allfields_text .= "$item "; 533 } 534 535 # Remove any leading or trailing white space 536 $item =~ s/\s+$//; 537 $item =~ s/^\s+//; 538 539 if ($self->{'indexing_text'}) { 540 # add the tag 541 $new_text .= "<field name=\"$shortname\" >$item</field>\n"; 542 } else { 543 $new_text .= "$item "; 544 } 545 } # end for loop processing @metadata_list 518 546 519 if ($self->{'indexing_text'}) {520 # add the tag521 $new_text = "<field name=\"$shortname\" >$new_text</field>\n";522 }523 547 # filter the text 524 548 $new_text = $self->filter_text ($field, $new_text); … … 669 693 670 694 foreach my $item (@metadata_list) { 671 &ghtml::htmlsafe($item); 672 673 $item = "<field name=\"$sf_shortname\">$item</field>\n"; 674 # filter the text??? 675 $text .= "$item"; # add it to the main text block 676 #print "#### new_text: $item\n"; 695 &ghtml::htmlsafe($item); 696 if ($item =~ /\S/) { 697 $item = "<field name=\"$sf_shortname\">$item</field>\n"; 698 # filter the text??? 699 $text .= "$item"; # add it to the main text block 700 #print "#### new_text: $item\n"; 701 } 677 702 } 678 703 if(scalar @metadata_list > 0) {
Note:
See TracChangeset
for help on using the changeset viewer.