Changeset 17567

Show
Ignore:
Timestamp:
22.10.2008 13:26:24 (11 years ago)
Author:
kjdon
Message:

if metadata is specified, only add in the ones that are not already indexed. no point in indexing twice

Files:
1 modified

Legend:

Unmodified
Added
Removed
  • gsdl/trunk/perllib/mgppbuildproc.pm

    r17564 r17567  
    266266     
    267267    $self->{'num_bytes'} += $doc_obj->get_text_length ($section); 
     268 
     269    # has the user added a 'metadata' index? 
     270    my $all_metadata_specified = 0;  
     271    # which fields have already been indexed? (same as fields, but in a map) 
     272    my $specified_fields = {}; 
    268273    foreach my $field (split (/;/, $fields)) { 
    269274        # only deal with this field if it doesn't start with top or 
     
    274279        my $new_text = "";  
    275280 
    276         # we get allfields by default - do nothing 
    277         if ($real_field eq "allfields") {  
    278      
     281        # we get allfields by default - do nothing except add into the map 
     282        if ($real_field eq "allfields") { 
     283        $self->{'indexfieldmap'}->{"allfields"} = "ZZ"; 
     284        $self->{'indexfieldmap'}->{"ZZ"} = 1; 
    279285        } 
    280286         
    281287        # metadata - output all metadata we know about except gsdl stuff 
    282288        # each metadata is in a separate index field 
    283         elsif ($real_field eq "metadata") {  
    284         my $shortname = ""; 
    285         my $metadata = $doc_obj->get_all_metadata ($section); 
    286         foreach my $pair (@$metadata) { 
    287             my ($mfield, $mvalue) = (@$pair); 
    288             # check fields here, maybe others dont want - change to use dontindex!! 
    289             if ($mfield ne "Identifier"  
    290             && $mfield !~ /^gsdl/  
    291             && $mfield ne "classifytype" 
    292             && $mfield ne "assocfilepath" 
    293             && defined $mvalue && $mvalue ne "") { 
    294              
    295             if (defined $self->{'indexfieldmap'}->{$mfield}) { 
    296                 $shortname = $self->{'indexfieldmap'}->{$mfield}; 
    297             } 
    298             else { 
    299                 $shortname = $self->create_shortname($mfield); 
    300                 $self->{'indexfieldmap'}->{$mfield} = $shortname; 
    301                 $self->{'indexfieldmap'}->{$shortname} = 1; 
    302             }       
    303             # should this line only be done if the following test is true? 
    304             $new_text .= "$paratag<$shortname>$mvalue</$shortname>\n"; 
    305             if (!defined $self->{'indexfields'}->{$mfield}) { 
    306                 $self->{'indexfields'}->{$mfield} = 1; 
    307             }                    
    308             } 
    309         } 
    310         } 
     289        if ($real_field eq "metadata") {  
     290        # we will process this later, so we are not reindexing metadata already indexed 
     291        $all_metadata_specified = 1; 
     292        } 
     293         
    311294        else { 
     295         
    312296        #individual metadata and or text specified - could be  
    313297        # a comma separated list 
     298        $specified_fields->{$real_field} = 1; 
    314299        my $shortname=""; 
    315300        my $new_field = 0; # have we found a new field name? 
     
    383368        $text .= "$new_text"; 
    384369    } # foreach field 
    385      
     370     
     371    if ($all_metadata_specified) { 
     372        my $new_text = ""; 
     373        my $shortname = ""; 
     374        my $metadata = $doc_obj->get_all_metadata ($section); 
     375        foreach my $pair (@$metadata) { 
     376        my ($mfield, $mvalue) = (@$pair); 
     377        # no value 
     378        next unless defined $mvalue && $mvalue ne ""; 
     379        # we have already indexed this 
     380        next if defined ($specified_fields->{$mfield}); 
     381        # check fields here, maybe others dont want - change to use dontindex!! 
     382        next if ($mfield eq "Identifier" || $mfield eq "classifytype" || $mfield eq "assocfilepath"); 
     383        next if ($mfield =~ /^gsdl/); 
     384         
     385             
     386        if (defined $self->{'indexfieldmap'}->{$mfield}) { 
     387            $shortname = $self->{'indexfieldmap'}->{$mfield}; 
     388        } 
     389        else { 
     390            $shortname = $self->create_shortname($mfield); 
     391            $self->{'indexfieldmap'}->{$mfield} = $shortname; 
     392            $self->{'indexfieldmap'}->{$shortname} = 1; 
     393        }       
     394        $new_text .= "$paratag<$shortname>$mvalue</$shortname>\n"; 
     395        if (!defined $self->{'indexfields'}->{$mfield}) { 
     396            $self->{'indexfields'}->{$mfield} = 1; 
     397        }                    
     398         
     399        } 
     400        # filter the text 
     401        $new_text = $self->filter_text ("metadata", $new_text); 
     402         
     403        $self->{'num_processed_bytes'} += length ($new_text); 
     404        $text .= "$new_text"; 
     405 
     406         
     407    } 
     408     
    386409    $text .= "$sectionendtag"; 
    387410    $section = $doc_obj->get_next_section($section); 
    388411    } # while defined section 
    389412    print $handle "$text\n$documentendtag";  
     413    #print STDERR "***********\n$text\n***************\n"; 
    390414     
    391415}