Changeset 27328

Show
Ignore:
Timestamp:
14.05.2013 11:09:58 (6 years ago)
Author:
kjdon
Message:

changed the way we store the list of fields that has been indexed, and the mapping between index and shortname. They are separated now, to avoid calculating a shortname for a field each time a new document is indexed - previously if there was no value, then the shortname was not remembered as it wasn't indexed, so each new document saw the shortname being calculated again. remove namespaces from meta fields before calculating shortnames, to make them more sensible. eg dc.Title->TI instead of DC.

Location:
main/trunk/greenstone2/perllib
Files:
2 modified

Legend:

Unmodified
Added
Removed
  • main/trunk/greenstone2/perllib/mgppbuilder.pm

    r27306 r27328  
    489489    $self->{'buildproc'}->set_index_languages ($languagemetadata, $langarr) if (defined $language); 
    490490    $self->{'buildproc'}->set_indexing_text (1); 
    491     #$self->{'buildproc'}->set_indexfieldmap ($self->{'indexfieldmap'}); 
    492491    $self->{'buildproc'}->set_levels ($self->{'levels'});  
    493492    $self->{'buildproc'}->set_db_level ($db_level);    
     
    637636    # these now come from collection meta. if that is not defined, uses the metadata name 
    638637    my $collmeta = ""; 
    639     if (defined $self->{'build_cfg'}->{'indexfields'}) { 
    640     foreach my $longfield (@{$self->{'build_cfg'}->{'indexfields'}}){ 
    641         my $shortfield = $self->{'buildproc'}->{'indexfieldmap'}->{$longfield}; 
     638    if (defined $self->{'build_cfg'}->{'extraindexfields'}) { 
     639    foreach my $longfield (@{$self->{'build_cfg'}->{'extraindexfields'}}){ 
     640        my $shortfield = $self->{'buildproc'}->{'fieldnamemap'}->{$longfield}; 
    642641        next if $shortfield eq 1; 
    643642         
     
    743742     
    744743    #add all fields bit  
    745     my $ifm = $self->{'buildproc'}->{'indexfieldmap'}; 
     744    my $fnm = $self->{'buildproc'}->{'fieldnamemap'}; 
    746745     
    747746    foreach $field (@specifiedfieldorder) { 
    748747    if ($field eq "metadata") { 
    749         foreach my $newfield (keys %{$self->{'buildproc'}->{'indexfields'}}) { 
     748        foreach my $newfield (keys %{$self->{'buildproc'}->{'extraindexfields'}}) { 
    750749        if (!defined $specifiedfields->{$newfield}) { 
    751             push (@indexfieldmap, "$newfield\-\>$self->{'buildproc'}->{'indexfieldmap'}->{$newfield}"); 
     750            push (@indexfieldmap, "$newfield\-\>$fnm->{$newfield}"); 
    752751            push (@indexfields, "$newfield"); 
    753752        } 
     
    762761    } else { 
    763762        # we only add in the ones that have been processed 
    764         if (defined $ifm->{$field}) { 
    765         push (@indexfieldmap, "$field\-\>$ifm->{$field}"); 
     763        if (defined $self->{'buildproc'}->{'allindexfields'}->{$field}) { 
     764        push (@indexfieldmap, "$field\-\>$fnm->{$field}"); 
    766765        push (@indexfields, "$field"); 
    767766        } 
  • main/trunk/greenstone2/perllib/mgppbuildproc.pm

    r24404 r27328  
    9090 
    9191    $self->{'dontindex'} = {}; 
    92     $self->{'indexfieldmap'} = {}; 
    93     $self->{'indexfields'} = {}; # only put in the ones that are not specified directly in the index 
     92    $self->{'allindexfields'} = {}; # list of all actually indexed fields 
     93    $self->{'extraindexfields'} = {}; # indexed fields not specfied in original index list - ie if 'metadata' was specified. 
     94    $self->{'fieldnamemap'} = {'allfields'=>'ZZ', 
     95              'ZZ'=>1, 
     96              'text'=>'TX', 
     97              'TX'=>1}; # mapping between index full names and short names. Once we have decided on a mapping it goes in here, whether we have indexed something or not. 
    9498    $self->{'strip_html'}=1; 
    9599     
    96100    return bless $self, $class; 
    97 } 
    98  
    99  
    100 #sub set_indexfieldmap { 
    101 #    my $self = shift (@_); 
    102 #    my ($indexmap) = @_; 
    103  
    104 #    $self->{'default_index_field_mapping'} = $indexmap; 
    105     #$self->{'indexfieldmap'} = $indexmap; 
    106 #} 
    107  
    108 sub get_indexfieldmap { 
    109     my $self = shift (@_); 
    110  
    111     return $self->{'indexfieldmap'}; 
    112101} 
    113102 
     
    279268        my $new_text = "";  
    280269 
    281         # we get allfields by default - do nothing except add into the map 
    282         if ($real_field eq "allfields") { 
    283         $self->{'indexfieldmap'}->{"allfields"} = "ZZ"; 
    284         $self->{'indexfieldmap'}->{"ZZ"} = 1; 
    285         } 
     270        # we get allfields by default  
     271        next if ($real_field eq "allfields");  
    286272         
    287273        # metadata - output all metadata we know about except gsdl stuff 
     
    290276        # we will process this later, so we are not reindexing metadata already indexed 
    291277        $all_metadata_specified = 1; 
     278        next; 
    292279        } 
    293280         
    294         else { 
    295          
    296281        #individual metadata and or text specified - could be  
    297282        # a comma separated list 
    298283        $specified_fields->{$real_field} = 1; 
    299284        my $shortname=""; 
    300         my $new_field = 0; # have we found a new field name? 
    301  
    302         if (defined $self->{'indexfieldmap'}->{$real_field}) { 
    303             $shortname = $self->{'indexfieldmap'}->{$real_field}; 
    304         } 
    305         else { 
    306             $shortname = $self->create_shortname($real_field); 
    307             $new_field = 1; # we want to record this shortname, but only if we have actually found some metadata values 
    308         } 
     285 
     286        if (defined $self->{'fieldnamemap'}->{$real_field}) { 
     287        $shortname = $self->{'fieldnamemap'}->{$real_field}; 
     288        } else { 
     289        $shortname = $self->create_shortname($real_field); 
     290        $self->{'fieldnamemap'}->{$real_field} = $shortname; 
     291        $self->{'fieldnamemap'}->{$shortname} = 1; 
     292        } 
     293 
    309294        my @metadata_list = (); # put any meta values in here 
    310295        my $section_text = ""; # put any text in here 
     
    357342            # only add tags in if indexing 
    358343            $new_text .= "</$shortname>"; 
    359             } 
    360             if ($self->{'indexing_text'} && $new_field) { 
    361             # we need to add to the list in indexfields 
    362  
    363             $self->{'indexfieldmap'}->{$real_field} = $shortname; 
    364             $self->{'indexfieldmap'}->{$shortname} = 1; 
     344            $self->{'allindexfields'}->{$real_field} = 1; 
    365345            } 
    366346        } 
    367         } 
    368347 
    369348        # filter the text 
     
    388367        next if ($mfield =~ /^gsdl/); 
    389368         
    390              
    391         if (defined $self->{'indexfieldmap'}->{$mfield}) { 
    392             $shortname = $self->{'indexfieldmap'}->{$mfield}; 
     369        if (defined $self->{'fieldnamemap'}->{$mfield}) { 
     370            $shortname = $self->{'fieldnamemap'}->{$mfield}; 
     371        } else { 
     372            $shortname = $self->create_shortname($mfield); 
     373            $self->{'fieldnamemap'}->{$mfield} = $shortname; 
     374            $self->{'fieldnamemap'}->{$shortname} = 1; 
    393375        } 
    394         else { 
    395             $shortname = $self->create_shortname($mfield); 
    396             $self->{'indexfieldmap'}->{$mfield} = $shortname; 
    397             $self->{'indexfieldmap'}->{$shortname} = 1; 
    398         }       
     376        $self->{'allindexfields'}->{$mfield} = 1; 
    399377        $new_text .= "$paratag<$shortname>$mvalue</$shortname>\n"; 
    400         if (!defined $self->{'indexfields'}->{$mfield}) { 
    401             $self->{'indexfields'}->{$mfield} = 1; 
     378        if (!defined $self->{'extraindexfields'}->{$mfield}) { 
     379            $self->{'extraindexfields'}->{$mfield} = 1; 
    402380        }                    
    403381         
     
    426404     
    427405    my ($realname) = @_; 
     406    my @realnamelist = split(",", $realname); 
     407    map {$_=~ s/^[a-zA-Z]+\.//;} @realnamelist; #remove namespaces 
     408    my ($singlename) = $realnamelist[0]; 
     409 
    428410    # try our predefined static mapping 
    429     if (defined $static_indexfield_map{$realname}) { 
    430     return $static_indexfield_map{$realname}; 
    431     } 
     411    my $name; 
     412    if (defined ($name = $static_indexfield_map{$singlename})) { 
     413    if (! defined $self->{'fieldnamemap'}->{$name}) { 
     414        # has this shortname already been used?? 
     415        return $static_indexfield_map{$singlename}; 
     416    } 
     417    } 
     418    # we can't use the quick map, so join all fields back together (without namespaces), and try sets of two characters. 
     419    $realname = join ("", @realnamelist); 
    432420    #try the first two chars 
    433421    my $shortname; 
     
    443431    #if already used, take the first and third letdigs and so on 
    444432    my $count = 1; 
    445     while (defined $self->{'indexfieldmap'}->{$shortname} || defined $static_indexfield_map{$shortname}) { 
     433    while (defined $self->{'fieldnamemap'}->{$shortname} || defined $static_indexfield_map{$shortname}) { 
    446434    if ($realname =~ /^[^\w]*(\w)([^\w]*\w){$count}[^\w]*(\w)/) { 
    447435        $shortname = "$1$3";