Changeset 27328


Ignore:
Timestamp:
2013-05-14T11:09:58+12:00 (11 years ago)
Author:
kjdon
Message:

changed the way we store the list of fields that has been indexed, and the mapping between index and shortname. They are separated now, to avoid calculating a shortname for a field each time a new document is indexed - previously if there was no value, then the shortname was not remembered as it wasn't indexed, so each new document saw the shortname being calculated again. remove namespaces from meta fields before calculating shortnames, to make them more sensible. eg dc.Title->TI instead of DC.

Location:
main/trunk/greenstone2/perllib
Files:
2 edited

Legend:

Unmodified
Added
Removed
  • main/trunk/greenstone2/perllib/mgppbuilder.pm

    r27306 r27328  
    489489    $self->{'buildproc'}->set_index_languages ($languagemetadata, $langarr) if (defined $language);
    490490    $self->{'buildproc'}->set_indexing_text (1);
    491     #$self->{'buildproc'}->set_indexfieldmap ($self->{'indexfieldmap'});
    492491    $self->{'buildproc'}->set_levels ($self->{'levels'});
    493492    $self->{'buildproc'}->set_db_level ($db_level);   
     
    637636    # these now come from collection meta. if that is not defined, uses the metadata name
    638637    my $collmeta = "";
    639     if (defined $self->{'build_cfg'}->{'indexfields'}) {
    640     foreach my $longfield (@{$self->{'build_cfg'}->{'indexfields'}}){
    641         my $shortfield = $self->{'buildproc'}->{'indexfieldmap'}->{$longfield};
     638    if (defined $self->{'build_cfg'}->{'extraindexfields'}) {
     639    foreach my $longfield (@{$self->{'build_cfg'}->{'extraindexfields'}}){
     640        my $shortfield = $self->{'buildproc'}->{'fieldnamemap'}->{$longfield};
    642641        next if $shortfield eq 1;
    643642       
     
    743742   
    744743    #add all fields bit
    745     my $ifm = $self->{'buildproc'}->{'indexfieldmap'};
     744    my $fnm = $self->{'buildproc'}->{'fieldnamemap'};
    746745   
    747746    foreach $field (@specifiedfieldorder) {
    748747    if ($field eq "metadata") {
    749         foreach my $newfield (keys %{$self->{'buildproc'}->{'indexfields'}}) {
     748        foreach my $newfield (keys %{$self->{'buildproc'}->{'extraindexfields'}}) {
    750749        if (!defined $specifiedfields->{$newfield}) {
    751             push (@indexfieldmap, "$newfield\-\>$self->{'buildproc'}->{'indexfieldmap'}->{$newfield}");
     750            push (@indexfieldmap, "$newfield\-\>$fnm->{$newfield}");
    752751            push (@indexfields, "$newfield");
    753752        }
     
    762761    } else {
    763762        # we only add in the ones that have been processed
    764         if (defined $ifm->{$field}) {
    765         push (@indexfieldmap, "$field\-\>$ifm->{$field}");
     763        if (defined $self->{'buildproc'}->{'allindexfields'}->{$field}) {
     764        push (@indexfieldmap, "$field\-\>$fnm->{$field}");
    766765        push (@indexfields, "$field");
    767766        }
  • main/trunk/greenstone2/perllib/mgppbuildproc.pm

    r24404 r27328  
    9090
    9191    $self->{'dontindex'} = {};
    92     $self->{'indexfieldmap'} = {};
    93     $self->{'indexfields'} = {}; # only put in the ones that are not specified directly in the index
     92    $self->{'allindexfields'} = {}; # list of all actually indexed fields
     93    $self->{'extraindexfields'} = {}; # indexed fields not specfied in original index list - ie if 'metadata' was specified.
     94    $self->{'fieldnamemap'} = {'allfields'=>'ZZ',
     95              'ZZ'=>1,
     96              'text'=>'TX',
     97              'TX'=>1}; # mapping between index full names and short names. Once we have decided on a mapping it goes in here, whether we have indexed something or not.
    9498    $self->{'strip_html'}=1;
    9599   
    96100    return bless $self, $class;
    97 }
    98 
    99 
    100 #sub set_indexfieldmap {
    101 #    my $self = shift (@_);
    102 #    my ($indexmap) = @_;
    103 
    104 #    $self->{'default_index_field_mapping'} = $indexmap;
    105     #$self->{'indexfieldmap'} = $indexmap;
    106 #}
    107 
    108 sub get_indexfieldmap {
    109     my $self = shift (@_);
    110 
    111     return $self->{'indexfieldmap'};
    112101}
    113102
     
    279268        my $new_text = "";
    280269
    281         # we get allfields by default - do nothing except add into the map
    282         if ($real_field eq "allfields") {
    283         $self->{'indexfieldmap'}->{"allfields"} = "ZZ";
    284         $self->{'indexfieldmap'}->{"ZZ"} = 1;
    285         }
     270        # we get allfields by default
     271        next if ($real_field eq "allfields");
    286272       
    287273        # metadata - output all metadata we know about except gsdl stuff
     
    290276        # we will process this later, so we are not reindexing metadata already indexed
    291277        $all_metadata_specified = 1;
     278        next;
    292279        }
    293280       
    294         else {
    295        
    296281        #individual metadata and or text specified - could be
    297282        # a comma separated list
    298283        $specified_fields->{$real_field} = 1;
    299284        my $shortname="";
    300         my $new_field = 0; # have we found a new field name?
    301 
    302         if (defined $self->{'indexfieldmap'}->{$real_field}) {
    303             $shortname = $self->{'indexfieldmap'}->{$real_field};
    304         }
    305         else {
    306             $shortname = $self->create_shortname($real_field);
    307             $new_field = 1; # we want to record this shortname, but only if we have actually found some metadata values
    308         }
     285
     286        if (defined $self->{'fieldnamemap'}->{$real_field}) {
     287        $shortname = $self->{'fieldnamemap'}->{$real_field};
     288        } else {
     289        $shortname = $self->create_shortname($real_field);
     290        $self->{'fieldnamemap'}->{$real_field} = $shortname;
     291        $self->{'fieldnamemap'}->{$shortname} = 1;
     292        }
     293
    309294        my @metadata_list = (); # put any meta values in here
    310295        my $section_text = ""; # put any text in here
     
    357342            # only add tags in if indexing
    358343            $new_text .= "</$shortname>";
    359             }
    360             if ($self->{'indexing_text'} && $new_field) {
    361             # we need to add to the list in indexfields
    362 
    363             $self->{'indexfieldmap'}->{$real_field} = $shortname;
    364             $self->{'indexfieldmap'}->{$shortname} = 1;
     344            $self->{'allindexfields'}->{$real_field} = 1;
    365345            }
    366346        }
    367         }
    368347
    369348        # filter the text
     
    388367        next if ($mfield =~ /^gsdl/);
    389368       
    390            
    391         if (defined $self->{'indexfieldmap'}->{$mfield}) {
    392             $shortname = $self->{'indexfieldmap'}->{$mfield};
     369        if (defined $self->{'fieldnamemap'}->{$mfield}) {
     370            $shortname = $self->{'fieldnamemap'}->{$mfield};
     371        } else {
     372            $shortname = $self->create_shortname($mfield);
     373            $self->{'fieldnamemap'}->{$mfield} = $shortname;
     374            $self->{'fieldnamemap'}->{$shortname} = 1;
    393375        }
    394         else {
    395             $shortname = $self->create_shortname($mfield);
    396             $self->{'indexfieldmap'}->{$mfield} = $shortname;
    397             $self->{'indexfieldmap'}->{$shortname} = 1;
    398         }     
     376        $self->{'allindexfields'}->{$mfield} = 1;
    399377        $new_text .= "$paratag<$shortname>$mvalue</$shortname>\n";
    400         if (!defined $self->{'indexfields'}->{$mfield}) {
    401             $self->{'indexfields'}->{$mfield} = 1;
     378        if (!defined $self->{'extraindexfields'}->{$mfield}) {
     379            $self->{'extraindexfields'}->{$mfield} = 1;
    402380        }                   
    403381       
     
    426404   
    427405    my ($realname) = @_;
     406    my @realnamelist = split(",", $realname);
     407    map {$_=~ s/^[a-zA-Z]+\.//;} @realnamelist; #remove namespaces
     408    my ($singlename) = $realnamelist[0];
     409
    428410    # try our predefined static mapping
    429     if (defined $static_indexfield_map{$realname}) {
    430     return $static_indexfield_map{$realname};
    431     }
     411    my $name;
     412    if (defined ($name = $static_indexfield_map{$singlename})) {
     413    if (! defined $self->{'fieldnamemap'}->{$name}) {
     414        # has this shortname already been used??
     415        return $static_indexfield_map{$singlename};
     416    }
     417    }
     418    # we can't use the quick map, so join all fields back together (without namespaces), and try sets of two characters.
     419    $realname = join ("", @realnamelist);
    432420    #try the first two chars
    433421    my $shortname;
     
    443431    #if already used, take the first and third letdigs and so on
    444432    my $count = 1;
    445     while (defined $self->{'indexfieldmap'}->{$shortname} || defined $static_indexfield_map{$shortname}) {
     433    while (defined $self->{'fieldnamemap'}->{$shortname} || defined $static_indexfield_map{$shortname}) {
    446434    if ($realname =~ /^[^\w]*(\w)([^\w]*\w){$count}[^\w]*(\w)/) {
    447435        $shortname = "$1$3";
Note: See TracChangeset for help on using the changeset viewer.