Changeset 27358 for main


Ignore:
Timestamp:
2013-05-16T15:30:15+12:00 (11 years ago)
Author:
kjdon
Message:

indexing sortfields separately. tidy up the parsing of the indexes list - only work out shortnames etc once, not every document and every section. Note, need to do this for sort fields too

File:
1 edited

Legend:

Unmodified
Added
Removed
  • main/trunk/greenstone2/perllib/lucenebuildproc.pm

    r27329 r27358  
    4949
    5050    $self->{'numincdocs'} = 0;
    51 
     51    $self->{'specified_fields'} = (); # list of fields actually specified in the index, in a map
     52    $self->{'allfields_index'} = 0; # do we need allfields index?
     53    $self->{'all_metadata_specified'} = 0; # are we indexing all metadata?
     54    $self->{'actualsortfields'} = {}; # sort fields that have actually been used
     55    $self->{'sortfieldnamemap'} = {}; # mapping between field name and field shortname, eg dc.Title->byTI
    5256    return bless $self, $class;
    5357}
    5458
     59sub set_index {
     60    my $self = shift (@_);
     61    my ($index, $indexexparr) = @_;
     62
     63    $self->mgppbuildproc::set_index($index, $indexexparr);
     64   
     65    # just get the list of index fields without any subcoll stuff
     66    my ($fields) = split (/:/, $self->{'index'});
     67
     68    foreach my $field (split (/;/, $fields)) {
     69    if ($field eq "allfields") {
     70        $self->{'allfields_index'} = 1;
     71    } elsif ($field eq "metadata") {
     72        $self->{'all_metadata_specified'} = 1;
     73    } else {
     74        $field =~ s/^top//;
     75        $self->{'specified_fields'} ->{$field} = 1;
     76    }
     77    }   
     78}
     79
     80sub set_sortfields {
     81    my $self = shift (@_);
     82 
     83    ($self->{'sortfields'}) = @_;
     84}
    5585
    5686sub is_incremental_capable
     
    76106    return if (!$self->get_indexing_text() && ($edit_mode eq "delete"));
    77107
     108    # 0/1 to indicate whether this doc is part of the specified subcollection
    78109    my $indexed_doc = $self->is_subcollection_doc($doc_obj);
    79110
     
    85116    $self->{'num_docs'} -= 1;
    86117    }
     118
    87119
    88120    # get the parameters for the output
     
    152184
    153185
    154     # has the user added a 'metadata' index?
    155     my $all_metadata_specified = 0;
    156     # which fields have already been indexed? (same as fields, but in a map)
    157     my $specified_fields = {};
    158    
    159     # do we have an allfields index??
    160     my $allfields_index = 0;
    161     # collect up all the text for it in here
     186    # collect up all the text for allfields index in here (if there is one)
    162187    my $allfields_text = "";
    163     foreach my $field (split (/;/, $fields)) {
    164         if ($field eq "allfields") {
    165         $allfields_index = 1;
    166         } elsif ($field eq "metadata") {
    167         $all_metadata_specified = 1;
    168         }
    169     }
    170    
     188
    171189    foreach my $field (split (/;/, $fields)) {
    172190       
     
    180198       
    181199        #individual metadata and or text specified - could be a comma separated list
    182         $specified_fields->{$real_field} = 1;
     200        #$specified_fields->{$real_field} = 1;
    183201        my $shortname="";
    184202        my $new_field = 0; # have we found a new field name?
     
    235253        }
    236254
    237         if ($allfields_index) {
     255        if ($self->{'allfields_index'}) {
    238256            $allfields_text .= $new_text;
    239257        }
     
    259277    } # foreach field
    260278
    261     if ($all_metadata_specified) {
     279    if ($self->{'all_metadata_specified'}) {
    262280       
    263281        my $new_text = "";
     
    269287        next unless defined $mvalue && $mvalue ne "";
    270288        # we have already indexed this
    271         next if defined ($specified_fields->{$mfield});
     289        next if defined ($self->{'specified_fields'}->{$mfield});
    272290        # check fields here, maybe others dont want - change to use dontindex!!
    273291        next if ($mfield eq "Identifier" || $mfield eq "classifytype" || $mfield eq "assocfilepath");
     
    286304        $self->{'allindexfields'}->{$mfield} = 1;
    287305        $new_text .= "<$shortname index=\"1\">$mvalue</$shortname>\n";
    288         if ($allfields_index) {
     306        if ($self->{'allfields_index'}) {
    289307            $allfields_text .= "$mvalue ";
    290308        }
     
    308326    }
    309327
    310     if ($allfields_index) {
     328    if ($self->{'allfields_index'}) {
    311329       
    312330        my $new_text = "<ZZ index=\"1\">$allfields_text</ZZ>\n";
     
    323341        }
    324342    }
    325        
     343    # only add sort fields for this section if we are indexing this section, we are doing section level indexing or this is the top section
     344    if ($self->{'indexing_text'} && ($sec_tag_name ne "" || $doc_section == 1 )) {
     345    # add sort fields if there are any
     346       
     347    foreach my $sfield (@{$self->{'sortfields'}}) {
     348        my $sf_shortname;
     349        if (defined $self->{'sortfieldnamemap'}->{$sfield}) {
     350        $sf_shortname = $self->{'sortfieldnamemap'}->{$sfield};
     351        }
     352        else {
     353        $sf_shortname = $self->create_sortfield_shortname($sfield);
     354        $self->{'sortfieldnamemap'}->{$sfield} = $sf_shortname;
     355        $self->{'sortfieldnamemap'}->{$sf_shortname} = 1;
     356        }
     357        my @metadata_list = (); # put any metadata values in here
     358        foreach my $submeta (split /,/, $sfield) {
     359        $submeta =~ s/^ex\.([^.]+)$/$1/; #strip off ex. iff it's the only metadata set prefix (will leave ex.dc.* intact)
     360       
     361        my @section_metadata = @{$doc_obj->get_metadata ($section, $submeta)};
     362        push (@metadata_list, @section_metadata);
     363        }
     364        my $new_text = "";
     365        foreach my $item (@metadata_list) {
     366        &ghtml::htmlsafe($item);
     367        $new_text .= "$item ";
     368        }
     369        if ($new_text =~ /\S/) {
     370        $new_text = "<$sf_shortname index=\"1\" tokenize=\"0\">$new_text</$sf_shortname>";
     371        # filter the text???
     372        $text .= "$new_text"; # add it to the main text block
     373        $self->{'actualsortfields'}->{$sfield} = 1;
     374        }
     375    }
     376    }
    326377    $text .= "\n</$sec_tag_name>\n" if ($sec_tag_name ne "");
    327378
    328379        $section = $doc_obj->get_next_section($section);
    329     } # while defined section
     380    } # for each section
     381   
     382    #open (TEXTOUT, ">text.out");
     383    #print TEXTOUT "$text\n$documentendtag";
     384    #close TEXTOUT;
    330385
    331386    print $lucenehandle "$text\n$documentendtag";
     
    556611    }
    557612}
     613
     614sub create_sortfield_shortname {
     615    my $self = shift(@_);
     616
     617    my ($realname) = @_;
     618
     619    my $index_shortname;
     620    # if we have created a shortname for an index on this field, then use it.
     621    if (defined $self->{'fieldnamemap'}->{$realname}) {
     622    $index_shortname = $self->{'fieldnamemap'}->{$realname};
     623    } else {
     624    $index_shortname = $self->create_shortname($realname);
     625    }
     626    return "by".$index_shortname;
     627}
     628 
     629
    5586301;
    559631
Note: See TracChangeset for help on using the changeset viewer.