Changeset 27358

Show
Ignore:
Timestamp:
16.05.2013 15:30:15 (6 years ago)
Author:
kjdon
Message:

indexing sortfields separately. tidy up the parsing of the indexes list - only work out shortnames etc once, not every document and every section. Note, need to do this for sort fields too

Files:
1 modified

Legend:

Unmodified
Added
Removed
  • main/trunk/greenstone2/perllib/lucenebuildproc.pm

    r27329 r27358  
    4949 
    5050    $self->{'numincdocs'} = 0; 
    51  
     51    $self->{'specified_fields'} = (); # list of fields actually specified in the index, in a map 
     52    $self->{'allfields_index'} = 0; # do we need allfields index? 
     53    $self->{'all_metadata_specified'} = 0; # are we indexing all metadata? 
     54    $self->{'actualsortfields'} = {}; # sort fields that have actually been used 
     55    $self->{'sortfieldnamemap'} = {}; # mapping between field name and field shortname, eg dc.Title->byTI 
    5256    return bless $self, $class; 
    5357} 
    5458 
     59sub set_index { 
     60    my $self = shift (@_); 
     61    my ($index, $indexexparr) = @_; 
     62 
     63    $self->mgppbuildproc::set_index($index, $indexexparr); 
     64     
     65    # just get the list of index fields without any subcoll stuff 
     66    my ($fields) = split (/:/, $self->{'index'}); 
     67 
     68    foreach my $field (split (/;/, $fields)) { 
     69    if ($field eq "allfields") { 
     70        $self->{'allfields_index'} = 1; 
     71    } elsif ($field eq "metadata") { 
     72        $self->{'all_metadata_specified'} = 1; 
     73    } else { 
     74        $field =~ s/^top//; 
     75        $self->{'specified_fields'} ->{$field} = 1; 
     76    } 
     77    }    
     78} 
     79 
     80sub set_sortfields { 
     81    my $self = shift (@_); 
     82  
     83    ($self->{'sortfields'}) = @_; 
     84} 
    5585 
    5686sub is_incremental_capable 
     
    76106    return if (!$self->get_indexing_text() && ($edit_mode eq "delete")); 
    77107 
     108    # 0/1 to indicate whether this doc is part of the specified subcollection 
    78109    my $indexed_doc = $self->is_subcollection_doc($doc_obj); 
    79110 
     
    85116    $self->{'num_docs'} -= 1; 
    86117    } 
     118 
    87119 
    88120    # get the parameters for the output 
     
    152184 
    153185 
    154     # has the user added a 'metadata' index? 
    155     my $all_metadata_specified = 0; 
    156     # which fields have already been indexed? (same as fields, but in a map) 
    157     my $specified_fields = {}; 
    158      
    159     # do we have an allfields index?? 
    160     my $allfields_index = 0; 
    161     # collect up all the text for it in here 
     186    # collect up all the text for allfields index in here (if there is one) 
    162187    my $allfields_text = ""; 
    163     foreach my $field (split (/;/, $fields)) { 
    164         if ($field eq "allfields") { 
    165         $allfields_index = 1; 
    166         } elsif ($field eq "metadata") { 
    167         $all_metadata_specified = 1; 
    168         } 
    169     } 
    170      
     188 
    171189    foreach my $field (split (/;/, $fields)) { 
    172190         
     
    180198         
    181199        #individual metadata and or text specified - could be a comma separated list 
    182         $specified_fields->{$real_field} = 1; 
     200        #$specified_fields->{$real_field} = 1; 
    183201        my $shortname=""; 
    184202        my $new_field = 0; # have we found a new field name? 
     
    235253        } 
    236254 
    237         if ($allfields_index) { 
     255        if ($self->{'allfields_index'}) { 
    238256            $allfields_text .= $new_text; 
    239257        } 
     
    259277    } # foreach field 
    260278 
    261     if ($all_metadata_specified) { 
     279    if ($self->{'all_metadata_specified'}) { 
    262280         
    263281        my $new_text = ""; 
     
    269287        next unless defined $mvalue && $mvalue ne ""; 
    270288        # we have already indexed this 
    271         next if defined ($specified_fields->{$mfield}); 
     289        next if defined ($self->{'specified_fields'}->{$mfield}); 
    272290        # check fields here, maybe others dont want - change to use dontindex!! 
    273291        next if ($mfield eq "Identifier" || $mfield eq "classifytype" || $mfield eq "assocfilepath"); 
     
    286304        $self->{'allindexfields'}->{$mfield} = 1; 
    287305        $new_text .= "<$shortname index=\"1\">$mvalue</$shortname>\n"; 
    288         if ($allfields_index) { 
     306        if ($self->{'allfields_index'}) { 
    289307            $allfields_text .= "$mvalue "; 
    290308        } 
     
    308326    } 
    309327 
    310     if ($allfields_index) { 
     328    if ($self->{'allfields_index'}) { 
    311329         
    312330        my $new_text = "<ZZ index=\"1\">$allfields_text</ZZ>\n"; 
     
    323341        } 
    324342    } 
    325          
     343    # only add sort fields for this section if we are indexing this section, we are doing section level indexing or this is the top section 
     344    if ($self->{'indexing_text'} && ($sec_tag_name ne "" || $doc_section == 1 )) { 
     345    # add sort fields if there are any 
     346         
     347    foreach my $sfield (@{$self->{'sortfields'}}) { 
     348        my $sf_shortname; 
     349        if (defined $self->{'sortfieldnamemap'}->{$sfield}) { 
     350        $sf_shortname = $self->{'sortfieldnamemap'}->{$sfield}; 
     351        } 
     352        else { 
     353        $sf_shortname = $self->create_sortfield_shortname($sfield); 
     354        $self->{'sortfieldnamemap'}->{$sfield} = $sf_shortname; 
     355        $self->{'sortfieldnamemap'}->{$sf_shortname} = 1; 
     356        } 
     357        my @metadata_list = (); # put any metadata values in here 
     358        foreach my $submeta (split /,/, $sfield) { 
     359        $submeta =~ s/^ex\.([^.]+)$/$1/; #strip off ex. iff it's the only metadata set prefix (will leave ex.dc.* intact) 
     360         
     361        my @section_metadata = @{$doc_obj->get_metadata ($section, $submeta)}; 
     362        push (@metadata_list, @section_metadata); 
     363        } 
     364        my $new_text = ""; 
     365        foreach my $item (@metadata_list) { 
     366        &ghtml::htmlsafe($item); 
     367        $new_text .= "$item "; 
     368        } 
     369        if ($new_text =~ /\S/) { 
     370        $new_text = "<$sf_shortname index=\"1\" tokenize=\"0\">$new_text</$sf_shortname>"; 
     371        # filter the text??? 
     372        $text .= "$new_text"; # add it to the main text block 
     373        $self->{'actualsortfields'}->{$sfield} = 1; 
     374        } 
     375    } 
     376    } 
    326377    $text .= "\n</$sec_tag_name>\n" if ($sec_tag_name ne ""); 
    327378 
    328379        $section = $doc_obj->get_next_section($section); 
    329     } # while defined section 
     380    } # for each section 
     381     
     382    #open (TEXTOUT, ">text.out"); 
     383    #print TEXTOUT "$text\n$documentendtag"; 
     384    #close TEXTOUT; 
    330385 
    331386    print $lucenehandle "$text\n$documentendtag"; 
     
    556611    } 
    557612} 
     613 
     614sub create_sortfield_shortname { 
     615    my $self = shift(@_); 
     616 
     617    my ($realname) = @_; 
     618 
     619    my $index_shortname; 
     620    # if we have created a shortname for an index on this field, then use it. 
     621    if (defined $self->{'fieldnamemap'}->{$realname}) { 
     622    $index_shortname = $self->{'fieldnamemap'}->{$realname}; 
     623    } else { 
     624    $index_shortname = $self->create_shortname($realname); 
     625    } 
     626    return "by".$index_shortname; 
     627} 
     628   
     629 
    5586301; 
    559631