Ignore:
Timestamp:
2000-07-13T10:21:53+12:00 (24 years ago)
Author:
sjboddie
Message:

merged changes to trunk into New_Config_Format branch

File:
1 edited

Legend:

Unmodified
Added
Removed
  • branches/New_Config_Format-branch/gsdl/perllib/mgbuildproc.pm

    r1072 r1279  
    5858    $self->{'num_sections'} = 0;
    5959    $self->{'num_bytes'} = 0;
     60    $self->{'num_processed_bytes'} = 0;
    6061
    6162    $self->{'indexing_text'} = 0;
     
    6970    $self->{'num_docs'} = 0;
    7071    $self->{'num_sections'} = 0;
     72    $self->{'num_processed_bytes'} = 0;
    7173    $self->{'num_bytes'} = 0;
    7274}
     
    8486}
    8587
     88# num_bytes is the actual number of bytes in the collection
     89# this is normally the same as what's processed during text compression
    8690sub get_num_bytes {
    8791    my $self = shift (@_);
    8892
    8993    return $self->{'num_bytes'};
     94}
     95
     96# num_processed_bytes is the number of bytes actually passed
     97# to mg for the current index
     98sub get_num_processed_bytes {
     99    my $self = shift (@_);
     100
     101    return $self->{'num_processed_bytes'};
    90102}
    91103
     
    126138}
    127139
     140sub get_index {
     141    my $self = shift (@_);
     142
     143    return $self->{'index'};
     144}
     145
    128146sub set_classifiers {
    129147    my $self = shift (@_);
     
    138156
    139157    $self->{'indexing_text'} = $indexing_text;
     158}
     159
     160sub get_indexing_text {
     161    my $self = shift (@_);
     162
     163    return $self->{'indexing_text'};
    140164}
    141165
     
    416440            if ($real_field eq "text") {
    417441            $new_text = $doc_obj->get_text ($section);
     442            $self->{'num_processed_bytes'} += length ($new_text);
    418443            $new_text =~ s/[\cB\cC]//g;
    419444            $self->find_paragraphs($new_text);
     
    423448            foreach $meta (@{$doc_obj->get_metadata ($section, $real_field)}) {
    424449                $meta =~ s/[\cB\cC]//g;
     450                $self->{'num_processed_bytes'} += length ($meta);
    425451                $new_text .= "\cC" unless $first;
    426452                $new_text .= $meta;
Note: See TracChangeset for help on using the changeset viewer.