Ignore:
Timestamp:
2000-06-29T10:34:25+12:00 (24 years ago)
Author:
sjboddie
Message:

Added some stat reporting and a warning message to the build code.
Now warns when very little or no text is to be processed for a given
index (as mg craps out in these situations). Will hopefully be useful
in realizing when an attempt is made to create an index of metadata that
is never set etc.

File:
1 edited

Legend:

Unmodified
Added
Removed
  • trunk/gsdl/perllib/mgbuildproc.pm

    r1072 r1251  
    5858    $self->{'num_sections'} = 0;
    5959    $self->{'num_bytes'} = 0;
     60    $self->{'num_processed_bytes'} = 0;
    6061
    6162    $self->{'indexing_text'} = 0;
     
    6970    $self->{'num_docs'} = 0;
    7071    $self->{'num_sections'} = 0;
     72    $self->{'num_processed_bytes'} = 0;
    7173    $self->{'num_bytes'} = 0;
    7274}
     
    8486}
    8587
     88# num_bytes is the actual number of bytes in the collection
     89# this is normally the same as what's processed during text compression
    8690sub get_num_bytes {
    8791    my $self = shift (@_);
    8892
    8993    return $self->{'num_bytes'};
     94}
     95
     96# num_processed_bytes is the number of bytes actually passed
     97# to mg for the current index
     98sub get_num_processed_bytes {
     99    my $self = shift (@_);
     100
     101    return $self->{'num_processed_bytes'};
    90102}
    91103
     
    126138}
    127139
     140sub get_index {
     141    my $self = shift (@_);
     142
     143    return $self->{'index'};
     144}
     145
    128146sub set_classifiers {
    129147    my $self = shift (@_);
     
    138156
    139157    $self->{'indexing_text'} = $indexing_text;
     158}
     159
     160sub get_indexing_text {
     161    my $self = shift (@_);
     162
     163    return $self->{'indexing_text'};
    140164}
    141165
     
    416440            if ($real_field eq "text") {
    417441            $new_text = $doc_obj->get_text ($section);
     442            $self->{'num_processed_bytes'} += length ($new_text);
    418443            $new_text =~ s/[\cB\cC]//g;
    419444            $self->find_paragraphs($new_text);
     
    423448            foreach $meta (@{$doc_obj->get_metadata ($section, $real_field)}) {
    424449                $meta =~ s/[\cB\cC]//g;
     450                $self->{'num_processed_bytes'} += length ($meta);
    425451                $new_text .= "\cC" unless $first;
    426452                $new_text .= $meta;
Note: See TracChangeset for help on using the changeset viewer.