Changeset 1251


Ignore:
Timestamp:
2000-06-29T10:34:25+12:00 (24 years ago)
Author:
sjboddie
Message:

Added some stat reporting and a warning message to the build code.
Now warns when very little or no text is to be processed for a given
index (as mg craps out in these situations). Will hopefully be useful
in realizing when an attempt is made to create an index of metadata that
is never set etc.

Location:
trunk/gsdl/perllib
Files:
2 edited

Legend:

Unmodified
Added
Removed
  • trunk/gsdl/perllib/mgbuilder.pm

    r1246 r1251  
    207207    close ($handle) unless $self->{'debug'};
    208208
     209    $self->print_stats();
     210
    209211    # create the compression dictionary
    210212    # the compression dictionary is built by assuming the stats are from a seed
     
    232234           "", {}, $self->{'buildproc'}, $self->{'maxdocs'});
    233235    close ($handle) unless $self->{'debug'};
     236
     237    $self->print_stats();
    234238}
    235239
     
    481485    close ($handle) unless $self->{'debug'};
    482486
     487    $self->print_stats();
     488
    483489    if (!$self->{'debug'}) {
    484490    # create the perfect hash function
     
    502508           "", {}, $self->{'buildproc'}, $self->{'maxdocs'});
    503509   
     510    $self->print_stats ();
     511
    504512    if (!$self->{'debug'}) {
    505513
     
    680688}
    681689
     690sub print_stats {
     691    my $self = shift (@_);
     692
     693    my $indexing_text = $self->{'buildproc'}->get_indexing_text();
     694    my $index = $self->{'buildproc'}->get_index();
     695    my $num_bytes = $self->{'buildproc'}->get_num_bytes();
     696    my $num_processed_bytes = $self->{'buildproc'}->get_num_processed_bytes();
     697
     698    if ($indexing_text) {
     699    print STDERR "Stats (Creating index $index)\n";
     700    } else {
     701    print STDERR "Stats (Compressing text from $index)\n";
     702    }
     703    print STDERR "Total bytes in collection: $num_bytes\n";
     704    print STDERR "Total bytes in $index: $num_processed_bytes\n";
     705
     706    if ($num_processed_bytes < 50) {
     707    print STDERR "***************\n";
     708    print STDERR "WARNING: There is very little or no text to process for $index\n";
     709    if ($indexing_text) {
     710        print STDERR "This may cause an error while attempting to build the index\n";
     711    } else {
     712        print STDERR "This may cause an error while attempting to compress the text\n";
     713    }
     714    print STDERR "***************\n";
     715    }
     716}
    682717
    6837181;
  • trunk/gsdl/perllib/mgbuildproc.pm

    r1072 r1251  
    5858    $self->{'num_sections'} = 0;
    5959    $self->{'num_bytes'} = 0;
     60    $self->{'num_processed_bytes'} = 0;
    6061
    6162    $self->{'indexing_text'} = 0;
     
    6970    $self->{'num_docs'} = 0;
    7071    $self->{'num_sections'} = 0;
     72    $self->{'num_processed_bytes'} = 0;
    7173    $self->{'num_bytes'} = 0;
    7274}
     
    8486}
    8587
     88# num_bytes is the actual number of bytes in the collection
     89# this is normally the same as what's processed during text compression
    8690sub get_num_bytes {
    8791    my $self = shift (@_);
    8892
    8993    return $self->{'num_bytes'};
     94}
     95
     96# num_processed_bytes is the number of bytes actually passed
     97# to mg for the current index
     98sub get_num_processed_bytes {
     99    my $self = shift (@_);
     100
     101    return $self->{'num_processed_bytes'};
    90102}
    91103
     
    126138}
    127139
     140sub get_index {
     141    my $self = shift (@_);
     142
     143    return $self->{'index'};
     144}
     145
    128146sub set_classifiers {
    129147    my $self = shift (@_);
     
    138156
    139157    $self->{'indexing_text'} = $indexing_text;
     158}
     159
     160sub get_indexing_text {
     161    my $self = shift (@_);
     162
     163    return $self->{'indexing_text'};
    140164}
    141165
     
    416440            if ($real_field eq "text") {
    417441            $new_text = $doc_obj->get_text ($section);
     442            $self->{'num_processed_bytes'} += length ($new_text);
    418443            $new_text =~ s/[\cB\cC]//g;
    419444            $self->find_paragraphs($new_text);
     
    423448            foreach $meta (@{$doc_obj->get_metadata ($section, $real_field)}) {
    424449                $meta =~ s/[\cB\cC]//g;
     450                $self->{'num_processed_bytes'} += length ($meta);
    425451                $new_text .= "\cC" unless $first;
    426452                $new_text .= $meta;
Note: See TracChangeset for help on using the changeset viewer.