Changeset 1694


Ignore:
Timestamp:
2000-11-23T11:55:50+13:00 (23 years ago)
Author:
kjm18
Message:

updated to resembled the corresponding mg updated versions

Location:
trunk/gsdl/perllib
Files:
2 edited

Legend:

Unmodified
Added
Removed
  • trunk/gsdl/perllib/mgppbuilder.pm

    r1301 r1694  
    3131use plugin;
    3232use util;
     33use FileHandle;
     34
     35
     36BEGIN {
     37    # set autoflush on for STDERR and STDOUT so that mg
     38    # doesn't get out of sync with plugins
     39    STDOUT->autoflush(1);
     40    STDERR->autoflush(1);
     41}
     42
     43END {
     44    STDOUT->autoflush(0);
     45    STDERR->autoflush(0);
     46}
     47
     48$maxdocsize = 12000;
    3349
    3450#update this !!!!!!!!!!!!!!!!
     
    4763sub new {
    4864    my ($class, $collection, $source_dir, $build_dir, $verbosity,
    49     $maxdocs, $debug, $keepold, $allclassifications) = @_;
     65    $maxdocs, $debug, $keepold, $allclassifications, $outhandle) = @_;
     66
     67    $outhandle = STDERR unless defined $outhandle;
    5068
    5169    # create an mgppbuilder object
     
    5876              'keepold'=>$keepold,
    5977              'allclassifications'=>$allclassifications,
     78              'outhandle'=>$outhandle,
    6079              'notbuilt'=>[]    # indexes not built
    6180              }, $class;
     
    100119    $self->{'pluginfo'} = &plugin::load_plugins ($plugins);
    101120    if (scalar(@{$self->{'pluginfo'}}) == 0) {
    102     print STDERR "No plugins were loaded.\n";
     121    print $outhandle "No plugins were loaded.\n";
    103122    die "\n";
    104123    }
     
    111130   
    112131    # load all the classifiers
    113     $self->{'classifiers'} = &classify::load_classifiers ($classifiers);
     132    $self->{'classifiers'} = &classify::load_classifiers ($classifiers, $outhandle);
    114133
    115134    # load up any dontgdbm fields
     
    135154
    136155    eval("\$self->{'buildproc'} = new $buildproctype(\$collection, " .
    137      "\$source_dir, \$build_dir, \$verbosity)");
     156     "\$source_dir, \$build_dir, \$verbosity, \$outhandle)");
    138157    die "$@" if $@;
    139158
     
    160179    my ($textindex, $indexname) = @_;
    161180
    162     print STDERR "build_col, textindex=$textindex, indexname=$indexname\n";
     181    my $outhandle = $self->{'outhandle'};
     182
     183    print $outhandle "build_col, textindex=$textindex, indexname=$indexname\n";
    163184    my $exedir = "$ENV{'GSDLHOME'}/src/mgpp/text";
    164185    my $exe = &util::get_os_exe ();
     
    203224    push @$indexes, $indexname;
    204225    }
    205     print STDERR "indexes are: @$indexes\n";
    206    
    207 
    208    print STDERR "\n*** mg_passes: first pass\n" if ($self->{'verbosity'} >= 1);
    209     print STDERR "fulltextprefix=$fulltextprefix\n";
     226    print $outhandle "indexes are: @$indexes\n";
     227   
     228
     229    print $outhandle "\n*** mg_passes: first pass\n" if ($self->{'verbosity'} >= 1);
     230    print $outhandle "fulltextprefix=$fulltextprefix\n";
    210231    # carry out the first pass of mg_passes
    211232    # -b $maxdocsize sets the maximum document size to be 12 meg - not available any longer
    212     print STDERR "\n    collecting text statistics\n"  if ($self->{'verbosity'} >= 1);
     233    print $outhandle "\n    collecting text statistics\n"  if ($self->{'verbosity'} >= 1);
    213234
    214235    my ($handle);
     
    247268    # words being put into the dictionary first (-2 -k 5120)
    248269    if (!$self->{'debug'}) {
    249     print STDERR "\n    creating the compression dictionary\n"  if ($self->{'verbosity'} >= 1);
     270    print $outhandle "\n    creating the compression dictionary\n"  if ($self->{'verbosity'} >= 1);
    250271    if (!-e "$mg_compression_dict_exe") {
    251272        die "mgppbuilder::compress_text - couldn't run $mg_compression_dict_exe\n";
     
    271292    $self->{'buildproc'}->reset();
    272293   
    273     print STDERR "\n    compressing the text\n"  if ($self->{'verbosity'} >= 1);
     294    print $outhandle "\n    compressing the text\n"  if ($self->{'verbosity'} >= 1);
    274295    &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
    275296           "", {}, $self->{'buildproc'}, $self->{'maxdocs'});
     
    279300   
    280301    # create the weights file
    281     print STDERR "\n    create the weights file\n"  if ($self->{'verbosity'} >= 1);
     302    print $outhandle "\n    create the weights file\n"  if ($self->{'verbosity'} >= 1);
    282303    if (!-e "$mg_weights_build_exe") {
    283304    die "mgppbuilder::build_index - couldn't run $mg_weights_build_exe\n";
     
    286307   
    287308    # create 'on-disk' stemmed dictionary
    288     print STDERR "\n    creating 'on-disk' stemmed dictionary\n"  if ($self->{'verbosity'} >= 1);
     309    print $outhandle "\n    creating 'on-disk' stemmed dictionary\n"  if ($self->{'verbosity'} >= 1);
    289310    if (!-e "$mg_invf_dict_exe") {
    290311    die "mgppbuilder::build_index - couldn't run $mg_invf_dict_exe\n";
     
    294315   
    295316    # creates stem index files for the various stemming methods
    296     print STDERR "\n    creating stem indexes\n"  if ($self->{'verbosity'} >= 1);
     317    print $outhandle "\n    creating stem indexes\n"  if ($self->{'verbosity'} >= 1);
    297318    if (!-e "$mg_stem_idx_exe") {
    298319    die "mgppbuilder::build_index - couldn't run $mg_stem_idx_exe\n";
     
    315336    my $mg_passes_exe = &util::filename_cat($exedir, "mg_passes$exe");
    316337    my $mg_compression_dict_exe = &util::filename_cat($exedir, "mg_compression_dict$exe");
     338    my $outhandle = $self->{'outhandle'};
    317339
    318340    &util::mk_all_dir (&util::filename_cat($self->{'build_dir'}, "text"));
     
    327349    }
    328350
    329     print STDERR "\n*** creating the compressed text\n" if ($self->{'verbosity'} >= 1);
     351    print $outhandle "\n*** creating the compressed text\n" if ($self->{'verbosity'} >= 1);
    330352
    331353    # collect the statistics for the text
    332        print STDERR "\n    collecting text statistics\n"  if ($self->{'verbosity'} >= 1);
     354    # -b $maxdocsize sets the maximum document size to be 12 meg
     355    print $outhandle "\n    collecting text statistics\n"  if ($self->{'verbosity'} >= 1);
    333356
    334357    my ($handle);
     
    364387    # note: this options are left over from mg version
    365388    if (!$self->{'debug'}) {
    366     print STDERR "\n    creating the compression dictionary\n"  if ($self->{'verbosity'} >= 1);
     389    print $outhandle "\n    creating the compression dictionary\n"  if ($self->{'verbosity'} >= 1);
    367390    if (!-e "$mg_compression_dict_exe") {
    368391        die "mgppbuilder::compress_text - couldn't run $mg_compression_dict_exe\n";
     
    381404    $self->{'buildproc'}->reset();
    382405    # compress the text
    383     print STDERR "\n    compressing the text\n"  if ($self->{'verbosity'} >= 1);
     406    print $outhandle "\n    compressing the text\n"  if ($self->{'verbosity'} >= 1);
    384407    &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
    385408           "", {}, $self->{'buildproc'}, $self->{'maxdocs'});
    386409    close ($handle) unless $self->{'debug'};
     410
     411    $self->print_stats();
    387412}
    388413
     
    406431    my $self = shift (@_);
    407432    my ($indexname) = @_;
     433    my $outhandle = $self->{'outhandle'};
    408434
    409435    my $indexes = [];
     
    416442#    push @$indexes, "text,Title,Organization,Magazine,Subject";
    417443#    push @$indexes, "Title,Organization,Magazine,Subject";
     444
    418445    # create the mapping between the index descriptions
    419446    # and their directory names
     
    423450    foreach $index (@$indexes) {
    424451    if ($self->want_built($index)) {
    425         print STDERR "\n*** building index $index in subdirectory " .
     452        print $outhandle "\n*** building index $index in subdirectory " .
    426453        "$self->{'index_mapping'}->{$index}\n" if ($self->{'verbosity'} >= 1);
    427454        $self->build_index($index);
    428455    } else {
    429         print STDERR "\n*** ignoring index $index\n" if ($self->{'verbosity'} >= 1);
     456        print $outhandle "\n*** ignoring index $index\n" if ($self->{'verbosity'} >= 1);
    430457    }
    431458    }
     
    551578    my $self = shift (@_);
    552579    my ($index) = @_;
     580    my $outhandle = $self->{'outhandle'};
    553581
    554582    # get the full index directory path and make sure it exists
     
    603631
    604632    # Build index dictionary. Uses verbatim stem method
    605     print STDERR "\n    creating index dictionary\n"  if ($self->{'verbosity'} >= 1);
     633    print $outhandle "\n    creating index dictionary\n"  if ($self->{'verbosity'} >= 1);
    606634    my ($handle);
    607635    if ($self->{'debug'}) {
     
    626654    close ($handle) unless $self->{'debug'};
    627655
     656    $self->print_stats();
     657
    628658    if (!$self->{'debug'}) {
    629659    # create the perfect hash function
     
    640670   
    641671    # invert the text
    642     print STDERR "\n    inverting the text\n"  if ($self->{'verbosity'} >= 1);
     672    print $outhandle "\n    inverting the text\n"  if ($self->{'verbosity'} >= 1);
    643673
    644674    $self->{'buildproc'}->reset();
    645675    &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
    646676           "", {}, $self->{'buildproc'}, $self->{'maxdocs'});
     677
     678    $self->print_stats ();
    647679   
    648680    if (!$self->{'debug'}) {
     
    651683   
    652684    # create the weights file
    653     print STDERR "\n    create the weights file\n"  if ($self->{'verbosity'} >= 1);
     685    print $outhandle "\n    create the weights file\n"  if ($self->{'verbosity'} >= 1);
    654686    if (!-e "$mg_weights_build_exe") {
    655687        die "mgppbuilder::build_index - couldn't run $mg_weights_build_exe\n";
     
    658690
    659691    # create 'on-disk' stemmed dictionary
    660     print STDERR "\n    creating 'on-disk' stemmed dictionary\n"  if ($self->{'verbosity'} >= 1);
     692    print $outhandle "\n    creating 'on-disk' stemmed dictionary\n"  if ($self->{'verbosity'} >= 1);
    661693    if (!-e "$mg_invf_dict_exe") {
    662694        die "mgppbuilder::build_index - couldn't run $mg_invf_dict_exe\n";
     
    666698
    667699    # creates stem index files for the various stemming methods
    668     print STDERR "\n    creating stem indexes\n"  if ($self->{'verbosity'} >= 1);
     700    print $outhandle "\n    creating stem indexes\n"  if ($self->{'verbosity'} >= 1);
    669701    if (!-e "$mg_stem_idx_exe") {
    670702        die "mgppbuilder::build_index - couldn't run $mg_stem_idx_exe\n";
     
    684716#       if (defined $suffix && !defined $wanted_index_files{$suffix}) {
    685717        # delete it!
    686 #       print STDERR "deleting $file\n" if $self->{'verbosity'} > 2;
     718#       print $outhandle "deleting $file\n" if $self->{'verbosity'} > 2;
    687719#       &util::rm (&util::filename_cat ($tmpdir, $file));
    688720#       }
     
    694726sub make_infodatabase {
    695727    my $self = shift (@_);
     728    my $outhandle = $self->{'outhandle'};
     729
     730
    696731    my $textdir = &util::filename_cat($self->{'build_dir'}, "text");
    697732    my $assocdir = &util::filename_cat($self->{'build_dir'}, "assoc");
     
    709744    my $txt2db_exe = &util::filename_cat($exedir, "txt2db$exe");
    710745
    711     print STDERR "\n*** creating the info database and processing associated files\n"
     746    print $outhandle "\n*** creating the info database and processing associated files\n"
    712747    if ($self->{'verbosity'} >= 1);
    713748
     
    748783            print $handle "<$self->{'index_mapping'}->{$cmeta}>" .
    749784            $self->{'collect_cfg'}->{'collectionmeta'}->{".$cmeta"} . "\n";
    750         print STDERR "have .section entry in collect file\n";
     785        print $outhandle "have .section entry in collect file\n";
    751786        } else {
    752             print STDERR "mgppbuilder: warning bad collectionmeta option '$cmeta' - ignored\n";
     787            print $outhandle "mgppbuilder: warning bad collectionmeta option '$cmeta' - ignored\n";
    753788        }
    754789        } else {
     
    779814    my %build_cfg = ();
    780815
    781     print STDERR "\n*** creating auxiliary files \n" if ($self->{'verbosity'} >= 1);
     816    my $outhandle =  $self->{'outhandle'};
     817    print $outhandle "\n*** creating auxiliary files \n" if ($self->{'verbosity'} >= 1);
    782818
    783819    # get the text directory
     
    826862}
    827863
     864sub print_stats {
     865    my $self = shift (@_);
     866
     867    my $outhandle = $self->{'outhandle'};
     868    my $indexing_text = $self->{'buildproc'}->get_indexing_text();
     869    my $index = $self->{'buildproc'}->get_index();
     870    my $num_bytes = $self->{'buildproc'}->get_num_bytes();
     871    my $num_processed_bytes = $self->{'buildproc'}->get_num_processed_bytes();
     872
     873    if ($indexing_text) {
     874    print $outhandle "Stats (Creating index $index)\n";
     875    } else {
     876    print $outhandle "Stats (Compressing text from $index)\n";
     877    }
     878    print $outhandle "Total bytes in collection: $num_bytes\n";
     879    print $outhandle "Total bytes in $index: $num_processed_bytes\n";
     880
     881    if ($num_processed_bytes < 50) {
     882    print $outhandle "***************\n";
     883    print $outhandle "WARNING: There is very little or no text to process for $index\n";
     884    if ($indexing_text) {
     885        print $outhandle "This may cause an error while attempting to build the index\n";
     886    } else {
     887        print $outhandle "This may cause an error while attempting to compress the text\n";
     888    }
     889    print $outhandle "***************\n";
     890    }
     891
     892}
    828893
    8298941;
  • trunk/gsdl/perllib/mgppbuildproc.pm

    r932 r1694  
    4242
    4343sub new {
    44     my ($class, $collection, $source_dir, $build_dir, $verbosity) = @_;
     44    my ($class, $collection, $source_dir, $build_dir,
     45    $verbosity, $outhandle) = @_;
    4546    my $self = new docproc ();
     47
     48    # outhandle is where all the debugging info goes
     49    # output_handle is where the output of the plugins is piped
     50    # to (i.e. mg, gdbm etc.)
     51    $outhandle = STDERR unless defined $outhandle;
    4652
    4753    $self->{'collection'} = $collection;
     
    5965    $self->{'num_sections'} = 0;
    6066    $self->{'num_bytes'} = 0;
     67    $self->{'num_processed_bytes'} = 0;
     68    $self->{'outhandle'} = $outhandle;
    6169
    6270    $self->{'indexing_text'} = 0;
     
    7078    $self->{'num_docs'} = 0;
    7179    $self->{'num_sections'} = 0;
     80    $self->{'num_processed_bytes'} = 0;
    7281    $self->{'num_bytes'} = 0;
    7382}
     
    8594}
    8695
     96# num_bytes is the actual number of bytes in the collection
     97# this is normally the same as what's processed during text compression
    8798sub get_num_bytes {
    8899    my $self = shift (@_);
    89100
    90101    return $self->{'num_bytes'};
     102}
     103
     104# num_processed_bytes is the number of bytes actually passed
     105# to mgpp for the current index
     106sub get_num_processed_bytes {
     107    my $self = shift (@_);
     108
     109    return $self->{'num_processed_bytes'};
    91110}
    92111
     
    127146}
    128147
     148sub get_index {
     149    my $self = shift (@_);
     150
     151    return $self->{'index'};
     152}
     153
    129154sub set_classifiers {
    130155    my $self = shift (@_);
     
    139164
    140165    $self->{'indexing_text'} = $indexing_text;
     166}
     167
     168sub get_indexing_text {
     169    my $self = shift (@_);
     170
     171    return $self->{'indexing_text'};
    141172}
    142173
     
    240271    else { print $handle "[$doc_OID.$section]\n"; }
    241272
    242     # output the fact that this document is a document
    243     #print $handle "<doctype>doc\n";
     273    # output the fact that this document is a document (unless doctype
     274    # has been set to something else from within a plugin
     275    my $dtype = $doc_obj->get_metadata_element ($section, "doctype");
     276    if (!defined $dtype || $dtype !~ /\w/) {
     277        print $handle "<doctype>doc\n";
     278    }
    244279
    245280    # output whether this node contains text
     
    251286
    252287    # output all the section metadata
    253     my $found_doctype = 0;
     288    #my $found_doctype = 0;
    254289    my $metadata = $doc_obj->get_all_metadata ($section);
    255290    foreach $pair (@$metadata) {
    256291        my ($field, $value) = (@$pair);
    257292
    258         $found_doctype = 1 if $field eq "doctype";
     293        #$found_doctype = 1 if $field eq "doctype";
    259294        if ($field ne "Identifier" && $field !~ /^gsdl/ &&
    260295        defined $value && $value ne "") {
     
    282317    # (unless doctype was already output as part of
    283318    # metadata)
    284     if (!$found_doctype && !defined $self->{'dontgdbm'}->{'doctype'}) {
    285         print $handle "<doctype>doc\n";
    286     }
     319    #if (!$found_doctype && !defined $self->{'dontgdbm'}->{'doctype'}) {
     320    #    print $handle "<doctype>doc\n";
     321    #}
    287322   
    288323   
Note: See TracChangeset for help on using the changeset viewer.