Changeset 782


Ignore:
Timestamp:
1999-11-15T17:50:16+13:00 (24 years ago)
Author:
sjboddie
Message:

removed gettext.pl - added debug, mode and index options to buildcol.pl
and debug option to import.pl

Location:
trunk/gsdl/perllib
Files:
1 added
1 edited

Legend:

Unmodified
Added
Removed
  • trunk/gsdl/perllib/mgbuilder.pm

    r780 r782  
    4848sub new {
    4949    my ($class, $collection, $source_dir, $build_dir,
    50     $verbosity, $maxdocs, $allclassifications) = @_;
     50    $verbosity, $maxdocs, $debug, $allclassifications) = @_;
    5151
    5252    # create an mgbuilder object
     
    5656              'verbosity'=>$verbosity,
    5757              'maxdocs'=>$maxdocs,
     58              'debug'=>$debug,
    5859              'allclassifications'=>$allclassifications,
    5960              'notbuilt'=>[]    # indexes not built
     
    155156    my $self = shift (@_);
    156157
    157     # remove any old builds
    158     &util::rm_r($self->{'build_dir'});
    159     &util::mk_all_dir($self->{'build_dir'});
     158    if (!$self->{'debug'}) {
     159    # remove any old builds
     160    &util::rm_r($self->{'build_dir'});
     161    &util::mk_all_dir($self->{'build_dir'});
    160162       
    161     # make the text directory
    162     my $textdir = "$self->{'build_dir'}/text";
    163     &util::mk_all_dir($textdir);
     163    # make the text directory
     164    my $textdir = "$self->{'build_dir'}/text";
     165    &util::mk_all_dir($textdir);
     166    }
    164167}
    165168
     
    185188    print STDERR "\n*** creating the compressed text\n" if ($self->{'verbosity'} >= 1);
    186189
    187     # set up the document processor
    188     $self->{'buildproc'}->set_output_handle ('mgbuilder::PIPEOUT');
     190    # collect the statistics for the text
     191    # -b $maxdocsize sets the maximum document size to be 12 meg
     192    print STDERR "\n    collecting text statistics\n"  if ($self->{'verbosity'} >= 1);
     193
     194    my ($handle);
     195    if ($self->{'debug'}) {
     196    $handle = STDOUT;
     197    } else {
     198    if (!-e "$mg_passes_exe" ||
     199        !open (PIPEOUT, "| $mg_passes_exe -f $fulltextprefix -b $maxdocsize -T1 $osextra")) {
     200        die "mgbuilder::compress_text - couldn't run $mg_passes_exe\n";
     201    }
     202    $handle = mgbuilder::PIPEOUT;
     203    }
     204
     205    $self->{'buildproc'}->set_output_handle ($handle);
    189206    $self->{'buildproc'}->set_mode ('text');
    190207    $self->{'buildproc'}->set_index ($textindex);
    191208    $self->{'buildproc'}->set_indexing_text (0);
    192    
    193     # collect the statistics for the text
    194     # -b $maxdocsize sets the maximum document size to be 12 meg
    195     print STDERR "\n    collecting text statistics\n"  if ($self->{'verbosity'} >= 1);
    196     if (!-e "$mg_passes_exe" || !open (PIPEOUT,
    197             "| $mg_passes_exe -f $fulltextprefix -b $maxdocsize -T1 $osextra")) {
    198     die "mgbuilder::compress_text - couldn't run $mg_passes_exe\n";
    199     }
    200209    $self->{'buildproc'}->reset();
    201     &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
    202            "", {}, $self->{'buildproc'}, $self->{'maxdocs'});
    203     close (PIPEOUT);
     210    &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
     211                   "", {}, $self->{'buildproc'}, $self->{'maxdocs'});
     212
     213    close ($handle) unless $self->{'debug'};
    204214
    205215    # create the compression dictionary
     
    208218    # and the resulting dictionary must be less than 5 meg with the most frequent
    209219    # words being put into the dictionary first (-2 -k 5120)
    210     print STDERR "\n    creating the compression dictionary\n"  if ($self->{'verbosity'} >= 1);
    211     if (!-e "$mg_compression_dict_exe") {
    212     die "mgbuilder::compress_text - couldn't run $mg_compression_dict_exe\n";
    213     }
    214     system ("$mg_compression_dict_exe -f $fulltextprefix -S -H -2 -k 5120 $osextra");
    215 
     220    if (!$self->{'debug'}) {
     221    print STDERR "\n    creating the compression dictionary\n"  if ($self->{'verbosity'} >= 1);
     222    if (!-e "$mg_compression_dict_exe") {
     223        die "mgbuilder::compress_text - couldn't run $mg_compression_dict_exe\n";
     224    }
     225    system ("$mg_compression_dict_exe -f $fulltextprefix -S -H -2 -k 5120 $osextra");
     226
     227    # -b $maxdocsize sets the maximum document size to be 12 meg
     228    if (!$self->{'debug'}) {
     229        if (!-e "$mg_passes_exe" ||
     230        !open ($handle, "| $mg_passes_exe -f $fulltextprefix -b $maxdocsize -T2 $osextra")) {
     231        die "mgbuilder::compress_text - couldn't run $mg_passes_exe\n";
     232        }
     233    }
     234    }
     235
     236    $self->{'buildproc'}->reset();
    216237    # compress the text
    217     # -b $maxdocsize sets the maximum document size to be 12 meg
    218238    print STDERR "\n    compressing the text\n"  if ($self->{'verbosity'} >= 1);
    219     if (!-e "$mg_passes_exe" || !open (PIPEOUT,
    220             "| $mg_passes_exe -f $fulltextprefix -b $maxdocsize -T2 $osextra")) {
    221     die "mgbuilder::compress_text - couldn't run $mg_passes_exe\n";
    222     }
    223     $self->{'buildproc'}->reset();
    224239    &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
    225240           "", {}, $self->{'buildproc'}, $self->{'maxdocs'});
    226     close (PIPEOUT);
     241    close ($handle) unless $self->{'debug'};
    227242}
    228243
     
    245260sub build_indexes {
    246261    my $self = shift (@_);
    247     my $indexes = $self->{'collect_cfg'}->{'indexes'};
     262    my ($indexname) = @_;
     263
     264    my $indexes = [];
     265    if (defined $indexname && $indexname =~ /\w/) {
     266    push @$indexes, $indexname;
     267    } else {
     268    $indexes = $self->{'collect_cfg'}->{'indexes'};
     269    }
    248270
    249271    # create the mapping between the index descriptions
     
    441463    }
    442464    }
    443    
     465
     466    # Build index dictionary. Uses verbatim stem method
     467    print STDERR "\n    creating index dictionary\n"  if ($self->{'verbosity'} >= 1);
     468    my ($handle);
     469    if ($self->{'debug'}) {
     470    $handle = STDOUT;
     471    } else {
     472    if (!-e "$mg_passes_exe" ||
     473        !open (PIPEOUT, "| $mg_passes_exe -f $fullindexprefix -b $maxdocsize " .
     474           "-$index_level -m 32 -s 0 -G -t 10 -N1 $osextra")) {
     475        die "mgbuilder::build_index - couldn't run $mg_passes_exe\n";
     476    }
     477    $handle = mgbuilder::PIPEOUT;
     478    }
     479   
    444480    # set up the document processor
    445     $self->{'buildproc'}->set_output_handle ('mgbuilder::PIPEOUT');
     481    $self->{'buildproc'}->set_output_handle ($handle);
    446482    $self->{'buildproc'}->set_mode ('text');
    447483    $self->{'buildproc'}->set_index ($index, $indexexparr);
    448484    $self->{'buildproc'}->set_indexing_text (1);
    449485
    450 
    451     # Build index dictionary. Uses verbatim stem method
    452     print STDERR "\n    creating index dictionary\n"  if ($self->{'verbosity'} >= 1);
    453     if (!-e "$mg_passes_exe" || !open (PIPEOUT,
    454         "| $mg_passes_exe -f $fullindexprefix -b $maxdocsize " .
    455         "-$index_level -m 32 -s 0 -G -t 10 -N1 $osextra")) {
    456     die "mgbuilder::build_index - couldn't run $mg_passes_exe\n";
    457     }
    458486    $self->{'buildproc'}->reset();
    459487    &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
    460488           "", {}, $self->{'buildproc'}, $self->{'maxdocs'});
    461     close (PIPEOUT);
    462 
    463     # create the perfect hash function
    464     if (!-e "$mg_perf_hash_build_exe") {
    465     die "mgbuilder::build_index - couldn't run $mg_perf_hash_build_exe\n";
    466     }
    467     system ("$mg_perf_hash_build_exe -f $fullindexprefix $osextra");
    468 
     489    close ($handle) unless $self->{'debug'};
     490
     491    if (!$self->{'debug'}) {
     492    # create the perfect hash function
     493    if (!-e "$mg_perf_hash_build_exe") {
     494        die "mgbuilder::build_index - couldn't run $mg_perf_hash_build_exe\n";
     495    }
     496    system ("$mg_perf_hash_build_exe -f $fullindexprefix $osextra");
     497
     498    if (!-e "$mg_passes_exe" ||
     499        !open ($handle, "| $mg_passes_exe -f $fullindexprefix -b $maxdocsize " .
     500           "-$index_level -c 3 -G -t 10 -N2 $osextra")) {
     501        die "mgbuilder::build_index - couldn't run $mg_passes_exe\n";
     502    }
     503    }
     504   
    469505    # invert the text
    470506    print STDERR "\n    inverting the text\n"  if ($self->{'verbosity'} >= 1);
    471     if (!-e "$mg_passes_exe" || !open (PIPEOUT,
    472         "| $mg_passes_exe -f $fullindexprefix -b $maxdocsize " .
    473         "-$index_level -c 3 -G -t 10 -N2 $osextra")) {
    474     die "mgbuilder::build_index - couldn't run $mg_passes_exe\n";
    475     }
     507
    476508    $self->{'buildproc'}->reset();
    477509    &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
    478510           "", {}, $self->{'buildproc'}, $self->{'maxdocs'});
    479     close (PIPEOUT);
    480 
    481     # create the weights file
    482     print STDERR "\n    create the weights file\n"  if ($self->{'verbosity'} >= 1);
    483     if (!-e "$mg_weights_build_exe") {
    484     die "mgbuilder::build_index - couldn't run $mg_weights_build_exe\n";
    485     }
    486     system ("$mg_weights_build_exe -f $fullindexprefix -t $fulltextprefix $osextra");
    487 
    488     # create 'on-disk' stemmed dictionary
    489     print STDERR "\n    creating 'on-disk' stemmed dictionary\n"  if ($self->{'verbosity'} >= 1);
    490     if (!-e "$mg_invf_dict_exe") {
    491     die "mgbuilder::build_index - couldn't run $mg_invf_dict_exe\n";
    492     }
    493     system ("$mg_invf_dict_exe -f $fullindexprefix $osextra");
    494 
    495 
    496     # creates stem index files for the various stemming methods
    497     print STDERR "\n    creating stem indexes\n"  if ($self->{'verbosity'} >= 1);
    498     if (!-e "$mg_stem_idx_exe") {
    499     die "mgbuilder::build_index - couldn't run $mg_stem_idx_exe\n";
    500     }
    501     system ("$mg_stem_idx_exe -b 4096 -s1 -f $fullindexprefix $osextra");
    502     system ("$mg_stem_idx_exe -b 4096 -s2 -f $fullindexprefix $osextra");
    503     system ("$mg_stem_idx_exe -b 4096 -s3 -f $fullindexprefix $osextra");
    504 
    505511   
    506     # remove unwanted files
    507     my $tmpdir = &util::filename_cat ($self->{'build_dir'}, $indexdir);
    508     opendir (DIR, $tmpdir) || die
    509     "mgbuilder::build_index - couldn't read directory $tmpdir\n";
    510     foreach $file (readdir(DIR)) {
    511     next if $file =~ /^\./;
    512     my ($suffix) = $file =~ /\.([^\.]+)$/;
    513     if (defined $suffix && !defined $wanted_index_files{$suffix}) {
    514         # delete it!
    515 #       print STDERR "deleting $file\n";
    516         &util::rm (&util::filename_cat ($tmpdir, $file));
    517     }
    518     }
    519     closedir (DIR);
     512    if (!$self->{'debug'}) {
     513
     514    close ($handle);
     515   
     516    # create the weights file
     517    print STDERR "\n    create the weights file\n"  if ($self->{'verbosity'} >= 1);
     518    if (!-e "$mg_weights_build_exe") {
     519        die "mgbuilder::build_index - couldn't run $mg_weights_build_exe\n";
     520    }
     521    system ("$mg_weights_build_exe -f $fullindexprefix -t $fulltextprefix $osextra");
     522
     523    # create 'on-disk' stemmed dictionary
     524    print STDERR "\n    creating 'on-disk' stemmed dictionary\n"  if ($self->{'verbosity'} >= 1);
     525    if (!-e "$mg_invf_dict_exe") {
     526        die "mgbuilder::build_index - couldn't run $mg_invf_dict_exe\n";
     527    }
     528    system ("$mg_invf_dict_exe -f $fullindexprefix $osextra");
     529
     530
     531    # creates stem index files for the various stemming methods
     532    print STDERR "\n    creating stem indexes\n"  if ($self->{'verbosity'} >= 1);
     533    if (!-e "$mg_stem_idx_exe") {
     534        die "mgbuilder::build_index - couldn't run $mg_stem_idx_exe\n";
     535    }
     536    system ("$mg_stem_idx_exe -b 4096 -s1 -f $fullindexprefix $osextra");
     537    system ("$mg_stem_idx_exe -b 4096 -s2 -f $fullindexprefix $osextra");
     538    system ("$mg_stem_idx_exe -b 4096 -s3 -f $fullindexprefix $osextra");
     539
     540   
     541    # remove unwanted files
     542    my $tmpdir = &util::filename_cat ($self->{'build_dir'}, $indexdir);
     543    opendir (DIR, $tmpdir) || die
     544        "mgbuilder::build_index - couldn't read directory $tmpdir\n";
     545    foreach $file (readdir(DIR)) {
     546        next if $file =~ /^\./;
     547        my ($suffix) = $file =~ /\.([^\.]+)$/;
     548        if (defined $suffix && !defined $wanted_index_files{$suffix}) {
     549        # delete it!
     550        print STDERR "deleting $file\n" if $self->{'verbosity'} > 2;
     551        &util::rm (&util::filename_cat ($tmpdir, $file));
     552        }
     553    }
     554    closedir (DIR);
     555    }
    520556}
    521557
     
    541577
    542578    # set up the document processor
    543     $self->{'buildproc'}->set_output_handle ('mgbuilder::PIPEOUT');
     579    my ($handle);
     580    if ($self->{'debug'}) {
     581    $handle = STDOUT;
     582    } else {
     583    if (!-e "$txt2db_exe" || !open (PIPEOUT, "| $txt2db_exe $fulldbname")) {
     584        die "mgbuilder::make_infodatabase - couldn't run $txt2db_exe\n";
     585    }
     586    $handle = mgbuilder::PIPEOUT;
     587    }
     588
     589    $self->{'buildproc'}->set_output_handle ($handle);
    544590    $self->{'buildproc'}->set_mode ('infodb');
    545591    $self->{'buildproc'}->set_dontgdbm ($self->{'dontgdbm'});
    546592    $self->{'buildproc'}->set_classifiers ($self->{'classifiers'});
    547593    $self->{'buildproc'}->set_indexing_text (0);
    548    
    549     # create the infodatabase
    550     if (!-e "$txt2db_exe" || !open (PIPEOUT,
    551             "| $txt2db_exe $fulldbname")) {
    552     die "mgbuilder::make_infodatabase - couldn't run $txt2db_exe\n";
    553     }
    554594    $self->{'buildproc'}->reset();
    555595
    556596    if (defined $self->{'collect_cfg'}->{'collectionmeta'}) {
    557 
     597   
    558598    if (!defined $self->{'index_mapping'}) {
    559599        $self->{'index_mapping'} =
     
    561601    }
    562602
    563     print PIPEOUT "[collection]\n";
    564 
     603    print $handle "[collection]\n";
     604   
    565605    foreach $cmeta (keys (%{$self->{'collect_cfg'}->{'collectionmeta'}})) {
    566606        if ($cmeta =~ s/^\.//) {
    567607        if (defined $self->{'index_mapping'}->{$cmeta}) {
    568             print PIPEOUT "<$self->{'index_mapping'}->{$cmeta}>" .
     608            print $handle "<$self->{'index_mapping'}->{$cmeta}>" .
    569609            $self->{'collect_cfg'}->{'collectionmeta'}->{".$cmeta"} . "\n";
    570610        } else {
     
    572612        }
    573613        } else {
    574         print PIPEOUT "<$cmeta>$self->{'collect_cfg'}->{'collectionmeta'}->{$cmeta}\n";
     614        print $handle "<$cmeta>$self->{'collect_cfg'}->{'collectionmeta'}->{$cmeta}\n";
    575615        }
    576616    }
    577     print PIPEOUT "\n" . ('-' x 70) . "\n";
    578 
    579     }
    580    
     617    print $handle "\n" . ('-' x 70) . "\n";
     618
     619    }
    581620
    582621    &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
     
    584623
    585624    # output classification information
    586     &classify::output_classify_info ($self->{'classifiers'}, 'mgbuilder::PIPEOUT',
     625    &classify::output_classify_info ($self->{'classifiers'}, $handle,
    587626                     $self->{'allclassifications'});
    588627
    589     close (PIPEOUT);
     628    close ($handle) if !$self->{'debug'};
    590629}
    591630
Note: See TracChangeset for help on using the changeset viewer.