Changeset 2478


Ignore:
Timestamp:
2001-05-29T10:49:04+12:00 (23 years ago)
Author:
kjm18
Message:

brought it in line with changes to buildcol.pl, mgbuilder.pm
now uses the new mgpp executable names (mgpp_passes instead of mg_passes)

File:
1 edited

Legend:

Unmodified
Added
Removed
  • trunk/gsdl/perllib/mgppbuilder.pm

    r1917 r2478  
    5959               'i'=>1,
    6060               'il'=>1,
    61                'tw'=>1,
    6261               'w'=>1,
    6362               'wa'=>1);
     
    8786sub new {
    8887    my ($class, $collection, $source_dir, $build_dir, $verbosity,
    89     $maxdocs, $debug, $keepold, $allclassifications, $outhandle) = @_;
     88    $maxdocs, $debug, $keepold, $allclassifications,
     89    $outhandle, $no_text) = @_;
    9090
    9191    $outhandle = STDERR unless defined $outhandle;
    92 
     92    $no_text = 0 unless defined $no_text;
     93   
    9394    # create an mgppbuilder object
    9495    my $self = bless {'collection'=>$collection,
     
    101102              'allclassifications'=>$allclassifications,
    102103              'outhandle'=>$outhandle,
     104              'no_text'=>$no_text,
    103105              'notbuilt'=>[],    # indexes not built
    104106              'indexfieldmap'=>\%static_indexfield_map
     
    130132    foreach $language (@{$self->{'collect_cfg'}->{'languages'}}) {
    131133        foreach $index (@$indexes) {
    132         push (@{$self->{'collect_cfg'}->{'indexes'}}, "$index:$language");
     134        if (defined ($self->{'collect_cfg'}->{'indexsubcollections'})) {
     135            push (@{$self->{'collect_cfg'}->{'indexes'}}, "$index:$language");
     136        }
     137        else { # add in an empty subcollection field
     138            push (@{$self->{'collect_cfg'}->{'indexes'}}, "$index\:\:$language");
     139        }       
    133140        }
    134141    }
     
    233240    my ($textindex) = @_;
    234241
    235     my $exedir = "$ENV{'GSDLHOME'}/src/mgpp/text";
     242    my $exedir = "$ENV{'GSDLHOME'}/bin/$ENV{'GSDLOS'}";
    236243    my $exe = &util::get_os_exe ();
    237     my $mg_passes_exe = &util::filename_cat($exedir, "mg_passes$exe");
    238     my $mg_compression_dict_exe = &util::filename_cat($exedir, "mg_compression_dict$exe");
     244    my $mgpp_passes_exe = &util::filename_cat($exedir, "mgpp_passes$exe");
     245    my $mgpp_compression_dict_exe = &util::filename_cat($exedir, "mgpp_compression_dict$exe");
    239246    my $outhandle = $self->{'outhandle'};
    240247
     
    244251    my $basefilename = "text/$self->{'collection'}";
    245252
    246 # mgpp cant work on windows at the moment   
    247 #     if ($ENV{'GSDLOS'} =~ /^windows$/i) {
    248 #    $basefilename =~ s/\//\\/g;
    249 #    $builddir =~ s/\//\\/g;
    250 #   
    251 #    }
     253    if ($ENV{'GSDLOS'} =~ /^windows$/i) {
     254    $basefilename =~ s/\//\\/g;
     255    $builddir =~ s/\//\\/g;
     256   
     257    }
    252258
    253259
     
    255261    # the compressor doesn't need to know about paragraphs - never want to
    256262    # retrieve them
    257     my $mg_passes_sections = "";
     263    my $mgpp_passes_sections = "";
    258264    if ($self->{'levels'}->{'Section'}) {
    259     $mg_passes_sections .= "-K Section ";
     265    $mgpp_passes_sections .= "-K Section ";
    260266    }
    261267   
     
    264270    # collect the statistics for the text
    265271    # -b $maxdocsize sets the maximum document size to be 12 meg
    266     print $outhandle "\n    collecting text statistics (mg_passes -T1)\n"  if ($self->{'verbosity'} >= 1);
     272    print $outhandle "\n    collecting text statistics (mgpp_passes -T1)\n"  if ($self->{'verbosity'} >= 1);
    267273
    268274    my ($handle);
     
    270276    $handle = STDOUT;
    271277    } else {
    272     if (!-e "$mg_passes_exe" ||
    273         !open (PIPEOUT, "| $mg_passes_exe $mg_passes_sections  -d $builddir -f $basefilename -T1")) {
    274         die "mgppbuilder::compress_text - couldn't run $mg_passes_exe\n";
     278    if (!-e "$mgpp_passes_exe" ||
     279        !open (PIPEOUT, "| mgpp_passes$exe $mgpp_passes_sections  -d $builddir -f $basefilename -T1")) {
     280        die "mgppbuilder::compress_text - couldn't run $mgpp_passes_exe\n";
    275281    }
    276282    $handle = mgppbuilder::PIPEOUT;
     
    281287    $self->{'buildproc'}->set_index ($textindex);
    282288    $self->{'buildproc'}->set_indexing_text (0);
     289    if ($self->{'no_text'}) {
     290    $self->{'buildproc'}->set_store_text(0);
     291    } else {
     292    $self->{'buildproc'}->set_store_text(1);
     293    }
    283294    $self->{'buildproc'}->set_indexfieldmap ($self->{'indexfieldmap'});
    284295    $self->{'buildproc'}->set_levels ($self->{'levels'});                     
     
    293304    close ($handle) unless $self->{'debug'};
    294305
     306    $self->print_stats();
     307
    295308    # create the compression dictionary
    296309    # the compression dictionary is built by assuming the stats are from a seed
     
    301314    if (!$self->{'debug'}) {
    302315    print $outhandle "\n    creating the compression dictionary\n"  if ($self->{'verbosity'} >= 1);
    303     if (!-e "$mg_compression_dict_exe") {
    304         die "mgppbuilder::compress_text - couldn't run $mg_compression_dict_exe\n";
    305     }
    306     system ("$mg_compression_dict_exe -d $builddir -f $basefilename -S -H -2 -k 5120");
    307 
     316    if (!-e "$mgpp_compression_dict_exe") {
     317        die "mgppbuilder::compress_text - couldn't run $mgpp_compression_dict_exe\n";
     318    }
     319    system ("mgpp_compression_dict$exe -d $builddir -f $basefilename -S -H -2 -k 5120");
    308320
    309321    if (!$self->{'debug'}) {
    310         if (!-e "$mg_passes_exe" ||
    311         !open ($handle, "| $mg_passes_exe $mg_passes_compress_sections -f $basefilename -d $builddir -T2")) {
    312         die "mgppbuilder::compress_text - couldn't run $mg_passes_exe\n";
     322        if (!-e "$mgpp_passes_exe" ||
     323        !open ($handle, "| mgpp_passes$exe $mgpp_passes_sections -f $basefilename -d $builddir -T2")) {
     324        die "mgppbuilder::compress_text - couldn't run $mgpp_passes_exe\n";
    313325        }
    314326    }
     
    317329    $self->{'buildproc'}->reset();
    318330    # compress the text
    319     print $outhandle "\n    compressing the text (mg_passes -T2)\n"  if ($self->{'verbosity'} >= 1);
     331    print $outhandle "\n    compressing the text (mgpp_passes -T2)\n"  if ($self->{'verbosity'} >= 1);
    320332    &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
    321333           "", {}, $self->{'buildproc'}, $self->{'maxdocs'});
     
    408420    }
    409421
     422    $mapping{$index} = $dirname;
     423
    410424    # store the mapping orders as well as the maps
     425    # also put index, subcollection and language fields into the mapping thing -
     426    # (the full index name (eg document:text:subcol:lang) is not used on
     427    # the query page) -these are used for collectionmeta later on
    411428    if (!defined $mapping{'indexmap'}{"$fields"}) {
    412429        $mapping{'indexmap'}{"$fields"} = $pindex;
    413430        push (@{$mapping{'indexmaporder'}}, "$fields");
     431        if (!defined $mapping{"$fields"}) {
     432        $mapping{"$fields"} = $pindex;
     433        }   
    414434    }
    415435    if ($psub =~ /\w/ && !defined ($mapping{'subcollectionmap'}{$subcollection})) {
    416436        $mapping{'subcollectionmap'}{$subcollection} = $psub;
    417437        push (@{$mapping{'subcollectionmaporder'}}, $subcollection);
     438        $mapping{$subcollection} = $psub;
    418439    }
    419440    if ($plang =~ /\w/ && !defined ($mapping{'languagemap'}{$languages})) {
    420441        $mapping{'languagemap'}{$languages} = $plang;
    421442        push (@{$mapping{'languagemaporder'}}, $language);
    422     }
    423     $mapping{$index} = $dirname;
     443        $mapping{$languages} = $plang;
     444    }
    424445    $dirnames{$dirname} = $index;
    425446    $pnames{'index'}{$pindex} = "$fields";
     
    499520
    500521    # get any os specific stuff
    501     my $exedir = "$ENV{'GSDLHOME'}/src/mgpp/text";
     522    my $exedir = "$ENV{'GSDLHOME'}/bin/$ENV{'GSDLOS'}";
    502523
    503524    my $exe = &util::get_os_exe ();
    504     my $mg_passes_exe = &util::filename_cat($exedir, "mg_passes$exe");
     525    my $mgpp_passes_exe = &util::filename_cat($exedir, "mgpp_passes$exe");
    505526
    506527    # define the section names for mgpasses
    507     my $mg_passes_sections = "";
     528    my $mgpp_passes_sections = "";
    508529    foreach $level (keys (%{$self->{'levels'}})) {
    509530    if ($level eq "Section" || $level eq "Paragraph") {
    510         $mg_passes_sections .= "-K $level ";
    511     }
    512     }
    513 
    514     my $mg_perf_hash_build_exe =
    515     &util::filename_cat($exedir, "mg_perf_hash_build$exe");
    516     my $mg_weights_build_exe =
    517     &util::filename_cat ($exedir, "mg_weights_build$exe");
    518     my $mg_invf_dict_exe =
    519     &util::filename_cat ($exedir, "mg_invf_dict$exe");
    520     my $mg_stem_idx_exe =
    521     &util::filename_cat ($exedir, "mg_stem_idx$exe");
    522 
    523 #    if ($ENV{'GSDLOS'} =~ /^windows$/i) {
    524 #   $builddir=~ s/\//\\/g;
    525 #   $basefilename =~ s/\//\\/g;
    526 #    }
     531        $mgpp_passes_sections .= "-K $level ";
     532    }
     533    }
     534
     535    my $mgpp_perf_hash_build_exe =
     536    &util::filename_cat($exedir, "mgpp_perf_hash_build$exe");
     537    my $mgpp_weights_build_exe =
     538    &util::filename_cat ($exedir, "mgpp_weights_build$exe");
     539    my $mgpp_invf_dict_exe =
     540    &util::filename_cat ($exedir, "mgpp_invf_dict$exe");
     541    my $mgpp_stem_idx_exe =
     542    &util::filename_cat ($exedir, "mgpp_stem_idx$exe");
     543
     544    if ($ENV{'GSDLOS'} =~ /^windows$/i) {
     545    $builddir=~ s/\//\\/g;
     546    $basefilename =~ s/\//\\/g;
     547    }
    527548
    528549    # get the index expression if this index belongs
    529550    # to a subcollection
    530551    my $indexexparr = [];
    531     my ($fields, $subcollection) = split (":", $index);
     552
     553    # there may be subcollection info, and language info.
     554    my ($fields, $subcollection, $language) = split (":", $index);
    532555    my @subcollections = ();
    533556    @subcollections = split /,/, $subcollection if (defined $subcollection);
     
    540563   
    541564    # add expressions for languages if this index belongs to
    542     # a language subcollection
    543     foreach $language (@{$self->{'collect_cfg'}->{'languages'}}) {
     565    # a language subcollection - only put languages expressions for the
     566    # ones we want in the index
     567
     568    my @languages = ();
     569    @languages = split /,/, $language if (defined $language);
     570    foreach $language (@languages) {
     571    my $not=0;
    544572    if ($language =~ s/^\!//) {
    545         push (@$indexexparr, "!Language/$language/");
    546     } else {
    547         push (@$indexexparr, "Language/$language/");
     573        $not = 1;
     574    }
     575    foreach $lang (@{$self->{'collect_cfg'}->{'languages'}}) {
     576        if ($lang eq $language) {
     577        if ($not) {
     578            push (@$indexexparr, "!Language/$language/");
     579        } else {
     580            push (@$indexexparr, "Language/$language/");
     581        }
     582        last;
     583        }
    548584    }
    549585    }
    550586
    551587    # Build index dictionary. Uses verbatim stem method
    552     print $outhandle "\n    creating index dictionary (mg_passes -I1)\n"  if ($self->{'verbosity'} >= 1);
     588    print $outhandle "\n    creating index dictionary (mgpp_passes -I1)\n"  if ($self->{'verbosity'} >= 1);
    553589    my ($handle);
    554590    if ($self->{'debug'}) {
    555591    $handle = STDOUT;
    556592    } else {
    557     if (!-e "$mg_passes_exe" ||
    558         !open (PIPEOUT, "| $mg_passes_exe $mg_passes_sections  -d $builddir -f $basefilename -I1")) {
    559         die "mgppbuilder::build_index - couldn't run $mg_passes_exe\n";
     593    if (!-e "$mgpp_passes_exe" ||
     594        !open (PIPEOUT, "| mgpp_passes$exe $mgpp_passes_sections  -d $builddir -f $basefilename -I1")) {
     595        die "mgppbuilder::build_index - couldn't run $mgpp_passes_exe\n";
    560596    }
    561597    $handle = mgppbuilder::PIPEOUT;
     
    567603    $self->{'buildproc'}->set_index ($index, $indexexparr);
    568604    $self->{'buildproc'}->set_indexing_text (1);
     605    $self->{'buildproc'}->set_store_text(1);
    569606    $self->{'buildproc'}->set_indexfieldmap ($self->{'indexfieldmap'});
    570607    $self->{'buildproc'}->set_levels ($self->{'levels'});                       
     
    578615    if (!$self->{'debug'}) {
    579616    # create the perfect hash function
    580     if (!-e "$mg_perf_hash_build_exe") {
    581         die "mgppbuilder::build_index - couldn't run $mg_perf_hash_build_exe\n";
    582     }
    583     system ("$mg_perf_hash_build_exe -d $builddir -f $basefilename");
    584 
    585     if (!-e "$mg_passes_exe" ||
    586         !open ($handle, "| $mg_passes_exe $mg_passes_sections  -d $builddir -f $basefilename -I2")) {
    587         die "mgppbuilder::build_index - couldn't run $mg_passes_exe\n";
     617    if (!-e "$mgpp_perf_hash_build_exe") {
     618        die "mgppbuilder::build_index - couldn't run $mgpp_perf_hash_build_exe\n";
     619    }
     620    system ("mgpp_perf_hash_build$exe -d $builddir -f $basefilename");
     621
     622    if (!-e "$mgpp_passes_exe" ||
     623        !open ($handle, "| mgpp_passes$exe $mgpp_passes_sections  -d $builddir -f $basefilename -I2")) {
     624        die "mgppbuilder::build_index - couldn't run $mgpp_passes_exe\n";
    588625    }
    589626    }
    590627   
    591628    # invert the text
    592     print $outhandle "\n    inverting the text (mg_passes -I2)\n"  if ($self->{'verbosity'} >= 1);
     629    print $outhandle "\n    inverting the text (mgpp_passes -I2)\n"  if ($self->{'verbosity'} >= 1);
    593630
    594631    $self->{'buildproc'}->reset();
     
    604641    # create the weights file
    605642    print $outhandle "\n    create the weights file\n"  if ($self->{'verbosity'} >= 1);
    606     if (!-e "$mg_weights_build_exe") {
    607         die "mgppbuilder::build_index - couldn't run $mg_weights_build_exe\n";
    608     }
    609     system ("$mg_weights_build_exe -d $builddir -f $basefilename");
     643    if (!-e "$mgpp_weights_build_exe") {
     644        die "mgppbuilder::build_index - couldn't run $mgpp_weights_build_exe\n";
     645    }
     646    system ("mgpp_weights_build$exe -d $builddir -f $basefilename");
    610647
    611648    # create 'on-disk' stemmed dictionary
    612649    print $outhandle "\n    creating 'on-disk' stemmed dictionary\n"  if ($self->{'verbosity'} >= 1);
    613     if (!-e "$mg_invf_dict_exe") {
    614         die "mgppbuilder::build_index - couldn't run $mg_invf_dict_exe\n";
    615     }
    616     system ("$mg_invf_dict_exe -d $builddir -f $basefilename");
     650    if (!-e "$mgpp_invf_dict_exe") {
     651        die "mgppbuilder::build_index - couldn't run $mgpp_invf_dict_exe\n";
     652    }
     653    system ("mgpp_invf_dict$exe -d $builddir -f $basefilename");
    617654
    618655
    619656    # creates stem index files for the various stemming methods
    620657    print $outhandle "\n    creating stem indexes\n"  if ($self->{'verbosity'} >= 1);
    621     if (!-e "$mg_stem_idx_exe") {
    622         die "mgppbuilder::build_index - couldn't run $mg_stem_idx_exe\n";
    623     }
    624     system ("$mg_stem_idx_exe -b 4096 -s1 -d $builddir -f $basefilename");
    625     system ("$mg_stem_idx_exe -b 4096 -s2 -d $builddir -f $basefilename");
    626     system ("$mg_stem_idx_exe -b 4096 -s3 -d $builddir -f $basefilename");
     658    if (!-e "$mgpp_stem_idx_exe") {
     659        die "mgppbuilder::build_index - couldn't run $mgpp_stem_idx_exe\n";
     660    }
     661    system ("mgpp_stem_idx$exe -b 4096 -s1 -d $builddir -f $basefilename");
     662    system ("mgpp_stem_idx$exe -b 4096 -s2 -d $builddir -f $basefilename");
     663    system ("mgpp_stem_idx$exe -b 4096 -s3 -d $builddir -f $basefilename");
    627664
    628665   
     
    664701    my $txt2db_exe = &util::filename_cat($exedir, "txt2db$exe");
    665702
     703    # define the indexed field mapping if not already done so (ie if infodb called separately from build_index)
     704    if (scalar(keys %{$self->{'buildproc'}->{'indexfieldmap'}}) == 0) {
     705    #check build.cfg to see if indexfields have been filled in
     706    $buildconfigfile = &util::filename_cat($ENV{'GSDLCOLLECTDIR'}, "building/build.cfg");
     707    if (-e $buildconfigfile) {
     708        $buildcfg = &colcfg::read_build_cfg( $buildconfigfile);
     709        if (defined $buildcfg->{'indexfields'}) {
     710        foreach $field (@{$buildcfg->{'indexfields'}}) {
     711            $self->{'buildproc'}->{'indexfields'}->{$field} = 1;
     712        }
     713        }
     714        if (defined $buildcfg->{'indexfieldmap'}) {
     715        foreach $field (@{$buildcfg->{'indexfieldmap'}}) {
     716            ($f, $v) = $field =~ /^(.*)\-\>(.*)$/;
     717            $self->{'buildproc'}->{'indexfieldmap'}->{$f} = $v;
     718        }
     719        }       
     720    }
     721    }
     722   
    666723    print $outhandle "\n*** creating the info database and processing associated files\n"
    667724    if ($self->{'verbosity'} >= 1);
     
    687744    $self->{'buildproc'}->set_classifiers ($self->{'classifiers'});
    688745    $self->{'buildproc'}->set_indexing_text (0);
    689     $self->{'buildproc'}->set_indexfieldmap ($self->{'indexfieldmap'});
     746    $self->{'buildproc'}->set_store_text(1);
     747    #$self->{'buildproc'}->set_indexfieldmap ($self->{'indexfieldmap'});
    690748
    691749    $self->{'buildproc'}->reset();
     
    714772    }
    715773    #print out the indexfield mapping
    716     foreach $field (keys(%{$self->{'indexfieldmap'}})) {
    717         $shortname = $self->{'indexfieldmap'}->{$field};
    718         print $handle "<$shortname>$field\n";
     774    foreach $field (keys %{$self->{'buildproc'}->{'indexfields'}}) {
     775        $shortname = $self->{'buildproc'}->{'indexfieldmap'}->{$field};
     776        print $handle "<$shortname>$field\n" if defined $shortname;
    719777    }
    720778    print $handle "\n" . ('-' x 70) . "\n";
     
    820878    print $outhandle "Total bytes in $index: $num_processed_bytes\n";
    821879
    822     if ($num_processed_bytes < 50) {
     880    if ($num_processed_bytes < 50 && ($indexing_text || !$self->{'no_text'})) {
    823881    print $outhandle "***************\n";
    824     print $outhandle "WARNING: There is very little or no text to process for $index\n";
    825882    if ($indexing_text) {
    826         print $outhandle "This may cause an error while attempting to build the index\n";
    827     } else {
    828         print $outhandle "This may cause an error while attempting to compress the text\n";
    829     }
     883        print $outhandle "WARNING: There is very little or no text to process for $index\n";
     884    } elsif (!$self->{'no_text'}) {
     885        print $outhandle "WARNING: There is very little or no text to compress\n";
     886    }     
     887    print $outhandle "         Was this your intention?\n";
    830888    print $outhandle "***************\n";
    831889    }
Note: See TracChangeset for help on using the changeset viewer.