Changeset 2336


Ignore:
Timestamp:
2001-04-24T16:36:21+12:00 (23 years ago)
Author:
sjboddie
Message:

added a -no_text option to buildcol.pl to allow collections to be built
without storing compressed text (intended for use in collections where
original documents (PDFs or Word docs maybe) are returned instead of the
compressed text)

Location:
trunk/gsdl
Files:
4 edited

Legend:

Unmodified
Added
Removed
  • trunk/gsdl/bin/script/buildcol.pl

    r1970 r2336  
    5858    print STDERR "   -keepold              will not destroy the current contents of the\n";
    5959    print STDERR "                         building directory\n";
     60    print STDERR "   -no_text              Don't store compressed text. This option is\n";
     61    print STDERR "                         useful for minimizing the size of the built\n";
     62    print STDERR "                         indexes if you intend always to display the\n";
     63    print STDERR "                         original documents at run time (i.e. you won't\n";
     64    print STDERR "                         be able to retrieve the compressed text version)\n";
    6065    print STDERR "   -allclassifications   Don't remove empty classifications\n";
    6166    print STDERR "   -create_images        Attempt to create default images for new\n";
     
    7984    $debug, $mode, $indexname, $keepold, $allclassifications,
    8085    $create_images, $collectdir, $out, $buildtype, $textindex,
    81     $no_strip_html);
     86    $no_strip_html, $no_text);
    8287    if (!parsargv::parse(\@ARGV,
    8388             'verbosity/\d+/2', \$verbosity,
     
    8994             'mode/^(all|compress_text|build_index|infodb)$/all', \$mode,
    9095             'index/.*/', \$indexname,
     96             'no_text', \$no_text,
    9197             'keepold', \$keepold,
    9298             'allclassifications', \$allclassifications,
     
    136142        $builddir = $collectcfg->{'builddir'};
    137143    }
     144    if (defined $collectcfg->{'collectdir'} && $collectdir eq "") {
     145        $collectdir = $collectcfg->{'collectdir'};
     146    }
     147    if (defined $collectcfg->{'no_text'} && $no_text == 0) {
     148        if ($collectcfg->{'no_text'} =~ /^true$/) {
     149        $no_text = 1;
     150        }
     151    }
     152    if (defined $collectcfg->{'allclassifications'} && $allclassifications == 0) {
     153        if ($collectcfg->{'allclassifications'} =~ /^true$/) {
     154        $allclassifications = 1;
     155        }
     156    }
    138157    if ($buildtype eq "mgpp" && defined $collectcfg->{'textcompress'}) {
    139158        $textindex = $collectcfg->{'textcompress'};
     
    219238    eval("\$builder = new $buildertype(\$collection, " .
    220239     "\$realarchivedir, \$realbuilddir, \$verbosity, " .
    221      "\$maxdocs, \$debug, \$keepold, \$allclassifications, \$out)");
     240     "\$maxdocs, \$debug, \$keepold, \$allclassifications, " .
     241     "\$out, \$no_text)");
    222242    die "$@" if $@;
    223243
  • trunk/gsdl/perllib/colcfg.pm

    r1851 r2336  
    6666    return &cfgread::read_cfg_file ($filename,
    6767                    q/^(creator|public|beta|defaultindex|importdir|/ .
    68                     q/archivedir|cachedir|builddir|removeold|textcompress|buildtype)$/,
     68                    q/archivedir|cachedir|builddir|removeold|/ .
     69                    q/textcompress|buildtype|collectdir|no_text|allclassifications)$/,
    6970                    q/(maintainer|languages|indexsubcollections|/ .
    7071                       q/indexes|dontbuild|dontgdbm|mirror|phind|levels)$/,
     
    7879    &cfgread::write_cfg_file($filename, $data,
    7980                 q/^(creator|public|beta|defaultindex|importdir|/ .
    80                  q/archivedir|cachedir|builddir|removeold|textcompress|buildtype)$/,
     81                 q/archivedir|cachedir|builddir|removeold|/ .
     82                 q/textcompress|buildtype|collectdir|no_text|allclassifications)$/,
    8183                 q/^(maintainer|languages|indexsubcollections|/ .
    8284                 q/indexes|dontbuild|dontgdbm|levels)$/,
  • trunk/gsdl/perllib/mgbuilder.pm

    r1973 r2336  
    6161sub new {
    6262    my ($class, $collection, $source_dir, $build_dir, $verbosity,
    63     $maxdocs, $debug, $keepold, $allclassifications, $outhandle) = @_;
     63    $maxdocs, $debug, $keepold, $allclassifications,
     64    $outhandle, $no_text) = @_;
    6465
    6566    $outhandle = STDERR unless defined $outhandle;
     67    $no_text = 0 unless defined $no_text;
    6668
    6769    # create an mgbuilder object
     
    7577              'allclassifications'=>$allclassifications,
    7678              'outhandle'=>$outhandle,
     79              'no_text'=>$no_text,
    7780              'notbuilt'=>[]    # indexes not built
    7881              }, $class;
     
    219222    } else {
    220223    if (!-e "$mg_passes_exe" ||
    221 #       !open (PIPEOUT, "| \"$mg_passes_exe\" -f \"$fulltextprefix\" -b $maxdocsize -T1 $osextra")) {
    222224        !open (PIPEOUT, "| mg_passes$exe -f \"$fulltextprefix\" -b $maxdocsize -T1 $osextra")) {
    223225        die "mgbuilder::compress_text - couldn't run $mg_passes_exe\n";
     
    230232    $self->{'buildproc'}->set_index ($textindex);
    231233    $self->{'buildproc'}->set_indexing_text (0);
     234    if ($self->{'no_text'}) {
     235    $self->{'buildproc'}->set_store_text(0);
     236    } else {
     237    $self->{'buildproc'}->set_store_text(1);
     238    }
    232239    $self->{'buildproc'}->reset();
    233240    &plugin::begin($self->{'pluginfo'}, $self->{'source_dir'},
     
    251258        die "mgbuilder::compress_text - couldn't run $mg_compression_dict_exe\n";
    252259    }
    253 #   system ("\"$mg_compression_dict_exe\" -f \"$fulltextprefix\" -S -H -2 -k 5120 $osextra");
    254260    system ("mg_compression_dict$exe -f \"$fulltextprefix\" -S -H -2 -k 5120 $osextra");
    255261
    256262    # -b $maxdocsize sets the maximum document size to be 12 meg
    257263    if (!-e "$mg_passes_exe" ||
    258 #       !open ($handle, "| \"$mg_passes_exe\" -f \"$fulltextprefix\" -b $maxdocsize -T2 $osextra")) {
    259264        !open ($handle, "| mg_passes$exe -f \"$fulltextprefix\" -b $maxdocsize -T2 $osextra")) {
    260265        die "mgbuilder::compress_text - couldn't run $mg_passes_exe\n";
     
    526531    } else {
    527532    if (!-e "$mg_passes_exe" ||
    528 #       !open (PIPEOUT, "| \"$mg_passes_exe\" -f \"$fullindexprefix\" -b $maxdocsize " .
    529533        !open (PIPEOUT, "| mg_passes$exe -f \"$fullindexprefix\" -b $maxdocsize " .
    530534           "-$index_level -m 32 -s 0 -G -t 10 -N1 $osextra")) {
     
    539543    $self->{'buildproc'}->set_index ($index, $indexexparr);
    540544    $self->{'buildproc'}->set_indexing_text (1);
     545    $self->{'buildproc'}->set_store_text(1);
    541546
    542547    $self->{'buildproc'}->reset();
     
    552557        die "mgbuilder::build_index - couldn't run $mg_perf_hash_build_exe\n";
    553558    }
    554 #   system ("\"$mg_perf_hash_build_exe\" -f \"$fullindexprefix\" $osextra");
    555559    system ("mg_perf_hash_build$exe -f \"$fullindexprefix\" $osextra");
    556560
    557561    if (!-e "$mg_passes_exe" ||
    558 #       !open ($handle, "| \"$mg_passes_exe\" -f \"$fullindexprefix\" -b $maxdocsize " .
    559562        !open ($handle, "| mg_passes$exe -f \"$fullindexprefix\" -b $maxdocsize " .
    560563           "-$index_level -c 3 -G -t 10 -N2 $osextra")) {
     
    581584        die "mgbuilder::build_index - couldn't run $mg_weights_build_exe\n";
    582585    }
    583 #   system ("\"$mg_weights_build_exe\" -f \"$fullindexprefix\" -t \"$fulltextprefix\" $osextra");
    584586    system ("mg_weights_build$exe -f \"$fullindexprefix\" -t \"$fulltextprefix\" $osextra");
    585587
     
    589591        die "mgbuilder::build_index - couldn't run $mg_invf_dict_exe\n";
    590592    }
    591 #   system ("\"$mg_invf_dict_exe\" -f \"$fullindexprefix\" $osextra");
    592593    system ("mg_invf_dict$exe -f \"$fullindexprefix\" $osextra");
    593594
     
    598599        die "mgbuilder::build_index - couldn't run $mg_stem_idx_exe\n";
    599600    }
    600 #   system ("\"$mg_stem_idx_exe\" -b 4096 -s1 -f \"$fullindexprefix\" $osextra");
    601601    system ("mg_stem_idx$exe -b 4096 -s1 -f \"$fullindexprefix\" $osextra");
    602 #   system ("\"$mg_stem_idx_exe\" -b 4096 -s2 -f \"$fullindexprefix\" $osextra");
    603602    system ("mg_stem_idx$exe -b 4096 -s2 -f \"$fullindexprefix\" $osextra");
    604 #   system ("\"$mg_stem_idx_exe\" -b 4096 -s3 -f \"$fullindexprefix\" $osextra");
    605603    system ("mg_stem_idx$exe -b 4096 -s3 -f \"$fullindexprefix\" $osextra");
    606 
    607604   
    608605    # remove unwanted files
     
    653650    $handle = STDOUT;
    654651    } else {
    655 #   if (!-e "$txt2db_exe" || !open (PIPEOUT, "| \"$txt2db_exe\" \"$fulldbname\"")) {
    656652    if (!-e "$txt2db_exe" || !open (PIPEOUT, "| txt2db$exe \"$fulldbname\"")) {
    657653        die "mgbuilder::make_infodatabase - couldn't run $txt2db_exe\n";
     
    666662    $self->{'buildproc'}->set_classifiers ($self->{'classifiers'});
    667663    $self->{'buildproc'}->set_indexing_text (0);
     664    $self->{'buildproc'}->set_store_text(1);
    668665    $self->{'buildproc'}->reset();
    669666
     
    731728    my $mgstat_exe = &util::filename_cat($exedir, "mgstat$exe");
    732729    my $input_file = &util::filename_cat ("text", $self->{'collection'});
    733 #    if (!-e "$mgstat_exe" || !open (PIPEIN, "\"$mgstat_exe\" -d \"$self->{'build_dir'}\" -f \"$input_file\" |")) {
    734730    if (!-e "$mgstat_exe" || !open (PIPEIN, "mgstat$exe -d \"$self->{'build_dir'}\" -f \"$input_file\" |")) {
    735731    print $outhandle "Warning: Couldn't open pipe to $mgstat_exe to get additional stats\n";
     
    797793    print $outhandle "Total bytes in $index: $num_processed_bytes\n";
    798794
    799     if ($num_processed_bytes < 50) {
     795    if ($num_processed_bytes < 50 && ($indexing_text || !$self->{'no_text'})) {
    800796    print $outhandle "***************\n";
    801     print $outhandle "WARNING: There is very little or no text to process for $index\n";
    802797    if ($indexing_text) {
    803         print $outhandle "This may cause an error while attempting to build the index\n";
    804     } else {
    805         print $outhandle "This may cause an error while attempting to compress the text\n";
    806     }
     798        print $outhandle "WARNING: There is very little or no text to process for $index\n";
     799    } elsif (!$self->{'no_text'}) {
     800        print $outhandle "WARNING: There is very little or no text to compress\n";
     801    }
     802    print $outhandle "         Was this your intention?\n";
    807803    print $outhandle "***************\n";
    808804    }
     
    810806
    8118071;
    812 
    813 
  • trunk/gsdl/perllib/mgbuildproc.pm

    r1424 r2336  
    6565    $self->{'num_bytes'} = 0;
    6666    $self->{'num_processed_bytes'} = 0;
     67    $self->{'store_text'} = 1;
    6768    $self->{'outhandle'} = $outhandle;
    6869
     
    170171    return $self->{'indexing_text'};
    171172}
     173
     174sub set_store_text {
     175    my $self = shift (@_);
     176    my ($store_text) = @_;
     177
     178    $self->{'store_text'} = $store_text;
     179}
     180
    172181
    173182sub process {
     
    446455            my $new_text = "";
    447456            if ($real_field eq "text") {
    448             $new_text = $doc_obj->get_text ($section);
     457            $new_text = $doc_obj->get_text ($section) if $self->{'store_text'};
    449458            $self->{'num_processed_bytes'} += length ($new_text);
    450459            $new_text =~ s/[\cB\cC]//g;
     
    457466                $self->{'num_processed_bytes'} += length ($meta);
    458467                $new_text .= "\cC" unless $first;
    459                 $new_text .= $meta;
     468                $new_text .= $meta if $self->{'store_text'};
    460469                $first = 0;
    461470            }
Note: See TracChangeset for help on using the changeset viewer.