Changeset 10468


Ignore:
Timestamp:
2005-08-10T16:19:17+12:00 (19 years ago)
Author:
kjdon
Message:

made a base builder class, adn moved lots of the code to it. hoe I haven't stuffed anything up :-)

Location:
trunk/gsdl/perllib
Files:
1 added
3 edited

Legend:

Unmodified
Added
Removed
  • trunk/gsdl/perllib/lucenebuilder.pm

    r10158 r10468  
    4444    $self = bless $self, $class;
    4545
    46     # load up the document processor for building
    47     # if a buildproc class has been created for this collection, use it
    48     # otherwise, use the lucene buildproc
    49     my ($buildprocdir, $buildproctype);
    50     if (-e "$ENV{'GSDLCOLLECTDIR'}/perllib/${collection}buildproc.pm") {
    51     $buildprocdir = "$ENV{'GSDLCOLLECTDIR'}/perllib";
    52     $buildproctype = "${collection}buildproc";
    53     } else {
    54     $buildprocdir = "$ENV{'GSDLHOME'}/perllib";
    55     $buildproctype = "lucenebuildproc";
    56     }
    57     require "$buildprocdir/$buildproctype.pm";
    58 
    59     eval("\$self->{'buildproc'} = new $buildproctype(\$collection, " .
    60      "\$source_dir, \$build_dir, \$keepold, \$verbosity, \$outhandle)");
    61     die "$@" if $@;
    62 
    6346    $self->{'buildtype'} = "lucene";
    6447
    6548    return $self;
     49}
     50
     51sub default_buildproc {
     52    my $self  = shift (@_);
     53
     54    return "lucenebuildproc";
    6655}
    6756
     
    188177}
    189178
    190 
    191 
    192 
    193 
    194179sub build_index {
    195180    my $self = shift (@_);
  • trunk/gsdl/perllib/mgbuilder.pm

    r10158 r10468  
    2626package mgbuilder;
    2727
     28use basebuilder;
    2829use classify;
    2930use cfgread;
     
    3435
    3536BEGIN {
    36     # set autoflush on for STDERR and STDOUT so that mg
    37     # doesn't get out of sync with plugins
    38     STDOUT->autoflush(1);
    39     STDERR->autoflush(1);
    40 }
    41 
    42 END {
    43     STDOUT->autoflush(0);
    44     STDERR->autoflush(0);
    45 }
    46 
    47 my $maxdocsize = 12000;
     37    @mgbuilder::ISA = ('basebuilder');
     38}
     39
    4840
    4941my %wanted_index_files = ('td'=>1,
     
    5850               'wa'=>1);
    5951
     52my $maxdocsize = $basebuilder::maxdocsize;
     53
    6054
    6155sub new {
    62     my ($class, $collection, $source_dir, $build_dir, $verbosity,
     56    my $class = shift(@_);
     57
     58    my ($collection, $source_dir, $build_dir, $verbosity,
    6359    $maxdocs, $debug, $keepold, $remove_empty_classifications,
    6460    $outhandle, $no_text, $failhandle, $gli) = @_;
    6561
    66     $outhandle = STDERR unless defined $outhandle;
    67     $no_text = 0 unless defined $no_text;
    68     $failhandle = STDERR unless defined $failhandle;
    69 
    70     # create an mgbuilder object
    71     my $self = bless {'collection'=>$collection,
    72               'source_dir'=>$source_dir,
    73               'build_dir'=>$build_dir,
    74               'verbosity'=>$verbosity,
    75               'maxdocs'=>$maxdocs,
    76               'debug'=>$debug,
    77               'keepold'=>$keepold,
    78               'remove_empty_classifications'=>$remove_empty_classifications,
    79               'outhandle'=>$outhandle,
    80               'no_text'=>$no_text,
    81               'failhandle'=>$failhandle,
    82               'notbuilt'=>{},    # indexes not built
    83               'gli'=>$gli
    84               }, $class;
    85 
    86     $self->{'gli'} = 0 unless defined $self->{'gli'};
    87 
    88     # read in the collection configuration file
    89     my $colcfgname = "$ENV{'GSDLCOLLECTDIR'}/etc/collect.cfg";
    90     if (!-e $colcfgname) {
    91     die "mgbuilder::new - couldn't find collect.cfg for collection $collection\n";
    92     }
    93     $self->{'collect_cfg'} = &colcfg::read_collect_cfg ($colcfgname);
     62    my $self = new basebuilder (@_);
     63    $self = bless $self, $class;
     64
     65    $self->{'buildtype'} = "mg";
     66    return $self;
     67}
     68
     69sub default_buildproc {
     70    my $self  = shift (@_);
     71
     72    return "mgbuildproc";
     73}
     74
     75sub generate_index_list {
     76    my $self = shift (@_);
    9477
    9578    if (!defined($self->{'collect_cfg'}->{'indexes'})) {
    9679    $self->{'collect_cfg'}->{'indexes'} = [];
    9780    }
    98 
    99     # sort out subcollection indexes
    100     if (defined $self->{'collect_cfg'}->{'indexsubcollections'}) {
    101     my $indexes = $self->{'collect_cfg'}->{'indexes'};
    102     $self->{'collect_cfg'}->{'indexes'} = [];
    103     foreach my $subcollection (@{$self->{'collect_cfg'}->{'indexsubcollections'}}) {
    104         foreach my $index (@$indexes) {
    105         push (@{$self->{'collect_cfg'}->{'indexes'}}, "$index:$subcollection");
    106         }
    107     }
    108     }
    109 
    110     # sort out language subindexes
    111     if (defined $self->{'collect_cfg'}->{'languages'}) {
    112     my $indexes = $self->{'collect_cfg'}->{'indexes'};
    113     $self->{'collect_cfg'}->{'indexes'} = [];
    114     foreach my $language (@{$self->{'collect_cfg'}->{'languages'}}) {
    115         foreach my $index (@$indexes) {
    116         if (defined ($self->{'collect_cfg'}->{'indexsubcollections'})) {
    117             push (@{$self->{'collect_cfg'}->{'indexes'}}, "$index:$language");
    118         }
    119         else { # add in an empty subcollection field
    120             push (@{$self->{'collect_cfg'}->{'indexes'}}, "$index\:\:$language");
    121         }
    122         }
    123     }
    124     }
    125 
    126     if (defined($self->{'collect_cfg'}->{'indexes'})) {
    127     # make sure that the same index isn't specified more than once
    128     my %tmphash = ();
    129     my @tmparray = @{$self->{'collect_cfg'}->{'indexes'}};
    130     $self->{'collect_cfg'}->{'indexes'} = [];
    131     foreach my $i (@tmparray) {
    132         if (!defined ($tmphash{$i})) {
    133         push (@{$self->{'collect_cfg'}->{'indexes'}}, $i);
    134         $tmphash{$i} = 1;
    135         }
    136     }
    137     } else {
    138     $self->{'collect_cfg'}->{'indexes'} = [];
    139     }
    140 
    14181    if (scalar(@{$self->{'collect_cfg'}->{'indexes'}}) == 0) {
    14282    # no indexes have been specified so we'll build a "dummy:text" index
     
    14484    }
    14585
    146     # get the list of plugins for this collection
    147     my $plugins = [];
    148     if (defined $self->{'collect_cfg'}->{'plugin'}) {
    149     $plugins = $self->{'collect_cfg'}->{'plugin'};
    150     }
    151    
    152     # load all the plugins
    153 
    154     #build up the extra global options for the plugins
    155     my @global_opts = ();
    156     if (defined $self->{'collect_cfg'}->{'separate_cjk'} && $self->{'collect_cfg'}->{'separate_cjk'} =~ /^true$/i) {
    157     push @global_opts, "-separate_cjk";
    158     }
    159     $self->{'pluginfo'} = &plugin::load_plugins ($plugins, $verbosity, $outhandle, $failhandle, \@global_opts);
    160    
    161     if (scalar(@{$self->{'pluginfo'}}) == 0) {
    162     print $outhandle "No plugins were loaded.\n";
    163     die "\n";
    164     }
    165 
    166     # get the list of classifiers for this collection
    167     my $classifiers = [];
    168     if (defined $self->{'collect_cfg'}->{'classify'}) {
    169     $classifiers = $self->{'collect_cfg'}->{'classify'};
    170     }
    171    
    172     # load all the classifiers
    173     $self->{'classifiers'} = &classify::load_classifiers ($classifiers, $build_dir, $outhandle);
    174 
    175     # load up any dontgdbm fields
    176     $self->{'dontgdbm'} = {};
    177     if (defined ($self->{'collect_cfg'}->{'dontgdbm'})) {
    178     foreach my $dg (@{$self->{'collect_cfg'}->{'dontgdbm'}}) {
    179         $self->{'dontgdbm'}->{$dg} = 1;
    180     }
    181     }
    182 
    183     # load up the document processor for building
    184     # if a buildproc class has been created for this collection, use it
    185     # otherwise, use the mg buildproc
    186     my ($buildprocdir, $buildproctype);
    187     if (-e "$ENV{'GSDLCOLLECTDIR'}/perllib/${collection}buildproc.pm") {
    188     $buildprocdir = "$ENV{'GSDLCOLLECTDIR'}/perllib";
    189     $buildproctype = "${collection}buildproc";
    190     } else {
    191     $buildprocdir = "$ENV{'GSDLHOME'}/perllib";
    192     $buildproctype = "mgbuildproc";
    193     }
    194  
    195     require "$buildprocdir/$buildproctype.pm";
    196 
    197     eval("\$self->{'buildproc'} = new $buildproctype(\$collection, " .
    198      "\$source_dir, \$build_dir, \$keepold, \$verbosity, \$outhandle)");
    199     die "$@" if $@;
    200 
    201     return $self;
    202 }
    203 
    204 sub init {
    205     my $self = shift (@_);
    206 
    207     if (!$self->{'debug'} && !$self->{'keepold'}) {
    208     # remove any old builds
    209     &util::rm_r($self->{'build_dir'});
    210     &util::mk_all_dir($self->{'build_dir'});
    211        
    212     # make the text directory
    213     my $textdir = "$self->{'build_dir'}/text";
    214     &util::mk_all_dir($textdir);
    215     }
    216 }
     86}
     87
    21788
    21889sub compress_text {
     
    324195}
    325196
    326 sub want_built {
    327     my $self = shift (@_);
    328     my ($index) = @_;
    329 
    330     if (defined ($self->{'collect_cfg'}->{'dontbuild'})) {
    331     foreach my $checkstr (@{$self->{'collect_cfg'}->{'dontbuild'}}) {
    332         if ($index =~ /^$checkstr$/) {
    333         #push (@{$self->{'notbuilt'}}, $self->{'index_mapping'}->{$index});
    334         $self->{'notbuilt'}->{$index} = 1;
    335         return 0;
    336         }
    337     }
    338     }
    339 
    340     return 1;
    341 }
    342 
    343 sub build_indexes {
    344     my $self = shift (@_);
    345     my ($indexname) = @_;
    346     my $outhandle = $self->{'outhandle'};
    347     my $indexes = [];
    348     if (defined $indexname && $indexname =~ /\w/) {
    349     push @$indexes, $indexname;
    350     } else {
    351     $indexes = $self->{'collect_cfg'}->{'indexes'};
    352     }
    353 
    354     # create the mapping between the index descriptions
    355     # and their directory names
    356     $self->{'index_mapping'} = $self->create_index_mapping ($indexes);
    357 
    358     # build each of the indexes
    359     foreach my $index (@$indexes) {
    360     if ($self->want_built($index)) {
    361         print $outhandle "\n*** building index $index in subdirectory " .
    362         "$self->{'index_mapping'}->{$index}\n" if ($self->{'verbosity'} >= 1);
    363         print STDERR "<Stage name='Index' source='$index'>\n" if $self->{'gli'};
    364         $self->build_index($index);
    365     } else {
    366         print $outhandle "\n*** ignoring index $index\n" if ($self->{'verbosity'} >= 1);
    367     }
    368     }
    369 }
    370197
    371198# creates directory names for each of the index descriptions
     
    440267}
    441268
    442 # returns a processed version of a field.
    443 # if the field has only one component the processed
    444 # version will contain the first character and next consonant
    445 # of that componant - otherwise it will contain the first
    446 # character of the first two components
    447 sub process_field {
    448     my $self = shift (@_);
    449     my ($field) = @_;
    450  
    451     return "" unless (defined ($field) && $field =~ /\w/);
    452 
    453     my @components = split /,/, $field;
    454     if (scalar @components >= 2) {
    455     splice (@components, 2);
    456     map {s/^(.).*$/$1/;} @components;
    457     return join("", @components);
    458     } else {
    459     my ($a, $b) = $field =~ /^(.).*?([bcdfghjklmnpqrstvwxyz])/i;
    460     ($a, $b) = $field =~ /^(.)(.)/ unless defined $a && defined $b;
    461     return "$a$b";
    462     }
    463 }
    464269
    465270sub make_unique {
     
    477282    return "$$indexref$$subref$$langref";
    478283}   
    479 
    480 sub get_next_version {
    481     my $self = shift (@_);
    482     my ($nameref) = @_;
    483     if ($$nameref =~ /(\d\d)$/) {
    484     my $num = $1; $num ++;
    485     $$nameref =~ s/\d\d$/$num/;
    486     } elsif ($$nameref =~ /(\d)$/) {
    487     my $num = $1;
    488     if ($num == 9) {$$nameref =~ s/\d$/10/;}
    489     else {$num ++; $$nameref =~ s/\d$/$num/;}
    490     } else {
    491     $$nameref =~ s/.$/0/;
    492     }
    493 }
    494284
    495285sub build_index {
     
    696486}
    697487
    698 sub make_infodatabase {
    699     my $self = shift (@_);
    700     my $outhandle = $self->{'outhandle'};
    701 
    702     my $textdir = &util::filename_cat($self->{'build_dir'}, "text");
    703     my $assocdir = &util::filename_cat($self->{'build_dir'}, "assoc");
    704     &util::mk_all_dir ($textdir);
    705     &util::mk_all_dir ($assocdir);
    706 
    707     # get db name
    708     my $dbext = ".bdb";
    709     $dbext = ".ldb" if &util::is_little_endian();
    710     my $fulldbname = &util::filename_cat ($textdir, "$self->{'collection'}$dbext");
    711     $fulldbname =~ s/\//\\/g if ($ENV{'GSDLOS'} =~ /^windows$/i);
    712 
    713     my $exedir = "$ENV{'GSDLHOME'}/bin/$ENV{'GSDLOS'}";
    714     my $exe = &util::get_os_exe ();
    715     my $txt2db_exe = &util::filename_cat($exedir, "txt2db$exe");
    716 
    717     print $outhandle "\n*** creating the info database and processing associated files\n"
    718     if ($self->{'verbosity'} >= 1);
    719     print STDERR "<Stage name='CreateInfoData'>\n" if $self->{'gli'};
    720 
    721     # init all the classifiers
    722     &classify::init_classifiers ($self->{'classifiers'});
    723    
    724     # set up the document processor
    725     my ($handle);
    726     if ($self->{'debug'}) {
    727     $handle = STDOUT;
    728     } else {
    729     if (!-e "$txt2db_exe" || !open (PIPEOUT, "| txt2db$exe \"$fulldbname\"")) {
    730         print STDERR "<FatalError name='NoRunText2DB'/>\n</Stage>\n" if $self->{'gli'};
    731         die "mgbuilder::make_infodatabase - couldn't run $txt2db_exe\n";
    732     }
    733     $handle = mgbuilder::PIPEOUT;
    734     }
    735    
    736     $self->{'buildproc'}->set_output_handle ($handle);
    737     $self->{'buildproc'}->set_mode ('infodb');
    738     $self->{'buildproc'}->set_assocdir ($assocdir);
    739     $self->{'buildproc'}->set_dontgdbm ($self->{'dontgdbm'});
    740     $self->{'buildproc'}->set_classifiers ($self->{'classifiers'});
    741     $self->{'buildproc'}->set_indexing_text (0);
    742     $self->{'buildproc'}->set_store_text(1);
    743     $self->{'buildproc'}->reset();
     488sub build_cfg_extra {
     489   my $self = shift(@_);
     490   my ($build_cfg) = @_;
    744491   
    745     if (defined $self->{'collect_cfg'}->{'collectionmeta'}) {
    746    
    747         if (!defined $self->{'index_mapping'}) {
    748         $self->{'index_mapping'} =
    749         $self->create_index_mapping ($self->{'collect_cfg'}->{'indexes'});
    750     }
    751    
    752     print $handle "[collection]\n";
    753      
    754     foreach my $cmeta (keys (%{$self->{'collect_cfg'}->{'collectionmeta'}})) {
    755         my $defaultfound=0;
    756         my $first=1;
    757         my $metadata_entry = "";
    758         my $default="";
    759         my $cmetamap = "";
    760         if ($cmeta =~ s/^\.//) {
    761             if (defined $self->{'index_mapping'}->{$cmeta}) {
    762             $cmetamap = $self->{'index_mapping'}->{$cmeta};
    763             $cmeta = ".$cmeta";
    764         }
    765         else {
    766             print $outhandle "mgbuilder: warning bad collectionmeta option '$cmeta' - ignored\n";
    767             next; #ignore this one
    768         }
    769         }
    770         else {
    771         $cmetamap = $cmeta; # just using the same name
    772         }
    773         #iterate through the languages
    774         foreach my $lang (keys (%{$self->{'collect_cfg'}->{'collectionmeta'}->{$cmeta}})) {
    775         if ($first) {
    776             $first=0;
    777             #set the default default to the first entry
    778             $default=$self->{'collect_cfg'}->{'collectionmeta'}->{$cmeta}->{$lang};
    779         }
    780         if ($lang =~ /default/) {
    781             $defaultfound=1;
    782             #the default entry goes first
    783             $metadata_entry = "<$cmetamap>" .
    784             $self->{'collect_cfg'}->{'collectionmeta'}->{$cmeta}->{'default'} . "\n" . $metadata_entry;
    785         }
    786         else {
    787             my ($l) = $lang =~ /^\[l=(\w*)\]$/;
    788             if ($l) {
    789             $metadata_entry .= "<$cmetamap:$l>" .
    790                 $self->{'collect_cfg'}->{'collectionmeta'}->{$cmeta}->{$lang} . "\n";
    791 
    792             # Use the English value as the default if no default is specified
    793             if ($l =~ /en/i) {
    794                 $default=$self->{'collect_cfg'}->{'collectionmeta'}->{$cmeta}->{$lang};
    795             }
    796             }
    797         }
    798         }
    799         #if we haven't found a default, put one in
    800         if (!$defaultfound) {
    801         $metadata_entry = "<$cmetamap>$default\n" . $metadata_entry;
    802         }
    803     #write the entry to the file
    804     print $handle $metadata_entry;
    805 
    806     }
    807    
    808     print $handle "\n" . ('-' x 70) . "\n";
    809     }
    810    
    811     &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
    812            "", {}, $self->{'buildproc'}, $self->{'maxdocs'},0, $self->{'gli'});
    813    
    814     # output classification information
    815     &classify::output_classify_info ($self->{'classifiers'}, $handle,
    816                      $self->{'remove_empty_classifications'},
    817                      $self->{'gli'});
    818 
    819 
    820     #output doclist
    821     my @doclist = $self->{'buildproc'}->get_doc_list();
    822     my $docs = join (";",@doclist);
    823     print $handle "[browselist]\n";
    824     print $handle "<hastxt>0\n";
    825     print $handle "<childtype>VList\n";
    826     print $handle "<numleafdocs>" . ($#doclist+1) . "\n";
    827     print $handle "<thistype>Invisible\n";
    828     print $handle "<contains>$docs";
    829     print $handle "\n" . ('-' x 70) . "\n";
    830 
    831     close ($handle) if !$self->{'debug'};
    832 
    833     print STDERR "</Stage>\n" if $self->{'gli'};
    834 }
    835 
    836 sub collect_specific {
    837     my $self = shift (@_);
    838 }
    839 
    840 sub make_auxiliary_files {
    841     my $self = shift (@_);
    842     my ($index);
    843     my $build_cfg = {};
    844     my $outhandle = $self->{'outhandle'};
    845 
    846     print $outhandle "\n*** creating auxiliary files \n" if ($self->{'verbosity'} >= 1);
    847     print STDERR "<Stage name='CreatingAuxilary'>\n" if $self->{'gli'};
    848 
    849     # get the text directory
    850     &util::mk_all_dir ($self->{'build_dir'});
    851 
    852     # store the build date
    853     $build_cfg->{'builddate'} = time;
    854     $build_cfg->{'indexstem'} = $self->{'collection'};
    855     # store the number of documents and number of bytes
    856     $build_cfg->{'numdocs'} = $self->{'buildproc'}->get_num_docs();
    857     $build_cfg->{'numbytes'} = $self->{'buildproc'}->get_num_bytes();
    858 
    859492    # get additional stats from mg
    860493    my $exedir = "$ENV{'GSDLHOME'}/bin/$ENV{'GSDLOS'}";
     
    875508    close PIPEIN;
    876509    }
    877 
    878     # store the mapping between the index names and the directory names
    879     # the index map is used to determine what indexes there are, so any that are not built should not be put into the map.
    880     my @indexmap = ();
    881     foreach my $index (@{$self->{'index_mapping'}->{'indexmaporder'}}) {
    882     if (not defined ($self->{'notbuilt'}->{$index})) {
    883         push (@indexmap, "$index\-\>$self->{'index_mapping'}->{'indexmap'}->{$index}");
    884     }
    885     }
    886     $build_cfg->{'indexmap'} = \@indexmap;
    887 
    888     my @subcollectionmap = ();
    889     foreach my $subcollection (@{$self->{'index_mapping'}->{'subcollectionmaporder'}}) {
    890     push (@subcollectionmap, "$subcollection\-\>" .
    891           $self->{'index_mapping'}->{'subcollectionmap'}->{$subcollection});
    892     }
    893     $build_cfg->{'subcollectionmap'} = \@subcollectionmap if scalar (@subcollectionmap);
    894 
    895     my @languagemap = ();
    896     foreach my $language (@{$self->{'index_mapping'}->{'languagemaporder'}}) {
    897     push (@languagemap, "$language\-\>" .
    898           $self->{'index_mapping'}->{'languagemap'}->{$language});
    899     }
    900     $build_cfg->{'languagemap'} = \@languagemap if scalar (@languagemap);
    901 
    902     #$build_cfg->{'notbuilt'} = $self->{'notbuilt'} if scalar @{$self->{'notbuilt'}};
    903     my @notbuilt = ();
    904     foreach my $nb (keys %{$self->{'notbuilt'}}) {
    905     push (@notbuilt, $nb);
    906     }
    907     $build_cfg->{'notbuilt'} = \@notbuilt if scalar (@notbuilt);
    908     $build_cfg->{'maxnumeric'} = 4;
    909     if (defined($self->{'collect_cfg'}->{'maxnumeric'}) &&
    910     $self->{'collect_cfg'}->{'maxnumeric'} =~ /^\d+$/) {
    911     $build_cfg->{'maxnumeric'} = $self->{'collect_cfg'}->{'maxnumeric'};
    912     }
    913 
    914     # write out the build information
    915     &cfgread::write_cfg_file("$self->{'build_dir'}/build.cfg", $build_cfg,
    916                  '^(builddate|numdocs|numbytes|numwords|numsections|maxnumeric|indexstem)$',
    917                              '^(indexmap|subcollectionmap|languagemap|notbuilt)$');
    918 
    919     print STDERR "</Stage>\n" if $self->{'gli'};
    920 }
    921 
    922 sub deinit {
    923     my $self = shift (@_);
    924 
    925     &plugin::deinit($self->{'pluginfo'},$self->{'buildproc'});
    926 }
    927 
    928 sub print_stats {
    929     my $self = shift (@_);
    930 
    931     my $outhandle = $self->{'outhandle'};
    932     my $indexing_text = $self->{'buildproc'}->get_indexing_text();
    933     my $index = $self->{'buildproc'}->get_index();
    934     my $num_bytes = $self->{'buildproc'}->get_num_bytes();
    935     my $num_processed_bytes = $self->{'buildproc'}->get_num_processed_bytes();
    936    
    937     if ($indexing_text) {
    938     print $outhandle "Stats (Creating index $index)\n";
    939     } else {
    940     print $outhandle "Stats (Compressing text from $index)\n";
    941     }
    942     print $outhandle "Total bytes in collection: $num_bytes\n";
    943     print $outhandle "Total bytes in $index: $num_processed_bytes\n";
    944    
    945     if ($num_processed_bytes < 50 && ($indexing_text || !$self->{'no_text'})) {
    946     print $outhandle "***************\n";
    947     if ($indexing_text) {
    948         print $outhandle "WARNING: There is very little or no text to process for $index\n";
    949     } elsif (!$self->{'no_text'}) {
    950         print $outhandle "WARNING: There is very little or no text to compress\n";
    951     }
    952     print $outhandle "         Was this your intention?\n";
    953     print $outhandle "***************\n";
    954     print STDERR "<Warning name='LittleOrNoText'>\n" if $self->{'gli'};
    955     }
    956510}
    957511
  • trunk/gsdl/perllib/mgppbuilder.pm

    r10158 r10468  
    2626package mgppbuilder;
    2727
     28use basebuilder;
    2829use classify;
    2930use cfgread;
     
    3334use FileHandle;
    3435
    35 
    36 BEGIN {
    37     # set autoflush on for STDERR and STDOUT so that mgpp
    38     # doesn't get out of sync with plugins
    39     STDOUT->autoflush(1);
    40     STDERR->autoflush(1);
    41 }
    42 
    43 END {
    44     STDOUT->autoflush(0);
    45     STDERR->autoflush(0);
    46 }
    47 
    48 our $maxdocsize = 12000;
     36sub BEGIN {
     37    @mgppbuilder::ISA = ('basebuilder');
     38}
     39
     40
    4941
    5042our %level_map = ('document'=>'Doc',
     
    10799              'Para'=>1);
    108100
     101my $maxdocsize = $basebuilder::maxdocsize;
     102
    109103sub new {
    110104    my $class = shift(@_);
     
    114108    $outhandle, $no_text, $failhandle, $gli) = @_;
    115109
    116     $outhandle = STDERR unless defined $outhandle;
    117     $no_text = 0 unless defined $no_text;
    118    
    119     # create an mgppbuilder object
    120     my $self = bless {'collection'=>$collection,
    121               'source_dir'=>$source_dir,
    122               'build_dir'=>$build_dir,
    123               'verbosity'=>$verbosity,
    124               'maxdocs'=>$maxdocs,
    125               'debug'=>$debug,
    126               'keepold'=>$keepold,
    127               'remove_empty_classifications'=>$remove_empty_classifications,
    128               'outhandle'=>$outhandle,
    129               'no_text'=>$no_text,
    130               'notbuilt'=>{},    # indexes not built
    131               'indexfieldmap'=>\%static_indexfield_map,
    132               'gli'=>$gli
    133           }, $class;
    134 
    135     $self->{'gli'} = 0 unless defined $self->{'gli'};
    136 
    137     # read in the collection configuration file
    138     my $colcfgname = "$ENV{'GSDLCOLLECTDIR'}/etc/collect.cfg";
    139     if (!-e $colcfgname) {
    140     die "mgppbuilder::new - couldn't find collect.cfg for collection $collection\n";
    141     }
    142     $self->{'collect_cfg'} = &colcfg::read_collect_cfg ($colcfgname);
    143 
    144     # sort out the indexes
    145     #indexes are specified with spaces, but we put them into one index
    146     my $indexes = $self->{'collect_cfg'}->{'indexes'};
    147     $self->{'collect_cfg'}->{'indexes'} = [];
    148     push (@{$self->{'collect_cfg'}->{'indexes'}}, join(',', @$indexes));
    149      
    150    
    151     # sort out subcollection indexes
    152     if (defined $self->{'collect_cfg'}->{'indexsubcollections'}) {
    153     my $indexes = $self->{'collect_cfg'}->{'indexes'};
    154     $self->{'collect_cfg'}->{'indexes'} = [];
    155     foreach my $subcollection (@{$self->{'collect_cfg'}->{'indexsubcollections'}}) {
    156         foreach my $index (@$indexes) {
    157         push (@{$self->{'collect_cfg'}->{'indexes'}}, "$index:$subcollection");
    158         }
    159     }
    160     }
    161 
    162     # sort out language subindexes
    163     if (defined $self->{'collect_cfg'}->{'languages'}) {
    164     my $indexes = $self->{'collect_cfg'}->{'indexes'};
    165     $self->{'collect_cfg'}->{'indexes'} = [];
    166     foreach my $language (@{$self->{'collect_cfg'}->{'languages'}}) {
    167         foreach my $index (@$indexes) {
    168         if (defined ($self->{'collect_cfg'}->{'indexsubcollections'})) {
    169             push (@{$self->{'collect_cfg'}->{'indexes'}}, "$index:$language");
    170         }
    171         else { # add in an empty subcollection field
    172             push (@{$self->{'collect_cfg'}->{'indexes'}}, "$index\:\:$language");
    173            
    174         }       
    175         }
    176     }
    177     }
    178 
    179     # make sure that the same index isn't specified more than once
    180     my %tmphash = ();
    181     my @tmparray = @{$self->{'collect_cfg'}->{'indexes'}};
    182     $self->{'collect_cfg'}->{'indexes'} = [];
    183     foreach my $i (@tmparray) {
    184     if (!defined ($tmphash{$i})) {
    185         push (@{$self->{'collect_cfg'}->{'indexes'}}, $i);
    186         $tmphash{$i} = 1;
    187     }
    188     }
    189 
     110    my $self = new basebuilder (@_);
     111    $self = bless $self, $class;
     112
     113    $self->{'indexfieldmap'} = \%static_indexfield_map;
    190114
    191115    # get the levels (Section, Paragraph) for indexing and compression
     
    212136    }
    213137
    214     print $outhandle "doclevel = ". $self->{'doc_level'}."\n";
    215     # get the list of plugins for this collection
    216 
    217     #build up the extra global options for the plugins
    218     my @global_opts = ();
    219     if (defined $self->{'collect_cfg'}->{'separate_cjk'} && $self->{'collect_cfg'}->{'separate_cjk'} =~ /^true$/i) {
    220     push @global_opts, "-separate_cjk";
    221     }
    222 
    223     my $plugins = [];
    224     if (defined $self->{'collect_cfg'}->{'plugin'}) {
    225     $plugins = $self->{'collect_cfg'}->{'plugin'};
    226     }
    227    
    228     # load all the plugins
    229     $self->{'pluginfo'} = &plugin::load_plugins ($plugins, $verbosity, $outhandle, \@global_opts);
    230     if (scalar(@{$self->{'pluginfo'}}) == 0) {
    231     print $outhandle "No plugins were loaded.\n";
    232     die "\n";
    233     }
    234 
    235     # get the list of classifiers for this collection
    236     my $classifiers = [];
    237     if (defined $self->{'collect_cfg'}->{'classify'}) {
    238     $classifiers = $self->{'collect_cfg'}->{'classify'};
    239     }
    240    
    241     # load all the classifiers
    242     $self->{'classifiers'} = &classify::load_classifiers ($classifiers, $build_dir, $outhandle);
    243 
    244     # load up any dontgdbm fields
    245     $self->{'dontgdbm'} = {};
    246     if (defined ($self->{'collect_cfg'}->{'dontgdbm'})) {
    247     foreach my $dg (@{$self->{'collect_cfg'}->{'dontgdbm'}}) {
    248         $self->{'dontgdbm'}->{$dg} = 1;
    249     }
    250     }
    251 
    252     # load up the document processor for building
    253     # if a buildproc class has been created for this collection, use it
    254     # otherwise, use the mgpp buildproc
    255     my ($buildprocdir, $buildproctype);
    256     if (-e "$ENV{'GSDLCOLLECTDIR'}/perllib/${collection}buildproc.pm") {
    257     $buildprocdir = "$ENV{'GSDLCOLLECTDIR'}/perllib";
    258     $buildproctype = "${collection}buildproc";
    259     } else {
    260     $buildprocdir = "$ENV{'GSDLHOME'}/perllib";
    261     $buildproctype = "mgppbuildproc";
    262     }
    263     require "$buildprocdir/$buildproctype.pm";
    264 
    265     eval("\$self->{'buildproc'} = new $buildproctype(\$collection, " .
    266      "\$source_dir, \$build_dir, \$keepold, \$verbosity, \$outhandle)");
    267     die "$@" if $@;
    268 
    269138    $self->{'buildtype'} = "mgpp";
    270139
     
    272141}
    273142
    274 sub init {
    275     my $self = shift (@_);
    276 
    277     if (!$self->{'debug'} && !$self->{'keepold'}) {
    278     # remove any old builds
    279     &util::rm_r($self->{'build_dir'});
    280     &util::mk_all_dir($self->{'build_dir'});
    281        
    282     # make the text directory
    283     my $textdir = "$self->{'build_dir'}/text";
    284     &util::mk_all_dir($textdir);
    285     }
    286 }
    287 
    288 sub set_strip_html {
    289     my $self = shift (@_);
    290     my ($strip) = @_;
    291    
    292     $self->{'strip_html'} = $strip;
    293     $self->{'buildproc'}->set_strip_html($strip);
     143sub generate_index_list {
     144    my $self  = shift (@_);
     145   
     146    # sort out the indexes
     147    #indexes are specified with spaces, but we put them into one index
     148    my $indexes = $self->{'collect_cfg'}->{'indexes'};
     149    $self->{'collect_cfg'}->{'indexes'} = [];
     150    push (@{$self->{'collect_cfg'}->{'indexes'}}, join(',', @$indexes));
     151}
     152
     153sub default_buildproc {
     154    my $self  = shift (@_);
     155
     156    return "mgppbuildproc";
    294157}
    295158
     
    423286}
    424287
    425 sub want_built {
    426     my $self = shift (@_);
    427     my ($index) = @_;
    428 
    429     if (defined ($self->{'collect_cfg'}->{'dontbuild'})) {
    430     foreach my $checkstr (@{$self->{'collect_cfg'}->{'dontbuild'}}) {
    431         if ($index =~ /^$checkstr$/) {
    432         #push (@{$self->{'notbuilt'}}, $self->{'index_mapping'}->{$index});
    433         $self->{'notbuilt'}->{$index} = 1;
    434         return 0;
    435         }
    436     }
    437     }
    438 
    439     return 1;
    440 }
    441 
    442 sub build_indexes {
    443     my $self = shift (@_);
    444     my ($indexname) = @_;
    445     my $outhandle = $self->{'outhandle'};
    446 
    447     my $indexes = [];
    448     if (defined $indexname && $indexname =~ /\w/) {
    449     push @$indexes, $indexname;
    450     } else {
    451     $indexes = $self->{'collect_cfg'}->{'indexes'};
    452     }
    453 
    454     # create the mapping between the index descriptions
    455     # and their directory names (includes subcolls and langs)
    456     $self->{'index_mapping'} = $self->create_index_mapping ($indexes);
    457 
    458     # build each of the indexes
    459     foreach my $index (@$indexes) {
    460     if ($self->want_built($index)) {
    461         print $outhandle "\n*** building index $index in subdirectory " .
    462         "$self->{'index_mapping'}->{$index}\n" if ($self->{'verbosity'} >= 1);
    463         print STDERR "<Stage name='Index' source='$index'>\n" if $self->{'gli'};
    464         $self->build_index($index);
    465     } else {
    466         print $outhandle "\n*** ignoring index $index\n" if ($self->{'verbosity'} >= 1);
    467     }
    468     }
    469 
     288
     289sub build_indexes_extra {
     290    my $self = shift(@_);
    470291    #define the final field lists
    471292    $self->make_final_field_list();
    472 
    473 }
     293}   
    474294
    475295# creates directory names for each of the index descriptions
     
    545365}
    546366
    547 # returns a processed version of a field.
    548 # if the field has only one component the processed
    549 # version will contain the first character and next consonant
    550 # of that componant - otherwise it will contain the first
    551 # character of the first two components
    552 sub process_field {
    553     my $self = shift (@_);
    554     my ($field) = @_;
    555  
    556     return "" unless (defined ($field) && $field =~ /\w/);
    557 
    558     my @components = split /,/, $field;
    559     if (scalar @components >= 2) {
    560     splice (@components, 2);
    561     map {s/^(.).*$/$1/;} @components;
    562     return join("", @components);
    563     } else {
    564     my ($a, $b) = $field =~ /^(.).*?([bcdfghjklmnpqrstvwxyz])/i;
    565     ($a, $b) = $field =~ /^(.)(.)/ unless defined $a && defined $b;
    566     return "$a$b";
    567     }
    568 }
    569 
    570367sub make_unique {
    571368    my $self = shift (@_);
     
    583380}   
    584381
    585 sub get_next_version {
    586     my $self = shift (@_);
    587     my ($nameref) = @_;
    588     my $num=0;
    589     if ($$nameref =~ /(\d\d)$/) {
    590     $num = $1; $num ++;
    591     $$nameref =~ s/\d\d$/$num/;
    592     } elsif ($$nameref =~ /(\d)$/) {
    593     $num = $1;
    594     if ($num == 9) {$$nameref =~ s/\d$/10/;}
    595     else {$num ++; $$nameref =~ s/\d$/$num/;}
    596     } else {
    597     $$nameref =~ s/.$/0/;
    598     }
    599 }
    600382
    601383sub build_index {
     
    812594}   
    813595
    814 sub make_infodatabase {
    815     my $self = shift (@_);
    816     my $outhandle = $self->{'outhandle'};
    817 
    818 
    819     my $textdir = &util::filename_cat($self->{'build_dir'}, "text");
    820     my $assocdir = &util::filename_cat($self->{'build_dir'}, "assoc");
    821     &util::mk_all_dir ($textdir);
    822     &util::mk_all_dir ($assocdir);
    823 
    824     # get db name
    825     my $dbext = ".bdb";
    826     $dbext = ".ldb" if &util::is_little_endian();
    827     my $fulldbname = &util::filename_cat ($textdir, "$self->{'collection'}$dbext");
    828     $fulldbname =~ s/\//\\/g if ($ENV{'GSDLOS'} =~ /^windows$/i);
    829 
    830     my $exedir = "$ENV{'GSDLHOME'}/bin/$ENV{'GSDLOS'}";
    831     my $exe = &util::get_os_exe ();
    832     my $txt2db_exe = &util::filename_cat($exedir, "txt2db$exe");
    833 
    834     # define the indexed field mapping if not already done so (ie if infodb called separately from build_index)
    835     if (!defined $self->{'build_cfg'}) {
    836     $self->read_final_field_list();
    837     }
    838     print $outhandle "\n*** creating the info database and processing associated files\n"
    839     if ($self->{'verbosity'} >= 1);
    840     print STDERR "<Stage name='CreateInfoData'>\n" if $self->{'gli'};
    841 
    842     # init all the classifiers
    843     &classify::init_classifiers ($self->{'classifiers'});
    844 
    845     # set up the document processor
    846     my ($handle);
    847     if ($self->{'debug'}) {
    848     $handle = STDOUT;
    849     } else {
    850     if (!-e "$txt2db_exe" || !open (PIPEOUT, "| txt2db$exe \"$fulldbname\"")) {
    851         print STDERR "<FatalError name='NoRunText2DB'/>\n</Stage>\n" if $self->{'gli'};
    852         die "mgppbuilder::make_infodatabase - couldn't run $txt2db_exe\n";
    853     }
    854     $handle = mgppbuilder::PIPEOUT;
    855     }
    856 
    857     $self->{'buildproc'}->set_output_handle ($handle);
    858     $self->{'buildproc'}->set_mode ('infodb');
    859     $self->{'buildproc'}->set_assocdir ($assocdir);
    860     $self->{'buildproc'}->set_dontgdbm ($self->{'dontgdbm'});
    861     $self->{'buildproc'}->set_classifiers ($self->{'classifiers'});
    862     $self->{'buildproc'}->set_indexing_text (0);
    863     $self->{'buildproc'}->set_store_text(1);
    864     #$self->{'buildproc'}->set_indexfieldmap ($self->{'indexfieldmap'});
    865 
    866     # make_infodatabase does not support incremental build
    867     # => full reset needed
    868     $self->{'buildproc'}->zero_reset();
     596
     597sub output_collection_meta {
     598    my $self = shift(@_);
     599    my ($handle) = @_;
    869600
    870601    # do the collection info
     
    873604    # first do the collection meta stuff - everything without a dot
    874605    my $collmetadefined = 0;
     606    my $metadata_entry;
    875607    if (defined $self->{'collect_cfg'}->{'collectionmeta'}) {
    876608    $collmetadefined = 1;
    877609    foreach my $cmeta (keys (%{$self->{'collect_cfg'}->{'collectionmeta'}})) {
    878610        next if ($cmeta =~ /^\./); # for now, ignore ones with dots
    879         my ($metadata_entry) = $self->create_language_db_map($cmeta, $cmeta);
     611        $metadata_entry = $self->create_language_db_map($cmeta, $cmeta);
    880612        #write the entry to the file
    881613        print $handle $metadata_entry;
     
    896628    $collmeta = ".$longfield";
    897629    if ($collmetadefined && defined $self->{'collect_cfg'}->{'collectionmeta'}->{$collmeta}) {
    898         my $metadata_entry = $self->create_language_db_map($collmeta, $shortfield);
     630        $metadata_entry = $self->create_language_db_map($collmeta, $shortfield);
    899631        $field_entry .= $metadata_entry;
    900632    } else { #use the metadata names, or the text macros for allfields and textonly
     
    917649    my $levelid = $level_map{$level}; # find the actual value we used in the index
    918650    if ($collmetadefined && defined $self->{'collect_cfg'}->{'collectionmeta'}->{$collmeta}) {
    919         my $metadata_entry = $self->create_language_db_map($collmeta, $levelid);
     651        $metadata_entry = $self->create_language_db_map($collmeta, $levelid);
    920652        $level_entry .= $metadata_entry;
    921653    } else {
     
    955687    # end the collection entry
    956688    print $handle "\n" . ('-' x 70) . "\n";       
    957    
    958     &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
    959            "", {}, $self->{'buildproc'}, $self->{'maxdocs'}, 0, $self->{'gli'});
    960 
    961     # output classification information
    962     &classify::output_classify_info ($self->{'classifiers'}, $handle,
    963                      $self->{'remove_empty_classifications'},
    964                      $self->{'gli'});
    965 
    966     #output doclist
    967     my @doclist = $self->{'buildproc'}->get_doc_list();
    968     my $docs = join (";",@doclist);
    969     print $handle "[browselist]\n";
    970     print $handle "<hastxt>0\n";
    971     print $handle "<childtype>VList\n";
    972     print $handle "<numleafdocs>" . ($#doclist+1) . "\n";
    973     print $handle "<thistype>Invisible\n";
    974     print $handle "<contains>$docs";
    975     print $handle "\n" . ('-' x 70) . "\n";
    976     close ($handle) if !$self->{'debug'};
    977 
    978     print STDERR "</Stage>\n" if $self->{'gli'};
    979 }
    980 
     689
     690
     691}
    981692sub create_language_db_map {
    982693    my $self = shift (@_);
     
    1020731   
    1021732}
    1022 sub collect_specific {
    1023     my $self = shift (@_);
    1024 }
    1025733
    1026734# at the end of building, we have an indexfieldmap with all the mappings,
     
    1047755    # those again.
    1048756
    1049     foreach my $field (@{$self->{'collect_cfg'}->{'indexes'}}) {
     757    my $field;
     758    foreach $field (@{$self->{'collect_cfg'}->{'indexes'}}) {
    1050759    # remove subcoll stuff
    1051760    my $parts = $field;
     
    1061770   
    1062771    #add all fields bit
    1063     foreach my $field (@specifiedfieldorder) {
     772    foreach $field (@specifiedfieldorder) {
    1064773    if ($field eq "metadata") {
    1065774        foreach my $newfield (keys %{$self->{'buildproc'}->{'indexfields'}}) {
     
    1114823
    1115824    my $buildcfg = &colcfg::read_build_cfg( $buildconfigfile);
    1116 
     825    my $field;
    1117826    if (defined $buildcfg->{'indexfields'}) {
    1118     foreach my $field (@{$buildcfg->{'indexfields'}}) {
     827    foreach $field (@{$buildcfg->{'indexfields'}}) {
    1119828        push (@indexfields, "$field");
    1120829    }
     
    1122831
    1123832    if (defined $buildcfg->{'indexfieldmap'}) {
    1124     foreach my $field (@{$buildcfg->{'indexfieldmap'}}) {
     833    foreach $field (@{$buildcfg->{'indexfieldmap'}}) {
    1125834        push (@indexfieldmap, "$field");
    1126835        my ($f, $v) = $field =~ /^(.*)\-\>(.*)$/;
     
    1133842}
    1134843
    1135 sub make_auxiliary_files {
     844
     845sub write_cfg_file {
     846    my $self = shift(@_);
     847    my ($build_cfg) = @_;
     848
     849    # write out the build information
     850    &cfgread::write_cfg_file("$self->{'build_dir'}/build.cfg", $build_cfg,
     851                 '^(builddate|buildtype|numdocs|numsections|numbytes|textlevel|indexstem|maxnumeric)$',
     852                             '^(indexmap|subcollectionmap|languagemap|indexfieldmap|notbuilt|indexfields|indexlevels|levelmap)$');
     853
     854}
     855
     856sub build_cfg_extra {
    1136857    my $self = shift (@_);
    1137     my ($index);
    1138    
    1139     my $build_cfg = {};
    1140     # this already includes indexfieldmap and indexfields
    1141     if (defined $self->{'build_cfg'}) {
    1142     $build_cfg = $self->{'build_cfg'};
    1143     }
    1144    
    1145     my $outhandle =  $self->{'outhandle'};
    1146     print $outhandle "\n*** creating auxiliary files \n" if ($self->{'verbosity'} >= 1);
    1147     print STDERR "<Stage name='CreatingAuxilary'>\n" if $self->{'gli'};
    1148 
    1149     # get the text directory
    1150     &util::mk_all_dir ($self->{'build_dir'});
    1151 
    1152     # store the build date
    1153     $build_cfg->{'builddate'} = time;
    1154     $build_cfg->{'buildtype'} = $self->{'buildtype'};
    1155     $build_cfg->{'indexstem'} = $self->{'collection'};
     858    my ($build_cfg) = @_;
     859
     860    $build_cfg->{'numsections'} = $self->{'buildproc'}->get_num_sections();
     861   
    1156862    # store the level info
    1157863    my @indexlevels = ();
     
    1169875    $build_cfg->{'textlevel'} = $level_map{'document'};
    1170876    }
    1171     # store the number of documents and number of bytes
    1172     $build_cfg->{'numdocs'} = $self->{'buildproc'}->get_num_docs();
    1173     $build_cfg->{'numsections'} = $self->{'buildproc'}->get_num_sections();
    1174     $build_cfg->{'numbytes'} = $self->{'buildproc'}->get_num_bytes();
    1175 
    1176     # store the mapping between the index names and the directory names
    1177     my @indexmap = ();
    1178     foreach my $index (@{$self->{'index_mapping'}->{'indexmaporder'}}) {
    1179     if (not defined ($self->{'notbuilt'}->{$index})) {
    1180         push (@indexmap, "$index\-\>$self->{'index_mapping'}->{'indexmap'}->{$index}");
    1181     }
    1182     }
    1183     $build_cfg->{'indexmap'} = \@indexmap;
    1184 
    1185     my @subcollectionmap = ();
    1186     foreach my $subcollection (@{$self->{'index_mapping'}->{'subcollectionmaporder'}}) {
    1187     push (@subcollectionmap, "$subcollection\-\>" .
    1188           $self->{'index_mapping'}->{'subcollectionmap'}->{$subcollection});
    1189     }
    1190     $build_cfg->{'subcollectionmap'} = \@subcollectionmap if scalar (@subcollectionmap);
    1191 
    1192     my @languagemap = ();
    1193     foreach my $language (@{$self->{'index_mapping'}->{'languagemaporder'}}) {
    1194     push (@languagemap, "$language\-\>" .
    1195           $self->{'index_mapping'}->{'languagemap'}->{$language});
    1196     }
    1197     $build_cfg->{'languagemap'} = \@languagemap if scalar (@languagemap);
    1198 
    1199     my @notbuilt = ();
    1200     foreach my $nb (keys %{$self->{'notbuilt'}}) {
    1201     push (@notbuilt, $nb);
    1202     }
    1203     $build_cfg->{'notbuilt'} = \@notbuilt if scalar (@notbuilt);
    1204 
    1205     # write out the build information
    1206     &cfgread::write_cfg_file("$self->{'build_dir'}/build.cfg", $build_cfg,
    1207                  '^(builddate|buildtype|numdocs|numsections|numbytes|textlevel|indexstem)$',
    1208                              '^(indexmap|subcollectionmap|languagemap|indexfieldmap|notbuilt|indexfields|indexlevels|levelmap)$');
    1209 
    1210     print STDERR "</Stage>\n" if $self->{'gli'};
    1211 }
    1212 
    1213 sub deinit {
    1214     my $self = shift (@_);
    1215    
    1216     &plugin::deinit($self->{'pluginfo'},$self->{'buildproc'});
    1217 }
    1218 
    1219 sub print_stats {
    1220     my $self = shift (@_);
    1221 
    1222     my $outhandle = $self->{'outhandle'};
    1223     my $indexing_text = $self->{'buildproc'}->get_indexing_text();
    1224     my $index = $self->{'buildproc'}->get_index();
    1225     my $num_bytes = $self->{'buildproc'}->get_num_bytes();
    1226     my $num_processed_bytes = $self->{'buildproc'}->get_num_processed_bytes();
    1227 
    1228     if ($indexing_text) {
    1229     print $outhandle "Stats (Creating index $index)\n";
    1230     } else {
    1231     print $outhandle "Stats (Compressing text from $index)\n";
    1232     }
    1233     print $outhandle "Total bytes in collection: $num_bytes\n";
    1234     print $outhandle "Total bytes in $index: $num_processed_bytes\n";
    1235 
    1236     if ($num_processed_bytes < 50 && ($indexing_text || !$self->{'no_text'})) {
    1237    
    1238     if ($self->{'keepold'}) {
    1239         if ($num_processed_bytes == 0) {
    1240         if ($indexing_text) {
    1241             print $outhandle "No additional text was added to $index\n";
    1242         } elsif (!$self->{'no_text'}) {
    1243             print $outhandle "No additional text was compressed\n";
    1244         }   
    1245         }   
    1246     }
    1247     else {
    1248         print $outhandle "***************\n";
    1249         if ($indexing_text) {
    1250         print $outhandle "WARNING: There is very little or no text to process for $index\n";
    1251         } elsif (!$self->{'no_text'}) {
    1252         print $outhandle "WARNING: There is very little or no text to compress\n";
    1253         }     
    1254         print $outhandle "         Was this your intention?\n";
    1255         print $outhandle "***************\n";
    1256     }
    1257 
    1258     }
    1259 
     877   
    1260878}
    1261879
Note: See TracChangeset for help on using the changeset viewer.