########################################################################### # # mgbuilder.pm -- MGBuilder object # A component of the Greenstone digital library software # from the New Zealand Digital Library Project at the # University of Waikato, New Zealand. # # Copyright (C) 1999 New Zealand Digital Library Project # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation; either version 2 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program; if not, write to the Free Software # Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. # ########################################################################### package mgbuilder; use classify; use cfgread; use colcfg; use plugin; use util; use FileHandle; BEGIN { # set autoflush on for STDERR and STDOUT so that mg # doesn't get out of sync with plugins STDOUT->autoflush(1); STDERR->autoflush(1); } END { STDOUT->autoflush(0); STDERR->autoflush(0); } $maxdocsize = 12000; %wanted_index_files = ('td'=>1, 't'=>1, 'idb'=>1, 'ib1'=>1, 'ib2'=>1, 'ib3'=>1, 'i'=>1, 'ip'=>1, 'tiw'=>1, 'wa'=>1); sub new { my ($class, $collection, $source_dir, $build_dir, $verbosity, $maxdocs, $debug, $keepold, $allclassifications, $outhandle) = @_; $outhandle = STDERR unless defined $outhandle; # create an mgbuilder object my $self = bless {'collection'=>$collection, 'source_dir'=>$source_dir, 'build_dir'=>$build_dir, 'verbosity'=>$verbosity, 'maxdocs'=>$maxdocs, 'debug'=>$debug, 'keepold'=>$keepold, 'allclassifications'=>$allclassifications, 'outhandle'=>$outhandle, 'notbuilt'=>[] # indexes not built }, $class; # read in the collection configuration file my $colcfgname = "$ENV{'GSDLCOLLECTDIR'}/etc/collect.cfg"; if (!-e $colcfgname) { die "mgbuilder::new - couldn't find collect.cfg for collection $collection\n"; } $self->{'collect_cfg'} = &colcfg::read_collect_cfg ($colcfgname); # sort out subcollection indexes if (defined $self->{'collect_cfg'}->{'indexsubcollections'}) { my $indexes = $self->{'collect_cfg'}->{'indexes'}; $self->{'collect_cfg'}->{'indexes'} = []; foreach $subcollection (@{$self->{'collect_cfg'}->{'indexsubcollections'}}) { foreach $index (@$indexes) { push (@{$self->{'collect_cfg'}->{'indexes'}}, "$index:$subcollection"); } } } # sort out language subindexes if (defined $self->{'collect_cfg'}->{'languages'}) { my $indexes = $self->{'collect_cfg'}->{'indexes'}; $self->{'collect_cfg'}->{'indexes'} = []; foreach $language (@{$self->{'collect_cfg'}->{'languages'}}) { foreach $index (@$indexes) { push (@{$self->{'collect_cfg'}->{'indexes'}}, "$index:$language"); } } } # get the list of plugins for this collection my $plugins = []; if (defined $self->{'collect_cfg'}->{'plugin'}) { $plugins = $self->{'collect_cfg'}->{'plugin'}; } # load all the plugins $self->{'pluginfo'} = &plugin::load_plugins ($plugins, $verbosity, $outhandle); if (scalar(@{$self->{'pluginfo'}}) == 0) { print $outhandle "No plugins were loaded.\n"; die "\n"; } # get the list of classifiers for this collection my $classifiers = []; if (defined $self->{'collect_cfg'}->{'classify'}) { $classifiers = $self->{'collect_cfg'}->{'classify'}; } # load all the classifiers $self->{'classifiers'} = &classify::load_classifiers ($classifiers, $outhandle); # load up any dontgdbm fields $self->{'dontgdbm'} = {}; if (defined ($self->{'collect_cfg'}->{'dontgdbm'})) { foreach $dg (@{$self->{'collect_cfg'}->{'dontgdbm'}}) { $self->{'dontgdbm'}->{$dg} = 1; } } # load up the document processor for building # if a buildproc class has been created for this collection, use it # otherwise, use the mg buildproc my ($buildprocdir, $buildproctype); if (-e "$ENV{'GSDLCOLLECTDIR'}/perllib/${collection}buildproc.pm") { $buildprocdir = "$ENV{'GSDLCOLLECTDIR'}/perllib"; $buildproctype = "${collection}buildproc"; } else { $buildprocdir = "$ENV{'GSDLHOME'}/perllib"; $buildproctype = "mgbuildproc"; } require "$buildprocdir/$buildproctype.pm"; eval("\$self->{'buildproc'} = new $buildproctype(\$collection, " . "\$source_dir, \$build_dir, \$verbosity, \$outhandle)"); die "$@" if $@; return $self; } sub init { my $self = shift (@_); if (!$self->{'debug'} && !$self->{'keepold'}) { # remove any old builds &util::rm_r($self->{'build_dir'}); &util::mk_all_dir($self->{'build_dir'}); # make the text directory my $textdir = "$self->{'build_dir'}/text"; &util::mk_all_dir($textdir); } } sub compress_text { my $self = shift (@_); my ($textindex) = @_; my $exedir = "$ENV{'GSDLHOME'}/bin/$ENV{'GSDLOS'}"; my $exe = &util::get_os_exe (); my $mg_passes_exe = &util::filename_cat($exedir, "mg_passes$exe"); my $mg_compression_dict_exe = &util::filename_cat($exedir, "mg_compression_dict$exe"); my $outhandle = $self->{'outhandle'}; &util::mk_all_dir (&util::filename_cat($self->{'build_dir'}, "text")); my $basefilename = "text/$self->{'collection'}"; my $fulltextprefix = &util::filename_cat ($self->{'build_dir'}, $basefilename); my $osextra = ""; if ($ENV{'GSDLOS'} =~ /^windows$/i) { $fulltextprefix =~ s/\//\\/g; } else { $osextra = " -d /"; } print $outhandle "\n*** creating the compressed text\n" if ($self->{'verbosity'} >= 1); # collect the statistics for the text # -b $maxdocsize sets the maximum document size to be 12 meg print $outhandle "\n collecting text statistics\n" if ($self->{'verbosity'} >= 1); my ($handle); if ($self->{'debug'}) { $handle = STDOUT; } else { if (!-e "$mg_passes_exe" || !open (PIPEOUT, "| $mg_passes_exe -f $fulltextprefix -b $maxdocsize -T1 $osextra")) { die "mgbuilder::compress_text - couldn't run $mg_passes_exe\n"; } $handle = mgbuilder::PIPEOUT; } $self->{'buildproc'}->set_output_handle ($handle); $self->{'buildproc'}->set_mode ('text'); $self->{'buildproc'}->set_index ($textindex); $self->{'buildproc'}->set_indexing_text (0); $self->{'buildproc'}->reset(); &plugin::begin($self->{'pluginfo'}, $self->{'source_dir'}, $self->{'buildproc'}, $self->{'maxdocs'}); &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'}, "", {}, $self->{'buildproc'}, $self->{'maxdocs'}); &plugin::end($self->{'pluginfo'}); close ($handle) unless $self->{'debug'}; $self->print_stats(); # create the compression dictionary # the compression dictionary is built by assuming the stats are from a seed # dictionary (-S), if a novel word is encountered it is spelled out (-H), # and the resulting dictionary must be less than 5 meg with the most frequent # words being put into the dictionary first (-2 -k 5120) if (!$self->{'debug'}) { print $outhandle "\n creating the compression dictionary\n" if ($self->{'verbosity'} >= 1); if (!-e "$mg_compression_dict_exe") { die "mgbuilder::compress_text - couldn't run $mg_compression_dict_exe\n"; } system ("$mg_compression_dict_exe -f $fulltextprefix -S -H -2 -k 5120 $osextra"); # -b $maxdocsize sets the maximum document size to be 12 meg if (!-e "$mg_passes_exe" || !open ($handle, "| $mg_passes_exe -f $fulltextprefix -b $maxdocsize -T2 $osextra")) { die "mgbuilder::compress_text - couldn't run $mg_passes_exe\n"; } } $self->{'buildproc'}->reset(); # compress the text print $outhandle "\n compressing the text\n" if ($self->{'verbosity'} >= 1); &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'}, "", {}, $self->{'buildproc'}, $self->{'maxdocs'}); close ($handle) unless $self->{'debug'}; $self->print_stats(); } sub want_built { my $self = shift (@_); my ($index) = @_; if (defined ($self->{'collect_cfg'}->{'dontbuild'})) { foreach $checkstr (@{$self->{'collect_cfg'}->{'dontbuild'}}) { if ($index =~ /^$checkstr$/) { push (@{$self->{'notbuilt'}}, $self->{'index_mapping'}->{$index}); return 0; } } } return 1; } sub build_indexes { my $self = shift (@_); my ($indexname) = @_; my $outhandle = $self->{'outhandle'}; my $indexes = []; if (defined $indexname && $indexname =~ /\w/) { push @$indexes, $indexname; } else { $indexes = $self->{'collect_cfg'}->{'indexes'}; } # create the mapping between the index descriptions # and their directory names $self->{'index_mapping'} = $self->create_index_mapping ($indexes); # build each of the indexes foreach $index (@$indexes) { if ($self->want_built($index)) { print $outhandle "\n*** building index $index in subdirectory " . "$self->{'index_mapping'}->{$index}\n" if ($self->{'verbosity'} >= 1); $self->build_index($index); } else { print $outhandle "\n*** ignoring index $index\n" if ($self->{'verbosity'} >= 1); } } } # creates directory names for each of the index descriptions sub create_index_mapping { my $self = shift (@_); my ($indexes) = @_; my %mapping = (); $mapping{'indexmaporder'} = []; $mapping{'subcollectionmaporder'} = []; $mapping{'languagemaporder'} = []; # dirnames is used to check for collisions. Start this off # with the manditory directory names my %dirnames = ('text'=>'text', 'extra'=>'extra'); my %pnames = ('index' => '', 'subcollection' => '', 'languages' => ''); foreach $index (@$indexes) { my ($level, $gran, $subcollection, $languages) = split (":", $index); # the directory name starts with the first character of the index level my ($pindex) = $level =~ /^(.)/; # next comes a processed version of the index $pindex .= $self->process_field ($gran); $pindex = lc ($pindex); # next comes a processed version of the subcollection if there is one. my $psub = $self->process_field ($subcollection); $psub = lc ($psub); # next comes a processed version of the language if there is one. my $plang = $self->process_field ($languages); $plang = lc ($plang); my $dirname = $pindex . $psub . $plang; # check to be sure all index names are unique while (defined ($dirnames{$dirname})) { $dirname = $self->make_unique (\%pnames, $index, \$pindex, \$psub, \$plang); } # store the mapping orders as well as the maps if (!defined $mapping{'indexmap'}{"$level:$gran"}) { $mapping{'indexmap'}{"$level:$gran"} = $pindex; push (@{$mapping{'indexmaporder'}}, "$level:$gran"); } if ($psub =~ /\w/ && !defined ($mapping{'subcollectionmap'}{$subcollection})) { $mapping{'subcollectionmap'}{$subcollection} = $psub; push (@{$mapping{'subcollectionmaporder'}}, $subcollection); } if ($plang =~ /\w/ && !defined ($mapping{'languagemap'}{$languages})) { $mapping{'languagemap'}{$languages} = $plang; push (@{$mapping{'languagemaporder'}}, $language); } $mapping{$index} = $dirname; $dirnames{$dirname} = $index; $pnames{'index'}{$pindex} = "$level:$gran"; $pnames{'subcollection'}{$psub} = $subcollection; $pnames{'languages'}{$plang} = $languages; } return \%mapping; } # returns a processed version of a field. # if the field has only one component the processed # version will contain the first character and next consonant # of that componant - otherwise it will contain the first # character of the first two components sub process_field { my $self = shift (@_); my ($field) = @_; return "" unless (defined ($field) && $field =~ /\w/); my @components = split /,/, $field; if (scalar @components >= 2) { splice (@components, 2); map {s/^(.).*$/$1/;} @components; return join("", @components); } else { my ($a, $b) = $field =~ /^(.).*?([bcdfghjklmnpqrstvwxyz])/i; ($a, $b) = $field =~ /^(.)(.)/ unless defined $a && defined $b; return "$a$b"; } } sub make_unique { my $self = shift (@_); my ($namehash, $index, $indexref, $subref, $langref) = @_; my ($level, $gran, $subcollection, $languages) = split (":", $index); if ($namehash->{'index'}->{$$indexref} ne "$level:$gran") { $self->get_next_version ($indexref); } elsif ($namehash->{'subcollection'}->{$$subref} ne $subcollection) { $self->get_next_version ($subref); } elsif ($namehash->{'languages'}->{$$langref} ne $languages) { $self->get_next_version ($langref); } return "$$indexref$$subref$$langref"; } sub get_next_version { my $self = shift (@_); my ($nameref) = @_; if ($$nameref =~ /(\d\d)$/) { my $num = $1; $num ++; $$nameref =~ s/\d\d$/$num/; } elsif ($$nameref =~ /(\d)$/) { my $num = $1; if ($num == 9) {$$nameref =~ s/\d\d$/10/;} else {$num ++; $$nameref =~ s/\d$/$num/;} } else { $$nameref =~ s/.$/0/; } } sub build_index { my $self = shift (@_); my ($index) = @_; my $outhandle = $self->{'outhandle'}; # get the full index directory path and make sure it exists my $indexdir = $self->{'index_mapping'}->{$index}; &util::mk_all_dir (&util::filename_cat($self->{'build_dir'}, $indexdir)); my $fullindexprefix = &util::filename_cat ($self->{'build_dir'}, $indexdir, $self->{'collection'}); my $fulltextprefix = &util::filename_cat ($self->{'build_dir'}, "text", $self->{'collection'}); # get any os specific stuff my $exedir = "$ENV{'GSDLHOME'}/bin/$ENV{'GSDLOS'}"; my $exe = &util::get_os_exe (); my $mg_passes_exe = &util::filename_cat($exedir, "mg_passes$exe"); my $mg_perf_hash_build_exe = &util::filename_cat($exedir, "mg_perf_hash_build$exe"); my $mg_weights_build_exe = &util::filename_cat ($exedir, "mg_weights_build$exe"); my $mg_invf_dict_exe = &util::filename_cat ($exedir, "mg_invf_dict$exe"); my $mg_stem_idx_exe = &util::filename_cat ($exedir, "mg_stem_idx$exe"); my $osextra = ""; if ($ENV{'GSDLOS'} =~ /^windows$/i) { $fullindexprefix =~ s/\//\\/g; } else { $osextra = " -d /"; } # get the index level from the index description # the index will be level 2 unless we are building a # paragraph level index my $index_level = 2; $index_level = 3 if $index =~ /^paragraph/i; # get the index expression if this index belongs # to a subcollection my $indexexparr = []; my ($level, $fields, $subcollection) = split (":", $index); my @subcollections = (); @subcollections = split /,/, $subcollection if (defined $subcollection); foreach $subcollection (@subcollections) { if (defined ($self->{'collect_cfg'}->{'subcollection'}->{$subcollection})) { push (@$indexexparr, $self->{'collect_cfg'}->{'subcollection'}->{$subcollection}); } } # add expressions for languages if this index belongs to # a language subcollection foreach $language (@{$self->{'collect_cfg'}->{'languages'}}) { if ($language =~ s/^\!//) { push (@$indexexparr, "!Language/$language/"); } else { push (@$indexexparr, "Language/$language/"); } } # Build index dictionary. Uses verbatim stem method print $outhandle "\n creating index dictionary\n" if ($self->{'verbosity'} >= 1); my ($handle); if ($self->{'debug'}) { $handle = STDOUT; } else { if (!-e "$mg_passes_exe" || !open (PIPEOUT, "| $mg_passes_exe -f $fullindexprefix -b $maxdocsize " . "-$index_level -m 32 -s 0 -G -t 10 -N1 $osextra")) { die "mgbuilder::build_index - couldn't run $mg_passes_exe\n"; } $handle = mgbuilder::PIPEOUT; } # set up the document processor $self->{'buildproc'}->set_output_handle ($handle); $self->{'buildproc'}->set_mode ('text'); $self->{'buildproc'}->set_index ($index, $indexexparr); $self->{'buildproc'}->set_indexing_text (1); $self->{'buildproc'}->reset(); &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'}, "", {}, $self->{'buildproc'}, $self->{'maxdocs'}); close ($handle) unless $self->{'debug'}; $self->print_stats(); if (!$self->{'debug'}) { # create the perfect hash function if (!-e "$mg_perf_hash_build_exe") { die "mgbuilder::build_index - couldn't run $mg_perf_hash_build_exe\n"; } system ("$mg_perf_hash_build_exe -f $fullindexprefix $osextra"); if (!-e "$mg_passes_exe" || !open ($handle, "| $mg_passes_exe -f $fullindexprefix -b $maxdocsize " . "-$index_level -c 3 -G -t 10 -N2 $osextra")) { die "mgbuilder::build_index - couldn't run $mg_passes_exe\n"; } } # invert the text print $outhandle "\n inverting the text\n" if ($self->{'verbosity'} >= 1); $self->{'buildproc'}->reset(); &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'}, "", {}, $self->{'buildproc'}, $self->{'maxdocs'}); $self->print_stats (); if (!$self->{'debug'}) { close ($handle); # create the weights file print $outhandle "\n create the weights file\n" if ($self->{'verbosity'} >= 1); if (!-e "$mg_weights_build_exe") { die "mgbuilder::build_index - couldn't run $mg_weights_build_exe\n"; } system ("$mg_weights_build_exe -f $fullindexprefix -t $fulltextprefix $osextra"); # create 'on-disk' stemmed dictionary print $outhandle "\n creating 'on-disk' stemmed dictionary\n" if ($self->{'verbosity'} >= 1); if (!-e "$mg_invf_dict_exe") { die "mgbuilder::build_index - couldn't run $mg_invf_dict_exe\n"; } system ("$mg_invf_dict_exe -f $fullindexprefix $osextra"); # creates stem index files for the various stemming methods print $outhandle "\n creating stem indexes\n" if ($self->{'verbosity'} >= 1); if (!-e "$mg_stem_idx_exe") { die "mgbuilder::build_index - couldn't run $mg_stem_idx_exe\n"; } system ("$mg_stem_idx_exe -b 4096 -s1 -f $fullindexprefix $osextra"); system ("$mg_stem_idx_exe -b 4096 -s2 -f $fullindexprefix $osextra"); system ("$mg_stem_idx_exe -b 4096 -s3 -f $fullindexprefix $osextra"); # remove unwanted files my $tmpdir = &util::filename_cat ($self->{'build_dir'}, $indexdir); opendir (DIR, $tmpdir) || die "mgbuilder::build_index - couldn't read directory $tmpdir\n"; foreach $file (readdir(DIR)) { next if $file =~ /^\./; my ($suffix) = $file =~ /\.([^\.]+)$/; if (defined $suffix && !defined $wanted_index_files{$suffix}) { # delete it! print $outhandle "deleting $file\n" if $self->{'verbosity'} > 2; &util::rm (&util::filename_cat ($tmpdir, $file)); } } closedir (DIR); } } sub make_infodatabase { my $self = shift (@_); my $outhandle = $self->{'outhandle'}; my $textdir = &util::filename_cat($self->{'build_dir'}, "text"); my $assocdir = &util::filename_cat($self->{'build_dir'}, "assoc"); &util::mk_all_dir ($textdir); &util::mk_all_dir ($assocdir); # get db name my $dbext = ".bdb"; $dbext = ".ldb" if &util::is_little_endian(); my $fulldbname = &util::filename_cat ($textdir, "$self->{'collection'}$dbext"); $fulldbname =~ s/\//\\/g if ($ENV{'GSDLOS'} =~ /^windows$/i); my $exedir = "$ENV{'GSDLHOME'}/bin/$ENV{'GSDLOS'}"; my $exe = &util::get_os_exe (); my $txt2db_exe = &util::filename_cat($exedir, "txt2db$exe"); print $outhandle "\n*** creating the info database and processing associated files\n" if ($self->{'verbosity'} >= 1); # init all the classifiers &classify::init_classifiers ($self->{'classifiers'}); # set up the document processor my ($handle); if ($self->{'debug'}) { $handle = STDOUT; } else { if (!-e "$txt2db_exe" || !open (PIPEOUT, "| $txt2db_exe $fulldbname")) { die "mgbuilder::make_infodatabase - couldn't run $txt2db_exe\n"; } $handle = mgbuilder::PIPEOUT; } $self->{'buildproc'}->set_output_handle ($handle); $self->{'buildproc'}->set_mode ('infodb'); $self->{'buildproc'}->set_assocdir ($assocdir); $self->{'buildproc'}->set_dontgdbm ($self->{'dontgdbm'}); $self->{'buildproc'}->set_classifiers ($self->{'classifiers'}); $self->{'buildproc'}->set_indexing_text (0); $self->{'buildproc'}->reset(); if (defined $self->{'collect_cfg'}->{'collectionmeta'}) { if (!defined $self->{'index_mapping'}) { $self->{'index_mapping'} = $self->create_index_mapping ($self->{'collect_cfg'}->{'indexes'}); } print $handle "[collection]\n"; foreach $cmeta (keys (%{$self->{'collect_cfg'}->{'collectionmeta'}})) { if ($cmeta =~ s/^\.//) { if (defined $self->{'index_mapping'}->{$cmeta}) { print $handle "<$self->{'index_mapping'}->{$cmeta}>" . $self->{'collect_cfg'}->{'collectionmeta'}->{".$cmeta"} . "\n"; } else { print $outhandle "mgbuilder: warning bad collectionmeta option '$cmeta' - ignored\n"; } } else { print $handle "<$cmeta>$self->{'collect_cfg'}->{'collectionmeta'}->{$cmeta}\n"; } } print $handle "\n" . ('-' x 70) . "\n"; } &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'}, "", {}, $self->{'buildproc'}, $self->{'maxdocs'}); # output classification information &classify::output_classify_info ($self->{'classifiers'}, $handle, $self->{'allclassifications'}); close ($handle) if !$self->{'debug'}; } sub collect_specific { my $self = shift (@_); } sub make_auxiliary_files { my $self = shift (@_); my ($index); my %build_cfg = (); my $outhandle = $self->{'outhandle'}; print $outhandle "\n*** creating auxiliary files \n" if ($self->{'verbosity'} >= 1); # get the text directory &util::mk_all_dir ($self->{'build_dir'}); # store the build date $build_cfg->{'builddate'} = time; # store the number of documents and number of bytes $build_cfg->{'numdocs'} = $self->{'buildproc'}->get_num_docs(); $build_cfg->{'numbytes'} = $self->{'buildproc'}->get_num_bytes(); # get additional stats from mg my $exedir = "$ENV{'GSDLHOME'}/bin/$ENV{'GSDLOS'}"; my $exe = &util::get_os_exe (); my $mgstat_exe = &util::filename_cat($exedir, "mgstat$exe"); my $input_file = &util::filename_cat ("text", $self->{'collection'}); if (!-e "$mgstat_exe" || !open (PIPEIN, "$mgstat_exe -d $self->{'build_dir'} -f $input_file |")) { print $outhandle "Warning: Couldn't open pipe to $mgstat_exe to get additional stats\n"; } else { my $line = ""; while (defined ($line = )) { if ($line =~ /^Words in collection \[dict\]\s+:\s+(\d+)/) { ($build_cfg->{'numwords'}) = $1; } elsif ($line =~ /^Documents\s+:\s+(\d+)/) { ($build_cfg->{'numsections'}) = $1; } } close PIPEIN; } # store the mapping between the index names and the directory names my @indexmap = (); foreach $index (@{$self->{'index_mapping'}->{'indexmaporder'}}) { push (@indexmap, "$index\-\>$self->{'index_mapping'}->{'indexmap'}->{$index}"); } $build_cfg->{'indexmap'} = \@indexmap; my @subcollectionmap = (); foreach $subcollection (@{$self->{'index_mapping'}->{'subcollectionmaporder'}}) { push (@subcollectionmap, "$subcollection\-\>" . $self->{'index_mapping'}->{'subcollectionmap'}->{$subcollection}); } $build_cfg->{'subcollectionmap'} = \@subcollectionmap if scalar (@subcollectionmap); my @languagemap = (); foreach $language (@{$self->{'index_mapping'}->{'languagemaporder'}}) { push (@languagemap, "$language\-\>" . $self->{'index_mapping'}->{'languagemap'}->{$language}); } $build_cfg->{'languagemap'} = \@languagemap if scalar (@languagemap); $build_cfg->{'notbuilt'} = $self->{'notbuilt'} if scalar @{$self->{'notbuilt'}}; # write out the build information &cfgread::write_cfg_file("$self->{'build_dir'}/build.cfg", $build_cfg, '^(builddate|numdocs|numbytes|numwords|numsections)$', '^(indexmap|subcollectionmap|languagemap|notbuilt)$'); } sub deinit { my $self = shift (@_); } sub print_stats { my $self = shift (@_); my $outhandle = $self->{'outhandle'}; my $indexing_text = $self->{'buildproc'}->get_indexing_text(); my $index = $self->{'buildproc'}->get_index(); my $num_bytes = $self->{'buildproc'}->get_num_bytes(); my $num_processed_bytes = $self->{'buildproc'}->get_num_processed_bytes(); if ($indexing_text) { print $outhandle "Stats (Creating index $index)\n"; } else { print $outhandle "Stats (Compressing text from $index)\n"; } print $outhandle "Total bytes in collection: $num_bytes\n"; print $outhandle "Total bytes in $index: $num_processed_bytes\n"; if ($num_processed_bytes < 50) { print $outhandle "***************\n"; print $outhandle "WARNING: There is very little or no text to process for $index\n"; if ($indexing_text) { print $outhandle "This may cause an error while attempting to build the index\n"; } else { print $outhandle "This may cause an error while attempting to compress the text\n"; } print $outhandle "***************\n"; } } 1;