########################################################################### # # mgbuilder.pm -- MGBuilder object # A component of the Greenstone digital library software # from the New Zealand Digital Library Project at the # University of Waikato, New Zealand. # # Copyright (C) 1999 New Zealand Digital Library Project # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation; either version 2 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program; if not, write to the Free Software # Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. # ########################################################################### package mgbuilder; use classify; use cfgread; use colcfg; use plugin; use util; use FileHandle; BEGIN { # set autoflush on for STDERR and STDOUT so that mg # doesn't get out of sync with plugins STDOUT->autoflush(1); STDERR->autoflush(1); } END { STDOUT->autoflush(0); STDERR->autoflush(0); } my $maxdocsize = 12000; my %wanted_index_files = ('td'=>1, 't'=>1, 'idb'=>1, 'ib1'=>1, 'ib2'=>1, 'ib3'=>1, 'i'=>1, 'ip'=>1, 'tiw'=>1, 'wa'=>1); sub new { my ($class, $collection, $source_dir, $build_dir, $verbosity, $maxdocs, $debug, $keepold, $remove_empty_classifications, $outhandle, $no_text, $failhandle, $gli) = @_; $outhandle = STDERR unless defined $outhandle; $no_text = 0 unless defined $no_text; $failhandle = STDERR unless defined $failhandle; # create an mgbuilder object my $self = bless {'collection'=>$collection, 'source_dir'=>$source_dir, 'build_dir'=>$build_dir, 'verbosity'=>$verbosity, 'maxdocs'=>$maxdocs, 'debug'=>$debug, 'keepold'=>$keepold, 'remove_empty_classifications'=>$remove_empty_classifications, 'outhandle'=>$outhandle, 'no_text'=>$no_text, 'failhandle'=>$failhandle, 'notbuilt'=>{}, # indexes not built 'gli'=>$gli }, $class; $self->{'gli'} = 0 unless defined $self->{'gli'}; # read in the collection configuration file my $colcfgname = "$ENV{'GSDLCOLLECTDIR'}/etc/collect.cfg"; if (!-e $colcfgname) { die "mgbuilder::new - couldn't find collect.cfg for collection $collection\n"; } $self->{'collect_cfg'} = &colcfg::read_collect_cfg ($colcfgname); if (!defined($self->{'collect_cfg'}->{'indexes'})) { $self->{'collect_cfg'}->{'indexes'} = []; } # sort out subcollection indexes if (defined $self->{'collect_cfg'}->{'indexsubcollections'}) { my $indexes = $self->{'collect_cfg'}->{'indexes'}; $self->{'collect_cfg'}->{'indexes'} = []; foreach my $subcollection (@{$self->{'collect_cfg'}->{'indexsubcollections'}}) { foreach my $index (@$indexes) { push (@{$self->{'collect_cfg'}->{'indexes'}}, "$index:$subcollection"); } } } # sort out language subindexes if (defined $self->{'collect_cfg'}->{'languages'}) { my $indexes = $self->{'collect_cfg'}->{'indexes'}; $self->{'collect_cfg'}->{'indexes'} = []; foreach my $language (@{$self->{'collect_cfg'}->{'languages'}}) { foreach my $index (@$indexes) { if (defined ($self->{'collect_cfg'}->{'indexsubcollections'})) { push (@{$self->{'collect_cfg'}->{'indexes'}}, "$index:$language"); } else { # add in an empty subcollection field push (@{$self->{'collect_cfg'}->{'indexes'}}, "$index\:\:$language"); } } } } if (defined($self->{'collect_cfg'}->{'indexes'})) { # make sure that the same index isn't specified more than once my %tmphash = (); my @tmparray = @{$self->{'collect_cfg'}->{'indexes'}}; $self->{'collect_cfg'}->{'indexes'} = []; foreach my $i (@tmparray) { if (!defined ($tmphash{$i})) { push (@{$self->{'collect_cfg'}->{'indexes'}}, $i); $tmphash{$i} = 1; } } } else { $self->{'collect_cfg'}->{'indexes'} = []; } if (scalar(@{$self->{'collect_cfg'}->{'indexes'}}) == 0) { # no indexes have been specified so we'll build a "dummy:text" index push (@{$self->{'collect_cfg'}->{'indexes'}}, "dummy:text"); } # get the list of plugins for this collection my $plugins = []; if (defined $self->{'collect_cfg'}->{'plugin'}) { $plugins = $self->{'collect_cfg'}->{'plugin'}; } # load all the plugins #build up the extra global options for the plugins my @global_opts = (); if (defined $self->{'collect_cfg'}->{'separate_cjk'} && $self->{'collect_cfg'}->{'separate_cjk'} =~ /^true$/i) { push @global_opts, "-separate_cjk"; } $self->{'pluginfo'} = &plugin::load_plugins ($plugins, $verbosity, $outhandle, $failhandle, \@global_opts); if (scalar(@{$self->{'pluginfo'}}) == 0) { print $outhandle "No plugins were loaded.\n"; die "\n"; } # get the list of classifiers for this collection my $classifiers = []; if (defined $self->{'collect_cfg'}->{'classify'}) { $classifiers = $self->{'collect_cfg'}->{'classify'}; } # load all the classifiers $self->{'classifiers'} = &classify::load_classifiers ($classifiers, $build_dir, $outhandle); # load up any dontgdbm fields $self->{'dontgdbm'} = {}; if (defined ($self->{'collect_cfg'}->{'dontgdbm'})) { foreach my $dg (@{$self->{'collect_cfg'}->{'dontgdbm'}}) { $self->{'dontgdbm'}->{$dg} = 1; } } # load up the document processor for building # if a buildproc class has been created for this collection, use it # otherwise, use the mg buildproc my ($buildprocdir, $buildproctype); if (-e "$ENV{'GSDLCOLLECTDIR'}/perllib/${collection}buildproc.pm") { $buildprocdir = "$ENV{'GSDLCOLLECTDIR'}/perllib"; $buildproctype = "${collection}buildproc"; } else { $buildprocdir = "$ENV{'GSDLHOME'}/perllib"; $buildproctype = "mgbuildproc"; } require "$buildprocdir/$buildproctype.pm"; eval("\$self->{'buildproc'} = new $buildproctype(\$collection, " . "\$source_dir, \$build_dir, \$verbosity, \$outhandle)"); die "$@" if $@; return $self; } sub init { my $self = shift (@_); if (!$self->{'debug'} && !$self->{'keepold'}) { # remove any old builds &util::rm_r($self->{'build_dir'}); &util::mk_all_dir($self->{'build_dir'}); # make the text directory my $textdir = "$self->{'build_dir'}/text"; &util::mk_all_dir($textdir); } } sub compress_text { my $self = shift (@_); my ($textindex) = @_; my $exedir = "$ENV{'GSDLHOME'}/bin/$ENV{'GSDLOS'}"; my $exe = &util::get_os_exe (); my $mg_passes_exe = &util::filename_cat($exedir, "mg_passes$exe"); my $mg_compression_dict_exe = &util::filename_cat($exedir, "mg_compression_dict$exe"); my $outhandle = $self->{'outhandle'}; my $maxnumeric = 4; if (defined($self->{'collect_cfg'}->{'maxnumeric'}) && $self->{'collect_cfg'}->{'maxnumeric'} =~ /^\d+$/) { $maxnumeric = $self->{'collect_cfg'}->{'maxnumeric'}; } &util::mk_all_dir (&util::filename_cat($self->{'build_dir'}, "text")); my $basefilename = "text/$self->{'collection'}"; my $fulltextprefix = &util::filename_cat ($self->{'build_dir'}, $basefilename); my $osextra = ""; if ($ENV{'GSDLOS'} =~ /^windows$/i) { $fulltextprefix =~ s@/@\\@g; } else { $osextra = " -d /"; } print $outhandle "\n*** creating the compressed text\n" if ($self->{'verbosity'} >= 1); print STDERR "\n" if $self->{'gli'}; # collect the statistics for the text # -b $maxdocsize sets the maximum document size to be 12 meg print $outhandle "\n collecting text statistics\n" if ($self->{'verbosity'} >= 1); print STDERR "\n" if $self->{'gli'}; my ($handle); if ($self->{'debug'}) { $handle = STDOUT; } else { if (!-e "$mg_passes_exe" || !open (PIPEOUT, "| mg_passes$exe -f \"$fulltextprefix\" -b $maxdocsize -T1 -M $maxnumeric $osextra")) { print STDERR "\n\n" if $self->{'gli'}; die "mgbuilder::compress_text - couldn't run $mg_passes_exe\n"; } $handle = mgbuilder::PIPEOUT; } $self->{'buildproc'}->set_output_handle ($handle); $self->{'buildproc'}->set_mode ('text'); $self->{'buildproc'}->set_index ($textindex); $self->{'buildproc'}->set_indexing_text (0); if ($self->{'no_text'}) { $self->{'buildproc'}->set_store_text(0); } else { $self->{'buildproc'}->set_store_text(1); } $self->{'buildproc'}->reset(); &plugin::begin($self->{'pluginfo'}, $self->{'source_dir'}, $self->{'buildproc'}, $self->{'maxdocs'}); &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'}, "", {}, $self->{'buildproc'}, $self->{'maxdocs'}); &plugin::end($self->{'pluginfo'}); close ($handle) unless $self->{'debug'}; $self->print_stats(); # create the compression dictionary # the compression dictionary is built by assuming the stats are from a seed # dictionary (-S), if a novel word is encountered it is spelled out (-H), # and the resulting dictionary must be less than 5 meg with the most frequent # words being put into the dictionary first (-2 -k 5120) if (!$self->{'debug'}) { print $outhandle "\n creating the compression dictionary\n" if ($self->{'verbosity'} >= 1); print STDERR "\n" if $self->{'gli'}; if (!-e "$mg_compression_dict_exe") { die "mgbuilder::compress_text - couldn't run $mg_compression_dict_exe\n"; } system ("mg_compression_dict$exe -f \"$fulltextprefix\" -S -H -2 -k 5120 $osextra"); # -b $maxdocsize sets the maximum document size to be 12 meg if (!-e "$mg_passes_exe" || !open ($handle, "| mg_passes$exe -f \"$fulltextprefix\" -b $maxdocsize -T2 -M $maxnumeric $osextra")) { print STDERR "\n\n" if $self->{'gli'}; die "mgbuilder::compress_text - couldn't run $mg_passes_exe\n"; } } else { print STDERR "\n" if $self->{'gli'}; } $self->{'buildproc'}->reset(); # compress the text print $outhandle "\n compressing the text\n" if ($self->{'verbosity'} >= 1); print STDERR "\n" if $self->{'gli'}; &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'}, "", {}, $self->{'buildproc'}, $self->{'maxdocs'}); close ($handle) unless $self->{'debug'}; $self->print_stats(); print STDERR "\n" if $self->{'gli'}; } sub want_built { my $self = shift (@_); my ($index) = @_; if (defined ($self->{'collect_cfg'}->{'dontbuild'})) { foreach my $checkstr (@{$self->{'collect_cfg'}->{'dontbuild'}}) { if ($index =~ /^$checkstr$/) { #push (@{$self->{'notbuilt'}}, $self->{'index_mapping'}->{$index}); $self->{'notbuilt'}->{$index} = 1; return 0; } } } return 1; } sub build_indexes { my $self = shift (@_); my ($indexname) = @_; my $outhandle = $self->{'outhandle'}; my $indexes = []; if (defined $indexname && $indexname =~ /\w/) { push @$indexes, $indexname; } else { $indexes = $self->{'collect_cfg'}->{'indexes'}; } # create the mapping between the index descriptions # and their directory names $self->{'index_mapping'} = $self->create_index_mapping ($indexes); # build each of the indexes foreach my $index (@$indexes) { if ($self->want_built($index)) { print $outhandle "\n*** building index $index in subdirectory " . "$self->{'index_mapping'}->{$index}\n" if ($self->{'verbosity'} >= 1); print STDERR "\n" if $self->{'gli'}; $self->build_index($index); } else { print $outhandle "\n*** ignoring index $index\n" if ($self->{'verbosity'} >= 1); } } } # creates directory names for each of the index descriptions sub create_index_mapping { my $self = shift (@_); my ($indexes) = @_; my %mapping = (); $mapping{'indexmaporder'} = []; $mapping{'subcollectionmaporder'} = []; $mapping{'languagemaporder'} = []; # dirnames is used to check for collisions. Start this off # with the manditory directory names my %dirnames = ('text'=>'text', 'extra'=>'extra'); my %pnames = ('index' => {}, 'subcollection' => {}, 'languages' => {}); foreach my $index (@$indexes) { my ($level, $gran, $subcollection, $languages) = split (":", $index); # the directory name starts with the first character of the index level my ($pindex) = $level =~ /^(.)/; # next comes a processed version of the index $pindex .= $self->process_field ($gran); $pindex = lc ($pindex); # next comes a processed version of the subcollection if there is one. my $psub = $self->process_field ($subcollection); $psub = lc ($psub); # next comes a processed version of the language if there is one. my $plang = $self->process_field ($languages); $plang = lc ($plang); my $dirname = $pindex . $psub . $plang; # check to be sure all index names are unique while (defined ($dirnames{$dirname})) { $dirname = $self->make_unique (\%pnames, $index, \$pindex, \$psub, \$plang); } $mapping{$index} = $dirname; # store the mapping orders as well as the maps # also put index, subcollection and language fields into the mapping thing - # (the full index name (eg document:text:subcol:lang) is not used on # the query page) -these are used for collectionmeta later on if (!defined $mapping{'indexmap'}{"$level:$gran"}) { $mapping{'indexmap'}{"$level:$gran"} = $pindex; push (@{$mapping{'indexmaporder'}}, "$level:$gran"); if (!defined $mapping{"$level:$gran"}) { $mapping{"$level:$gran"} = $pindex; } } if ($psub =~ /\w/ && !defined ($mapping{'subcollectionmap'}{$subcollection})) { $mapping{'subcollectionmap'}{$subcollection} = $psub; push (@{$mapping{'subcollectionmaporder'}}, $subcollection); $mapping{$subcollection} = $psub; } if ($plang =~ /\w/ && !defined ($mapping{'languagemap'}{$languages})) { $mapping{'languagemap'}{$languages} = $plang; push (@{$mapping{'languagemaporder'}}, $languages); $mapping{$languages} = $plang; } $dirnames{$dirname} = $index; $pnames{'index'}->{$pindex} = "$level:$gran"; $pnames{'subcollection'}->{$psub} = $subcollection; $pnames{'languages'}->{$plang} = $languages; } return \%mapping; } # returns a processed version of a field. # if the field has only one component the processed # version will contain the first character and next consonant # of that componant - otherwise it will contain the first # character of the first two components sub process_field { my $self = shift (@_); my ($field) = @_; return "" unless (defined ($field) && $field =~ /\w/); my @components = split /,/, $field; if (scalar @components >= 2) { splice (@components, 2); map {s/^(.).*$/$1/;} @components; return join("", @components); } else { my ($a, $b) = $field =~ /^(.).*?([bcdfghjklmnpqrstvwxyz])/i; ($a, $b) = $field =~ /^(.)(.)/ unless defined $a && defined $b; return "$a$b"; } } sub make_unique { my $self = shift (@_); my ($namehash, $index, $indexref, $subref, $langref) = @_; my ($level, $gran, $subcollection, $languages) = split (":", $index); if ($namehash->{'index'}->{$$indexref} ne "$level:$gran") { $self->get_next_version ($indexref); } elsif ($namehash->{'subcollection'}->{$$subref} ne $subcollection) { $self->get_next_version ($subref); } elsif ($namehash->{'languages'}->{$$langref} ne $languages) { $self->get_next_version ($langref); } return "$$indexref$$subref$$langref"; } sub get_next_version { my $self = shift (@_); my ($nameref) = @_; if ($$nameref =~ /(\d\d)$/) { my $num = $1; $num ++; $$nameref =~ s/\d\d$/$num/; } elsif ($$nameref =~ /(\d)$/) { my $num = $1; if ($num == 9) {$$nameref =~ s/\d$/10/;} else {$num ++; $$nameref =~ s/\d$/$num/;} } else { $$nameref =~ s/.$/0/; } } sub build_index { my $self = shift (@_); my ($index) = @_; my $outhandle = $self->{'outhandle'}; # get the full index directory path and make sure it exists my $indexdir = $self->{'index_mapping'}->{$index}; &util::mk_all_dir (&util::filename_cat($self->{'build_dir'}, $indexdir)); my $fullindexprefix = &util::filename_cat ($self->{'build_dir'}, $indexdir, $self->{'collection'}); my $fulltextprefix = &util::filename_cat ($self->{'build_dir'}, "text", $self->{'collection'}); # get any os specific stuff my $exedir = "$ENV{'GSDLHOME'}/bin/$ENV{'GSDLOS'}"; my $exe = &util::get_os_exe (); my $mg_passes_exe = &util::filename_cat($exedir, "mg_passes$exe"); my $mg_perf_hash_build_exe = &util::filename_cat($exedir, "mg_perf_hash_build$exe"); my $mg_weights_build_exe = &util::filename_cat ($exedir, "mg_weights_build$exe"); my $mg_invf_dict_exe = &util::filename_cat ($exedir, "mg_invf_dict$exe"); my $mg_stem_idx_exe = &util::filename_cat ($exedir, "mg_stem_idx$exe"); my $maxnumeric = 4; if (defined($self->{'collect_cfg'}->{'maxnumeric'}) && $self->{'collect_cfg'}->{'maxnumeric'} =~ /^\d+$/) { $maxnumeric = $self->{'collect_cfg'}->{'maxnumeric'}; } my $osextra = ""; if ($ENV{'GSDLOS'} =~ /^windows$/i) { $fullindexprefix =~ s@/@\\@g; } else { $osextra = " -d /"; if ($outhandle ne "STDERR") { # so mg_passes doesn't print to stderr if we redirect output $osextra .= " 2>/dev/null"; } } # get the index level from the index description # the index will be level 2 unless we are building a # paragraph level index my $index_level = 2; $index_level = 3 if $index =~ /^paragraph/i; # get the index expression if this index belongs # to a subcollection my $indexexparr = []; # there may be subcollection info, and language info. my ($level, $fields, $subcollection, $language) = split (":", $index); my @subcollections = (); @subcollections = split /,/, $subcollection if (defined $subcollection); foreach my $subcollection (@subcollections) { if (defined ($self->{'collect_cfg'}->{'subcollection'}->{$subcollection})) { push (@$indexexparr, $self->{'collect_cfg'}->{'subcollection'}->{$subcollection}); } } # add expressions for languages if this index belongs to # a language subcollection - only put languages expressions for the # ones we want in the index # this puts a separate Language/en entry in for each language in the list # is this what we want? # should we just have one entry with Language/en,es/ ?? my @languages = (); @languages = split /,/, $language if (defined $language); foreach my $language (@languages) { my $not=0; if ($language =~ s/^\!//) { $not = 1; } if($not) { push (@$indexexparr, "!Language/$language/"); } else { push (@$indexexparr, "Language/$language/"); } } # Build index dictionary. Uses verbatim stem method print $outhandle "\n creating index dictionary\n" if ($self->{'verbosity'} >= 1); print STDERR "\n" if $self->{'gli'}; my ($handle); if ($self->{'debug'}) { $handle = STDOUT; } else { if (!-e "$mg_passes_exe" || !open (PIPEOUT, "| mg_passes$exe -f \"$fullindexprefix\" -b $maxdocsize " . "-$index_level -m 32 -s 0 -G -t 10 -N1 -M $maxnumeric $osextra")) { print STDERR "\n\n" if $self->{'gli'}; die "mgbuilder::build_index - couldn't run $mg_passes_exe\n"; } $handle = mgbuilder::PIPEOUT; } # set up the document processor $self->{'buildproc'}->set_output_handle ($handle); $self->{'buildproc'}->set_mode ('text'); $self->{'buildproc'}->set_index ($index, $indexexparr); $self->{'buildproc'}->set_indexing_text (1); $self->{'buildproc'}->set_store_text(1); $self->{'buildproc'}->reset(); &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'}, "", {}, $self->{'buildproc'}, $self->{'maxdocs'}); close ($handle) unless $self->{'debug'}; $self->print_stats(); # now we check to see if the required files have been produced - if not we quit building this index so the whole process doesn't crap out. # we check on the .id file - index dictionary my $dict_file = "$fullindexprefix.id"; if (!-e $dict_file) { print $outhandle "mgbuilder::build_index - Couldn't create index $index\n"; $self->{'notbuilt'}->{$index}=1; return; } if (!$self->{'debug'}) { # create the perfect hash function if (!-e "$mg_perf_hash_build_exe") { print STDERR "\n\n" if $self->{'gli'}; die "mgbuilder::build_index - couldn't run $mg_perf_hash_build_exe\n"; } system ("mg_perf_hash_build$exe -f \"$fullindexprefix\" $osextra"); if (!-e "$mg_passes_exe" || !open ($handle, "| mg_passes$exe -f \"$fullindexprefix\" -b $maxdocsize " . "-$index_level -c 3 -G -t 10 -N2 -M $maxnumeric $osextra")) { print STDERR "\n\n" if $self->{'gli'}; die "mgbuilder::build_index - couldn't run $mg_passes_exe\n"; } } # invert the text print $outhandle "\n inverting the text\n" if ($self->{'verbosity'} >= 1); print STDERR "\n" if $self->{'gli'}; $self->{'buildproc'}->reset(); &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'}, "", {}, $self->{'buildproc'}, $self->{'maxdocs'}); $self->print_stats (); if (!$self->{'debug'}) { close ($handle); # create the weights file print $outhandle "\n create the weights file\n" if ($self->{'verbosity'} >= 1); print STDERR "\n" if $self->{'gli'}; if (!-e "$mg_weights_build_exe") { print STDERR "\n\n" if $self->{'gli'}; die "mgbuilder::build_index - couldn't run $mg_weights_build_exe\n"; } system ("mg_weights_build$exe -f \"$fullindexprefix\" -t \"$fulltextprefix\" $osextra"); # create 'on-disk' stemmed dictionary print $outhandle "\n creating 'on-disk' stemmed dictionary\n" if ($self->{'verbosity'} >= 1); print STDERR "\n" if $self->{'gli'}; if (!-e "$mg_invf_dict_exe") { print STDERR "\n\n" if $self->{'gli'}; die "mgbuilder::build_index - couldn't run $mg_invf_dict_exe\n"; } system ("mg_invf_dict$exe -f \"$fullindexprefix\" $osextra"); # creates stem index files for the various stemming methods print $outhandle "\n creating stem indexes\n" if ($self->{'verbosity'} >= 1); print STDERR "\n" if $self->{'gli'}; if (!-e "$mg_stem_idx_exe") { print STDERR "\n\n" if $self->{'gli'}; die "mgbuilder::build_index - couldn't run $mg_stem_idx_exe\n"; } system ("mg_stem_idx$exe -b 4096 -s1 -f \"$fullindexprefix\" $osextra"); system ("mg_stem_idx$exe -b 4096 -s2 -f \"$fullindexprefix\" $osextra"); system ("mg_stem_idx$exe -b 4096 -s3 -f \"$fullindexprefix\" $osextra"); # remove unwanted files my $tmpdir = &util::filename_cat ($self->{'build_dir'}, $indexdir); opendir (DIR, $tmpdir) || die "mgbuilder::build_index - couldn't read directory $tmpdir\n"; foreach my $file (readdir(DIR)) { next if $file =~ /^\./; my ($suffix) = $file =~ /\.([^\.]+)$/; if (defined $suffix && !defined $wanted_index_files{$suffix}) { # delete it! print $outhandle "deleting $file\n" if $self->{'verbosity'} > 2; &util::rm (&util::filename_cat ($tmpdir, $file)); } } closedir (DIR); } print STDERR "\n" if $self->{'gli'}; } sub make_infodatabase { my $self = shift (@_); my $outhandle = $self->{'outhandle'}; my $textdir = &util::filename_cat($self->{'build_dir'}, "text"); my $assocdir = &util::filename_cat($self->{'build_dir'}, "assoc"); &util::mk_all_dir ($textdir); &util::mk_all_dir ($assocdir); # get db name my $dbext = ".bdb"; $dbext = ".ldb" if &util::is_little_endian(); my $fulldbname = &util::filename_cat ($textdir, "$self->{'collection'}$dbext"); $fulldbname =~ s/\//\\/g if ($ENV{'GSDLOS'} =~ /^windows$/i); my $exedir = "$ENV{'GSDLHOME'}/bin/$ENV{'GSDLOS'}"; my $exe = &util::get_os_exe (); my $txt2db_exe = &util::filename_cat($exedir, "txt2db$exe"); print $outhandle "\n*** creating the info database and processing associated files\n" if ($self->{'verbosity'} >= 1); print STDERR "\n" if $self->{'gli'}; # init all the classifiers &classify::init_classifiers ($self->{'classifiers'}); # set up the document processor my ($handle); if ($self->{'debug'}) { $handle = STDOUT; } else { if (!-e "$txt2db_exe" || !open (PIPEOUT, "| txt2db$exe \"$fulldbname\"")) { print STDERR "\n\n" if $self->{'gli'}; die "mgbuilder::make_infodatabase - couldn't run $txt2db_exe\n"; } $handle = mgbuilder::PIPEOUT; } $self->{'buildproc'}->set_output_handle ($handle); $self->{'buildproc'}->set_mode ('infodb'); $self->{'buildproc'}->set_assocdir ($assocdir); $self->{'buildproc'}->set_dontgdbm ($self->{'dontgdbm'}); $self->{'buildproc'}->set_classifiers ($self->{'classifiers'}); $self->{'buildproc'}->set_indexing_text (0); $self->{'buildproc'}->set_store_text(1); $self->{'buildproc'}->reset(); if (defined $self->{'collect_cfg'}->{'collectionmeta'}) { if (!defined $self->{'index_mapping'}) { $self->{'index_mapping'} = $self->create_index_mapping ($self->{'collect_cfg'}->{'indexes'}); } print $handle "[collection]\n"; foreach my $cmeta (keys (%{$self->{'collect_cfg'}->{'collectionmeta'}})) { my $defaultfound=0; my $first=1; my $metadata_entry = ""; my $default=""; my $cmetamap = ""; if ($cmeta =~ s/^\.//) { if (defined $self->{'index_mapping'}->{$cmeta}) { $cmetamap = $self->{'index_mapping'}->{$cmeta}; $cmeta = ".$cmeta"; } else { print $outhandle "mgbuilder: warning bad collectionmeta option '$cmeta' - ignored\n"; next; #ignore this one } } else { $cmetamap = $cmeta; # just using the same name } #iterate through the languages foreach my $lang (keys (%{$self->{'collect_cfg'}->{'collectionmeta'}->{$cmeta}})) { if ($first) { $first=0; #set the default default to the first entry $default=$self->{'collect_cfg'}->{'collectionmeta'}->{$cmeta}->{$lang}; } if ($lang =~ /default/) { $defaultfound=1; #the default entry goes first $metadata_entry = "<$cmetamap>" . $self->{'collect_cfg'}->{'collectionmeta'}->{$cmeta}->{'default'} . "\n" . $metadata_entry; } else { my ($l) = $lang =~ /^\[l=(\w*)\]$/; if ($l) { $metadata_entry .= "<$cmetamap:$l>" . $self->{'collect_cfg'}->{'collectionmeta'}->{$cmeta}->{$lang} . "\n"; # Use the English value as the default if no default is specified if ($l =~ /en/i) { $default=$self->{'collect_cfg'}->{'collectionmeta'}->{$cmeta}->{$lang}; } } } } #if we haven't found a default, put one in if (!$defaultfound) { $metadata_entry = "<$cmetamap>$default\n" . $metadata_entry; } #write the entry to the file print $handle $metadata_entry; } print $handle "\n" . ('-' x 70) . "\n"; } &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'}, "", {}, $self->{'buildproc'}, $self->{'maxdocs'}); # output classification information &classify::output_classify_info ($self->{'classifiers'}, $handle, $self->{'remove_empty_classifications'}, $self->{'gli'}); #output doclist my @doclist = $self->{'buildproc'}->get_doc_list(); my $docs = join (";",@doclist); print $handle "[browselist]\n"; print $handle "0\n"; print $handle "VList\n"; print $handle "" . ($#doclist+1) . "\n"; print $handle "Invisible\n"; print $handle "$docs"; print $handle "\n" . ('-' x 70) . "\n"; close ($handle) if !$self->{'debug'}; print STDERR "\n" if $self->{'gli'}; } sub collect_specific { my $self = shift (@_); } sub make_auxiliary_files { my $self = shift (@_); my ($index); my $build_cfg = {}; my $outhandle = $self->{'outhandle'}; print $outhandle "\n*** creating auxiliary files \n" if ($self->{'verbosity'} >= 1); print STDERR "\n" if $self->{'gli'}; # get the text directory &util::mk_all_dir ($self->{'build_dir'}); # store the build date $build_cfg->{'builddate'} = time; # store the number of documents and number of bytes $build_cfg->{'numdocs'} = $self->{'buildproc'}->get_num_docs(); $build_cfg->{'numbytes'} = $self->{'buildproc'}->get_num_bytes(); # get additional stats from mg my $exedir = "$ENV{'GSDLHOME'}/bin/$ENV{'GSDLOS'}"; my $exe = &util::get_os_exe (); my $mgstat_exe = &util::filename_cat($exedir, "mgstat$exe"); my $input_file = &util::filename_cat ("text", $self->{'collection'}); if (!-e "$mgstat_exe" || !open (PIPEIN, "mgstat$exe -d \"$self->{'build_dir'}\" -f \"$input_file\" |")) { print $outhandle "Warning: Couldn't open pipe to $mgstat_exe to get additional stats\n"; } else { my $line = ""; while (defined ($line = )) { if ($line =~ /^Words in collection \[dict\]\s+:\s+(\d+)/) { ($build_cfg->{'numwords'}) = $1; } elsif ($line =~ /^Documents\s+:\s+(\d+)/) { ($build_cfg->{'numsections'}) = $1; } } close PIPEIN; } # store the mapping between the index names and the directory names # the index map is used to determine what indexes there are, so any that are not built should not be put into the map. my @indexmap = (); foreach my $index (@{$self->{'index_mapping'}->{'indexmaporder'}}) { if (not defined ($self->{'notbuilt'}->{$index})) { push (@indexmap, "$index\-\>$self->{'index_mapping'}->{'indexmap'}->{$index}"); } } $build_cfg->{'indexmap'} = \@indexmap; my @subcollectionmap = (); foreach my $subcollection (@{$self->{'index_mapping'}->{'subcollectionmaporder'}}) { push (@subcollectionmap, "$subcollection\-\>" . $self->{'index_mapping'}->{'subcollectionmap'}->{$subcollection}); } $build_cfg->{'subcollectionmap'} = \@subcollectionmap if scalar (@subcollectionmap); my @languagemap = (); foreach my $language (@{$self->{'index_mapping'}->{'languagemaporder'}}) { push (@languagemap, "$language\-\>" . $self->{'index_mapping'}->{'languagemap'}->{$language}); } $build_cfg->{'languagemap'} = \@languagemap if scalar (@languagemap); #$build_cfg->{'notbuilt'} = $self->{'notbuilt'} if scalar @{$self->{'notbuilt'}}; my @notbuilt = (); foreach my $nb (keys %{$self->{'notbuilt'}}) { push (@notbuilt, $nb); } $build_cfg->{'notbuilt'} = \@notbuilt if scalar (@notbuilt); $build_cfg->{'maxnumeric'} = 4; if (defined($self->{'collect_cfg'}->{'maxnumeric'}) && $self->{'collect_cfg'}->{'maxnumeric'} =~ /^\d+$/) { $build_cfg->{'maxnumeric'} = $self->{'collect_cfg'}->{'maxnumeric'}; } # write out the build information &cfgread::write_cfg_file("$self->{'build_dir'}/build.cfg", $build_cfg, '^(builddate|numdocs|numbytes|numwords|numsections|maxnumeric)$', '^(indexmap|subcollectionmap|languagemap|notbuilt)$'); print STDERR "\n" if $self->{'gli'}; } sub deinit { my $self = shift (@_); } sub print_stats { my $self = shift (@_); my $outhandle = $self->{'outhandle'}; my $indexing_text = $self->{'buildproc'}->get_indexing_text(); my $index = $self->{'buildproc'}->get_index(); my $num_bytes = $self->{'buildproc'}->get_num_bytes(); my $num_processed_bytes = $self->{'buildproc'}->get_num_processed_bytes(); if ($indexing_text) { print $outhandle "Stats (Creating index $index)\n"; } else { print $outhandle "Stats (Compressing text from $index)\n"; } print $outhandle "Total bytes in collection: $num_bytes\n"; print $outhandle "Total bytes in $index: $num_processed_bytes\n"; if ($num_processed_bytes < 50 && ($indexing_text || !$self->{'no_text'})) { print $outhandle "***************\n"; if ($indexing_text) { print $outhandle "WARNING: There is very little or no text to process for $index\n"; } elsif (!$self->{'no_text'}) { print $outhandle "WARNING: There is very little or no text to compress\n"; } print $outhandle " Was this your intention?\n"; print $outhandle "***************\n"; print STDERR "\n" if $self->{'gli'}; } } 1;