###########################################################################
#
# mgbuilder.pm -- MGBuilder object
# A component of the Greenstone digital library software
# from the New Zealand Digital Library Project at the
# University of Waikato, New Zealand.
#
# Copyright (C) 1999 New Zealand Digital Library Project
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
#
###########################################################################
package mgbuilder;
use classify;
use cfgread;
use colcfg;
use plugin;
use util;
use FileHandle;
BEGIN {
# set autoflush on for STDERR and STDOUT so that mg
# doesn't get out of sync with plugins
STDOUT->autoflush(1);
STDERR->autoflush(1);
}
END {
STDOUT->autoflush(0);
STDERR->autoflush(0);
}
my $maxdocsize = 12000;
my %wanted_index_files = ('td'=>1,
't'=>1,
'idb'=>1,
'ib1'=>1,
'ib2'=>1,
'ib3'=>1,
'i'=>1,
'ip'=>1,
'tiw'=>1,
'wa'=>1);
sub new {
my ($class, $collection, $source_dir, $build_dir, $verbosity,
$maxdocs, $debug, $keepold, $remove_empty_classifications,
$outhandle, $no_text, $failhandle, $gli) = @_;
$outhandle = STDERR unless defined $outhandle;
$no_text = 0 unless defined $no_text;
$failhandle = STDERR unless defined $failhandle;
# create an mgbuilder object
my $self = bless {'collection'=>$collection,
'source_dir'=>$source_dir,
'build_dir'=>$build_dir,
'verbosity'=>$verbosity,
'maxdocs'=>$maxdocs,
'debug'=>$debug,
'keepold'=>$keepold,
'remove_empty_classifications'=>$remove_empty_classifications,
'outhandle'=>$outhandle,
'no_text'=>$no_text,
'failhandle'=>$failhandle,
'notbuilt'=>{}, # indexes not built
'gli'=>$gli
}, $class;
$self->{'gli'} = 0 unless defined $self->{'gli'};
# read in the collection configuration file
my $colcfgname = "$ENV{'GSDLCOLLECTDIR'}/etc/collect.cfg";
if (!-e $colcfgname) {
die "mgbuilder::new - couldn't find collect.cfg for collection $collection\n";
}
$self->{'collect_cfg'} = &colcfg::read_collect_cfg ($colcfgname);
if (!defined($self->{'collect_cfg'}->{'indexes'})) {
$self->{'collect_cfg'}->{'indexes'} = [];
}
# sort out subcollection indexes
if (defined $self->{'collect_cfg'}->{'indexsubcollections'}) {
my $indexes = $self->{'collect_cfg'}->{'indexes'};
$self->{'collect_cfg'}->{'indexes'} = [];
foreach my $subcollection (@{$self->{'collect_cfg'}->{'indexsubcollections'}}) {
foreach my $index (@$indexes) {
push (@{$self->{'collect_cfg'}->{'indexes'}}, "$index:$subcollection");
}
}
}
# sort out language subindexes
if (defined $self->{'collect_cfg'}->{'languages'}) {
my $indexes = $self->{'collect_cfg'}->{'indexes'};
$self->{'collect_cfg'}->{'indexes'} = [];
foreach my $language (@{$self->{'collect_cfg'}->{'languages'}}) {
foreach my $index (@$indexes) {
if (defined ($self->{'collect_cfg'}->{'indexsubcollections'})) {
push (@{$self->{'collect_cfg'}->{'indexes'}}, "$index:$language");
}
else { # add in an empty subcollection field
push (@{$self->{'collect_cfg'}->{'indexes'}}, "$index\:\:$language");
}
}
}
}
if (defined($self->{'collect_cfg'}->{'indexes'})) {
# make sure that the same index isn't specified more than once
my %tmphash = ();
my @tmparray = @{$self->{'collect_cfg'}->{'indexes'}};
$self->{'collect_cfg'}->{'indexes'} = [];
foreach my $i (@tmparray) {
if (!defined ($tmphash{$i})) {
push (@{$self->{'collect_cfg'}->{'indexes'}}, $i);
$tmphash{$i} = 1;
}
}
} else {
$self->{'collect_cfg'}->{'indexes'} = [];
}
if (scalar(@{$self->{'collect_cfg'}->{'indexes'}}) == 0) {
# no indexes have been specified so we'll build a "dummy:text" index
push (@{$self->{'collect_cfg'}->{'indexes'}}, "dummy:text");
}
# get the list of plugins for this collection
my $plugins = [];
if (defined $self->{'collect_cfg'}->{'plugin'}) {
$plugins = $self->{'collect_cfg'}->{'plugin'};
}
# load all the plugins
#build up the extra global options for the plugins
my @global_opts = ();
if (defined $self->{'collect_cfg'}->{'separate_cjk'} && $self->{'collect_cfg'}->{'separate_cjk'} =~ /^true$/i) {
push @global_opts, "-separate_cjk";
}
$self->{'pluginfo'} = &plugin::load_plugins ($plugins, $verbosity, $outhandle, $failhandle, \@global_opts);
if (scalar(@{$self->{'pluginfo'}}) == 0) {
print $outhandle "No plugins were loaded.\n";
die "\n";
}
# get the list of classifiers for this collection
my $classifiers = [];
if (defined $self->{'collect_cfg'}->{'classify'}) {
$classifiers = $self->{'collect_cfg'}->{'classify'};
}
# load all the classifiers
$self->{'classifiers'} = &classify::load_classifiers ($classifiers, $build_dir, $outhandle);
# load up any dontgdbm fields
$self->{'dontgdbm'} = {};
if (defined ($self->{'collect_cfg'}->{'dontgdbm'})) {
foreach my $dg (@{$self->{'collect_cfg'}->{'dontgdbm'}}) {
$self->{'dontgdbm'}->{$dg} = 1;
}
}
# load up the document processor for building
# if a buildproc class has been created for this collection, use it
# otherwise, use the mg buildproc
my ($buildprocdir, $buildproctype);
if (-e "$ENV{'GSDLCOLLECTDIR'}/perllib/${collection}buildproc.pm") {
$buildprocdir = "$ENV{'GSDLCOLLECTDIR'}/perllib";
$buildproctype = "${collection}buildproc";
} else {
$buildprocdir = "$ENV{'GSDLHOME'}/perllib";
$buildproctype = "mgbuildproc";
}
require "$buildprocdir/$buildproctype.pm";
eval("\$self->{'buildproc'} = new $buildproctype(\$collection, " .
"\$source_dir, \$build_dir, \$verbosity, \$outhandle)");
die "$@" if $@;
return $self;
}
sub init {
my $self = shift (@_);
if (!$self->{'debug'} && !$self->{'keepold'}) {
# remove any old builds
&util::rm_r($self->{'build_dir'});
&util::mk_all_dir($self->{'build_dir'});
# make the text directory
my $textdir = "$self->{'build_dir'}/text";
&util::mk_all_dir($textdir);
}
}
sub compress_text {
my $self = shift (@_);
my ($textindex) = @_;
my $exedir = "$ENV{'GSDLHOME'}/bin/$ENV{'GSDLOS'}";
my $exe = &util::get_os_exe ();
my $mg_passes_exe = &util::filename_cat($exedir, "mg_passes$exe");
my $mg_compression_dict_exe = &util::filename_cat($exedir, "mg_compression_dict$exe");
my $outhandle = $self->{'outhandle'};
my $maxnumeric = 4;
if (defined($self->{'collect_cfg'}->{'maxnumeric'}) &&
$self->{'collect_cfg'}->{'maxnumeric'} =~ /^\d+$/) {
$maxnumeric = $self->{'collect_cfg'}->{'maxnumeric'};
}
&util::mk_all_dir (&util::filename_cat($self->{'build_dir'}, "text"));
my $basefilename = "text/$self->{'collection'}";
my $fulltextprefix = &util::filename_cat ($self->{'build_dir'}, $basefilename);
my $osextra = "";
if ($ENV{'GSDLOS'} =~ /^windows$/i) {
$fulltextprefix =~ s@/@\\@g;
} else {
$osextra = " -d /";
}
print $outhandle "\n*** creating the compressed text\n" if ($self->{'verbosity'} >= 1);
print STDERR "\n" if $self->{'gli'};
# collect the statistics for the text
# -b $maxdocsize sets the maximum document size to be 12 meg
print $outhandle "\n collecting text statistics\n" if ($self->{'verbosity'} >= 1);
print STDERR "\n" if $self->{'gli'};
my ($handle);
if ($self->{'debug'}) {
$handle = STDOUT;
} else {
if (!-e "$mg_passes_exe" ||
!open (PIPEOUT, "| mg_passes$exe -f \"$fulltextprefix\" -b $maxdocsize -T1 -M $maxnumeric $osextra")) {
print STDERR "\n\n" if $self->{'gli'};
die "mgbuilder::compress_text - couldn't run $mg_passes_exe\n";
}
$handle = mgbuilder::PIPEOUT;
}
$self->{'buildproc'}->set_output_handle ($handle);
$self->{'buildproc'}->set_mode ('text');
$self->{'buildproc'}->set_index ($textindex);
$self->{'buildproc'}->set_indexing_text (0);
if ($self->{'no_text'}) {
$self->{'buildproc'}->set_store_text(0);
} else {
$self->{'buildproc'}->set_store_text(1);
}
$self->{'buildproc'}->reset();
&plugin::begin($self->{'pluginfo'}, $self->{'source_dir'},
$self->{'buildproc'}, $self->{'maxdocs'});
&plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
"", {}, $self->{'buildproc'}, $self->{'maxdocs'});
&plugin::end($self->{'pluginfo'});
close ($handle) unless $self->{'debug'};
$self->print_stats();
# create the compression dictionary
# the compression dictionary is built by assuming the stats are from a seed
# dictionary (-S), if a novel word is encountered it is spelled out (-H),
# and the resulting dictionary must be less than 5 meg with the most frequent
# words being put into the dictionary first (-2 -k 5120)
if (!$self->{'debug'}) {
print $outhandle "\n creating the compression dictionary\n" if ($self->{'verbosity'} >= 1);
print STDERR "\n" if $self->{'gli'};
if (!-e "$mg_compression_dict_exe") {
die "mgbuilder::compress_text - couldn't run $mg_compression_dict_exe\n";
}
system ("mg_compression_dict$exe -f \"$fulltextprefix\" -S -H -2 -k 5120 $osextra");
# -b $maxdocsize sets the maximum document size to be 12 meg
if (!-e "$mg_passes_exe" ||
!open ($handle, "| mg_passes$exe -f \"$fulltextprefix\" -b $maxdocsize -T2 -M $maxnumeric $osextra")) {
print STDERR "\n\n" if $self->{'gli'};
die "mgbuilder::compress_text - couldn't run $mg_passes_exe\n";
}
}
else {
print STDERR "\n" if $self->{'gli'};
}
$self->{'buildproc'}->reset();
# compress the text
print $outhandle "\n compressing the text\n" if ($self->{'verbosity'} >= 1);
print STDERR "\n" if $self->{'gli'};
&plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
"", {}, $self->{'buildproc'}, $self->{'maxdocs'});
close ($handle) unless $self->{'debug'};
$self->print_stats();
print STDERR "\n" if $self->{'gli'};
}
sub want_built {
my $self = shift (@_);
my ($index) = @_;
if (defined ($self->{'collect_cfg'}->{'dontbuild'})) {
foreach my $checkstr (@{$self->{'collect_cfg'}->{'dontbuild'}}) {
if ($index =~ /^$checkstr$/) {
#push (@{$self->{'notbuilt'}}, $self->{'index_mapping'}->{$index});
$self->{'notbuilt'}->{$index} = 1;
return 0;
}
}
}
return 1;
}
sub build_indexes {
my $self = shift (@_);
my ($indexname) = @_;
my $outhandle = $self->{'outhandle'};
my $indexes = [];
if (defined $indexname && $indexname =~ /\w/) {
push @$indexes, $indexname;
} else {
$indexes = $self->{'collect_cfg'}->{'indexes'};
}
# create the mapping between the index descriptions
# and their directory names
$self->{'index_mapping'} = $self->create_index_mapping ($indexes);
# build each of the indexes
foreach my $index (@$indexes) {
if ($self->want_built($index)) {
print $outhandle "\n*** building index $index in subdirectory " .
"$self->{'index_mapping'}->{$index}\n" if ($self->{'verbosity'} >= 1);
print STDERR "\n" if $self->{'gli'};
$self->build_index($index);
} else {
print $outhandle "\n*** ignoring index $index\n" if ($self->{'verbosity'} >= 1);
}
}
}
# creates directory names for each of the index descriptions
sub create_index_mapping {
my $self = shift (@_);
my ($indexes) = @_;
my %mapping = ();
$mapping{'indexmaporder'} = [];
$mapping{'subcollectionmaporder'} = [];
$mapping{'languagemaporder'} = [];
# dirnames is used to check for collisions. Start this off
# with the manditory directory names
my %dirnames = ('text'=>'text',
'extra'=>'extra');
my %pnames = ('index' => {}, 'subcollection' => {}, 'languages' => {});
foreach my $index (@$indexes) {
my ($level, $gran, $subcollection, $languages) = split (":", $index);
# the directory name starts with the first character of the index level
my ($pindex) = $level =~ /^(.)/;
# next comes a processed version of the index
$pindex .= $self->process_field ($gran);
$pindex = lc ($pindex);
# next comes a processed version of the subcollection if there is one.
my $psub = $self->process_field ($subcollection);
$psub = lc ($psub);
# next comes a processed version of the language if there is one.
my $plang = $self->process_field ($languages);
$plang = lc ($plang);
my $dirname = $pindex . $psub . $plang;
# check to be sure all index names are unique
while (defined ($dirnames{$dirname})) {
$dirname = $self->make_unique (\%pnames, $index, \$pindex, \$psub, \$plang);
}
$mapping{$index} = $dirname;
# store the mapping orders as well as the maps
# also put index, subcollection and language fields into the mapping thing -
# (the full index name (eg document:text:subcol:lang) is not used on
# the query page) -these are used for collectionmeta later on
if (!defined $mapping{'indexmap'}{"$level:$gran"}) {
$mapping{'indexmap'}{"$level:$gran"} = $pindex;
push (@{$mapping{'indexmaporder'}}, "$level:$gran");
if (!defined $mapping{"$level:$gran"}) {
$mapping{"$level:$gran"} = $pindex;
}
}
if ($psub =~ /\w/ && !defined ($mapping{'subcollectionmap'}{$subcollection})) {
$mapping{'subcollectionmap'}{$subcollection} = $psub;
push (@{$mapping{'subcollectionmaporder'}}, $subcollection);
$mapping{$subcollection} = $psub;
}
if ($plang =~ /\w/ && !defined ($mapping{'languagemap'}{$languages})) {
$mapping{'languagemap'}{$languages} = $plang;
push (@{$mapping{'languagemaporder'}}, $languages);
$mapping{$languages} = $plang;
}
$dirnames{$dirname} = $index;
$pnames{'index'}->{$pindex} = "$level:$gran";
$pnames{'subcollection'}->{$psub} = $subcollection;
$pnames{'languages'}->{$plang} = $languages;
}
return \%mapping;
}
# returns a processed version of a field.
# if the field has only one component the processed
# version will contain the first character and next consonant
# of that componant - otherwise it will contain the first
# character of the first two components
sub process_field {
my $self = shift (@_);
my ($field) = @_;
return "" unless (defined ($field) && $field =~ /\w/);
my @components = split /,/, $field;
if (scalar @components >= 2) {
splice (@components, 2);
map {s/^(.).*$/$1/;} @components;
return join("", @components);
} else {
my ($a, $b) = $field =~ /^(.).*?([bcdfghjklmnpqrstvwxyz])/i;
($a, $b) = $field =~ /^(.)(.)/ unless defined $a && defined $b;
return "$a$b";
}
}
sub make_unique {
my $self = shift (@_);
my ($namehash, $index, $indexref, $subref, $langref) = @_;
my ($level, $gran, $subcollection, $languages) = split (":", $index);
if ($namehash->{'index'}->{$$indexref} ne "$level:$gran") {
$self->get_next_version ($indexref);
} elsif ($namehash->{'subcollection'}->{$$subref} ne $subcollection) {
$self->get_next_version ($subref);
} elsif ($namehash->{'languages'}->{$$langref} ne $languages) {
$self->get_next_version ($langref);
}
return "$$indexref$$subref$$langref";
}
sub get_next_version {
my $self = shift (@_);
my ($nameref) = @_;
if ($$nameref =~ /(\d\d)$/) {
my $num = $1; $num ++;
$$nameref =~ s/\d\d$/$num/;
} elsif ($$nameref =~ /(\d)$/) {
my $num = $1;
if ($num == 9) {$$nameref =~ s/\d$/10/;}
else {$num ++; $$nameref =~ s/\d$/$num/;}
} else {
$$nameref =~ s/.$/0/;
}
}
sub build_index {
my $self = shift (@_);
my ($index) = @_;
my $outhandle = $self->{'outhandle'};
# get the full index directory path and make sure it exists
my $indexdir = $self->{'index_mapping'}->{$index};
&util::mk_all_dir (&util::filename_cat($self->{'build_dir'}, $indexdir));
my $fullindexprefix = &util::filename_cat ($self->{'build_dir'}, $indexdir,
$self->{'collection'});
my $fulltextprefix = &util::filename_cat ($self->{'build_dir'}, "text",
$self->{'collection'});
# get any os specific stuff
my $exedir = "$ENV{'GSDLHOME'}/bin/$ENV{'GSDLOS'}";
my $exe = &util::get_os_exe ();
my $mg_passes_exe = &util::filename_cat($exedir, "mg_passes$exe");
my $mg_perf_hash_build_exe =
&util::filename_cat($exedir, "mg_perf_hash_build$exe");
my $mg_weights_build_exe =
&util::filename_cat ($exedir, "mg_weights_build$exe");
my $mg_invf_dict_exe =
&util::filename_cat ($exedir, "mg_invf_dict$exe");
my $mg_stem_idx_exe =
&util::filename_cat ($exedir, "mg_stem_idx$exe");
my $maxnumeric = 4;
if (defined($self->{'collect_cfg'}->{'maxnumeric'}) &&
$self->{'collect_cfg'}->{'maxnumeric'} =~ /^\d+$/) {
$maxnumeric = $self->{'collect_cfg'}->{'maxnumeric'};
}
my $osextra = "";
if ($ENV{'GSDLOS'} =~ /^windows$/i) {
$fullindexprefix =~ s@/@\\@g;
} else {
$osextra = " -d /";
if ($outhandle ne "STDERR") {
# so mg_passes doesn't print to stderr if we redirect output
$osextra .= " 2>/dev/null";
}
}
# get the index level from the index description
# the index will be level 2 unless we are building a
# paragraph level index
my $index_level = 2;
$index_level = 3 if $index =~ /^paragraph/i;
# get the index expression if this index belongs
# to a subcollection
my $indexexparr = [];
# there may be subcollection info, and language info.
my ($level, $fields, $subcollection, $language) = split (":", $index);
my @subcollections = ();
@subcollections = split /,/, $subcollection if (defined $subcollection);
foreach my $subcollection (@subcollections) {
if (defined ($self->{'collect_cfg'}->{'subcollection'}->{$subcollection})) {
push (@$indexexparr, $self->{'collect_cfg'}->{'subcollection'}->{$subcollection});
}
}
# add expressions for languages if this index belongs to
# a language subcollection - only put languages expressions for the
# ones we want in the index
# this puts a separate Language/en entry in for each language in the list
# is this what we want?
# should we just have one entry with Language/en,es/ ??
my @languages = ();
@languages = split /,/, $language if (defined $language);
foreach my $language (@languages) {
my $not=0;
if ($language =~ s/^\!//) {
$not = 1;
}
if($not) {
push (@$indexexparr, "!Language/$language/");
} else {
push (@$indexexparr, "Language/$language/");
}
}
# Build index dictionary. Uses verbatim stem method
print $outhandle "\n creating index dictionary\n" if ($self->{'verbosity'} >= 1);
print STDERR "\n" if $self->{'gli'};
my ($handle);
if ($self->{'debug'}) {
$handle = STDOUT;
} else {
if (!-e "$mg_passes_exe" ||
!open (PIPEOUT, "| mg_passes$exe -f \"$fullindexprefix\" -b $maxdocsize " .
"-$index_level -m 32 -s 0 -G -t 10 -N1 -M $maxnumeric $osextra")) {
print STDERR "\n\n" if $self->{'gli'};
die "mgbuilder::build_index - couldn't run $mg_passes_exe\n";
}
$handle = mgbuilder::PIPEOUT;
}
# set up the document processor
$self->{'buildproc'}->set_output_handle ($handle);
$self->{'buildproc'}->set_mode ('text');
$self->{'buildproc'}->set_index ($index, $indexexparr);
$self->{'buildproc'}->set_indexing_text (1);
$self->{'buildproc'}->set_store_text(1);
$self->{'buildproc'}->reset();
&plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
"", {}, $self->{'buildproc'}, $self->{'maxdocs'});
close ($handle) unless $self->{'debug'};
$self->print_stats();
# now we check to see if the required files have been produced - if not we quit building this index so the whole process doesn't crap out.
# we check on the .id file - index dictionary
my $dict_file = "$fullindexprefix.id";
if (!-e $dict_file) {
print $outhandle "mgbuilder::build_index - Couldn't create index $index\n";
$self->{'notbuilt'}->{$index}=1;
return;
}
if (!$self->{'debug'}) {
# create the perfect hash function
if (!-e "$mg_perf_hash_build_exe") {
print STDERR "\n\n" if $self->{'gli'};
die "mgbuilder::build_index - couldn't run $mg_perf_hash_build_exe\n";
}
system ("mg_perf_hash_build$exe -f \"$fullindexprefix\" $osextra");
if (!-e "$mg_passes_exe" ||
!open ($handle, "| mg_passes$exe -f \"$fullindexprefix\" -b $maxdocsize " .
"-$index_level -c 3 -G -t 10 -N2 -M $maxnumeric $osextra")) {
print STDERR "\n\n" if $self->{'gli'};
die "mgbuilder::build_index - couldn't run $mg_passes_exe\n";
}
}
# invert the text
print $outhandle "\n inverting the text\n" if ($self->{'verbosity'} >= 1);
print STDERR "\n" if $self->{'gli'};
$self->{'buildproc'}->reset();
&plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
"", {}, $self->{'buildproc'}, $self->{'maxdocs'});
$self->print_stats ();
if (!$self->{'debug'}) {
close ($handle);
# create the weights file
print $outhandle "\n create the weights file\n" if ($self->{'verbosity'} >= 1);
print STDERR "\n" if $self->{'gli'};
if (!-e "$mg_weights_build_exe") {
print STDERR "\n\n" if $self->{'gli'};
die "mgbuilder::build_index - couldn't run $mg_weights_build_exe\n";
}
system ("mg_weights_build$exe -f \"$fullindexprefix\" -t \"$fulltextprefix\" $osextra");
# create 'on-disk' stemmed dictionary
print $outhandle "\n creating 'on-disk' stemmed dictionary\n" if ($self->{'verbosity'} >= 1);
print STDERR "\n" if $self->{'gli'};
if (!-e "$mg_invf_dict_exe") {
print STDERR "\n\n" if $self->{'gli'};
die "mgbuilder::build_index - couldn't run $mg_invf_dict_exe\n";
}
system ("mg_invf_dict$exe -f \"$fullindexprefix\" $osextra");
# creates stem index files for the various stemming methods
print $outhandle "\n creating stem indexes\n" if ($self->{'verbosity'} >= 1);
print STDERR "\n" if $self->{'gli'};
if (!-e "$mg_stem_idx_exe") {
print STDERR "\n\n" if $self->{'gli'};
die "mgbuilder::build_index - couldn't run $mg_stem_idx_exe\n";
}
system ("mg_stem_idx$exe -b 4096 -s1 -f \"$fullindexprefix\" $osextra");
system ("mg_stem_idx$exe -b 4096 -s2 -f \"$fullindexprefix\" $osextra");
system ("mg_stem_idx$exe -b 4096 -s3 -f \"$fullindexprefix\" $osextra");
# remove unwanted files
my $tmpdir = &util::filename_cat ($self->{'build_dir'}, $indexdir);
opendir (DIR, $tmpdir) || die
"mgbuilder::build_index - couldn't read directory $tmpdir\n";
foreach my $file (readdir(DIR)) {
next if $file =~ /^\./;
my ($suffix) = $file =~ /\.([^\.]+)$/;
if (defined $suffix && !defined $wanted_index_files{$suffix}) {
# delete it!
print $outhandle "deleting $file\n" if $self->{'verbosity'} > 2;
&util::rm (&util::filename_cat ($tmpdir, $file));
}
}
closedir (DIR);
}
print STDERR "\n" if $self->{'gli'};
}
sub make_infodatabase {
my $self = shift (@_);
my $outhandle = $self->{'outhandle'};
my $textdir = &util::filename_cat($self->{'build_dir'}, "text");
my $assocdir = &util::filename_cat($self->{'build_dir'}, "assoc");
&util::mk_all_dir ($textdir);
&util::mk_all_dir ($assocdir);
# get db name
my $dbext = ".bdb";
$dbext = ".ldb" if &util::is_little_endian();
my $fulldbname = &util::filename_cat ($textdir, "$self->{'collection'}$dbext");
$fulldbname =~ s/\//\\/g if ($ENV{'GSDLOS'} =~ /^windows$/i);
my $exedir = "$ENV{'GSDLHOME'}/bin/$ENV{'GSDLOS'}";
my $exe = &util::get_os_exe ();
my $txt2db_exe = &util::filename_cat($exedir, "txt2db$exe");
print $outhandle "\n*** creating the info database and processing associated files\n"
if ($self->{'verbosity'} >= 1);
print STDERR "\n" if $self->{'gli'};
# init all the classifiers
&classify::init_classifiers ($self->{'classifiers'});
# set up the document processor
my ($handle);
if ($self->{'debug'}) {
$handle = STDOUT;
} else {
if (!-e "$txt2db_exe" || !open (PIPEOUT, "| txt2db$exe \"$fulldbname\"")) {
print STDERR "\n\n" if $self->{'gli'};
die "mgbuilder::make_infodatabase - couldn't run $txt2db_exe\n";
}
$handle = mgbuilder::PIPEOUT;
}
$self->{'buildproc'}->set_output_handle ($handle);
$self->{'buildproc'}->set_mode ('infodb');
$self->{'buildproc'}->set_assocdir ($assocdir);
$self->{'buildproc'}->set_dontgdbm ($self->{'dontgdbm'});
$self->{'buildproc'}->set_classifiers ($self->{'classifiers'});
$self->{'buildproc'}->set_indexing_text (0);
$self->{'buildproc'}->set_store_text(1);
$self->{'buildproc'}->reset();
if (defined $self->{'collect_cfg'}->{'collectionmeta'}) {
if (!defined $self->{'index_mapping'}) {
$self->{'index_mapping'} =
$self->create_index_mapping ($self->{'collect_cfg'}->{'indexes'});
}
print $handle "[collection]\n";
foreach my $cmeta (keys (%{$self->{'collect_cfg'}->{'collectionmeta'}})) {
my $defaultfound=0;
my $first=1;
my $metadata_entry = "";
my $default="";
my $cmetamap = "";
if ($cmeta =~ s/^\.//) {
if (defined $self->{'index_mapping'}->{$cmeta}) {
$cmetamap = $self->{'index_mapping'}->{$cmeta};
$cmeta = ".$cmeta";
}
else {
print $outhandle "mgbuilder: warning bad collectionmeta option '$cmeta' - ignored\n";
next; #ignore this one
}
}
else {
$cmetamap = $cmeta; # just using the same name
}
#iterate through the languages
foreach my $lang (keys (%{$self->{'collect_cfg'}->{'collectionmeta'}->{$cmeta}})) {
if ($first) {
$first=0;
#set the default default to the first entry
$default=$self->{'collect_cfg'}->{'collectionmeta'}->{$cmeta}->{$lang};
}
if ($lang =~ /default/) {
$defaultfound=1;
#the default entry goes first
$metadata_entry = "<$cmetamap>" .
$self->{'collect_cfg'}->{'collectionmeta'}->{$cmeta}->{'default'} . "\n" . $metadata_entry;
}
else {
my ($l) = $lang =~ /^\[l=(\w*)\]$/;
if ($l) {
$metadata_entry .= "<$cmetamap:$l>" .
$self->{'collect_cfg'}->{'collectionmeta'}->{$cmeta}->{$lang} . "\n";
# Use the English value as the default if no default is specified
if ($l =~ /en/i) {
$default=$self->{'collect_cfg'}->{'collectionmeta'}->{$cmeta}->{$lang};
}
}
}
}
#if we haven't found a default, put one in
if (!$defaultfound) {
$metadata_entry = "<$cmetamap>$default\n" . $metadata_entry;
}
#write the entry to the file
print $handle $metadata_entry;
}
print $handle "\n" . ('-' x 70) . "\n";
}
&plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
"", {}, $self->{'buildproc'}, $self->{'maxdocs'});
# output classification information
&classify::output_classify_info ($self->{'classifiers'}, $handle,
$self->{'remove_empty_classifications'},
$self->{'gli'});
#output doclist
my @doclist = $self->{'buildproc'}->get_doc_list();
my $docs = join (";",@doclist);
print $handle "[browselist]\n";
print $handle "0\n";
print $handle "VList\n";
print $handle "" . ($#doclist+1) . "\n";
print $handle "Invisible\n";
print $handle "$docs";
print $handle "\n" . ('-' x 70) . "\n";
close ($handle) if !$self->{'debug'};
print STDERR "\n" if $self->{'gli'};
}
sub collect_specific {
my $self = shift (@_);
}
sub make_auxiliary_files {
my $self = shift (@_);
my ($index);
my $build_cfg = {};
my $outhandle = $self->{'outhandle'};
print $outhandle "\n*** creating auxiliary files \n" if ($self->{'verbosity'} >= 1);
print STDERR "\n" if $self->{'gli'};
# get the text directory
&util::mk_all_dir ($self->{'build_dir'});
# store the build date
$build_cfg->{'builddate'} = time;
# store the number of documents and number of bytes
$build_cfg->{'numdocs'} = $self->{'buildproc'}->get_num_docs();
$build_cfg->{'numbytes'} = $self->{'buildproc'}->get_num_bytes();
# get additional stats from mg
my $exedir = "$ENV{'GSDLHOME'}/bin/$ENV{'GSDLOS'}";
my $exe = &util::get_os_exe ();
my $mgstat_exe = &util::filename_cat($exedir, "mgstat$exe");
my $input_file = &util::filename_cat ("text", $self->{'collection'});
if (!-e "$mgstat_exe" || !open (PIPEIN, "mgstat$exe -d \"$self->{'build_dir'}\" -f \"$input_file\" |")) {
print $outhandle "Warning: Couldn't open pipe to $mgstat_exe to get additional stats\n";
} else {
my $line = "";
while (defined ($line = )) {
if ($line =~ /^Words in collection \[dict\]\s+:\s+(\d+)/) {
($build_cfg->{'numwords'}) = $1;
} elsif ($line =~ /^Documents\s+:\s+(\d+)/) {
($build_cfg->{'numsections'}) = $1;
}
}
close PIPEIN;
}
# store the mapping between the index names and the directory names
# the index map is used to determine what indexes there are, so any that are not built should not be put into the map.
my @indexmap = ();
foreach my $index (@{$self->{'index_mapping'}->{'indexmaporder'}}) {
if (not defined ($self->{'notbuilt'}->{$index})) {
push (@indexmap, "$index\-\>$self->{'index_mapping'}->{'indexmap'}->{$index}");
}
}
$build_cfg->{'indexmap'} = \@indexmap;
my @subcollectionmap = ();
foreach my $subcollection (@{$self->{'index_mapping'}->{'subcollectionmaporder'}}) {
push (@subcollectionmap, "$subcollection\-\>" .
$self->{'index_mapping'}->{'subcollectionmap'}->{$subcollection});
}
$build_cfg->{'subcollectionmap'} = \@subcollectionmap if scalar (@subcollectionmap);
my @languagemap = ();
foreach my $language (@{$self->{'index_mapping'}->{'languagemaporder'}}) {
push (@languagemap, "$language\-\>" .
$self->{'index_mapping'}->{'languagemap'}->{$language});
}
$build_cfg->{'languagemap'} = \@languagemap if scalar (@languagemap);
#$build_cfg->{'notbuilt'} = $self->{'notbuilt'} if scalar @{$self->{'notbuilt'}};
my @notbuilt = ();
foreach my $nb (keys %{$self->{'notbuilt'}}) {
push (@notbuilt, $nb);
}
$build_cfg->{'notbuilt'} = \@notbuilt if scalar (@notbuilt);
$build_cfg->{'maxnumeric'} = 4;
if (defined($self->{'collect_cfg'}->{'maxnumeric'}) &&
$self->{'collect_cfg'}->{'maxnumeric'} =~ /^\d+$/) {
$build_cfg->{'maxnumeric'} = $self->{'collect_cfg'}->{'maxnumeric'};
}
# write out the build information
&cfgread::write_cfg_file("$self->{'build_dir'}/build.cfg", $build_cfg,
'^(builddate|numdocs|numbytes|numwords|numsections|maxnumeric)$',
'^(indexmap|subcollectionmap|languagemap|notbuilt)$');
print STDERR "\n" if $self->{'gli'};
}
sub deinit {
my $self = shift (@_);
}
sub print_stats {
my $self = shift (@_);
my $outhandle = $self->{'outhandle'};
my $indexing_text = $self->{'buildproc'}->get_indexing_text();
my $index = $self->{'buildproc'}->get_index();
my $num_bytes = $self->{'buildproc'}->get_num_bytes();
my $num_processed_bytes = $self->{'buildproc'}->get_num_processed_bytes();
if ($indexing_text) {
print $outhandle "Stats (Creating index $index)\n";
} else {
print $outhandle "Stats (Compressing text from $index)\n";
}
print $outhandle "Total bytes in collection: $num_bytes\n";
print $outhandle "Total bytes in $index: $num_processed_bytes\n";
if ($num_processed_bytes < 50 && ($indexing_text || !$self->{'no_text'})) {
print $outhandle "***************\n";
if ($indexing_text) {
print $outhandle "WARNING: There is very little or no text to process for $index\n";
} elsif (!$self->{'no_text'}) {
print $outhandle "WARNING: There is very little or no text to compress\n";
}
print $outhandle " Was this your intention?\n";
print $outhandle "***************\n";
print STDERR "\n" if $self->{'gli'};
}
}
1;