# MGBuilder object # package mgbuilder; use cfgread; use colcfg; use plugin; use util; $maxdocsize = 12000; %wanted_index_files = ('td'=>1, 't'=>1, 'idb'=>1, 'ib1'=>1, 'ib2'=>1, 'ib3'=>1, 'i'=>1, 'ip'=>1, 'tiw'=>1, 'wa'=>1); sub new { my ($class, $collection, $source_dir, $build_dir, $verbosity) = @_; # create an mgbuilder object my $self = bless {'collection'=>$collection, 'source_dir'=>$source_dir, 'build_dir'=>$build_dir, 'verbosity'=>$verbosity}, $class; # read in the collection configuration file if (!-e "$ENV{'GSDLHOME'}/collect/$collection/collect.cfg") { die "mgbuilder::new - couldn't find collect.cfg for collection $collection\n"; } $self->{'collect_cfg'} = &colcfg::read_collect_cfg ("$ENV{'GSDLHOME'}/collect/" . "$collection/collect.cfg"); # sort out subcollection indexes if (defined $self->{'collect_cfg'}->{'indexsubcollections'}) { my $indexes = $self->{'collect_cfg'}->{'indexes'}; $self->{'collect_cfg'}->{'indexes'} = []; foreach $subcollection (@{$self->{'collect_cfg'}->{'indexsubcollections'}}) { foreach $index (@$indexes) { push (@{$self->{'collect_cfg'}->{'indexes'}}, "$index:$subcollection"); } } } # get the list of plugins for this collection my @plugins = (); # some good choice of plugins .... ???? if (defined $self->{'collect_cfg'}->{'plugins'}) { @plugins = @{$self->{'collect_cfg'}->{'plugins'}}; } # load all the plugins $self->{'pluginfo'} = &plugin::load_plugins ($collection, \@plugins); if (scalar(@{$self->{'pluginfo'}}) == 0) { print STDERR "No plugins were loaded.\n"; die "\n"; } # load up the document processor for building # if a buildproc class has been created for this collection, use it # otherwise, use the mg buildproc my ($buildprocdir, $buildproctype); if (-e "$ENV{'GSDLHOME'}/collect/$collection/perllib/${collection}buildproc.pm") { $buildprocdir = "$ENV{'GSDLHOME'}/collect/$collection/perllib"; $buildproctype = "${collection}buildproc"; } else { $buildprocdir = "$ENV{'GSDLHOME'}/perllib"; $buildproctype = "mgbuildproc"; } require "$buildprocdir/$buildproctype.pm"; eval("\$self->{'buildproc'} = new $buildproctype(\$collection, " . "\$source_dir, \$build_dir, \$verbosity)"); die "$@" if $@; return $self; } sub init { my $self = shift (@_); # remove any old builds &util::rm_r($self->{'build_dir'}); &util::mk_all_dir($self->{'build_dir'}); # make the text directory my $textdir = "$self->{'build_dir'}/text"; &util::mk_all_dir($textdir); } sub compress_text { my $self = shift (@_); my $exedir = "$ENV{'GSDLHOME'}/bin/$ENV{'GSDLOS'}"; my $exe = &util::get_os_exe (); &util::mk_all_dir (&util::filename_cat($self->{'build_dir'}, "text")); my $basefilename = "text/$self->{'collection'}"; my $fulltextprefix = &util::filename_cat ($self->{'build_dir'}, $basefilename); my $osextra = ""; if ($ENV{'GSDLOS'} =~ /^windows$/i) { $fulltextprefix =~ s/\//\\/g; } else { $osextra = " -d /"; } print STDERR "\n*** creating the compressed text\n" if ($self->{'verbosity'} >= 1); # set up the document processor $self->{'buildproc'}->set_output_handle ('mgbuilder::PIPEOUT'); $self->{'buildproc'}->set_mode ('text'); $self->{'buildproc'}->set_index ('section:text'); # collect the statistics for the text # -b $maxdocsize sets the maximum document size to be 12 meg print STDERR "\n collecting text statistics\n" if ($self->{'verbosity'} >= 1); if (!-e "$exedir/mg_passes$exe" || !open (PIPEOUT, "| $exedir/mg_passes$exe -f $fulltextprefix -b $maxdocsize -T1 $osextra")) { die "mgbuilder::compress_text - couldn't run $exedir/mg_passes$exe\n"; } $self->{'buildproc'}->reset(); &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'}, "", {}, $self->{'buildproc'}); close (PIPEOUT); # create the compression dictionary # the compression dictionary is built by assuming the stats are from a seed # dictionary (-S), if a novel word is encountered it is spelled out (-H), # and the resulting dictionary must be less than 5 meg with the most frequent # words being put into the dictionary first (-2 -k 5120) print STDERR "\n creating the compression dictionary\n" if ($self->{'verbosity'} >= 1); if (!-e "$exedir/mg_compression_dict$exe") { die "mgbuilder::compress_text - couldn't run $exedir/mg_compression_dict$exe\n"; } system ("$exedir/mg_compression_dict$exe -f $fulltextprefix -S -H -2 -k 5120 $osextra"); # compress the text # -b $maxdocsize sets the maximum document size to be 12 meg print STDERR "\n compressing the text\n" if ($self->{'verbosity'} >= 1); if (!-e "$exedir/mg_passes$exe" || !open (PIPEOUT, "| $exedir/mg_passes$exe -f $fulltextprefix -b $maxdocsize -T2 $osextra")) { die "mgbuilder::compress_text - couldn't run $exedir/mg_passes$exe\n"; } $self->{'buildproc'}->reset(); &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'}, "", {}, $self->{'buildproc'}); close (PIPEOUT); } sub build_indexes { my $self = shift (@_); my $indexes = $self->{'collect_cfg'}->{'indexes'}; # create the mapping between the index descriptions # and their directory names $self->{'index_mapping'} = $self->create_index_mapping ($indexes); # build each of the indexes foreach $index (@$indexes) { print STDERR "\n*** building index $index in subdirectory " . "$self->{'index_mapping'}->{$index}\n" if ($self->{'verbosity'} >= 1); $self->build_index($index); } } # creates directory names for each of the index descriptions sub create_index_mapping { my $self = shift (@_); my ($indexes) = @_; my %mapping = (); # dirnames is used to check for collisions. Start this off # with the manditory directory names my %dirnames = ('text'=>'text', 'extra'=>'extra'); foreach $index (@$indexes) { my ($level, $fields, $subcollection) = split (":", $index); my @fields = split (/,/, $fields); splice (@fields, 2); # just want first two fields # the directory names starts with the first character of the index level my ($dirname) = $level =~ /^(.)/; # next comes a processed version of the first two fields in the index # the processed version contains the first character and the next # consonant map {s/^(.).*?([bcdfghjklmnpqrstvwxyz]).*$/$1$2/i;} @fields; $dirname .= join("", @fields); # next comes a processed version of the subcollection if there is one. # the processed version contains the first character and the next # consonant if there's only one field, otherwise the first character # of the first two fields if (defined ($subcollection) && $subcollection =~ /\w/) { @fields = split /,/, $subcollection; if (scalar @fields >= 2) { splice (@fields, 2); map {s/^(.).*$/$1/i;} @fields; $dirname .= join("", @fields); } else { $subcollection =~ s/^(.).*?([bcdfghjklmnpqrstvwxyz]?).*$/$1$2/i; $dirname .= $subcollection; } } # convert the directory name to lowercase $dirname = lc ($dirname); # add a number to make this directory name unique if (defined $dirnames{$dirname}) { my $num = 1; while (defined $dirnames{"$dirname$num"}) { $num++; } $dirname .= $num; } $mapping{$index} = $dirname; $dirnames{$dirname} = $index; } return \%mapping; } sub build_index { my $self = shift (@_); my ($index) = @_; # get the full index directory path and make sure it exists my $indexdir = $self->{'index_mapping'}->{$index}; &util::mk_all_dir (&util::filename_cat($self->{'build_dir'}, $indexdir)); my $fullindexprefix = &util::filename_cat ($self->{'build_dir'}, $indexdir, $self->{'collection'}); my $fulltextprefix = &util::filename_cat ($self->{'build_dir'}, "text", $self->{'collection'}); # get any os specific stuff my $exedir = "$ENV{'GSDLHOME'}/bin/$ENV{'GSDLOS'}"; my $exe = &util::get_os_exe (); my $osextra = ""; if ($ENV{'GSDLOS'} =~ /^windows$/i) { $fullindexprefix =~ s/\//\\/g; } else { $osextra = " -d /"; } # get the index level from the index description # the index will be level 2 unless we are building a # paragraph level index my $index_level = 2; $index_level = 3 if $index =~ /^paragraph/i; # get the index expression if this index belongs # to a subcollection my $indexexparr = []; my ($level, $fields, $subcollection) = split (":", $index); my (@subcollections) = split /,/, $subcollection; foreach $subcollection (@subcollections) { if (defined ($self->{'collect_cfg'}->{'subcollection'}->{$subcollection})) { push (@$indexexparr, $self->{'collect_cfg'}->{'subcollection'}->{$subcollection}); } } # set up the document processor $self->{'buildproc'}->set_output_handle ('mgbuilder::PIPEOUT'); $self->{'buildproc'}->set_mode ('text'); $self->{'buildproc'}->set_index ($index, $indexexparr); # Build index dictionary. Uses verbatim stem method print STDERR "\n creating index dictionary\n" if ($self->{'verbosity'} >= 1); if (!-e "$exedir/mg_passes$exe" || !open (PIPEOUT, "| $exedir/mg_passes$exe -f $fullindexprefix -b $maxdocsize " . "-$index_level -m 32 -s 0 -G -t 10 -N1 $osextra")) { die "mgbuilder::build_index - couldn't run $exedir/mg_passes$exe\n"; } $self->{'buildproc'}->reset(); &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'}, "", {}, $self->{'buildproc'}); close (PIPEOUT); # create the perfect hash function if (!-e "$exedir/mg_perf_hash_build$exe") { die "mgbuilder::build_index - couldn't run $exedir/mg_perf_hash_build$exe\n"; } system ("$exedir/mg_perf_hash_build$exe -f $fullindexprefix $osextra"); # invert the text print STDERR "\n inverting the text\n" if ($self->{'verbosity'} >= 1); if (!-e "$exedir/mg_passes$exe" || !open (PIPEOUT, "| $exedir/mg_passes$exe -f $fullindexprefix -b $maxdocsize " . "-$index_level -c 3 -G -t 10 -N2 $osextra")) { die "mgbuilder::build_index - couldn't run $exedir/mg_passes$exe\n"; } $self->{'buildproc'}->reset(); &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'}, "", {}, $self->{'buildproc'}); close (PIPEOUT); # create the weights file print STDERR "\n create the weights file\n" if ($self->{'verbosity'} >= 1); if (!-e "$exedir/mg_weights_build$exe") { die "mgbuilder::build_index - couldn't run $exedir/mg_weights_build$exe\n"; } system ("$exedir/mg_weights_build$exe -f $fullindexprefix -t $fulltextprefix $osextra"); # create 'on-disk' stemmed dictionary print STDERR "\n creating 'on-disk' stemmed dictionary\n" if ($self->{'verbosity'} >= 1); if (!-e "$exedir/mg_invf_dict$exe") { die "mgbuilder::build_index - couldn't run $exedir/mg_invf_dict$exe\n"; } system ("$exedir/mg_invf_dict$exe -f $fullindexprefix $osextra"); # creates stem index files for the various stemming methods print STDERR "\n creating stem indexes\n" if ($self->{'verbosity'} >= 1); if (!-e "$exedir/mg_stem_idx$exe") { die "mgbuilder::build_index - couldn't run $exedir/mg_stem_idx$exe\n"; } system ("$exedir/mg_stem_idx$exe -b 4096 -s1 -f $fullindexprefix $osextra"); system ("$exedir/mg_stem_idx$exe -b 4096 -s2 -f $fullindexprefix $osextra"); system ("$exedir/mg_stem_idx$exe -b 4096 -s3 -f $fullindexprefix $osextra"); # remove unwanted files my $tmpdir = &util::filename_cat ($self->{'build_dir'}, $indexdir); opendir (DIR, $tmpdir) || die "mgbuilder::build_index - couldn't read directory $tmpdir\n"; foreach $file (readdir(DIR)) { next if $file =~ /^\./; my ($suffix) = $file =~ /\.([^\.]+)$/; if (defined $suffix && !defined $wanted_index_files{$suffix}) { # delete it! # print STDERR "deleting $file\n"; &util::rm (&util::filename_cat ($tmpdir, $file)); } } closedir (DIR); } sub make_infodatabase { my $self = shift (@_); my $textdir = &util::filename_cat($self->{'build_dir'}, "text"); &util::mk_all_dir ($textdir); # assume little-endian for now :-) my $fulldbname = &util::filename_cat ($textdir, "$self->{'collection'}.ldb"); $fulldbname =~ s/\//\\/g if ($ENV{'GSDLOS'} =~ /^windows$/i); my $exedir = "$ENV{'GSDLHOME'}/bin/$ENV{'GSDLOS'}"; my $exe = &util::get_os_exe (); print STDERR "\n*** creating the info database\n" if ($self->{'verbosity'} >= 1); # set up the document processor $self->{'buildproc'}->set_output_handle ('mgbuilder::PIPEOUT'); $self->{'buildproc'}->set_mode ('infodb'); # collect the statistics for the text # -b $maxdocsize sets the maximum document size to be 12 meg if (!-e "$exedir/txt2db$exe" || !open (PIPEOUT, "| $exedir/txt2db$exe $fulldbname")) { die "mgbuilder::make_infodatabase - couldn't run $exedir/txt2db$exe\n"; } $self->{'buildproc'}->reset(); &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'}, "", {}, $self->{'buildproc'}); close (PIPEOUT); } sub make_auxiliary_files { my $self = shift (@_); my ($index); my %build_cfg = (); print STDERR "\n*** creating auxiliary files \n" if ($self->{'verbosity'} >= 1); # get the text directory &util::mk_all_dir ($self->{'build_dir'}); # store the build date $build_cfg->{'builddate'} = time; # store the number of documents and number of bytes $build_cfg->{'numdocs'} = $self->{'buildproc'}->get_num_docs(); $build_cfg->{'numbytes'} = $self->{'buildproc'}->get_num_bytes(); # store the mapping between the index names and the directory names my @indexmap = (); foreach $index (@{$self->{'collect_cfg'}->{'indexes'}}) { push (@indexmap, "$index\-\>$self->{'index_mapping'}->{$index}"); } $build_cfg->{'indexmap'} = \@indexmap; # write out the build information &cfgread::write_cfg_file("$self->{'build_dir'}/build.cfg", $build_cfg, '^(builddate|numdocs|numbytes)$', '^(indexmap)$'); } sub deinit { my $self = shift (@_); } 1;