########################################################################### # # mgbuilder.pm -- MGBuilder object # A component of the Greenstone digital library software # from the New Zealand Digital Library Project at the # University of Waikato, New Zealand. # # Copyright (C) 1999 New Zealand Digital Library Project # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation; either version 2 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program; if not, write to the Free Software # Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. # ########################################################################### package mgppbuilder; use classify; use cfgread; use colcfg; use plugin; use util; #update this !!!!!!!!!!!!!!!! %wanted_index_files = ('td'=>1, 't'=>1, 'idb'=>1, 'ib1'=>1, 'ib2'=>1, 'ib3'=>1, 'i'=>1, 'ip'=>1, 'tiw'=>1, 'wa'=>1); sub new { my ($class, $collection, $source_dir, $build_dir, $verbosity, $maxdocs, $debug, $keepold, $allclassifications) = @_; # create an mgppbuilder object my $self = bless {'collection'=>$collection, 'source_dir'=>$source_dir, 'build_dir'=>$build_dir, 'verbosity'=>$verbosity, 'maxdocs'=>$maxdocs, 'debug'=>$debug, 'keepold'=>$keepold, 'allclassifications'=>$allclassifications, 'notbuilt'=>[] # indexes not built }, $class; # read in the collection configuration file my $colcfgname = "$ENV{'GSDLCOLLECTDIR'}/etc/collect.cfg"; if (!-e $colcfgname) { die "mgppbuilder::new - couldn't find collect.cfg for collection $collection\n"; } $self->{'collect_cfg'} = &colcfg::read_collect_cfg ($colcfgname); # sort out subcollection indexes if (defined $self->{'collect_cfg'}->{'indexsubcollections'}) { my $indexes = $self->{'collect_cfg'}->{'indexes'}; $self->{'collect_cfg'}->{'indexes'} = []; foreach $subcollection (@{$self->{'collect_cfg'}->{'indexsubcollections'}}) { foreach $index (@$indexes) { push (@{$self->{'collect_cfg'}->{'indexes'}}, "$index:$subcollection"); } } } # sort out language subindexes if (defined $self->{'collect_cfg'}->{'languages'}) { my $indexes = $self->{'collect_cfg'}->{'indexes'}; $self->{'collect_cfg'}->{'indexes'} = []; foreach $language (@{$self->{'collect_cfg'}->{'languages'}}) { foreach $index (@$indexes) { push (@{$self->{'collect_cfg'}->{'indexes'}}, "$index:$language"); } } } # get the list of plugins for this collection my $plugins = []; if (defined $self->{'collect_cfg'}->{'plugin'}) { $plugins = $self->{'collect_cfg'}->{'plugin'}; } # load all the plugins $self->{'pluginfo'} = &plugin::load_plugins ($plugins); if (scalar(@{$self->{'pluginfo'}}) == 0) { print STDERR "No plugins were loaded.\n"; die "\n"; } # get the list of classifiers for this collection my $classifiers = []; if (defined $self->{'collect_cfg'}->{'classify'}) { $classifiers = $self->{'collect_cfg'}->{'classify'}; } # load all the classifiers $self->{'classifiers'} = &classify::load_classifiers ($classifiers); # load up any dontgdbm fields $self->{'dontgdbm'} = {}; if (defined ($self->{'collect_cfg'}->{'dontgdbm'})) { foreach $dg (@{$self->{'collect_cfg'}->{'dontgdbm'}}) { $self->{'dontgdbm'}->{$dg} = 1; } } # load up the document processor for building # if a buildproc class has been created for this collection, use it # otherwise, use the mg buildproc my ($buildprocdir, $buildproctype); if (-e "$ENV{'GSDLCOLLECTDIR'}/perllib/${collection}buildproc.pm") { $buildprocdir = "$ENV{'GSDLCOLLECTDIR'}/perllib"; $buildproctype = "${collection}buildproc"; } else { $buildprocdir = "$ENV{'GSDLHOME'}/perllib"; $buildproctype = "mgppbuildproc"; } require "$buildprocdir/$buildproctype.pm"; eval("\$self->{'buildproc'} = new $buildproctype(\$collection, " . "\$source_dir, \$build_dir, \$verbosity)"); die "$@" if $@; return $self; } sub init { my $self = shift (@_); if (!$self->{'debug'} && !$self->{'keepold'}) { # remove any old builds &util::rm_r($self->{'build_dir'}); &util::mk_all_dir($self->{'build_dir'}); # make the text directory my $textdir = "$self->{'build_dir'}/text"; &util::mk_all_dir($textdir); } } sub build_collection { my $self = shift (@_); my ($textindex, $indexname) = @_; print STDERR "build_col, textindex=$textindex, indexname=$indexname\n"; my $exedir = "$ENV{'GSDLHOME'}/src/mgpp/text"; my $exe = &util::get_os_exe (); my $mg_passes_exe = &util::filename_cat($exedir, "mg_passes$exe"); my $mg_compression_dict_exe = &util::filename_cat($exedir, "mg_compression_dict$exe"); my $mg_perf_hash_build_exe = &util::filename_cat($exedir, "mg_perf_hash_build$exe"); my $mg_weights_build_exe = &util::filename_cat ($exedir, "mg_weights_build$exe"); my $mg_invf_dict_exe = &util::filename_cat ($exedir, "mg_invf_dict$exe"); my $mg_stem_idx_exe = &util::filename_cat ($exedir, "mg_stem_idx$exe"); &util::mk_all_dir (&util::filename_cat($self->{'build_dir'}, "text")); my $basefilename = "$self->{'collection'}"; # my $fulltextprefix = &util::filename_cat ($self->{'build_dir'}, $basefilename); # my $fullindexprefix = &util::filename_cat ($self->{'build_dir'}, # $self->{'collection'}); my $fulltextprefix=$self->{'build_dir'}; # note if this works, change all to $directory, change in mg calls!!!!!!!!!!!!!! my $fullindexprefix=$self->{'build_dir'}; my $directory = $self->{'build_dir'}; my $osextra = ""; if ($ENV{'GSDLOS'} =~ /^windows$/i) { $fulltextprefix =~ s/\//\\/g; #$directory = ~s/\//\\/g; } else { $osextra = " -d /"; } #indexname got from command line arg. if not specified, its "", so use # ones stated in cfg file my $indexes = []; if (!(defined $indexname && $indexname =~ /\w/)) { $indexes = $self->{'collect_cfg'}->{'indexes'}; $indexname="Title,Organization,Magazine,text"; } else { push @$indexes, $indexname; } print STDERR "indexes are: @$indexes\n"; print STDERR "\n*** mg_passes: first pass\n" if ($self->{'verbosity'} >= 1); print STDERR "fulltextprefix=$fulltextprefix\n"; # carry out the first pass of mg_passes # -b $maxdocsize sets the maximum document size to be 12 meg - not available any longer print STDERR "\n collecting text statistics\n" if ($self->{'verbosity'} >= 1); my ($handle); if ($self->{'debug'}) { $handle = STDOUT; } else { if (!-e "$mg_passes_exe" || !open (PIPEOUT, "| $mg_passes_exe -K Section -K Paragraph -T1 -I1 -d $fulltextprefix -f $basefilename")) { die "mgppbuilder::compress_text - couldn't run $mg_passes_exe\n"; } $handle = mgppbuilder::PIPEOUT; } #Assume that only going to build one index for now. so index will be # anything specified in cfg file $self->{'buildproc'}->set_output_handle ($handle); $self->{'buildproc'}->set_mode ('text'); $self->{'buildproc'}->set_index ($indexname); $self->{'buildproc'}->set_indexing_text (1); # not used at the moment I think $self->{'buildproc'}->reset(); &plugin::begin($self->{'pluginfo'}, $self->{'source_dir'}, $self->{'buildproc'}, $self->{'maxdocs'}); &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'}, "", {}, $self->{'buildproc'}, $self->{'maxdocs'}); &plugin::end($self->{'pluginfo'}); close (PIPEOUT); close ($handle) unless $self->{'debug'}; # create the compression dictionary # the compression dictionary is built by assuming the stats are from a seed # dictionary (-S), if a novel word is encountered it is spelled out (-H), # and the resulting dictionary must be less than 5 meg with the most frequent # words being put into the dictionary first (-2 -k 5120) if (!$self->{'debug'}) { print STDERR "\n creating the compression dictionary\n" if ($self->{'verbosity'} >= 1); if (!-e "$mg_compression_dict_exe") { die "mgppbuilder::compress_text - couldn't run $mg_compression_dict_exe\n"; } system ("$mg_compression_dict_exe -d $fulltextprefix -f $basefilename"); # create the perfect hash function if (!-e "$mg_perf_hash_build_exe") { die "mgppbuilder::build_index - couldn't run $mg_perf_hash_build_exe\n"; } system ("$mg_perf_hash_build_exe -d $fullindexprefix -f $basefilename"); # compress the text # -b $maxdocsize sets the maximum document size to be 12 meg if (!$self->{'debug'}) { if (!-e "$mg_passes_exe" || !open ($handle, "| $mg_passes_exe -K Section -K Paragraph -d $fulltextprefix -f $basefilename -T2 -I2")) { die "mgppbuilder::compress_text - couldn't run $mg_passes_exe\n"; } } } $self->{'buildproc'}->reset(); print STDERR "\n compressing the text\n" if ($self->{'verbosity'} >= 1); &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'}, "", {}, $self->{'buildproc'}, $self->{'maxdocs'}); close ($handle) unless $self->{'debug'}; # create the weights file print STDERR "\n create the weights file\n" if ($self->{'verbosity'} >= 1); if (!-e "$mg_weights_build_exe") { die "mgppbuilder::build_index - couldn't run $mg_weights_build_exe\n"; } system ("$mg_weights_build_exe -d $fullindexprefix -f $basefilename "); # create 'on-disk' stemmed dictionary print STDERR "\n creating 'on-disk' stemmed dictionary\n" if ($self->{'verbosity'} >= 1); if (!-e "$mg_invf_dict_exe") { die "mgppbuilder::build_index - couldn't run $mg_invf_dict_exe\n"; } system ("$mg_invf_dict_exe -d $fullindexprefix -f $basefilename"); # creates stem index files for the various stemming methods print STDERR "\n creating stem indexes\n" if ($self->{'verbosity'} >= 1); if (!-e "$mg_stem_idx_exe") { die "mgppbuilder::build_index - couldn't run $mg_stem_idx_exe\n"; } system ("$mg_stem_idx_exe -b 4096 -s1 -d $fullindexprefix -f $basefilename"); system ("$mg_stem_idx_exe -b 4096 -s2 -d $fullindexprefix -f $basefilename"); system ("$mg_stem_idx_exe -b 4096 -s3 -d $fullindexprefix -f $basefilename"); } #for mgpp with more than one index sub compress_text { my $self = shift (@_); my ($textindex) = @_; # $textindex = "Title,Organization,Subject,Magazine,text"; my $exedir = "$ENV{'GSDLHOME'}/src/mgpp/text"; my $exe = &util::get_os_exe (); my $mg_passes_exe = &util::filename_cat($exedir, "mg_passes$exe"); my $mg_compression_dict_exe = &util::filename_cat($exedir, "mg_compression_dict$exe"); &util::mk_all_dir (&util::filename_cat($self->{'build_dir'}, "text")); my $builddir = $self->{'build_dir'}; my $basefilename = "text/$self->{'collection'}"; if ($ENV{'GSDLOS'} =~ /^windows$/i) { $basefilename =~ s/\//\\/g; $builddir =~ s/\//\\/g; } print STDERR "\n*** creating the compressed text\n" if ($self->{'verbosity'} >= 1); # collect the statistics for the text print STDERR "\n collecting text statistics\n" if ($self->{'verbosity'} >= 1); my ($handle); if ($self->{'debug'}) { $handle = STDOUT; } else { if (!-e "$mg_passes_exe" || !open (PIPEOUT, "| $mg_passes_exe -K Section -K Paragraph -d $builddir -f $basefilename -T1")) { die "mgppbuilder::compress_text - couldn't run $mg_passes_exe\n"; } $handle = mgppbuilder::PIPEOUT; } $self->{'buildproc'}->set_output_handle ($handle); $self->{'buildproc'}->set_mode ('text'); $self->{'buildproc'}->set_index ($textindex); $self->{'buildproc'}->set_indexing_text (0); $self->{'buildproc'}->reset(); &plugin::begin($self->{'pluginfo'}, $self->{'source_dir'}, $self->{'buildproc'}, $self->{'maxdocs'}); &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'}, "", {}, $self->{'buildproc'}, $self->{'maxdocs'}); &plugin::end($self->{'pluginfo'}); close (PIPEOUT); close ($handle) unless $self->{'debug'}; # create the compression dictionary # the compression dictionary is built by assuming the stats are from a seed # dictionary (-S), if a novel word is encountered it is spelled out (-H), # and the resulting dictionary must be less than 5 meg with the most # frequent words being put into the dictionary first (-2 -k 5120) # note: this options are left over from mg version if (!$self->{'debug'}) { print STDERR "\n creating the compression dictionary\n" if ($self->{'verbosity'} >= 1); if (!-e "$mg_compression_dict_exe") { die "mgppbuilder::compress_text - couldn't run $mg_compression_dict_exe\n"; } system ("$mg_compression_dict_exe -d $builddir -f $basefilename -S -H -2 -k 5120"); if (!$self->{'debug'}) { if (!-e "$mg_passes_exe" || !open ($handle, "| $mg_passes_exe -K Section -K Paragraph -f $basefilename -d $builddir -T2")) { die "mgppbuilder::compress_text - couldn't run $mg_passes_exe\n"; } } } $self->{'buildproc'}->reset(); # compress the text print STDERR "\n compressing the text\n" if ($self->{'verbosity'} >= 1); &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'}, "", {}, $self->{'buildproc'}, $self->{'maxdocs'}); close ($handle) unless $self->{'debug'}; } sub want_built { my $self = shift (@_); my ($index) = @_; if (defined ($self->{'collect_cfg'}->{'dontbuild'})) { foreach $checkstr (@{$self->{'collect_cfg'}->{'dontbuild'}}) { if ($index =~ /^$checkstr$/) { push (@{$self->{'notbuilt'}}, $self->{'index_mapping'}->{$index}); return 0; } } } return 1; } sub build_indexes { my $self = shift (@_); my ($indexname) = @_; my $indexes = []; if (defined $indexname && $indexname =~ /\w/) { push @$indexes, $indexname; } else { $indexes = $self->{'collect_cfg'}->{'indexes'}; } # push @$indexes, "text,Title,Organization,Magazine,Subject"; # push @$indexes, "Title,Organization,Magazine,Subject"; # create the mapping between the index descriptions # and their directory names $self->{'index_mapping'} = $self->create_index_mapping ($indexes); # build each of the indexes foreach $index (@$indexes) { if ($self->want_built($index)) { print STDERR "\n*** building index $index in subdirectory " . "$self->{'index_mapping'}->{$index}\n" if ($self->{'verbosity'} >= 1); $self->build_index($index); } else { print STDERR "\n*** ignoring index $index\n" if ($self->{'verbosity'} >= 1); } } } # creates directory names for each of the index descriptions sub create_index_mapping { my $self = shift (@_); my ($indexes) = @_; my %mapping = (); $mapping{'indexmaporder'} = []; $mapping{'subcollectionmaporder'} = []; $mapping{'languagemaporder'} = []; # dirnames is used to check for collisions. Start this off # with the manditory directory names my %dirnames = ('text'=>'text', 'extra'=>'extra'); my %pnames = ('index' => '', 'subcollection' => '', 'languages' => ''); foreach $index (@$indexes) { my ($fields, $subcollection, $languages) = split (":", $index); # the directory name starts with a processed version of index fields my ($pindex) = $self->process_field($fields); # next comes a processed version of the index $pindex = lc ($pindex); # next comes a processed version of the subcollection if there is one. my $psub = $self->process_field ($subcollection); $psub = lc ($psub); # next comes a processed version of the language if there is one. my $plang = $self->process_field ($languages); $plang = lc ($plang); my $dirname = $pindex . $psub . $plang; # check to be sure all index names are unique while (defined ($dirnames{$dirname})) { $dirname = $self->make_unique (\%pnames, $index, \$pindex, \$psub, \$plang); } # store the mapping orders as well as the maps if (!defined $mapping{'indexmap'}{"$fields"}) { $mapping{'indexmap'}{"$fields"} = $pindex; push (@{$mapping{'indexmaporder'}}, "$fields"); } if ($psub =~ /\w/ && !defined ($mapping{'subcollectionmap'}{$subcollection})) { $mapping{'subcollectionmap'}{$subcollection} = $psub; push (@{$mapping{'subcollectionmaporder'}}, $subcollection); } if ($plang =~ /\w/ && !defined ($mapping{'languagemap'}{$languages})) { $mapping{'languagemap'}{$languages} = $plang; push (@{$mapping{'languagemaporder'}}, $language); } $mapping{$index} = $dirname; $dirnames{$dirname} = $index; $pnames{'index'}{$pindex} = "$fields"; $pnames{'subcollection'}{$psub} = $subcollection; $pnames{'languages'}{$plang} = $languages; } return \%mapping; } # returns a processed version of a field. # if the field has only one component the processed # version will contain the first character and next consonant # of that componant - otherwise it will contain the first # character of the first two components sub process_field { my $self = shift (@_); my ($field) = @_; return "" unless (defined ($field) && $field =~ /\w/); my @components = split /,/, $field; if (scalar @components >= 2) { splice (@components, 2); map {s/^(.).*$/$1/;} @components; return join("", @components); } else { my ($a, $b) = $field =~ /^(.).*?([bcdfghjklmnpqrstvwxyz])/i; ($a, $b) = $field =~ /^(.)(.)/ unless defined $a && defined $b; return "$a$b"; } } sub make_unique { my $self = shift (@_); my ($namehash, $index, $indexref, $subref, $langref) = @_; my ($fields, $subcollection, $languages) = split (":", $index); if ($namehash->{'index'}->{$$indexref} ne "$fields") { $self->get_next_version ($indexref); } elsif ($namehash->{'subcollection'}->{$$subref} ne $subcollection) { $self->get_next_version ($subref); } elsif ($namehash->{'languages'}->{$$langref} ne $languages) { $self->get_next_version ($langref); } return "$$indexref$$subref$$langref"; } sub get_next_version { my $self = shift (@_); my ($nameref) = @_; if ($$nameref =~ /(\d\d)$/) { my $num = $1; $num ++; $$nameref =~ s/\d\d$/$num/; } elsif ($$nameref =~ /(\d)$/) { my $num = $1; if ($num == 9) {$$nameref =~ s/\d\d$/10/;} else {$num ++; $$nameref =~ s/\d$/$num/;} } else { $$nameref =~ s/.$/0/; } } sub build_index { my $self = shift (@_); my ($index) = @_; # get the full index directory path and make sure it exists my $indexdir = $self->{'index_mapping'}->{$index}; &util::mk_all_dir (&util::filename_cat($self->{'build_dir'}, $indexdir)); my $builddir = $self->{'build_dir'}; my $basefilename = &util::filename_cat ($indexdir, $self->{'collection'}); # get any os specific stuff my $exedir = "$ENV{'GSDLHOME'}/src/mgpp/text"; my $exe = &util::get_os_exe (); my $mg_passes_exe = &util::filename_cat($exedir, "mg_passes$exe"); my $mg_perf_hash_build_exe = &util::filename_cat($exedir, "mg_perf_hash_build$exe"); my $mg_weights_build_exe = &util::filename_cat ($exedir, "mg_weights_build$exe"); my $mg_invf_dict_exe = &util::filename_cat ($exedir, "mg_invf_dict$exe"); my $mg_stem_idx_exe = &util::filename_cat ($exedir, "mg_stem_idx$exe"); if ($ENV{'GSDLOS'} =~ /^windows$/i) { $builddir=~ s/\//\\/g; $basefilename =~ s/\//\\/g; } # get the index expression if this index belongs # to a subcollection my $indexexparr = []; my ($fields, $subcollection) = split (":", $index); my @subcollections = (); @subcollections = split /,/, $subcollection if (defined $subcollection); foreach $subcollection (@subcollections) { if (defined ($self->{'collect_cfg'}->{'subcollection'}->{$subcollection})) { push (@$indexexparr, $self->{'collect_cfg'}->{'subcollection'}->{$subcollection}); } } # add expressions for languages if this index belongs to # a language subcollection foreach $language (@{$self->{'collect_cfg'}->{'languages'}}) { if ($language =~ s/^\!//) { push (@$indexexparr, "!Language/$language/"); } else { push (@$indexexparr, "Language/$language/"); } } # Build index dictionary. Uses verbatim stem method print STDERR "\n creating index dictionary\n" if ($self->{'verbosity'} >= 1); my ($handle); if ($self->{'debug'}) { $handle = STDOUT; } else { if (!-e "$mg_passes_exe" || !open (PIPEOUT, "| $mg_passes_exe -K Section -K Paragraph -d $builddir -f $basefilename -I1")) { die "mgppbuilder::build_index - couldn't run $mg_passes_exe\n"; } $handle = mgppbuilder::PIPEOUT; } # set up the document processor $self->{'buildproc'}->set_output_handle ($handle); $self->{'buildproc'}->set_mode ('text'); $self->{'buildproc'}->set_index ($index, $indexexparr); $self->{'buildproc'}->set_indexing_text (1); $self->{'buildproc'}->reset(); &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'}, "", {}, $self->{'buildproc'}, $self->{'maxdocs'}); close ($handle) unless $self->{'debug'}; if (!$self->{'debug'}) { # create the perfect hash function if (!-e "$mg_perf_hash_build_exe") { die "mgppbuilder::build_index - couldn't run $mg_perf_hash_build_exe\n"; } system ("$mg_perf_hash_build_exe -d $builddir -f $basefilename"); if (!-e "$mg_passes_exe" || !open ($handle, "| $mg_passes_exe -K Section -K Paragraph -d $builddir -f $basefilename -I2")) { die "mgppbuilder::build_index - couldn't run $mg_passes_exe\n"; } } # invert the text print STDERR "\n inverting the text\n" if ($self->{'verbosity'} >= 1); $self->{'buildproc'}->reset(); &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'}, "", {}, $self->{'buildproc'}, $self->{'maxdocs'}); if (!$self->{'debug'}) { close ($handle); # create the weights file print STDERR "\n create the weights file\n" if ($self->{'verbosity'} >= 1); if (!-e "$mg_weights_build_exe") { die "mgppbuilder::build_index - couldn't run $mg_weights_build_exe\n"; } system ("$mg_weights_build_exe -d $builddir -f $basefilename"); # create 'on-disk' stemmed dictionary print STDERR "\n creating 'on-disk' stemmed dictionary\n" if ($self->{'verbosity'} >= 1); if (!-e "$mg_invf_dict_exe") { die "mgppbuilder::build_index - couldn't run $mg_invf_dict_exe\n"; } system ("$mg_invf_dict_exe -d $builddir -f $basefilename"); # creates stem index files for the various stemming methods print STDERR "\n creating stem indexes\n" if ($self->{'verbosity'} >= 1); if (!-e "$mg_stem_idx_exe") { die "mgppbuilder::build_index - couldn't run $mg_stem_idx_exe\n"; } system ("$mg_stem_idx_exe -b 4096 -s1 -d $builddir -f $basefilename"); system ("$mg_stem_idx_exe -b 4096 -s2 -d $builddir -f $basefilename"); system ("$mg_stem_idx_exe -b 4096 -s3 -d $builddir -f $basefilename"); # remove unwanted files # my $tmpdir = &util::filename_cat ($self->{'build_dir'}, $indexdir); # opendir (DIR, $tmpdir) || die # "mgppbuilder::build_index - couldn't read directory $tmpdir\n"; # foreach $file (readdir(DIR)) { # next if $file =~ /^\./; # my ($suffix) = $file =~ /\.([^\.]+)$/; # if (defined $suffix && !defined $wanted_index_files{$suffix}) { # delete it! # print STDERR "deleting $file\n" if $self->{'verbosity'} > 2; # &util::rm (&util::filename_cat ($tmpdir, $file)); # } # } # closedir (DIR); } } sub make_infodatabase { my $self = shift (@_); my $textdir = &util::filename_cat($self->{'build_dir'}, "text"); my $assocdir = &util::filename_cat($self->{'build_dir'}, "assoc"); &util::mk_all_dir ($textdir); &util::mk_all_dir ($assocdir); # get db name my $dbext = ".bdb"; $dbext = ".ldb" if &util::is_little_endian(); my $fulldbname = &util::filename_cat ($textdir, "$self->{'collection'}$dbext"); $fulldbname =~ s/\//\\/g if ($ENV{'GSDLOS'} =~ /^windows$/i); my $exedir = "$ENV{'GSDLHOME'}/bin/$ENV{'GSDLOS'}"; my $exe = &util::get_os_exe (); my $txt2db_exe = &util::filename_cat($exedir, "txt2db$exe"); print STDERR "\n*** creating the info database and processing associated files\n" if ($self->{'verbosity'} >= 1); # init all the classifiers &classify::init_classifiers ($self->{'classifiers'}); # set up the document processor my ($handle); if ($self->{'debug'}) { $handle = STDOUT; } else { if (!-e "$txt2db_exe" || !open (PIPEOUT, "| $txt2db_exe $fulldbname")) { die "mgppbuilder::make_infodatabase - couldn't run $txt2db_exe\n"; } $handle = mgppbuilder::PIPEOUT; } $self->{'buildproc'}->set_output_handle ($handle); $self->{'buildproc'}->set_mode ('infodb'); $self->{'buildproc'}->set_assocdir ($assocdir); $self->{'buildproc'}->set_dontgdbm ($self->{'dontgdbm'}); $self->{'buildproc'}->set_classifiers ($self->{'classifiers'}); $self->{'buildproc'}->set_indexing_text (0); $self->{'buildproc'}->reset(); if (defined $self->{'collect_cfg'}->{'collectionmeta'}) { if (!defined $self->{'index_mapping'}) { $self->{'index_mapping'} = $self->create_index_mapping ($self->{'collect_cfg'}->{'indexes'}); } print $handle "[collection]\n"; foreach $cmeta (keys (%{$self->{'collect_cfg'}->{'collectionmeta'}})) { if ($cmeta =~ s/^\.//) { if (defined $self->{'index_mapping'}->{$cmeta}) { print $handle "<$self->{'index_mapping'}->{$cmeta}>" . $self->{'collect_cfg'}->{'collectionmeta'}->{".$cmeta"} . "\n"; print STDERR "have .section entry in collect file\n"; } else { print STDERR "mgppbuilder: warning bad collectionmeta option '$cmeta' - ignored\n"; } } else { print $handle "<$cmeta>$self->{'collect_cfg'}->{'collectionmeta'}->{$cmeta}\n"; } } print $handle "\n" . ('-' x 70) . "\n"; } &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'}, "", {}, $self->{'buildproc'}, $self->{'maxdocs'}); # output classification information &classify::output_classify_info ($self->{'classifiers'}, $handle, $self->{'allclassifications'}); close ($handle) if !$self->{'debug'}; } sub collect_specific { my $self = shift (@_); } sub make_auxiliary_files { my $self = shift (@_); my ($index); my %build_cfg = (); print STDERR "\n*** creating auxiliary files \n" if ($self->{'verbosity'} >= 1); # get the text directory &util::mk_all_dir ($self->{'build_dir'}); # store the build date $build_cfg->{'builddate'} = time; # store the number of documents and number of bytes $build_cfg->{'numdocs'} = $self->{'buildproc'}->get_num_docs(); $build_cfg->{'numbytes'} = $self->{'buildproc'}->get_num_bytes(); # store the mapping between the index names and the directory names my @indexmap = (); foreach $index (@{$self->{'index_mapping'}->{'indexmaporder'}}) { push (@indexmap, "$index\-\>$self->{'index_mapping'}->{'indexmap'}->{$index}"); } $build_cfg->{'indexmap'} = \@indexmap; my @subcollectionmap = (); foreach $subcollection (@{$self->{'index_mapping'}->{'subcollectionmaporder'}}) { push (@subcollectionmap, "$subcollection\-\>" . $self->{'index_mapping'}->{'subcollectionmap'}->{$subcollection}); } $build_cfg->{'subcollectionmap'} = \@subcollectionmap if scalar (@subcollectionmap); my @languagemap = (); foreach $language (@{$self->{'index_mapping'}->{'languagemaporder'}}) { push (@languagemap, "$language\-\>" . $self->{'index_mapping'}->{'languagemap'}->{$language}); } $build_cfg->{'languagemap'} = \@languagemap if scalar (@languagemap); $build_cfg->{'notbuilt'} = $self->{'notbuilt'}; # write out the build information &cfgread::write_cfg_file("$self->{'build_dir'}/build.cfg", $build_cfg, '^(builddate|numdocs|numbytes)$', '^(indexmap|subcollectionmap|languagemap|notbuilt)$'); } sub deinit { my $self = shift (@_); } 1;