########################################################################### # # mgppbuilder.pm -- MGBuilder object # A component of the Greenstone digital library software # from the New Zealand Digital Library Project at the # University of Waikato, New Zealand. # # Copyright (C) 1999 New Zealand Digital Library Project # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation; either version 2 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program; if not, write to the Free Software # Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. # ########################################################################### package mgppbuilder; use classify; use cfgread; use colcfg; use plugin; use util; use FileHandle; BEGIN { # set autoflush on for STDERR and STDOUT so that mg # doesn't get out of sync with plugins STDOUT->autoflush(1); STDERR->autoflush(1); } END { STDOUT->autoflush(0); STDERR->autoflush(0); } $maxdocsize = 12000; %wanted_index_files = ('td'=>1, 't'=>1, 'tl'=>1, 'ti'=>1, 'idb'=>1, 'ib1'=>1, 'ib2'=>1, 'ib3'=>1, 'i'=>1, 'il'=>1, 'w'=>1, 'wa'=>1); # change this so a user can add their own ones in via a file or cfg %static_indexfield_map = ('Title'=>'TI', 'TI'=>1, 'Subject'=>'SU', 'SU'=>1, 'Creator'=>'CR', 'CR'=>1, 'Organization'=>'OR', 'OR'=>1, 'Source'=>'SO', 'SO'=>1, 'Howto'=>'HT', 'HT'=>1, 'ItemTitle'=>'IT', 'IT'=>1, 'ProgNumber'=>'PN', 'PN'=>1, 'People'=>'PE', 'PE'=>1, 'TextOnly'=>'TX', 'TX'=>1); sub new { my ($class, $collection, $source_dir, $build_dir, $verbosity, $maxdocs, $debug, $keepold, $allclassifications, $outhandle, $no_text) = @_; $outhandle = STDERR unless defined $outhandle; $no_text = 0 unless defined $no_text; # create an mgppbuilder object my $self = bless {'collection'=>$collection, 'source_dir'=>$source_dir, 'build_dir'=>$build_dir, 'verbosity'=>$verbosity, 'maxdocs'=>$maxdocs, 'debug'=>$debug, 'keepold'=>$keepold, 'allclassifications'=>$allclassifications, 'outhandle'=>$outhandle, 'no_text'=>$no_text, 'notbuilt'=>[], # indexes not built 'indexfieldmap'=>\%static_indexfield_map }, $class; # read in the collection configuration file my $colcfgname = "$ENV{'GSDLCOLLECTDIR'}/etc/collect.cfg"; if (!-e $colcfgname) { die "mgppbuilder::new - couldn't find collect.cfg for collection $collection\n"; } $self->{'collect_cfg'} = &colcfg::read_collect_cfg ($colcfgname); # sort out subcollection indexes if (defined $self->{'collect_cfg'}->{'indexsubcollections'}) { my $indexes = $self->{'collect_cfg'}->{'indexes'}; $self->{'collect_cfg'}->{'indexes'} = []; foreach $subcollection (@{$self->{'collect_cfg'}->{'indexsubcollections'}}) { foreach $index (@$indexes) { push (@{$self->{'collect_cfg'}->{'indexes'}}, "$index:$subcollection"); } } } # sort out language subindexes if (defined $self->{'collect_cfg'}->{'languages'}) { my $indexes = $self->{'collect_cfg'}->{'indexes'}; $self->{'collect_cfg'}->{'indexes'} = []; foreach $language (@{$self->{'collect_cfg'}->{'languages'}}) { foreach $index (@$indexes) { if (defined ($self->{'collect_cfg'}->{'indexsubcollections'})) { push (@{$self->{'collect_cfg'}->{'indexes'}}, "$index:$language"); } else { # add in an empty subcollection field push (@{$self->{'collect_cfg'}->{'indexes'}}, "$index\:\:$language"); } } } } # make sure that the same index isn't specified more than once my %tmphash = (); my @tmparray = @{$self->{'collect_cfg'}->{'indexes'}}; $self->{'collect_cfg'}->{'indexes'} = []; foreach my $i (@tmparray) { if (!defined ($tmphash{$i})) { push (@{$self->{'collect_cfg'}->{'indexes'}}, $i); $tmphash{$i} = 1; } } # get the levels (Section, Paragraph) for indexing and compression $self->{'levels'} = {}; if (defined $self->{'collect_cfg'}->{'levels'}) { foreach $level ( @{$self->{'collect_cfg'}->{'levels'}} ){ $self->{'levels'}->{$level} = 1; } } # get the list of plugins for this collection my $plugins = []; if (defined $self->{'collect_cfg'}->{'plugin'}) { $plugins = $self->{'collect_cfg'}->{'plugin'}; } # load all the plugins $self->{'pluginfo'} = &plugin::load_plugins ($plugins, $verbosity, $outhandle); if (scalar(@{$self->{'pluginfo'}}) == 0) { print $outhandle "No plugins were loaded.\n"; die "\n"; } # get the list of classifiers for this collection my $classifiers = []; if (defined $self->{'collect_cfg'}->{'classify'}) { $classifiers = $self->{'collect_cfg'}->{'classify'}; } # load all the classifiers $self->{'classifiers'} = &classify::load_classifiers ($classifiers, $build_dir, $outhandle); # load up any dontgdbm fields $self->{'dontgdbm'} = {}; if (defined ($self->{'collect_cfg'}->{'dontgdbm'})) { foreach $dg (@{$self->{'collect_cfg'}->{'dontgdbm'}}) { $self->{'dontgdbm'}->{$dg} = 1; } } # load up the document processor for building # if a buildproc class has been created for this collection, use it # otherwise, use the mgpp buildproc my ($buildprocdir, $buildproctype); if (-e "$ENV{'GSDLCOLLECTDIR'}/perllib/${collection}buildproc.pm") { $buildprocdir = "$ENV{'GSDLCOLLECTDIR'}/perllib"; $buildproctype = "${collection}buildproc"; } else { $buildprocdir = "$ENV{'GSDLHOME'}/perllib"; $buildproctype = "mgppbuildproc"; } require "$buildprocdir/$buildproctype.pm"; eval("\$self->{'buildproc'} = new $buildproctype(\$collection, " . "\$source_dir, \$build_dir, \$verbosity, \$outhandle)"); die "$@" if $@; return $self; } sub init { my $self = shift (@_); if (!$self->{'debug'} && !$self->{'keepold'}) { # remove any old builds &util::rm_r($self->{'build_dir'}); &util::mk_all_dir($self->{'build_dir'}); # make the text directory my $textdir = "$self->{'build_dir'}/text"; &util::mk_all_dir($textdir); } } sub set_strip_html { my $self = shift (@_); my ($strip) = @_; $self->{'strip_html'} = $strip; $self->{'buildproc'}->set_strip_html($strip); } sub compress_text { my $self = shift (@_); my ($textindex) = @_; my $exedir = "$ENV{'GSDLHOME'}/bin/$ENV{'GSDLOS'}"; my $exe = &util::get_os_exe (); my $mgpp_passes_exe = &util::filename_cat($exedir, "mgpp_passes$exe"); my $mgpp_compression_dict_exe = &util::filename_cat($exedir, "mgpp_compression_dict$exe"); my $outhandle = $self->{'outhandle'}; &util::mk_all_dir (&util::filename_cat($self->{'build_dir'}, "text")); my $basefilename = "text/$self->{'collection'}"; my $fulltextprefix = &util::filename_cat ($self->{'build_dir'}, $basefilename); my $osextra = ""; if ($ENV{'GSDLOS'} =~ /^windows$/i) { $fulltextprefix =~ s@/@\\@g; } else { $osextra = " -d /"; } # define the section names for mgpasses # the compressor doesn't need to know about paragraphs - never want to # retrieve them my $mgpp_passes_sections = ""; if ($self->{'levels'}->{'Section'}) { $mgpp_passes_sections .= "-K Section "; } print $outhandle "\n*** creating the compressed text\n" if ($self->{'verbosity'} >= 1); # collect the statistics for the text # -b $maxdocsize sets the maximum document size to be 12 meg print $outhandle "\n collecting text statistics (mgpp_passes -T1)\n" if ($self->{'verbosity'} >= 1); my ($handle); if ($self->{'debug'}) { $handle = STDOUT; } else { if (!-e "$mgpp_passes_exe" || !open (PIPEOUT, "| mgpp_passes$exe $mgpp_passes_sections -f \"$fulltextprefix\" -T1 $osextra")) { die "mgppbuilder::compress_text - couldn't run $mgpp_passes_exe\n"; } $handle = mgppbuilder::PIPEOUT; } $self->{'buildproc'}->set_output_handle ($handle); $self->{'buildproc'}->set_mode ('text'); $self->{'buildproc'}->set_index ($textindex); $self->{'buildproc'}->set_indexing_text (0); if ($self->{'no_text'}) { $self->{'buildproc'}->set_store_text(0); } else { $self->{'buildproc'}->set_store_text(1); } $self->{'buildproc'}->set_indexfieldmap ($self->{'indexfieldmap'}); $self->{'buildproc'}->set_levels ($self->{'levels'}); $self->{'buildproc'}->reset(); &plugin::begin($self->{'pluginfo'}, $self->{'source_dir'}, $self->{'buildproc'}, $self->{'maxdocs'}); &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'}, "", {}, $self->{'buildproc'}, $self->{'maxdocs'}); &plugin::end($self->{'pluginfo'}); close (PIPEOUT); close ($handle) unless $self->{'debug'}; $self->print_stats(); # create the compression dictionary # the compression dictionary is built by assuming the stats are from a seed # dictionary (-S), if a novel word is encountered it is spelled out (-H), # and the resulting dictionary must be less than 5 meg with the most # frequent words being put into the dictionary first (-2 -k 5120) # note: these options are left over from mg version if (!$self->{'debug'}) { print $outhandle "\n creating the compression dictionary\n" if ($self->{'verbosity'} >= 1); if (!-e "$mgpp_compression_dict_exe") { die "mgppbuilder::compress_text - couldn't run $mgpp_compression_dict_exe\n"; } system ("mgpp_compression_dict$exe -f \"$fulltextprefix\" -S -H -2 -k 5120 $osextra"); if (!$self->{'debug'}) { if (!-e "$mgpp_passes_exe" || !open ($handle, "| mgpp_passes$exe $mgpp_passes_sections -f \"$fulltextprefix\" -T2 $osextra")) { die "mgppbuilder::compress_text - couldn't run $mgpp_passes_exe\n"; } } } $self->{'buildproc'}->reset(); # compress the text print $outhandle "\n compressing the text (mgpp_passes -T2)\n" if ($self->{'verbosity'} >= 1); &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'}, "", {}, $self->{'buildproc'}, $self->{'maxdocs'}); close ($handle) unless $self->{'debug'}; $self->print_stats(); } sub want_built { my $self = shift (@_); my ($index) = @_; if (defined ($self->{'collect_cfg'}->{'dontbuild'})) { foreach $checkstr (@{$self->{'collect_cfg'}->{'dontbuild'}}) { if ($index =~ /^$checkstr$/) { push (@{$self->{'notbuilt'}}, $self->{'index_mapping'}->{$index}); return 0; } } } return 1; } sub build_indexes { my $self = shift (@_); my ($indexname) = @_; my $outhandle = $self->{'outhandle'}; my $indexes = []; if (defined $indexname && $indexname =~ /\w/) { push @$indexes, $indexname; } else { $indexes = $self->{'collect_cfg'}->{'indexes'}; } # create the mapping between the index descriptions # and their directory names $self->{'index_mapping'} = $self->create_index_mapping ($indexes); # build each of the indexes foreach $index (@$indexes) { if ($self->want_built($index)) { print $outhandle "\n*** building index $index in subdirectory " . "$self->{'index_mapping'}->{$index}\n" if ($self->{'verbosity'} >= 1); $self->build_index($index); } else { print $outhandle "\n*** ignoring index $index\n" if ($self->{'verbosity'} >= 1); } } } # creates directory names for each of the index descriptions sub create_index_mapping { my $self = shift (@_); my ($indexes) = @_; my %mapping = (); $mapping{'indexmaporder'} = []; $mapping{'subcollectionmaporder'} = []; $mapping{'languagemaporder'} = []; # dirnames is used to check for collisions. Start this off # with the manditory directory names my %dirnames = ('text'=>'text', 'extra'=>'extra'); my %pnames = ('index' => '', 'subcollection' => '', 'languages' => ''); foreach $index (@$indexes) { my ($fields, $subcollection, $languages) = split (":", $index); # the directory name starts with a processed version of index fields my ($pindex) = $self->process_field($fields); # next comes a processed version of the index $pindex = lc ($pindex); # next comes a processed version of the subcollection if there is one. my $psub = $self->process_field ($subcollection); $psub = lc ($psub); # next comes a processed version of the language if there is one. my $plang = $self->process_field ($languages); $plang = lc ($plang); my $dirname = $pindex . $psub . $plang; # check to be sure all index names are unique while (defined ($dirnames{$dirname})) { $dirname = $self->make_unique (\%pnames, $index, \$pindex, \$psub, \$plang); } $mapping{$index} = $dirname; # store the mapping orders as well as the maps # also put index, subcollection and language fields into the mapping thing - # (the full index name (eg document:text:subcol:lang) is not used on # the query page) -these are used for collectionmeta later on if (!defined $mapping{'indexmap'}{"$fields"}) { $mapping{'indexmap'}{"$fields"} = $pindex; push (@{$mapping{'indexmaporder'}}, "$fields"); if (!defined $mapping{"$fields"}) { $mapping{"$fields"} = $pindex; } } if ($psub =~ /\w/ && !defined ($mapping{'subcollectionmap'}{$subcollection})) { $mapping{'subcollectionmap'}{$subcollection} = $psub; push (@{$mapping{'subcollectionmaporder'}}, $subcollection); $mapping{$subcollection} = $psub; } if ($plang =~ /\w/ && !defined ($mapping{'languagemap'}{$languages})) { $mapping{'languagemap'}{$languages} = $plang; push (@{$mapping{'languagemaporder'}}, $language); $mapping{$languages} = $plang; } $dirnames{$dirname} = $index; $pnames{'index'}{$pindex} = "$fields"; $pnames{'subcollection'}{$psub} = $subcollection; $pnames{'languages'}{$plang} = $languages; } return \%mapping; } # returns a processed version of a field. # if the field has only one component the processed # version will contain the first character and next consonant # of that componant - otherwise it will contain the first # character of the first two components sub process_field { my $self = shift (@_); my ($field) = @_; return "" unless (defined ($field) && $field =~ /\w/); my @components = split /,/, $field; if (scalar @components >= 2) { splice (@components, 2); map {s/^(.).*$/$1/;} @components; return join("", @components); } else { my ($a, $b) = $field =~ /^(.).*?([bcdfghjklmnpqrstvwxyz])/i; ($a, $b) = $field =~ /^(.)(.)/ unless defined $a && defined $b; return "$a$b"; } } sub make_unique { my $self = shift (@_); my ($namehash, $index, $indexref, $subref, $langref) = @_; my ($fields, $subcollection, $languages) = split (":", $index); if ($namehash->{'index'}->{$$indexref} ne "$fields") { $self->get_next_version ($indexref); } elsif ($namehash->{'subcollection'}->{$$subref} ne $subcollection) { $self->get_next_version ($subref); } elsif ($namehash->{'languages'}->{$$langref} ne $languages) { $self->get_next_version ($langref); } return "$$indexref$$subref$$langref"; } sub get_next_version { my $self = shift (@_); my ($nameref) = @_; if ($$nameref =~ /(\d\d)$/) { my $num = $1; $num ++; $$nameref =~ s/\d\d$/$num/; } elsif ($$nameref =~ /(\d)$/) { my $num = $1; if ($num == 9) {$$nameref =~ s/\d\d$/10/;} else {$num ++; $$nameref =~ s/\d$/$num/;} } else { $$nameref =~ s/.$/0/; } } sub build_index { my $self = shift (@_); my ($index) = @_; my $outhandle = $self->{'outhandle'}; # get the full index directory path and make sure it exists my $indexdir = $self->{'index_mapping'}->{$index}; &util::mk_all_dir (&util::filename_cat($self->{'build_dir'}, $indexdir)); my $fullindexprefix = &util::filename_cat ($self->{'build_dir'}, $indexdir, $self->{'collection'}); my $fulltextprefix = &util::filename_cat ($self->{'build_dir'}, "text", $self->{'collection'}); # get any os specific stuff my $exedir = "$ENV{'GSDLHOME'}/bin/$ENV{'GSDLOS'}"; my $exe = &util::get_os_exe (); my $mgpp_passes_exe = &util::filename_cat($exedir, "mgpp_passes$exe"); # define the section names for mgpasses my $mgpp_passes_sections = ""; foreach $level (keys (%{$self->{'levels'}})) { if ($level eq "Section" || $level eq "Paragraph") { $mgpp_passes_sections .= "-K $level "; } } my $mgpp_perf_hash_build_exe = &util::filename_cat($exedir, "mgpp_perf_hash_build$exe"); my $mgpp_weights_build_exe = &util::filename_cat ($exedir, "mgpp_weights_build$exe"); my $mgpp_invf_dict_exe = &util::filename_cat ($exedir, "mgpp_invf_dict$exe"); my $mgpp_stem_idx_exe = &util::filename_cat ($exedir, "mgpp_stem_idx$exe"); my $osextra = ""; if ($ENV{'GSDLOS'} =~ /^windows$/i) { $fullindexprefix =~ s@/@\\@g; } else { $osextra = " -d /"; if ($outhandle ne "STDERR") { # so mgpp_passes doesn't print to stderr if we redirect output $osextra .= " 2>/dev/null"; } } # get the index expression if this index belongs # to a subcollection my $indexexparr = []; # there may be subcollection info, and language info. my ($fields, $subcollection, $language) = split (":", $index); my @subcollections = (); @subcollections = split /,/, $subcollection if (defined $subcollection); foreach $subcollection (@subcollections) { if (defined ($self->{'collect_cfg'}->{'subcollection'}->{$subcollection})) { push (@$indexexparr, $self->{'collect_cfg'}->{'subcollection'}->{$subcollection}); } } # add expressions for languages if this index belongs to # a language subcollection - only put languages expressions for the # ones we want in the index my @languages = (); @languages = split /,/, $language if (defined $language); foreach $language (@languages) { my $not=0; if ($language =~ s/^\!//) { $not = 1; } foreach $lang (@{$self->{'collect_cfg'}->{'languages'}}) { if ($lang eq $language) { if ($not) { push (@$indexexparr, "!Language/$language/"); } else { push (@$indexexparr, "Language/$language/"); } last; } } } # Build index dictionary. Uses verbatim stem method print $outhandle "\n creating index dictionary (mgpp_passes -I1)\n" if ($self->{'verbosity'} >= 1); my ($handle); if ($self->{'debug'}) { $handle = STDOUT; } else { if (!-e "$mgpp_passes_exe" || !open (PIPEOUT, "| mgpp_passes$exe $mgpp_passes_sections -f \"$fullindexprefix\" -I1 $osextra")) { die "mgppbuilder::build_index - couldn't run $mgpp_passes_exe\n"; } $handle = mgppbuilder::PIPEOUT; } # set up the document processor $self->{'buildproc'}->set_output_handle ($handle); $self->{'buildproc'}->set_mode ('text'); $self->{'buildproc'}->set_index ($index, $indexexparr); $self->{'buildproc'}->set_indexing_text (1); $self->{'buildproc'}->set_store_text(1); $self->{'buildproc'}->set_indexfieldmap ($self->{'indexfieldmap'}); $self->{'buildproc'}->set_levels ($self->{'levels'}); $self->{'buildproc'}->reset(); &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'}, "", {}, $self->{'buildproc'}, $self->{'maxdocs'}); close ($handle) unless $self->{'debug'}; $self->print_stats(); if (!$self->{'debug'}) { # create the perfect hash function if (!-e "$mgpp_perf_hash_build_exe") { die "mgppbuilder::build_index - couldn't run $mgpp_perf_hash_build_exe\n"; } system ("mgpp_perf_hash_build$exe -f \"$fullindexprefix\" $osextra"); if (!-e "$mgpp_passes_exe" || !open ($handle, "| mgpp_passes$exe $mgpp_passes_sections -f \"$fullindexprefix\" -I2 $osextra")) { die "mgppbuilder::build_index - couldn't run $mgpp_passes_exe\n"; } } # invert the text print $outhandle "\n inverting the text (mgpp_passes -I2)\n" if ($self->{'verbosity'} >= 1); $self->{'buildproc'}->reset(); &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'}, "", {}, $self->{'buildproc'}, $self->{'maxdocs'}); $self->print_stats (); if (!$self->{'debug'}) { close ($handle); # create the weights file print $outhandle "\n create the weights file\n" if ($self->{'verbosity'} >= 1); if (!-e "$mgpp_weights_build_exe") { die "mgppbuilder::build_index - couldn't run $mgpp_weights_build_exe\n"; } system ("mgpp_weights_build$exe -f \"$fullindexprefix\" $osextra"); # create 'on-disk' stemmed dictionary print $outhandle "\n creating 'on-disk' stemmed dictionary\n" if ($self->{'verbosity'} >= 1); if (!-e "$mgpp_invf_dict_exe") { die "mgppbuilder::build_index - couldn't run $mgpp_invf_dict_exe\n"; } system ("mgpp_invf_dict$exe -f \"$fullindexprefix\" $osextra" ); # creates stem index files for the various stemming methods print $outhandle "\n creating stem indexes\n" if ($self->{'verbosity'} >= 1); if (!-e "$mgpp_stem_idx_exe") { die "mgppbuilder::build_index - couldn't run $mgpp_stem_idx_exe\n"; } system ("mgpp_stem_idx$exe -b 4096 -s1 -f \"$fullindexprefix\" $osextra"); system ("mgpp_stem_idx$exe -b 4096 -s2 -f \"$fullindexprefix\" $osextra"); system ("mgpp_stem_idx$exe -b 4096 -s3 -f \"$fullindexprefix\" $osextra"); # remove unwanted files my $tmpdir = &util::filename_cat ($self->{'build_dir'}, $indexdir); opendir (DIR, $tmpdir) || die "mgppbuilder::build_index - couldn't read directory $tmpdir\n"; foreach $file (readdir(DIR)) { next if $file =~ /^\./; my ($suffix) = $file =~ /\.([^\.]+)$/; if (defined $suffix && !defined $wanted_index_files{$suffix}) { # delete it! print $outhandle "deleting $file\n" if $self->{'verbosity'} > 2; #&util::rm (&util::filename_cat ($tmpdir, $file)); } } closedir (DIR); } } sub make_infodatabase { my $self = shift (@_); my $outhandle = $self->{'outhandle'}; my $textdir = &util::filename_cat($self->{'build_dir'}, "text"); my $assocdir = &util::filename_cat($self->{'build_dir'}, "assoc"); &util::mk_all_dir ($textdir); &util::mk_all_dir ($assocdir); # get db name my $dbext = ".bdb"; $dbext = ".ldb" if &util::is_little_endian(); my $fulldbname = &util::filename_cat ($textdir, "$self->{'collection'}$dbext"); $fulldbname =~ s/\//\\/g if ($ENV{'GSDLOS'} =~ /^windows$/i); my $exedir = "$ENV{'GSDLHOME'}/bin/$ENV{'GSDLOS'}"; my $exe = &util::get_os_exe (); my $txt2db_exe = &util::filename_cat($exedir, "txt2db$exe"); # define the indexed field mapping if not already done so (ie if infodb called separately from build_index) if (scalar(keys %{$self->{'buildproc'}->{'indexfieldmap'}}) == 0) { #check build.cfg to see if indexfields have been filled in $buildconfigfile = &util::filename_cat($ENV{'GSDLCOLLECTDIR'}, "building/build.cfg"); if (-e $buildconfigfile) { $buildcfg = &colcfg::read_build_cfg( $buildconfigfile); if (defined $buildcfg->{'indexfields'}) { foreach $field (@{$buildcfg->{'indexfields'}}) { $self->{'buildproc'}->{'indexfields'}->{$field} = 1; } } if (defined $buildcfg->{'indexfieldmap'}) { foreach $field (@{$buildcfg->{'indexfieldmap'}}) { ($f, $v) = $field =~ /^(.*)\-\>(.*)$/; $self->{'buildproc'}->{'indexfieldmap'}->{$f} = $v; } } } } print $outhandle "\n*** creating the info database and processing associated files\n" if ($self->{'verbosity'} >= 1); # init all the classifiers &classify::init_classifiers ($self->{'classifiers'}); # set up the document processor my ($handle); if ($self->{'debug'}) { $handle = STDOUT; } else { if (!-e "$txt2db_exe" || !open (PIPEOUT, "| $txt2db_exe $fulldbname")) { die "mgppbuilder::make_infodatabase - couldn't run $txt2db_exe\n"; } $handle = mgppbuilder::PIPEOUT; } $self->{'buildproc'}->set_output_handle ($handle); $self->{'buildproc'}->set_mode ('infodb'); $self->{'buildproc'}->set_assocdir ($assocdir); $self->{'buildproc'}->set_dontgdbm ($self->{'dontgdbm'}); $self->{'buildproc'}->set_classifiers ($self->{'classifiers'}); $self->{'buildproc'}->set_indexing_text (0); $self->{'buildproc'}->set_store_text(1); #$self->{'buildproc'}->set_indexfieldmap ($self->{'indexfieldmap'}); $self->{'buildproc'}->reset(); if (defined $self->{'collect_cfg'}->{'collectionmeta'}) { if (!defined $self->{'index_mapping'}) { $self->{'index_mapping'} = $self->create_index_mapping ($self->{'collect_cfg'}->{'indexes'}); } print $handle "[collection]\n"; foreach $cmeta (keys (%{$self->{'collect_cfg'}->{'collectionmeta'}})) { my $defaultfound=0; my $first=1; my $metadata_entry = ""; my $default=""; my $cmetamap = ""; if ($cmeta =~ s/^\.//) { if (defined $self->{'index_mapping'}->{$cmeta}) { $cmetamap = $self->{'index_mapping'}->{$cmeta}; $cmeta = ".$cmeta"; } else { print $outhandle "mgbuilder: warning bad collectionmeta option '$cmeta' - ignored\n"; next; #ignore this one } } else { $cmetamap = $cmeta; # just using the same name } #iterate through the languages foreach $lang (keys (%{$self->{'collect_cfg'}->{'collectionmeta'}->{$cmeta}})) { if ($first) { $first=0; #set the default default to the first entry $default=$self->{'collect_cfg'}->{'collectionmeta'}->{$cmeta}->{$lang}; } if ($lang =~ /default/) { $defaultfound=1; #the default entry goes first $metadata_entry = "<$cmetamap>" . $self->{'collect_cfg'}->{'collectionmeta'}->{$cmeta}->{'default'} . "\n" . $metadata_entry; } else { my ($l) = $lang =~ /^\[l=(\w*)\]$/; if ($l) { $metadata_entry .= "<$cmetamap:$l>" . $self->{'collect_cfg'}->{'collectionmeta'}->{$cmeta}->{$lang} . "\n"; } } } #if we haven't found a default, put one in if (!$defaultfound) { $metadata_entry = "<$cmetamap>$default\n" . $metadata_entry; } #write the entry to the file print $handle $metadata_entry; } #add the indexfieldmap macros to [collection] # eg Title # Subject # these may be overidden for other langs if add to macro files $field_entry=""; foreach $longfield (keys %{$self->{'buildproc'}->{'indexfieldmap'}}){ $shortfield = $self->{'buildproc'}->{'indexfieldmap'}->{$longfield}; next if $shortfield eq 1; $field_entry .= "<$shortfield>$longfield\n"; } print $handle $field_entry; print $handle "\n" . ('-' x 70) . "\n"; } &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'}, "", {}, $self->{'buildproc'}, $self->{'maxdocs'}); # output classification information &classify::output_classify_info ($self->{'classifiers'}, $handle, $self->{'allclassifications'}); #output doclist my @doclist = $self->{'buildproc'}->get_doc_list(); my $docs = join (";",@doclist); print $handle "[browselist]\n"; print $handle "0\n"; print $handle "VList\n"; print $handle "" . ($#doclist+1) . "\n"; print $handle "Invisible\n"; print $handle "$docs"; print $handle "\n" . ('-' x 70) . "\n"; close ($handle) if !$self->{'debug'}; } sub collect_specific { my $self = shift (@_); } sub make_auxiliary_files { my $self = shift (@_); my ($index); my %build_cfg = (); my $outhandle = $self->{'outhandle'}; print $outhandle "\n*** creating auxiliary files \n" if ($self->{'verbosity'} >= 1); # get the text directory &util::mk_all_dir ($self->{'build_dir'}); # store the build date $build_cfg->{'builddate'} = time; $build_cfg->{'buildtype'} = "mgpp"; # store the number of documents and number of bytes $build_cfg->{'numdocs'} = $self->{'buildproc'}->get_num_docs(); $build_cfg->{'numbytes'} = $self->{'buildproc'}->get_num_bytes(); # store the mapping between the index names and the directory names my @indexmap = (); foreach $index (@{$self->{'index_mapping'}->{'indexmaporder'}}) { push (@indexmap, "$index\-\>$self->{'index_mapping'}->{'indexmap'}->{$index}"); } $build_cfg->{'indexmap'} = \@indexmap; my @subcollectionmap = (); foreach $subcollection (@{$self->{'index_mapping'}->{'subcollectionmaporder'}}) { push (@subcollectionmap, "$subcollection\-\>" . $self->{'index_mapping'}->{'subcollectionmap'}->{$subcollection}); } $build_cfg->{'subcollectionmap'} = \@subcollectionmap if scalar (@subcollectionmap); my @languagemap = (); foreach $language (@{$self->{'index_mapping'}->{'languagemaporder'}}) { push (@languagemap, "$language\-\>" . $self->{'index_mapping'}->{'languagemap'}->{$language}); } $build_cfg->{'languagemap'} = \@languagemap if scalar (@languagemap); $build_cfg->{'notbuilt'} = $self->{'notbuilt'}; # store the indexfieldmap information my @indexfieldmap = (); #add all fields bit foreach $field (keys %{$self->{'buildproc'}->{'indexfields'}}) { push (@indexfieldmap, "$field\-\>$self->{'buildproc'}->{'indexfieldmap'}->{$field}"); } $build_cfg->{'indexfieldmap'} = \@indexfieldmap; #store the indexed field information foreach $field (sort keys %{$self->{'buildproc'}->{'indexfields'}}) { push (@{$build_cfg->{'indexfields'}}, $field); } # write out the build information &cfgread::write_cfg_file("$self->{'build_dir'}/build.cfg", $build_cfg, '^(builddate|buildtype|numdocs|numbytes)$', '^(indexmap|subcollectionmap|languagemap|indexfieldmap|notbuilt|indexfields)$'); } sub deinit { my $self = shift (@_); } sub print_stats { my $self = shift (@_); my $outhandle = $self->{'outhandle'}; my $indexing_text = $self->{'buildproc'}->get_indexing_text(); my $index = $self->{'buildproc'}->get_index(); my $num_bytes = $self->{'buildproc'}->get_num_bytes(); my $num_processed_bytes = $self->{'buildproc'}->get_num_processed_bytes(); if ($indexing_text) { print $outhandle "Stats (Creating index $index)\n"; } else { print $outhandle "Stats (Compressing text from $index)\n"; } print $outhandle "Total bytes in collection: $num_bytes\n"; print $outhandle "Total bytes in $index: $num_processed_bytes\n"; if ($num_processed_bytes < 50 && ($indexing_text || !$self->{'no_text'})) { print $outhandle "***************\n"; if ($indexing_text) { print $outhandle "WARNING: There is very little or no text to process for $index\n"; } elsif (!$self->{'no_text'}) { print $outhandle "WARNING: There is very little or no text to compress\n"; } print $outhandle " Was this your intention?\n"; print $outhandle "***************\n"; } } 1;