########################################################################### # # mgppbuilder.pm -- MGBuilder object # A component of the Greenstone digital library software # from the New Zealand Digital Library Project at the # University of Waikato, New Zealand. # # Copyright (C) 1999 New Zealand Digital Library Project # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation; either version 2 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program; if not, write to the Free Software # Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. # ########################################################################### package mgppbuilder; use classify; use cfgread; use colcfg; use plugin; use util; use FileHandle; BEGIN { # set autoflush on for STDERR and STDOUT so that mgpp # doesn't get out of sync with plugins STDOUT->autoflush(1); STDERR->autoflush(1); } END { STDOUT->autoflush(0); STDERR->autoflush(0); } $maxdocsize = 12000; %level_map = ('document'=>'Doc', 'section'=>'Sec', 'paragraph'=>'Para', 'Doc'=>'_textdocument_', 'Sec'=>'_textsection_', 'Para'=>'_textparagraph_'); #$doc_level = "Doc"; #$sec_level = "Sec"; #$para_level = "Para"; %wanted_index_files = ('td'=>1, 't'=>1, 'tl'=>1, 'ti'=>1, 'idb'=>1, 'ib1'=>1, 'ib2'=>1, 'ib3'=>1, 'i'=>1, 'il'=>1, 'w'=>1, 'wa'=>1); # change this so a user can add their own ones in via a file or cfg #add AND, OR, NOT NEAR to this list - these cannot be used as field names #also add the level names (Doc, Sec, Para) %static_indexfield_map = ('Title'=>'TI', 'TI'=>1, 'Subject'=>'SU', 'SU'=>1, 'Creator'=>'CR', 'CR'=>1, 'Organization'=>'ORG', 'ORG'=>1, 'Source'=>'SO', 'SO'=>1, 'Howto'=>'HT', 'HT'=>1, 'ItemTitle'=>'IT', 'IT'=>1, 'ProgNumber'=>'PN', 'PN'=>1, 'People'=>'PE', 'PE'=>1, 'Coverage'=>'CO', 'CO'=>1, 'allfields'=>'ZZ', 'ZZ'=>1, 'text'=>'TX', 'TX'=>1, 'AND'=>1, 'OR'=>1, 'NOT'=>1, 'NEAR'=>1, 'Doc'=>1, 'Sec'=>1, 'Para'=>1); sub new { my ($class, $collection, $source_dir, $build_dir, $verbosity, $maxdocs, $debug, $keepold, $allclassifications, $outhandle, $no_text, $gli) = @_; $outhandle = STDERR unless defined $outhandle; $no_text = 0 unless defined $no_text; # create an mgppbuilder object my $self = bless {'collection'=>$collection, 'source_dir'=>$source_dir, 'build_dir'=>$build_dir, 'verbosity'=>$verbosity, 'maxdocs'=>$maxdocs, 'debug'=>$debug, 'keepold'=>$keepold, 'allclassifications'=>$allclassifications, 'outhandle'=>$outhandle, 'no_text'=>$no_text, 'notbuilt'=>{}, # indexes not built 'indexfieldmap'=>\%static_indexfield_map, 'gli'=>$gli }, $class; $self->{'gli'} = 0 unless defined $self->{'gli'}; # read in the collection configuration file my $colcfgname = "$ENV{'GSDLCOLLECTDIR'}/etc/collect.cfg"; if (!-e $colcfgname) { die "mgppbuilder::new - couldn't find collect.cfg for collection $collection\n"; } $self->{'collect_cfg'} = &colcfg::read_collect_cfg ($colcfgname); # sort out the indexes #indexes are specified with spaces, but we put them into one index my $indexes = $self->{'collect_cfg'}->{'indexes'}; $self->{'collect_cfg'}->{'indexes'} = []; push (@{$self->{'collect_cfg'}->{'indexes'}}, join(',', @$indexes)); # sort out subcollection indexes if (defined $self->{'collect_cfg'}->{'indexsubcollections'}) { my $indexes = $self->{'collect_cfg'}->{'indexes'}; $self->{'collect_cfg'}->{'indexes'} = []; foreach $subcollection (@{$self->{'collect_cfg'}->{'indexsubcollections'}}) { foreach $index (@$indexes) { push (@{$self->{'collect_cfg'}->{'indexes'}}, "$index:$subcollection"); } } } # sort out language subindexes if (defined $self->{'collect_cfg'}->{'languages'}) { my $indexes = $self->{'collect_cfg'}->{'indexes'}; $self->{'collect_cfg'}->{'indexes'} = []; foreach $language (@{$self->{'collect_cfg'}->{'languages'}}) { foreach $index (@$indexes) { if (defined ($self->{'collect_cfg'}->{'indexsubcollections'})) { push (@{$self->{'collect_cfg'}->{'indexes'}}, "$index:$language"); } else { # add in an empty subcollection field push (@{$self->{'collect_cfg'}->{'indexes'}}, "$index\:\:$language"); } } } } # make sure that the same index isn't specified more than once my %tmphash = (); my @tmparray = @{$self->{'collect_cfg'}->{'indexes'}}; $self->{'collect_cfg'}->{'indexes'} = []; foreach my $i (@tmparray) { if (!defined ($tmphash{$i})) { push (@{$self->{'collect_cfg'}->{'indexes'}}, $i); $tmphash{$i} = 1; } } # get the levels (Section, Paragraph) for indexing and compression $self->{'levels'} = {}; $self->{'levelorder'} = (); if (defined $self->{'collect_cfg'}->{'levels'}) { foreach $level ( @{$self->{'collect_cfg'}->{'levels'}} ){ $level =~ tr/A-Z/a-z/; $self->{'levels'}->{$level} = 1; push (@{$self->{'levelorder'}}, $level); } } else { # default to document $self->{'levels'}->{'document'} = 1; push (@{$self->{'levelorder'}}, 'document'); } $self->{'doc_level'} = "document"; if (! $self->{'levels'}->{'document'}) { if ($self->{'levels'}->{'section'}) { $self->{'doc_level'} = "section"; } else { die "you must have either document or section level specified!!\n"; } } print $outhandle "doclevel = ". $self->{'doc_level'}."\n"; # get the list of plugins for this collection #build up the extra global options for the plugins my @global_opts = (); if (defined $self->{'collect_cfg'}->{'separate_cjk'} && $self->{'collect_cfg'}->{'separate_cjk'} =~ /^true$/i) { push @global_opts, "-separate_cjk"; } my $plugins = []; if (defined $self->{'collect_cfg'}->{'plugin'}) { $plugins = $self->{'collect_cfg'}->{'plugin'}; } # load all the plugins $self->{'pluginfo'} = &plugin::load_plugins ($plugins, $verbosity, $outhandle, \@global_opts); if (scalar(@{$self->{'pluginfo'}}) == 0) { print $outhandle "No plugins were loaded.\n"; die "\n"; } # get the list of classifiers for this collection my $classifiers = []; if (defined $self->{'collect_cfg'}->{'classify'}) { $classifiers = $self->{'collect_cfg'}->{'classify'}; } # load all the classifiers $self->{'classifiers'} = &classify::load_classifiers ($classifiers, $build_dir, $outhandle); # load up any dontgdbm fields $self->{'dontgdbm'} = {}; if (defined ($self->{'collect_cfg'}->{'dontgdbm'})) { foreach $dg (@{$self->{'collect_cfg'}->{'dontgdbm'}}) { $self->{'dontgdbm'}->{$dg} = 1; } } # load up the document processor for building # if a buildproc class has been created for this collection, use it # otherwise, use the mgpp buildproc my ($buildprocdir, $buildproctype); if (-e "$ENV{'GSDLCOLLECTDIR'}/perllib/${collection}buildproc.pm") { $buildprocdir = "$ENV{'GSDLCOLLECTDIR'}/perllib"; $buildproctype = "${collection}buildproc"; } else { $buildprocdir = "$ENV{'GSDLHOME'}/perllib"; $buildproctype = "mgppbuildproc"; } require "$buildprocdir/$buildproctype.pm"; eval("\$self->{'buildproc'} = new $buildproctype(\$collection, " . "\$source_dir, \$build_dir, \$verbosity, \$outhandle)"); die "$@" if $@; return $self; } sub init { my $self = shift (@_); if (!$self->{'debug'} && !$self->{'keepold'}) { # remove any old builds &util::rm_r($self->{'build_dir'}); &util::mk_all_dir($self->{'build_dir'}); # make the text directory my $textdir = "$self->{'build_dir'}/text"; &util::mk_all_dir($textdir); } } sub set_strip_html { my $self = shift (@_); my ($strip) = @_; $self->{'strip_html'} = $strip; $self->{'buildproc'}->set_strip_html($strip); } sub compress_text { my $self = shift (@_); my ($textindex) = @_; my $exedir = "$ENV{'GSDLHOME'}/bin/$ENV{'GSDLOS'}"; my $exe = &util::get_os_exe (); my $mgpp_passes_exe = &util::filename_cat($exedir, "mgpp_passes$exe"); my $mgpp_compression_dict_exe = &util::filename_cat($exedir, "mgpp_compression_dict$exe"); my $outhandle = $self->{'outhandle'}; &util::mk_all_dir (&util::filename_cat($self->{'build_dir'}, "text")); my $basefilename = "text/$self->{'collection'}"; my $fulltextprefix = &util::filename_cat ($self->{'build_dir'}, $basefilename); my $osextra = ""; if ($ENV{'GSDLOS'} =~ /^windows$/i) { $fulltextprefix =~ s@/@\\@g; } else { $osextra = " -d /"; } # define the section names and possibly the doc name for mgpasses # the compressor doesn't need to know about paragraphs - never want to # retrieve them my $mgpp_passes_sections = ""; my ($doc_level) = $self->{'doc_level'}; $mgpp_passes_sections .= "-J " . $level_map{$doc_level} . " "; foreach $level (keys %{$self->{'levels'}}) { if ($level ne $doc_level && $level ne "paragraph") { $mgpp_passes_sections .= "-K " . $level_map{$level} . " "; } } print $outhandle "\n*** creating the compressed text\n" if ($self->{'verbosity'} >= 1); print STDERR "\n" if $self->{'gli'}; # collect the statistics for the text # -b $maxdocsize sets the maximum document size to be 12 meg print $outhandle "\n collecting text statistics (mgpp_passes -T1)\n" if ($self->{'verbosity'} >= 1); print STDERR "\n" if $self->{'gli'}; my ($handle); if ($self->{'debug'}) { $handle = STDOUT; } else { #print $outhandle "trying to run (compress 1) mgpp_passes$exe $mgpp_passes_sections -f \"$fulltextprefix\" -T1 $osextra\n"; if (!-e "$mgpp_passes_exe" || !open (PIPEOUT, "| mgpp_passes$exe $mgpp_passes_sections -f \"$fulltextprefix\" -T1 $osextra")) { print STDERR "\n\n" if $self->{'gli'}; die "mgppbuilder::compress_text - couldn't run $mgpp_passes_exe\n"; } $handle = mgppbuilder::PIPEOUT; } $self->{'buildproc'}->set_output_handle ($handle); $self->{'buildproc'}->set_mode ('text'); $self->{'buildproc'}->set_index ($textindex); $self->{'buildproc'}->set_indexing_text (0); if ($self->{'no_text'}) { $self->{'buildproc'}->set_store_text(0); } else { $self->{'buildproc'}->set_store_text(1); } $self->{'buildproc'}->set_indexfieldmap ($self->{'indexfieldmap'}); $self->{'buildproc'}->set_levels ($self->{'levels'}); $self->{'buildproc'}->reset(); &plugin::begin($self->{'pluginfo'}, $self->{'source_dir'}, $self->{'buildproc'}, $self->{'maxdocs'}); &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'}, "", {}, $self->{'buildproc'}, $self->{'maxdocs'}); &plugin::end($self->{'pluginfo'}); close (PIPEOUT); close ($handle) unless $self->{'debug'}; $self->print_stats(); # create the compression dictionary # the compression dictionary is built by assuming the stats are from a seed # dictionary (-S), if a novel word is encountered it is spelled out (-H), # and the resulting dictionary must be less than 5 meg with the most # frequent words being put into the dictionary first (-2 -k 5120) # note: these options are left over from mg version if (!$self->{'debug'}) { print $outhandle "\n creating the compression dictionary\n" if ($self->{'verbosity'} >= 1); print STDERR "\n" if $self->{'gli'}; if (!-e "$mgpp_compression_dict_exe") { print STDERR "\n\n" if $self->{'gli'}; die "mgppbuilder::compress_text - couldn't run $mgpp_compression_dict_exe\n"; } system ("mgpp_compression_dict$exe -f \"$fulltextprefix\" -S -H -2 -k 5120 $osextra"); if (!$self->{'debug'}) { #print $outhandle "trying to run (compress 2) mgpp_passes$exe $mgpp_passes_sections -f \"$fulltextprefix\" -T2 $osextra\n"; if (!-e "$mgpp_passes_exe" || !open ($handle, "| mgpp_passes$exe $mgpp_passes_sections -f \"$fulltextprefix\" -T2 $osextra")) { print STDERR "\n\n" if $self->{'gli'}; die "mgppbuilder::compress_text - couldn't run $mgpp_passes_exe\n"; } } } else { print STDERR "\n" if $self->{'gli'}; } $self->{'buildproc'}->reset(); # compress the text print $outhandle "\n compressing the text (mgpp_passes -T2)\n" if ($self->{'verbosity'} >= 1); print STDERR "\n" if $self->{'gli'}; &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'}, "", {}, $self->{'buildproc'}, $self->{'maxdocs'}); close ($handle) unless $self->{'debug'}; $self->print_stats(); print STDERR "\n" if $self->{'gli'}; } sub want_built { my $self = shift (@_); my ($index) = @_; if (defined ($self->{'collect_cfg'}->{'dontbuild'})) { foreach $checkstr (@{$self->{'collect_cfg'}->{'dontbuild'}}) { if ($index =~ /^$checkstr$/) { #push (@{$self->{'notbuilt'}}, $self->{'index_mapping'}->{$index}); $self->{'notbuilt'}->{$index} = 1; return 0; } } } return 1; } sub build_indexes { my $self = shift (@_); my ($indexname) = @_; my $outhandle = $self->{'outhandle'}; my $indexes = []; if (defined $indexname && $indexname =~ /\w/) { push @$indexes, $indexname; } else { $indexes = $self->{'collect_cfg'}->{'indexes'}; } # create the mapping between the index descriptions # and their directory names (includes subcolls and langs) $self->{'index_mapping'} = $self->create_index_mapping ($indexes); # build each of the indexes foreach $index (@$indexes) { if ($self->want_built($index)) { print $outhandle "\n*** building index $index in subdirectory " . "$self->{'index_mapping'}->{$index}\n" if ($self->{'verbosity'} >= 1); print STDERR "\n" if $self->{'gli'}; $self->build_index($index); } else { print $outhandle "\n*** ignoring index $index\n" if ($self->{'verbosity'} >= 1); } } #define the final field lists $self->make_final_field_list(); } # creates directory names for each of the index descriptions sub create_index_mapping { my $self = shift (@_); my ($indexes) = @_; my %mapping = (); $mapping{'indexmaporder'} = []; $mapping{'subcollectionmaporder'} = []; $mapping{'languagemaporder'} = []; # dirnames is used to check for collisions. Start this off # with the manditory directory names my %dirnames = ('text'=>'text', 'extra'=>'extra'); my %pnames = ('index' => '', 'subcollection' => '', 'languages' => ''); foreach $index (@$indexes) { my ($fields, $subcollection, $languages) = split (":", $index); # the directory name starts with a processed version of index fields #my ($pindex) = $self->process_field($fields); #$pindex = lc ($pindex); # now we only ever have one index, and its called 'idx' $pindex = 'idx'; # next comes a processed version of the subcollection if there is one. my $psub = $self->process_field ($subcollection); $psub = lc ($psub); # next comes a processed version of the language if there is one. my $plang = $self->process_field ($languages); $plang = lc ($plang); my $dirname = $pindex . $psub . $plang; # check to be sure all index names are unique while (defined ($dirnames{$dirname})) { $dirname = $self->make_unique (\%pnames, $index, \$pindex, \$psub, \$plang); } $mapping{$index} = $dirname; # store the mapping orders as well as the maps # also put index, subcollection and language fields into the mapping thing - # (the full index name (eg text:subcol:lang) is not used on # the query page) -these are used for collectionmeta later on if (!defined $mapping{'indexmap'}{"$fields"}) { $mapping{'indexmap'}{"$fields"} = $pindex; push (@{$mapping{'indexmaporder'}}, "$fields"); if (!defined $mapping{"$fields"}) { $mapping{"$fields"} = $pindex; } } if ($psub =~ /\w/ && !defined ($mapping{'subcollectionmap'}{$subcollection})) { $mapping{'subcollectionmap'}{$subcollection} = $psub; push (@{$mapping{'subcollectionmaporder'}}, $subcollection); $mapping{$subcollection} = $psub; } if ($plang =~ /\w/ && !defined ($mapping{'languagemap'}{$languages})) { $mapping{'languagemap'}{$languages} = $plang; push (@{$mapping{'languagemaporder'}}, $languages); $mapping{$languages} = $plang; } $dirnames{$dirname} = $index; $pnames{'index'}{$pindex} = "$fields"; $pnames{'subcollection'}{$psub} = $subcollection; $pnames{'languages'}{$plang} = $languages; } return \%mapping; } # returns a processed version of a field. # if the field has only one component the processed # version will contain the first character and next consonant # of that componant - otherwise it will contain the first # character of the first two components sub process_field { my $self = shift (@_); my ($field) = @_; return "" unless (defined ($field) && $field =~ /\w/); my @components = split /,/, $field; if (scalar @components >= 2) { splice (@components, 2); map {s/^(.).*$/$1/;} @components; return join("", @components); } else { my ($a, $b) = $field =~ /^(.).*?([bcdfghjklmnpqrstvwxyz])/i; ($a, $b) = $field =~ /^(.)(.)/ unless defined $a && defined $b; return "$a$b"; } } sub make_unique { my $self = shift (@_); my ($namehash, $index, $indexref, $subref, $langref) = @_; my ($fields, $subcollection, $languages) = split (":", $index); if ($namehash->{'index'}->{$$indexref} ne "$fields") { $self->get_next_version ($indexref); } elsif ($namehash->{'subcollection'}->{$$subref} ne $subcollection) { $self->get_next_version ($subref); } elsif ($namehash->{'languages'}->{$$langref} ne $languages) { $self->get_next_version ($langref); } return "$$indexref$$subref$$langref"; } sub get_next_version { my $self = shift (@_); my ($nameref) = @_; if ($$nameref =~ /(\d\d)$/) { my $num = $1; $num ++; $$nameref =~ s/\d\d$/$num/; } elsif ($$nameref =~ /(\d)$/) { my $num = $1; if ($num == 9) {$$nameref =~ s/\d\d$/10/;} else {$num ++; $$nameref =~ s/\d$/$num/;} } else { $$nameref =~ s/.$/0/; } } sub build_index { my $self = shift (@_); my ($index) = @_; my $outhandle = $self->{'outhandle'}; # get the full index directory path and make sure it exists my $indexdir = $self->{'index_mapping'}->{$index}; &util::mk_all_dir (&util::filename_cat($self->{'build_dir'}, $indexdir)); my $fullindexprefix = &util::filename_cat ($self->{'build_dir'}, $indexdir, $self->{'collection'}); my $fulltextprefix = &util::filename_cat ($self->{'build_dir'}, "text", $self->{'collection'}); # get any os specific stuff my $exedir = "$ENV{'GSDLHOME'}/bin/$ENV{'GSDLOS'}"; my $exe = &util::get_os_exe (); my $mgpp_passes_exe = &util::filename_cat($exedir, "mgpp_passes$exe"); # define the section names for mgpasses # define the section names and possibly the doc name for mgpasses my $mgpp_passes_sections = ""; my ($doc_level) = $self->{'doc_level'}; $mgpp_passes_sections .= "-J " . $level_map{$doc_level} ." "; foreach $level (keys %{$self->{'levels'}}) { if ($level ne $doc_level) { $mgpp_passes_sections .= "-K " . $level_map{$level}. " "; } } my $mgpp_perf_hash_build_exe = &util::filename_cat($exedir, "mgpp_perf_hash_build$exe"); my $mgpp_weights_build_exe = &util::filename_cat ($exedir, "mgpp_weights_build$exe"); my $mgpp_invf_dict_exe = &util::filename_cat ($exedir, "mgpp_invf_dict$exe"); my $mgpp_stem_idx_exe = &util::filename_cat ($exedir, "mgpp_stem_idx$exe"); my $osextra = ""; if ($ENV{'GSDLOS'} =~ /^windows$/i) { $fullindexprefix =~ s@/@\\@g; } else { $osextra = " -d /"; if ($outhandle ne "STDERR") { # so mgpp_passes doesn't print to stderr if we redirect output $osextra .= " 2>/dev/null"; } } # get the index expression if this index belongs # to a subcollection my $indexexparr = []; # there may be subcollection info, and language info. my ($fields, $subcollection, $language) = split (":", $index); my @subcollections = (); @subcollections = split /,/, $subcollection if (defined $subcollection); foreach $subcollection (@subcollections) { if (defined ($self->{'collect_cfg'}->{'subcollection'}->{$subcollection})) { push (@$indexexparr, $self->{'collect_cfg'}->{'subcollection'}->{$subcollection}); } } # add expressions for languages if this index belongs to # a language subcollection - only put languages expressions for the # ones we want in the index # this puts a separate Language/en entry in for each language in the list # is this what we want? # should we just have one entry with Language/en,es/ ?? my @languages = (); @languages = split /,/, $language if (defined $language); foreach $language (@languages) { my $not=0; if ($language =~ s/^\!//) { $not = 1; } if ($not) { push (@$indexexparr, "!Language/$language/"); } else { push (@$indexexparr, "Language/$language/"); } } # Build index dictionary. Uses verbatim stem method print $outhandle "\n creating index dictionary (mgpp_passes -I1)\n" if ($self->{'verbosity'} >= 1); print STDERR "\n" if $self->{'gli'}; my ($handle); if ($self->{'debug'}) { $handle = STDOUT; } else { if (!-e "$mgpp_passes_exe" || !open (PIPEOUT, "| mgpp_passes$exe $mgpp_passes_sections -f \"$fullindexprefix\" -I1 $osextra")) { print STDERR "\n\n" if $self->{'gli'}; die "mgppbuilder::build_index - couldn't run $mgpp_passes_exe\n"; } $handle = mgppbuilder::PIPEOUT; } # set up the document processr $self->{'buildproc'}->set_output_handle ($handle); $self->{'buildproc'}->set_mode ('text'); $self->{'buildproc'}->set_index ($index, $indexexparr); $self->{'buildproc'}->set_indexing_text (1); $self->{'buildproc'}->set_store_text(1); $self->{'buildproc'}->set_indexfieldmap ($self->{'indexfieldmap'}); $self->{'buildproc'}->set_levels ($self->{'levels'}); $self->{'buildproc'}->reset(); &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'}, "", {}, $self->{'buildproc'}, $self->{'maxdocs'}); close ($handle) unless $self->{'debug'}; $self->print_stats(); # now we check to see if the required files have been produced - if not we quit building this index so the whole process doesn't crap out. # we check on the .id file - index dictionary my $dict_file = "$fullindexprefix.id"; if (!-e $dict_file) { print $outhandle "mgppbuilder::build_index - Couldn't create index $index\n"; print STDERR "\n\n" if $self->{'gli'}; $self->{'notbuilt'}->{$index}=1; return; } if (!$self->{'debug'}) { # create the perfect hash function if (!-e "$mgpp_perf_hash_build_exe") { print STDERR "\n\n" if $self->{'gli'}; die "mgppbuilder::build_index - couldn't run $mgpp_perf_hash_build_exe\n"; } system ("mgpp_perf_hash_build$exe -f \"$fullindexprefix\" $osextra"); if (!-e "$mgpp_passes_exe" || !open ($handle, "| mgpp_passes$exe $mgpp_passes_sections -f \"$fullindexprefix\" -I2 $osextra")) { print STDERR "\n\n" if $self->{'gli'}; die "mgppbuilder::build_index - couldn't run $mgpp_passes_exe\n"; } } # invert the text print $outhandle "\n inverting the text (mgpp_passes -I2)\n" if ($self->{'verbosity'} >= 1); print STDERR "\n" if $self->{'gli'}; $self->{'buildproc'}->reset(); &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'}, "", {}, $self->{'buildproc'}, $self->{'maxdocs'}); $self->print_stats (); if (!$self->{'debug'}) { close ($handle); # create the weights file print $outhandle "\n create the weights file\n" if ($self->{'verbosity'} >= 1); print STDERR "\n" if $self->{'gli'}; if (!-e "$mgpp_weights_build_exe") { print STDERR "\n\n" if $self->{'gli'}; die "mgppbuilder::build_index - couldn't run $mgpp_weights_build_exe\n"; } system ("mgpp_weights_build$exe -f \"$fullindexprefix\" $osextra"); # create 'on-disk' stemmed dictionary print $outhandle "\n creating 'on-disk' stemmed dictionary\n" if ($self->{'verbosity'} >= 1); if (!-e "$mgpp_invf_dict_exe") { print STDERR "\n\n" if $self->{'gli'}; die "mgppbuilder::build_index - couldn't run $mgpp_invf_dict_exe\n"; } system ("mgpp_invf_dict$exe -f \"$fullindexprefix\" $osextra" ); # creates stem index files for the various stemming methods print $outhandle "\n creating stem indexes\n" if ($self->{'verbosity'} >= 1); print STDERR "\n" if $self->{'gli'}; if (!-e "$mgpp_stem_idx_exe") { print STDERR "\n\n" if $self->{'gli'}; die "mgppbuilder::build_index - couldn't run $mgpp_stem_idx_exe\n"; } system ("mgpp_stem_idx$exe -b 4096 -s1 -f \"$fullindexprefix\" $osextra"); system ("mgpp_stem_idx$exe -b 4096 -s2 -f \"$fullindexprefix\" $osextra"); system ("mgpp_stem_idx$exe -b 4096 -s3 -f \"$fullindexprefix\" $osextra"); # remove unwanted files my $tmpdir = &util::filename_cat ($self->{'build_dir'}, $indexdir); opendir (DIR, $tmpdir) || die "mgppbuilder::build_index - couldn't read directory $tmpdir\n"; foreach $file (readdir(DIR)) { next if $file =~ /^\./; my ($suffix) = $file =~ /\.([^\.]+)$/; if (defined $suffix && !defined $wanted_index_files{$suffix}) { # delete it! print $outhandle "deleting $file\n" if $self->{'verbosity'} > 2; #&util::rm (&util::filename_cat ($tmpdir, $file)); } } closedir (DIR); } print STDERR "\n" if $self->{'gli'}; } sub make_infodatabase { my $self = shift (@_); my $outhandle = $self->{'outhandle'}; my $textdir = &util::filename_cat($self->{'build_dir'}, "text"); my $assocdir = &util::filename_cat($self->{'build_dir'}, "assoc"); &util::mk_all_dir ($textdir); &util::mk_all_dir ($assocdir); # get db name my $dbext = ".bdb"; $dbext = ".ldb" if &util::is_little_endian(); my $fulldbname = &util::filename_cat ($textdir, "$self->{'collection'}$dbext"); $fulldbname =~ s/\//\\/g if ($ENV{'GSDLOS'} =~ /^windows$/i); my $exedir = "$ENV{'GSDLHOME'}/bin/$ENV{'GSDLOS'}"; my $exe = &util::get_os_exe (); my $txt2db_exe = &util::filename_cat($exedir, "txt2db$exe"); # define the indexed field mapping if not already done so (ie if infodb called separately from build_index) if (!defined $self->{'build_cfg'}) { $self->read_final_field_list(); } print $outhandle "\n*** creating the info database and processing associated files\n" if ($self->{'verbosity'} >= 1); print STDERR "\n" if $self->{'gli'}; # init all the classifiers &classify::init_classifiers ($self->{'classifiers'}); # set up the document processor my ($handle); if ($self->{'debug'}) { $handle = STDOUT; } else { if (!-e "$txt2db_exe" || !open (PIPEOUT, "| txt2db$exe \"$fulldbname\"")) { print STDERR "\n\n" if $self->{'gli'}; die "mgppbuilder::make_infodatabase - couldn't run $txt2db_exe\n"; } $handle = mgppbuilder::PIPEOUT; } $self->{'buildproc'}->set_output_handle ($handle); $self->{'buildproc'}->set_mode ('infodb'); $self->{'buildproc'}->set_assocdir ($assocdir); $self->{'buildproc'}->set_dontgdbm ($self->{'dontgdbm'}); $self->{'buildproc'}->set_classifiers ($self->{'classifiers'}); $self->{'buildproc'}->set_indexing_text (0); $self->{'buildproc'}->set_store_text(1); #$self->{'buildproc'}->set_indexfieldmap ($self->{'indexfieldmap'}); $self->{'buildproc'}->reset(); # do the collection info print $handle "[collection]\n"; # first do the collection meta stuff - everything without a dot my $collmetadefined = 0; if (defined $self->{'collect_cfg'}->{'collectionmeta'}) { $collmetadefined = 1; foreach $cmeta (keys (%{$self->{'collect_cfg'}->{'collectionmeta'}})) { next if ($cmeta =~ /^\./); # for now, ignore ones with dots my ($metadata_entry) = $self->create_language_db_map($cmeta, $cmeta); #write the entry to the file print $handle $metadata_entry; } # foreach collmeta key } #add the index field macros to [collection] # eg Title # Subject # these now come from collection meta. if that is not defined, usses the metadata name $field_entry=""; foreach $longfield (@{$self->{'build_cfg'}->{'indexfields'}}){ $shortfield = $self->{'buildproc'}->{'indexfieldmap'}->{$longfield}; next if $shortfield eq 1; # we need to check if some coll meta has been defined my $collmeta = ".$longfield"; if ($collmetadefined && defined $self->{'collect_cfg'}->{'collectionmeta'}->{$collmeta}) { $metadata_entry = $self->create_language_db_map($collmeta, $shortfield); $field_entry .= $metadata_entry; } else { #use the metadata names, or the text macros for allfields and textonly if ($longfield eq "allfields") { $field_entry .= "<$shortfield>_query:textallfields_\n"; } elsif ($longfield eq "text") { $field_entry .= "<$shortfield>_query:texttextonly_\n"; } else { $field_entry .= "<$shortfield>$longfield\n"; } } } print $handle $field_entry; # now add the level names $level_entry = ""; foreach $level (@{$self->{'collect_cfg'}->{'levels'}}) { my $collmeta = ".$level"; # based on the original specification $level =~ tr/A-Z/a-z/; # make it lower case my $levelid = $level_map{$level}; # find the actual value we used in the index if ($collmetadefined && defined $self->{'collect_cfg'}->{'collectionmeta'}->{$collmeta}) { $metadata_entry = $self->create_language_db_map($collmeta, $levelid); $level_entry .= $metadata_entry; } else { # use the default macro $level_entry .= "<$levelid>" . $level_map{$levelid} . "\n"; } } print $handle $level_entry; # now add subcoll meta $subcoll_entry = ""; foreach $subcoll (@{$self->{'index_mapping'}->{'subcollectionmaporder'}}) { if (defined $self->{'collect_cfg'}->{'collectionmeta'}->{".$subcoll"}) { my $shortname = $self->{'index_mapping'}->{$subcoll}; $one_entry = $self->create_language_db_map(".$subcoll", $shortname); $subcoll_entry .= $one_entry; } else { $subcoll_entry .= "<$shortname>$subcoll\n"; } } print $handle $subcoll_entry; # now add language meta $lang_entry = ""; foreach $lang (@{$self->{'index_mapping'}->{'languagemaporder'}}) { if (defined $self->{'collect_cfg'}->{'collectionmeta'}->{".$lang"}) { my $shortname = $self->{'index_mapping'}->{$lang}; $one_entry = $self->create_language_db_map(".$lang", $shortname); $lang_entry .= $one_entry; } else { $lang_entry .= "<$shortname>$lang\n"; } } print $handle $lang_entry; #end the collection entry print $handle "\n" . ('-' x 70) . "\n"; &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'}, "", {}, $self->{'buildproc'}, $self->{'maxdocs'}); # output classification information &classify::output_classify_info ($self->{'classifiers'}, $handle, $self->{'allclassifications'}, $self->{'gli'}); #output doclist my @doclist = $self->{'buildproc'}->get_doc_list(); my $docs = join (";",@doclist); print $handle "[browselist]\n"; print $handle "0\n"; print $handle "VList\n"; print $handle "" . ($#doclist+1) . "\n"; print $handle "Invisible\n"; print $handle "$docs"; print $handle "\n" . ('-' x 70) . "\n"; close ($handle) if !$self->{'debug'}; print STDERR "\n" if $self->{'gli'}; } sub create_language_db_map { my $self = shift (@_); my ($metaname, $mapname) = @_; my $outhandle = $self->{'outhandle'}; my $defaultfound=0; my $first=1; my $metadata_entry = ""; my $default=""; #iterate through the languages foreach $lang (keys (%{$self->{'collect_cfg'}->{'collectionmeta'}->{$metaname}})) { if ($first) { $first=0; #set the default default to the first entry $default=$self->{'collect_cfg'}->{'collectionmeta'}->{$metaname}->{$lang}; } if ($lang =~ /default/) { $defaultfound=1; #the default entry goes first $metadata_entry = "<$mapname>" . $self->{'collect_cfg'}->{'collectionmeta'}->{$metaname}->{'default'} . "\n" . $metadata_entry; } else { my ($l) = $lang =~ /^\[l=(\w*)\]$/; if ($l) { $metadata_entry .= "<$mapname:$l>" . $self->{'collect_cfg'}->{'collectionmeta'}->{$metaname}->{$lang} . "\n"; # Use the English value as the default if no default is specified if ($l =~ /en/i) { $default=$self->{'collect_cfg'}->{'collectionmeta'}->{$metaname}->{$lang}; } } } } #foreach lang #if we haven't found a default, put one in if (!$defaultfound) { $metadata_entry = "<$mapname>$default\n" . $metadata_entry; } return $metadata_entry; } sub collect_specific { my $self = shift (@_); } # at the end of building, we have an indexfieldmap with all teh mappings, plus # some extras, and indexmap with any indexes in it that weren't specified in the index definition. # we want to make an ordered list of fields that are indexed, and a list of mappings that are used. this will be used for the build.cfg file, and for collection meta definition # we store these in a build.cfg bit sub make_final_field_list { my $self = shift (@_); $self->{'build_cfg'} = {}; # store the indexfieldmap information my @indexfieldmap = (); my @indexfields = (); my $specifiedfields = {}; my @specifiedfieldorder = (); # go through the index definition and add each thing to a map, so we can easily check if it is already specified - when doing the metadata, we print out all the individual fields, but some may already be specified in the index definition, so we dont want to add those again. foreach $field (@{$self->{'collect_cfg'}->{'indexes'}}) { # remove subcoll stuff my $parts = $field; $parts =~ s/:.*$//; my @fs = split(',', $parts); foreach $f(@fs) { if (!defined $specifiedfields->{$f}) { $specifiedfields->{$f}=1; push (@specifiedfieldorder, "$f"); } } } #add all fields bit foreach $field (@specifiedfieldorder) { if ($field eq "metadata") { foreach $newfield (keys %{$self->{'buildproc'}->{'indexfields'}}) { if (!defined $specifiedfields->{$newfield}) { push (@indexfieldmap, "$newfield\-\>$self->{'buildproc'}->{'indexfieldmap'}->{$newfield}"); push (@indexfields, "$newfield"); } } } elsif ($field eq 'text') { push (@indexfieldmap, "text\-\>TX"); push (@indexfields, "text"); } elsif ($field eq 'allfields') { push (@indexfieldmap, "allfields\-\>ZZ"); push (@indexfields, "allfields"); } else { push (@indexfieldmap, "$field\-\>$self->{'buildproc'}->{'indexfieldmap'}->{$field}"); push (@indexfields, "$field"); } } $self->{'build_cfg'}->{'indexfieldmap'} = \@indexfieldmap; $self->{'build_cfg'}->{'indexfields'} = \@indexfields; } # recreate the field list from the build.cfg file, look first in building, then in index to find it. if there is no build.cfg, we cant do the field list (there is unlikely to be any index anyway.) sub read_final_field_list { my $self = shift (@_); $self->{'build_cfg'} = {}; my @indexfieldmap = (); my @indexfields = (); if (scalar(keys %{$self->{'buildproc'}->{'indexfieldmap'}}) == 0) { # set the default mapping $self->{'buildproc'}->set_indexfieldmap ($self->{'indexfieldmap'}); } # we read the stuff in from the build.cfg file - if its there $buildconfigfile = &util::filename_cat($self->{'build_dir'}, "build.cfg"); if (!-e $buildconfigfile) { # try the index dir - but do we know where it is?? try here $buildconfigfile = &util::filename_cat($ENV{'GSDLCOLLECTDIR'}, "index", "build.cfg"); if (!-e $buildconfigfile) { #we cant find a config file - just ignore the field list return; } } $buildcfg = &colcfg::read_build_cfg( $buildconfigfile); if (defined $buildcfg->{'indexfields'}) { foreach $field (@{$buildcfg->{'indexfields'}}) { push (@indexfields, "$field"); } } if (defined $buildcfg->{'indexfieldmap'}) { foreach $field (@{$buildcfg->{'indexfieldmap'}}) { push (@indexfieldmap, "$field"); ($f, $v) = $field =~ /^(.*)\-\>(.*)$/; $self->{'buildproc'}->{'indexfieldmap'}->{$f} = $v; } } $self->{'build_cfg'}->{'indexfieldmap'} = \@indexfieldmap; $self->{'build_cfg'}->{'indexfields'} = \@indexfields; } sub make_auxiliary_files { my $self = shift (@_); my ($index); my $build_cfg = {}; # this already includes indexfieldmap and indexfields if (defined $self->{'build_cfg'}) { $build_cfg = $self->{'build_cfg'}; } #my %build_cfg = (); my $outhandle = $self->{'outhandle'}; print $outhandle "\n*** creating auxiliary files \n" if ($self->{'verbosity'} >= 1); print STDERR "\n" if $self->{'gli'}; # get the text directory &util::mk_all_dir ($self->{'build_dir'}); # store the build date $build_cfg->{'builddate'} = time; $build_cfg->{'buildtype'} = "mgpp"; #do we need this?? # store the level info my @indexlevels = (); foreach $l (@{$self->{'levelorder'}}) { push (@indexlevels, $level_map{$l}); } $build_cfg->{'indexlevels'} = \@indexlevels; if ($self->{'levels'}->{'section'}) { $build_cfg->{'textlevel'} = $level_map{'section'}; } else { $build_cfg->{'textlevel'} = $level_map{'document'}; } # store the number of documents and number of bytes $build_cfg->{'numdocs'} = $self->{'buildproc'}->get_num_docs(); $build_cfg->{'numbytes'} = $self->{'buildproc'}->get_num_bytes(); # store the mapping between the index names and the directory names my @indexmap = (); foreach $index (@{$self->{'index_mapping'}->{'indexmaporder'}}) { if (not defined ($self->{'notbuilt'}->{$index})) { push (@indexmap, "$index\-\>$self->{'index_mapping'}->{'indexmap'}->{$index}"); } } $build_cfg->{'indexmap'} = \@indexmap; my @subcollectionmap = (); foreach $subcollection (@{$self->{'index_mapping'}->{'subcollectionmaporder'}}) { push (@subcollectionmap, "$subcollection\-\>" . $self->{'index_mapping'}->{'subcollectionmap'}->{$subcollection}); } $build_cfg->{'subcollectionmap'} = \@subcollectionmap if scalar (@subcollectionmap); my @languagemap = (); foreach $language (@{$self->{'index_mapping'}->{'languagemaporder'}}) { push (@languagemap, "$language\-\>" . $self->{'index_mapping'}->{'languagemap'}->{$language}); } $build_cfg->{'languagemap'} = \@languagemap if scalar (@languagemap); #$build_cfg->{'notbuilt'} = $self->{'notbuilt'}; my @notbuilt = (); foreach $nb (keys %{$self->{'notbuilt'}}) { push (@notbuilt, $nb); } $build_cfg->{'notbuilt'} = \@notbuilt if scalar (@notbuilt); # write out the build information &cfgread::write_cfg_file("$self->{'build_dir'}/build.cfg", $build_cfg, '^(builddate|buildtype|numdocs|numbytes|textlevel)$', '^(indexmap|subcollectionmap|languagemap|indexfieldmap|notbuilt|indexfields|indexlevels)$'); print STDERR "\n" if $self->{'gli'}; } sub deinit { my $self = shift (@_); } sub print_stats { my $self = shift (@_); my $outhandle = $self->{'outhandle'}; my $indexing_text = $self->{'buildproc'}->get_indexing_text(); my $index = $self->{'buildproc'}->get_index(); my $num_bytes = $self->{'buildproc'}->get_num_bytes(); my $num_processed_bytes = $self->{'buildproc'}->get_num_processed_bytes(); if ($indexing_text) { print $outhandle "Stats (Creating index $index)\n"; } else { print $outhandle "Stats (Compressing text from $index)\n"; } print $outhandle "Total bytes in collection: $num_bytes\n"; print $outhandle "Total bytes in $index: $num_processed_bytes\n"; if ($num_processed_bytes < 50 && ($indexing_text || !$self->{'no_text'})) { print $outhandle "***************\n"; if ($indexing_text) { print $outhandle "WARNING: There is very little or no text to process for $index\n"; } elsif (!$self->{'no_text'}) { print $outhandle "WARNING: There is very little or no text to compress\n"; } print $outhandle " Was this your intention?\n"; print $outhandle "***************\n"; } } 1;