########################################################################### # # basebuilder.pm -- base class for collection builders # A component of the Greenstone digital library software # from the New Zealand Digital Library Project at the # University of Waikato, New Zealand. # # Copyright (C) 1999 New Zealand Digital Library Project # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation; either version 2 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program; if not, write to the Free Software # Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. # ########################################################################### package basebuilder; use strict; no strict 'refs'; # allow filehandles to be variables and viceversa use classify; use cfgread; use colcfg; use dbutil; use plugin; use util; BEGIN { # set autoflush on for STDERR and STDOUT so that mgpp # doesn't get out of sync with plugins STDOUT->autoflush(1); STDERR->autoflush(1); } END { STDOUT->autoflush(0); STDERR->autoflush(0); } our $maxdocsize = 12000; # used to signify "gs2"(default) or "gs3" my $gs_mode = "gs2"; sub new { my ($class, $collection, $source_dir, $build_dir, $verbosity, $maxdocs, $debug, $keepold, $incremental, $incremental_dlc, $remove_empty_classifications, $outhandle, $no_text, $failhandle, $gli, $disable_OAI) = @_; $outhandle = *STDERR unless defined $outhandle; $no_text = 0 unless defined $no_text; $failhandle = *STDERR unless defined $failhandle; # create a builder object my $self = bless {'collection'=>$collection, 'source_dir'=>$source_dir, 'build_dir'=>$build_dir, 'verbosity'=>$verbosity, 'maxdocs'=>$maxdocs, 'debug'=>$debug, 'keepold'=>$keepold, 'incremental'=>$incremental, 'incremental_dlc' => $incremental_dlc, 'remove_empty_classifications'=>$remove_empty_classifications, 'outhandle'=>$outhandle, 'no_text'=>$no_text, 'failhandle'=>$failhandle, 'notbuilt'=>{}, # indexes not built 'gli'=>$gli, 'disable_OAI'=>$disable_OAI }, $class; $self->{'gli'} = 0 unless defined $self->{'gli'}; # disable_OIA applies to greenstone 3 only and is only passed to &colcfg::write_build_cfg_xml (then cfgread4gs3::write_cfg_file) when writing the buildConfig.xml $self->{'disable_OAI'} = 0 unless defined $self->{'disable_OAI'}; # Read in the collection configuration file. my ($colcfgname); ($colcfgname, $gs_mode) = &colcfg::get_collect_cfg_name($outhandle); if ($gs_mode eq "gs2") { $self->{'collect_cfg'} = &colcfg::read_collect_cfg ($colcfgname); } elsif ($gs_mode eq "gs3") { $self->{'collect_cfg'} = &colcfg::read_collection_cfg_xml ($colcfgname); #this $self->{'collect_cfg_preserve'} is used for gs3 only and to be passed to &colcfg::write_build_cfg_xml in sub make_auxilary_files later in this basebuilder.pm, we use this preserve object because $self->{'collect_cfg'}->{'classify'} somewhat gets modified during the calling of &classify::load_classifiers. $self->{'collect_cfg_preserve'} = &colcfg::read_collection_cfg_xml ($colcfgname); } # get the database type for this collection from the collect.cfg file (may be undefined) $self->{'infodbtype'} = $self->{'collect_cfg'}->{'infodbtype'} || ""; # get the list of plugins for this collection my $plugins = []; if (defined $self->{'collect_cfg'}->{'plugin'}) { $plugins = $self->{'collect_cfg'}->{'plugin'}; } # load all the plugins #build up the extra global options for the plugins my @global_opts = (); if (defined $self->{'collect_cfg'}->{'separate_cjk'} && $self->{'collect_cfg'}->{'separate_cjk'} =~ /^true$/i) { push @global_opts, "-separate_cjk"; } $self->{'pluginfo'} = &plugin::load_plugins ($plugins, $verbosity, $outhandle, $failhandle, \@global_opts, $keepold); if (scalar(@{$self->{'pluginfo'}}) == 0) { print $outhandle "No plugins were loaded.\n"; die "\n"; } # get the list of classifiers for this collection my $classifiers = []; if (defined $self->{'collect_cfg'}->{'classify'}) { $classifiers = $self->{'collect_cfg'}->{'classify'}; } # load all the classifiers $self->{'classifiers'} = &classify::load_classifiers ($classifiers, $build_dir, $outhandle); # load up any dontdb fields $self->{'dontdb'} = {}; if (defined ($self->{'collect_cfg'}->{'dontgdbm'})) { foreach my $dg (@{$self->{'collect_cfg'}->{'dontgdbm'}}) { $self->{'dontdb'}->{$dg} = 1; } } $self->{'maxnumeric'} = 4; return $self; } # stuff has been moved here from new, so we can use subclass methods sub init { my $self = shift(@_); $self->generate_index_list(); $self->generate_index_options(); # sort out subcollection indexes if (defined $self->{'collect_cfg'}->{'indexsubcollections'}) { my $indexes = $self->{'collect_cfg'}->{'indexes'}; $self->{'collect_cfg'}->{'indexes'} = []; foreach my $subcollection (@{$self->{'collect_cfg'}->{'indexsubcollections'}}) { foreach my $index (@$indexes) { push (@{$self->{'collect_cfg'}->{'indexes'}}, "$index:$subcollection"); } } } # sort out language subindexes if (defined $self->{'collect_cfg'}->{'languages'}) { my $indexes = $self->{'collect_cfg'}->{'indexes'}; $self->{'collect_cfg'}->{'indexes'} = []; foreach my $language (@{$self->{'collect_cfg'}->{'languages'}}) { foreach my $index (@$indexes) { if (defined ($self->{'collect_cfg'}->{'indexsubcollections'})) { push (@{$self->{'collect_cfg'}->{'indexes'}}, "$index:$language"); } else { # add in an empty subcollection field push (@{$self->{'collect_cfg'}->{'indexes'}}, "$index\:\:$language"); } } } } if (defined($self->{'collect_cfg'}->{'indexes'})) { # make sure that the same index isn't specified more than once my %tmphash = (); my @tmparray = @{$self->{'collect_cfg'}->{'indexes'}}; $self->{'collect_cfg'}->{'indexes'} = []; foreach my $i (@tmparray) { if (!defined ($tmphash{$i})) { push (@{$self->{'collect_cfg'}->{'indexes'}}, $i); $tmphash{$i} = 1; } } } else { $self->{'collect_cfg'}->{'indexes'} = []; } # load up the document processor for building # if a buildproc class has been created for this collection, use it # otherwise, use the mg buildproc my ($buildprocdir, $buildproctype); my $collection = $self->{'collection'}; if (-e "$ENV{'GSDLCOLLECTDIR'}/custom/${collection}/perllib/custombuildproc.pm") { $buildprocdir = "$ENV{'GSDLCOLLECTDIR'}/custom/${collection}/perllib"; $buildproctype = "custombuildproc"; } elsif (-e "$ENV{'GSDLCOLLECTDIR'}/perllib/custombuildproc.pm") { $buildprocdir = "$ENV{'GSDLCOLLECTDIR'}/perllib"; $buildproctype = "custombuildproc"; } elsif (-e "$ENV{'GSDLCOLLECTDIR'}/perllib/${collection}buildproc.pm") { $buildprocdir = "$ENV{'GSDLCOLLECTDIR'}/perllib"; $buildproctype = "${collection}buildproc"; } else { $buildprocdir = "$ENV{'GSDLHOME'}/perllib"; $buildproctype = $self->default_buildproc(); } require "$buildprocdir/$buildproctype.pm"; eval("\$self->{'buildproc'} = new $buildproctype(\$self->{'collection'}, " . "\$self->{'source_dir'}, \$self->{'build_dir'}, \$self->{'keepold'}, \$self->{'verbosity'}, \$self->{'outhandle'})"); die "$@" if $@; if (!$self->{'debug'} && !$self->{'keepold'}) { # remove any old builds &util::rm_r($self->{'build_dir'}); &util::mk_all_dir($self->{'build_dir'}); # make the text directory my $textdir = "$self->{'build_dir'}/text"; &util::mk_all_dir($textdir); } } sub deinit { my $self = shift (@_); &plugin::deinit($self->{'pluginfo'},$self->{'buildproc'}); } sub set_sections_index_document_metadata { my $self = shift (@_); my ($index) = @_; $self->{'buildproc'}->set_sections_index_document_metadata($index); } sub set_maxnumeric { my $self = shift (@_); my ($maxnumeric) = @_; $self->{'maxnumeric'} = $maxnumeric; } sub set_strip_html { my $self = shift (@_); my ($strip) = @_; $self->{'strip_html'} = $strip; $self->{'buildproc'}->set_strip_html($strip); } sub compress_text { my $self = shift (@_); my ($textindex) = @_; print STDERR "compress_text() should be implemented in subclass!!"; return; } sub build_indexes { my $self = shift (@_); my ($indexname) = @_; my $outhandle = $self->{'outhandle'}; my $indexes = []; if (defined $indexname && $indexname =~ /\w/) { push @$indexes, $indexname; } else { $indexes = $self->{'collect_cfg'}->{'indexes'}; } # create the mapping between the index descriptions # and their directory names (includes subcolls and langs) $self->{'index_mapping'} = $self->create_index_mapping ($indexes); # build each of the indexes foreach my $index (@$indexes) { if ($self->want_built($index)) { print $outhandle "\n*** building index $index in subdirectory " . "$self->{'index_mapping'}->{$index}\n" if ($self->{'verbosity'} >= 1); print STDERR "\n" if $self->{'gli'}; $self->build_index($index); } else { print $outhandle "\n*** ignoring index $index\n" if ($self->{'verbosity'} >= 1); } } $self->build_indexes_extra(); } sub build_indexes_extra { my $self = shift(@_); } sub build_index { my $self = shift (@_); my ($index) = @_; print STDERR "build_index should be implemented in subclass\n"; return; } sub make_infodatabase { my $self = shift (@_); my $outhandle = $self->{'outhandle'}; print STDERR "BuildDir: $self->{'build_dir'}\n"; my $textdir = &util::filename_cat($self->{'build_dir'}, "text"); my $assocdir = &util::filename_cat($self->{'build_dir'}, "assoc"); &util::mk_all_dir ($textdir); &util::mk_all_dir ($assocdir); # Get info database file path my $infodb_file_path = &dbutil::get_infodb_file_path($self->{'infodbtype'}, $self->{'collection'}, $textdir); print $outhandle "\n*** creating the info database and processing associated files\n" if ($self->{'verbosity'} >= 1); print STDERR "\n" if $self->{'gli'}; # init all the classifiers &classify::init_classifiers ($self->{'classifiers'}); my $reconstructed_docs = undef; if ($self->{'keepold'}) { # reconstruct doc_obj metadata from database for all docs $reconstructed_docs = &classify::reconstruct_doc_objs_metadata($self->{'infodbtype'}, $infodb_file_path); } # set up the document processor my ($infodb_handle); if ($self->{'debug'}) { $infodb_handle = *STDOUT; } else { print STDERR "Infodbtype: " . $self->{'infodbtype'} . "\n"; $infodb_handle = &dbutil::open_infodb_write_handle($self->{'infodbtype'}, $infodb_file_path); if (!defined($infodb_handle)) { print STDERR "\n\n" if $self->{'gli'}; die "builder::make_infodatabase - couldn't open infodb write handle\n"; } } $self->{'buildproc'}->set_infodbtype ($self->{'infodbtype'}); $self->{'buildproc'}->set_output_handle ($infodb_handle); $self->{'buildproc'}->set_mode ('infodb'); $self->{'buildproc'}->set_assocdir ($assocdir); $self->{'buildproc'}->set_dontdb ($self->{'dontdb'}); $self->{'buildproc'}->set_classifiers ($self->{'classifiers'}); $self->{'buildproc'}->set_indexing_text (0); $self->{'buildproc'}->set_store_text(1); # make_infodatabase needs full reset even for incremental build # as incremental works by reconstructing all docs from the database and # then adding in the new ones $self->{'buildproc'}->zero_reset(); $self->{'buildproc'}->{'mdprefix_fields'} = {}; if ($self->{'keepold'}) { # create flat classify structure, ready for new docs to be added foreach my $doc_obj ( @$reconstructed_docs ) { print $outhandle " Adding reconstructed ", $doc_obj->get_OID(), " into classify structures\n"; $self->{'buildproc'}->process($doc_obj,undef); } } &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'}, "", {}, $self->{'buildproc'}, $self->{'maxdocs'},0, $self->{'gli'}); # this has changed to only output collection meta if its # not in the config file $self->output_collection_meta($infodb_handle); # output classification information &classify::output_classify_info ($self->{'classifiers'}, $self->{'infodbtype'}, $infodb_handle, $self->{'remove_empty_classifications'}, $self->{'gli'}); # Output classifier reverse lookup, used in incremental deletion #&classify::print_reverse_lookup($infodb_handle); # output doclist my @doc_list = $self->{'buildproc'}->get_doc_list(); my $browselist_infodb = { 'hastxt' => [ "0" ], 'childtype' => [ "VList" ], 'numleafdocs' => [ scalar(@doc_list) ], 'thistype' => [ "Invisible" ], 'contains' => [ join(";", @doc_list) ] }; print STDERR "Infodbtype: " . $self->{'infodbtype'} . "\n"; &dbutil::write_infodb_entry($self->{'infodbtype'}, $infodb_handle, "browselist", $browselist_infodb); close ($infodb_handle) if !$self->{'debug'}; print STDERR "\n" if $self->{'gli'}; } sub make_auxiliary_files { my $self = shift (@_); my ($index); my $build_cfg = {}; # subclasses may have already defined stuff in here if (defined $self->{'build_cfg'}) { $build_cfg = $self->{'build_cfg'}; } my $outhandle = $self->{'outhandle'}; print $outhandle "\n*** creating auxiliary files \n" if ($self->{'verbosity'} >= 1); print STDERR "\n" if $self->{'gli'}; # get the text directory &util::mk_all_dir ($self->{'build_dir'}); # store the build date $build_cfg->{'builddate'} = time; $build_cfg->{'buildtype'} = $self->{'buildtype'}; $build_cfg->{'indexstem'} = &util::get_dirsep_tail($self->{'collection'}); $build_cfg->{'stemindexes'} = $self->{'stemindexes'}; # store the number of documents and number of bytes $build_cfg->{'numdocs'} = $self->{'buildproc'}->get_num_docs(); $build_cfg->{'numsections'} = $self->{'buildproc'}->get_num_sections(); $build_cfg->{'numbytes'} = $self->{'buildproc'}->get_num_bytes(); # store the mapping between the index names and the directory names # the index map is used to determine what indexes there are, so any that are not built should not be put into the map. my @indexmap = (); foreach my $index (@{$self->{'index_mapping'}->{'indexmaporder'}}) { if (not defined ($self->{'notbuilt'}->{$index})) { push (@indexmap, "$index\-\>$self->{'index_mapping'}->{'indexmap'}->{$index}"); } } $build_cfg->{'indexmap'} = \@indexmap if scalar (@indexmap); my @subcollectionmap = (); foreach my $subcollection (@{$self->{'index_mapping'}->{'subcollectionmaporder'}}) { push (@subcollectionmap, "$subcollection\-\>" . $self->{'index_mapping'}->{'subcollectionmap'}->{$subcollection}); } $build_cfg->{'subcollectionmap'} = \@subcollectionmap if scalar (@subcollectionmap); my @languagemap = (); foreach my $language (@{$self->{'index_mapping'}->{'languagemaporder'}}) { push (@languagemap, "$language\-\>" . $self->{'index_mapping'}->{'languagemap'}->{$language}); } $build_cfg->{'languagemap'} = \@languagemap if scalar (@languagemap); my @notbuilt = (); foreach my $nb (keys %{$self->{'notbuilt'}}) { push (@notbuilt, $nb); } $build_cfg->{'notbuilt'} = \@notbuilt if scalar (@notbuilt); $build_cfg->{'maxnumeric'} = $self->{'maxnumeric'}; $self->build_cfg_extra($build_cfg); if ($gs_mode eq "gs2") { &colcfg::write_build_cfg("$self->{'build_dir'}/build.cfg", $build_cfg); } if ($gs_mode eq "gs3") { &colcfg::write_build_cfg_xml("$self->{'build_dir'}/buildConfig.xml", $build_cfg, $self->{'collect_cfg_preserve'}, $self->{'disable_OAI'}); } print STDERR "\n" if $self->{'gli'}; } sub collect_specific { my $self = shift (@_); } sub want_built { my $self = shift (@_); my ($index) = @_; if (defined ($self->{'collect_cfg'}->{'dontbuild'})) { foreach my $checkstr (@{$self->{'collect_cfg'}->{'dontbuild'}}) { if ($index =~ /^$checkstr$/) { $self->{'notbuilt'}->{$index} = 1; return 0; } } } return 1; } sub create_index_mapping { my $self = shift (@_); my ($indexes) = @_; print STDERR "create_index_mapping should be implemented in subclass\n"; my %mapping = (); return \%mapping; } # returns a processed version of a field. # if the field has only one component the processed # version will contain the first character and next consonant # of that componant - otherwise it will contain the first # character of the first two components # only uses letdig (\w) characters now sub process_field { my $self = shift (@_); my ($field) = @_; return "" unless (defined ($field) && $field =~ /\S/); my ($a, $b); my @components = split /,/, $field; if (scalar @components >= 2) { # pick the first letdig from the first two field names ($a) = $components[0] =~ /^[^\w]*(\w)/; ($b) = $components[1] =~ /^[^\w]*(\w)/; } else { # pick the first two letdig chars ($a, $b) = $field =~ /^[^\w]*(\w)[^\w]*?(\w)/i; } # there may not have been any letdigs... $a = 'a' unless defined $a; $b = '0' unless defined $b; return "$a$b"; } sub get_next_version { my $self = shift (@_); my ($nameref) = @_; my $num=0; if ($$nameref =~ /(\d\d)$/) { $num = $1; $num ++; $$nameref =~ s/\d\d$/$num/; } elsif ($$nameref =~ /(\d)$/) { $num = $1; if ($num == 9) {$$nameref =~ s/\d$/10/;} else {$num ++; $$nameref =~ s/\d$/$num/;} } else { $$nameref =~ s/.$/0/; } } # implement this in subclass if want to add extra stuff to build.cfg sub build_cfg_extra { my $self = shift(@_); my ($build_cfg) = @_; } sub get_collection_meta_sets { my $self = shift(@_); my $collection_infodb = shift(@_); my $mdprefix_fields = $self->{'buildproc'}->{'mdprefix_fields'}; foreach my $prefix (keys %$mdprefix_fields) { push(@{$collection_infodb->{"metadataset"}}, $prefix); foreach my $field (keys %{$mdprefix_fields->{$prefix}}) { push(@{$collection_infodb->{"metadatalist-$prefix"}}, $field); my $val = $mdprefix_fields->{$prefix}->{$field}; push(@{$collection_infodb->{"metadatafreq-$prefix-$field"}}, $val); } } } # default is to output the metadata sets (prefixes) used in collection sub output_collection_meta { my $self = shift(@_); my $infodb_handle = shift(@_); my %collection_infodb = (); $self->get_collection_meta_sets(\%collection_infodb); &dbutil::write_infodb_entry($self->{'infodbtype'}, $infodb_handle, "collection", \%collection_infodb); } sub print_stats { my $self = shift (@_); my $outhandle = $self->{'outhandle'}; my $indexing_text = $self->{'buildproc'}->get_indexing_text(); my $index = $self->{'buildproc'}->get_index(); my $num_bytes = $self->{'buildproc'}->get_num_bytes(); my $num_processed_bytes = $self->{'buildproc'}->get_num_processed_bytes(); if ($indexing_text) { print $outhandle "Stats (Creating index $index)\n"; } else { print $outhandle "Stats (Compressing text from $index)\n"; } print $outhandle "Total bytes in collection: $num_bytes\n"; print $outhandle "Total bytes in $index: $num_processed_bytes\n"; if ($num_processed_bytes < 50 && ($indexing_text || !$self->{'no_text'})) { if ($self->{'keepold'}) { if ($num_processed_bytes == 0) { if ($indexing_text) { print $outhandle "No additional text was added to $index\n"; } elsif (!$self->{'no_text'}) { print $outhandle "No additional text was compressed\n"; } } } else { print $outhandle "***************\n"; if ($indexing_text) { print $outhandle "WARNING: There is very little or no text to process for $index\n"; } elsif (!$self->{'no_text'}) { print $outhandle "WARNING: There is very little or no text to compress\n"; } print $outhandle " Was this your intention?\n"; print $outhandle "***************\n"; } } } 1;