########################################################################### # # lucenebuilder.pm -- perl wrapper for building index with Lucene # A component of the Greenstone digital library software # from the New Zealand Digital Library Project at the # University of Waikato, New Zealand. # # Copyright (C) 1999 New Zealand Digital Library Project # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation; either version 2 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program; if not, write to the Free Software # Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. # ########################################################################### package lucenebuilder; # Use same basic XML structure setup by mgppbuilder/mgppbuildproc use mgppbuilder; sub BEGIN { @lucenebuilder::ISA = ('mgppbuilder'); } sub new { my $class = shift(@_); my ($collection, $source_dir, $build_dir, $verbosity, $maxdocs, $debug, $keepold, $allclassifications, $outhandle, $no_text, $gli) = @_; my $self = new mgppbuilder (@_); $self = bless $self, $class; # load up the document processor for building # if a buildproc class has been created for this collection, use it # otherwise, use the lucene buildproc my ($buildprocdir, $buildproctype); if (-e "$ENV{'GSDLCOLLECTDIR'}/perllib/${collection}buildproc.pm") { $buildprocdir = "$ENV{'GSDLCOLLECTDIR'}/perllib"; $buildproctype = "${collection}buildproc"; } else { $buildprocdir = "$ENV{'GSDLHOME'}/perllib"; $buildproctype = "lucenebuildproc"; } require "$buildprocdir/$buildproctype.pm"; eval("\$self->{'buildproc'} = new $buildproctype(\$collection, " . "\$source_dir, \$build_dir, \$verbosity, \$outhandle)"); die "$@" if $@; $self->{'buildtype'} = "lucene"; return $self; } # this writes a nice version of the text docs sub compress_text { my $self = shift (@_); my ($textindex) = @_; my $outhandle = $self->{'outhandle'}; print STDERR "Saving the document text\n"; # the text directory my $text_dir = &util::filename_cat($self->{'build_dir'}, "text"); my $build_dir = &util::filename_cat($self->{'build_dir'},""); &util::mk_all_dir ($text_dir); my $osextra = ""; if ($ENV{'GSDLOS'} =~ /^windows$/i) { $text_dir =~ s@/@\\@g; } else { if ($outhandle ne "STDERR") { # so lucene_passes doesn't print to stderr if we redirect output $osextra .= " 2>/dev/null"; } } # get any os specific stuff my $scriptdir = "$ENV{'GSDLHOME'}/bin/script"; my $lucene_passes_exe = &util::filename_cat($scriptdir, "lucene_passes.pl"); my $full_lucene_passes_exe = $lucene_passes_exe; if ($ENV{'GSDLOS'} =~ /^windows$/i) { $full_lucene_passes_exe = "perl.exe -S $lucene_passes_exe"; } my $lucene_passes_sections = "Doc"; my ($handle); if ($self->{'debug'}) { $handle = STDOUT; } else { if (!-e "$lucene_passes_exe" || !open (PIPEOUT, "| $full_lucene_passes_exe text $lucene_passes_sections \"$build_dir\" \"dummy\" $osextra")) { print STDERR "\n\n" if $self->{'gli'}; die "lucenebuilder::build_index - couldn't run $lucene_passes_exe\n"; } $handle = lucenebuilder::PIPEOUT; } my $levels = $self->{'levels'}; my $gdbm_level = "document"; if ($levels->{'section'}) { $gdbm_level = "section"; } undef $levels->{'paragraph'}; # get rid of para if we had it. # set up the document processr $self->{'buildproc'}->set_output_handle ($handle); $self->{'buildproc'}->set_mode ('text'); $self->{'buildproc'}->set_index ($textindex); $self->{'buildproc'}->set_indexing_text (0); $self->{'buildproc'}->set_store_text(1); $self->{'buildproc'}->set_indexfieldmap ($self->{'indexfieldmap'}); $self->{'buildproc'}->set_levels ($levels); $self->{'buildproc'}->set_gdbm_level ($gdbm_level); $self->{'buildproc'}->reset(); &plugin::begin($self->{'pluginfo'}, $self->{'source_dir'}, $self->{'buildproc'}, $self->{'maxdocs'}); &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'}, "", {}, $self->{'buildproc'}, $self->{'maxdocs'}); &plugin::end($self->{'pluginfo'}); close ($handle) unless $self->{'debug'}; close PIPEOUT; $self->print_stats(); print STDERR "\n" if $self->{'gli'}; } sub build_indexes { my $self = shift (@_); my ($indexname) = @_; my $outhandle = $self->{'outhandle'}; my $indexes = []; if (defined $indexname && $indexname =~ /\w/) { push @$indexes, $indexname; } else { $indexes = $self->{'collect_cfg'}->{'indexes'}; } # create the mapping between the index descriptions # and their directory names (includes subcolls and langs) $self->{'index_mapping'} = $self->create_index_mapping ($indexes); # build each of the indexes foreach $index (@$indexes) { if ($self->want_built($index)) { my $idx = $self->{'index_mapping'}->{$index}; foreach my $level (keys %{$self->{'levels'}}) { next if $level =~ /paragraph/; # we don't do para indexing my ($pindex) = $level =~ /^(.)/; # should probably check that new name with level # is unique ... but currently (with doc sec and para) # each has unique first letter. $self->{'index_mapping'}->{$index} = $pindex.$idx; my $llevel = $mgppbuilder::level_map{$level}; print $outhandle "\n*** building index $index at level $llevel in subdirectory " . "$self->{'index_mapping'}->{$index}\n" if ($self->{'verbosity'} >= 1); print STDERR "\n" if $self->{'gli'}; $self->build_index($index,$llevel); } $self->{'index_mapping'}->{$index} = $idx; } else { print $outhandle "\n*** ignoring index $index\n" if ($self->{'verbosity'} >= 1); } } #define the final field lists $self->make_final_field_list(); } sub build_index { my $self = shift (@_); my ($index,$llevel) = @_; my $outhandle = $self->{'outhandle'}; my $build_dir = $self->{'build_dir'}; # get the full index directory path and make sure it exists my $indexdir = $self->{'index_mapping'}->{$index}; &util::mk_all_dir (&util::filename_cat($build_dir, $indexdir)); # get any os specific stuff my $exedir = "$ENV{'GSDLHOME'}/bin/$ENV{'GSDLOS'}"; my $scriptdir = "$ENV{'GSDLHOME'}/bin/script"; my $exe = &util::get_os_exe (); my $lucene_passes_exe = &util::filename_cat($scriptdir, "lucene_passes.pl"); my $full_lucene_passes_exe = $lucene_passes_exe; if ($ENV{'GSDLOS'} =~ /^windows$/i) { $full_lucene_passes_exe = "perl.exe -S $lucene_passes_exe"; } # define the section names for lucenepasses # define the section names and possibly the doc name for lucenepasses my $lucene_passes_sections = $llevel; my $osextra = ""; if ($ENV{'GSDLOS'} =~ /^windows$/i) { $build_dir =~ s@/@\\@g; } else { $osextra = " -d /"; if ($outhandle ne "STDERR") { # so lucene_passes doesn't print to stderr if we redirect output $osextra .= " 2>/dev/null"; } } # get the index expression if this index belongs # to a subcollection my $indexexparr = []; # there may be subcollection info, and language info. my ($fields, $subcollection, $language) = split (":", $index); my @subcollections = (); @subcollections = split /,/, $subcollection if (defined $subcollection); foreach $subcollection (@subcollections) { if (defined ($self->{'collect_cfg'}->{'subcollection'}->{$subcollection})) { push (@$indexexparr, $self->{'collect_cfg'}->{'subcollection'}->{$subcollection}); } } # add expressions for languages if this index belongs to # a language subcollection - only put languages expressions for the # ones we want in the index # this puts a separate Language/en entry in for each language in the list # is this what we want? # should we just have one entry with Language/en,es/ ?? my @languages = (); @languages = split /,/, $language if (defined $language); foreach $language (@languages) { my $not=0; if ($language =~ s/^\!//) { $not = 1; } if ($not) { push (@$indexexparr, "!Language/$language/"); } else { push (@$indexexparr, "Language/$language/"); } } # Build index dictionary. Uses verbatim stem method print $outhandle "\n creating index dictionary (lucene_passes -I1)\n" if ($self->{'verbosity'} >= 1); print STDERR "\n" if $self->{'gli'}; my ($handle); if ($self->{'debug'}) { $handle = STDOUT; } else { if (!-e "$lucene_passes_exe" || !open (PIPEOUT, "| $full_lucene_passes_exe index $lucene_passes_sections \"$build_dir\" \"$indexdir\" $osextra")) { print STDERR "\n\n" if $self->{'gli'}; die "lucenebuilder::build_index - couldn't run $lucene_passes_exe\n"; } $handle = lucenebuilder::PIPEOUT; } my $store_levels = $self->{'levels'}; my $gdbm_level = "document"; if ($store_levels->{'section'}) { $gdbm_level = "section"; } my $dom_level = ""; foreach my $key (keys %$store_levels) { if ($mgppbuilder::level_map{$key} eq $llevel) { $dom_level = $key; } } if ($dom_level eq "") { print STDERR "Warning: unrecognized tag level $llevel\n"; $dom_level = "document"; } my $local_levels = { $dom_level => 1 }; # work on one level at a time # set up the document processr $self->{'buildproc'}->set_output_handle ($handle); $self->{'buildproc'}->set_mode ('text'); $self->{'buildproc'}->set_index ($index, $indexexparr); $self->{'buildproc'}->set_indexing_text (1); $self->{'buildproc'}->set_store_text(1); $self->{'buildproc'}->set_indexfieldmap ($self->{'indexfieldmap'}); $self->{'buildproc'}->set_levels ($local_levels); $self->{'buildproc'}->set_gdbm_level($gdbm_level); $self->{'buildproc'}->reset(); &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'}, "", {}, $self->{'buildproc'}, $self->{'maxdocs'}); close ($handle) unless $self->{'debug'}; $self->print_stats(); $self->{'buildproc'}->set_levels ($store_levels); print STDERR "\n" if $self->{'gli'}; } 1;