###########################################################################
#
# lucenebuilder.pm -- perl wrapper for building index with Lucene
# A component of the Greenstone digital library software
# from the New Zealand Digital Library Project at the
# University of Waikato, New Zealand.
#
# Copyright (C) 1999 New Zealand Digital Library Project
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
#
###########################################################################
package lucenebuilder;
# Use same basic XML structure setup by mgppbuilder/mgppbuildproc
use mgppbuilder;
sub BEGIN {
@lucenebuilder::ISA = ('mgppbuilder');
}
sub new {
my $class = shift(@_);
my ($collection, $source_dir, $build_dir, $verbosity,
$maxdocs, $debug, $keepold, $allclassifications,
$outhandle, $no_text, $gli) = @_;
my $self = new mgppbuilder (@_);
$self = bless $self, $class;
# load up the document processor for building
# if a buildproc class has been created for this collection, use it
# otherwise, use the lucene buildproc
my ($buildprocdir, $buildproctype);
if (-e "$ENV{'GSDLCOLLECTDIR'}/perllib/${collection}buildproc.pm") {
$buildprocdir = "$ENV{'GSDLCOLLECTDIR'}/perllib";
$buildproctype = "${collection}buildproc";
} else {
$buildprocdir = "$ENV{'GSDLHOME'}/perllib";
$buildproctype = "lucenebuildproc";
}
require "$buildprocdir/$buildproctype.pm";
eval("\$self->{'buildproc'} = new $buildproctype(\$collection, " .
"\$source_dir, \$build_dir, \$verbosity, \$outhandle)");
die "$@" if $@;
$self->{'buildtype'} = "lucene";
return $self;
}
# this writes a nice version of the text docs
sub compress_text {
my $self = shift (@_);
my ($textindex) = @_;
my $outhandle = $self->{'outhandle'};
print STDERR "Saving the document text\n";
# the text directory
my $text_dir = &util::filename_cat($self->{'build_dir'}, "text");
my $build_dir = &util::filename_cat($self->{'build_dir'},"");
&util::mk_all_dir ($text_dir);
my $osextra = "";
if ($ENV{'GSDLOS'} =~ /^windows$/i) {
$text_dir =~ s@/@\\@g;
} else {
if ($outhandle ne "STDERR") {
# so lucene_passes doesn't print to stderr if we redirect output
$osextra .= " 2>/dev/null";
}
}
# get any os specific stuff
my $scriptdir = "$ENV{'GSDLHOME'}/bin/script";
my $lucene_passes_exe = &util::filename_cat($scriptdir, "lucene_passes.pl");
my $full_lucene_passes_exe = $lucene_passes_exe;
if ($ENV{'GSDLOS'} =~ /^windows$/i) {
$full_lucene_passes_exe = "perl.exe -S $lucene_passes_exe";
}
my $lucene_passes_sections = "Doc";
my ($handle);
if ($self->{'debug'}) {
$handle = STDOUT;
} else {
if (!-e "$lucene_passes_exe" ||
!open (PIPEOUT, "| $full_lucene_passes_exe text $lucene_passes_sections \"$build_dir\" \"dummy\" $osextra")) {
print STDERR "\n\n" if $self->{'gli'};
die "lucenebuilder::build_index - couldn't run $lucene_passes_exe\n";
}
$handle = lucenebuilder::PIPEOUT;
}
my $levels = $self->{'levels'};
my $gdbm_level = "document";
if ($levels->{'section'}) {
$gdbm_level = "section";
}
undef $levels->{'paragraph'}; # get rid of para if we had it.
# set up the document processr
$self->{'buildproc'}->set_output_handle ($handle);
$self->{'buildproc'}->set_mode ('text');
$self->{'buildproc'}->set_index ($textindex);
$self->{'buildproc'}->set_indexing_text (0);
$self->{'buildproc'}->set_store_text(1);
$self->{'buildproc'}->set_indexfieldmap ($self->{'indexfieldmap'});
$self->{'buildproc'}->set_levels ($levels);
$self->{'buildproc'}->set_gdbm_level ($gdbm_level);
$self->{'buildproc'}->reset();
&plugin::begin($self->{'pluginfo'}, $self->{'source_dir'},
$self->{'buildproc'}, $self->{'maxdocs'});
&plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
"", {}, $self->{'buildproc'}, $self->{'maxdocs'});
&plugin::end($self->{'pluginfo'});
close ($handle) unless $self->{'debug'};
close PIPEOUT;
$self->print_stats();
print STDERR "\n" if $self->{'gli'};
}
sub build_indexes {
my $self = shift (@_);
my ($indexname) = @_;
my $outhandle = $self->{'outhandle'};
my $indexes = [];
if (defined $indexname && $indexname =~ /\w/) {
push @$indexes, $indexname;
} else {
$indexes = $self->{'collect_cfg'}->{'indexes'};
}
# create the mapping between the index descriptions
# and their directory names (includes subcolls and langs)
$self->{'index_mapping'} = $self->create_index_mapping ($indexes);
# build each of the indexes
foreach $index (@$indexes) {
if ($self->want_built($index)) {
my $idx = $self->{'index_mapping'}->{$index};
foreach my $level (keys %{$self->{'levels'}}) {
next if $level =~ /paragraph/; # we don't do para indexing
my ($pindex) = $level =~ /^(.)/;
# should probably check that new name with level
# is unique ... but currently (with doc sec and para)
# each has unique first letter.
$self->{'index_mapping'}->{$index} = $pindex.$idx;
my $llevel = $mgppbuilder::level_map{$level};
print $outhandle "\n*** building index $index at level $llevel in subdirectory " .
"$self->{'index_mapping'}->{$index}\n" if ($self->{'verbosity'} >= 1);
print STDERR "\n" if $self->{'gli'};
$self->build_index($index,$llevel);
}
$self->{'index_mapping'}->{$index} = $idx;
} else {
print $outhandle "\n*** ignoring index $index\n" if ($self->{'verbosity'} >= 1);
}
}
#define the final field lists
$self->make_final_field_list();
}
sub build_index {
my $self = shift (@_);
my ($index,$llevel) = @_;
my $outhandle = $self->{'outhandle'};
my $build_dir = $self->{'build_dir'};
# get the full index directory path and make sure it exists
my $indexdir = $self->{'index_mapping'}->{$index};
&util::mk_all_dir (&util::filename_cat($build_dir, $indexdir));
# get any os specific stuff
my $exedir = "$ENV{'GSDLHOME'}/bin/$ENV{'GSDLOS'}";
my $scriptdir = "$ENV{'GSDLHOME'}/bin/script";
my $exe = &util::get_os_exe ();
my $lucene_passes_exe = &util::filename_cat($scriptdir, "lucene_passes.pl");
my $full_lucene_passes_exe = $lucene_passes_exe;
if ($ENV{'GSDLOS'} =~ /^windows$/i) {
$full_lucene_passes_exe = "perl.exe -S $lucene_passes_exe";
}
# define the section names for lucenepasses
# define the section names and possibly the doc name for lucenepasses
my $lucene_passes_sections = $llevel;
my $osextra = "";
if ($ENV{'GSDLOS'} =~ /^windows$/i) {
$build_dir =~ s@/@\\@g;
} else {
$osextra = " -d /";
if ($outhandle ne "STDERR") {
# so lucene_passes doesn't print to stderr if we redirect output
$osextra .= " 2>/dev/null";
}
}
# get the index expression if this index belongs
# to a subcollection
my $indexexparr = [];
# there may be subcollection info, and language info.
my ($fields, $subcollection, $language) = split (":", $index);
my @subcollections = ();
@subcollections = split /,/, $subcollection if (defined $subcollection);
foreach $subcollection (@subcollections) {
if (defined ($self->{'collect_cfg'}->{'subcollection'}->{$subcollection})) {
push (@$indexexparr, $self->{'collect_cfg'}->{'subcollection'}->{$subcollection});
}
}
# add expressions for languages if this index belongs to
# a language subcollection - only put languages expressions for the
# ones we want in the index
# this puts a separate Language/en entry in for each language in the list
# is this what we want?
# should we just have one entry with Language/en,es/ ??
my @languages = ();
@languages = split /,/, $language if (defined $language);
foreach $language (@languages) {
my $not=0;
if ($language =~ s/^\!//) {
$not = 1;
}
if ($not) {
push (@$indexexparr, "!Language/$language/");
} else {
push (@$indexexparr, "Language/$language/");
}
}
# Build index dictionary. Uses verbatim stem method
print $outhandle "\n creating index dictionary (lucene_passes -I1)\n" if ($self->{'verbosity'} >= 1);
print STDERR "\n" if $self->{'gli'};
my ($handle);
if ($self->{'debug'}) {
$handle = STDOUT;
} else {
if (!-e "$lucene_passes_exe" ||
!open (PIPEOUT, "| $full_lucene_passes_exe index $lucene_passes_sections \"$build_dir\" \"$indexdir\" $osextra")) {
print STDERR "\n\n" if $self->{'gli'};
die "lucenebuilder::build_index - couldn't run $lucene_passes_exe\n";
}
$handle = lucenebuilder::PIPEOUT;
}
my $store_levels = $self->{'levels'};
my $gdbm_level = "document";
if ($store_levels->{'section'}) {
$gdbm_level = "section";
}
my $dom_level = "";
foreach my $key (keys %$store_levels) {
if ($mgppbuilder::level_map{$key} eq $llevel) {
$dom_level = $key;
}
}
if ($dom_level eq "") {
print STDERR "Warning: unrecognized tag level $llevel\n";
$dom_level = "document";
}
my $local_levels = { $dom_level => 1 }; # work on one level at a time
# set up the document processr
$self->{'buildproc'}->set_output_handle ($handle);
$self->{'buildproc'}->set_mode ('text');
$self->{'buildproc'}->set_index ($index, $indexexparr);
$self->{'buildproc'}->set_indexing_text (1);
$self->{'buildproc'}->set_store_text(1);
$self->{'buildproc'}->set_indexfieldmap ($self->{'indexfieldmap'});
$self->{'buildproc'}->set_levels ($local_levels);
$self->{'buildproc'}->set_gdbm_level($gdbm_level);
$self->{'buildproc'}->reset();
&plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
"", {}, $self->{'buildproc'}, $self->{'maxdocs'});
close ($handle) unless $self->{'debug'};
$self->print_stats();
$self->{'buildproc'}->set_levels ($store_levels);
print STDERR "\n" if $self->{'gli'};
}
1;