#!/usr/bin/perl -w ########################################################################### # # lucene_passes.pl -- perl wrapper, akin to mgpp_passes, for Lucene # A component of the Greenstone digital library software # from the New Zealand Digital Library Project at the # University of Waikato, New Zealand. # # Copyright (C) 1999 New Zealand Digital Library Project # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation; either version 2 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program; if not, write to the Free Software # Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. # ########################################################################### BEGIN { die "GSDLHOME not set\n" unless defined $ENV{'GSDLHOME'}; die "GSDLOS not set\n" unless defined $ENV{'GSDLOS'}; unshift (@INC, "$ENV{'GSDLHOME'}/perllib"); unshift (@INC, "$ENV{'GSDLHOME'}/perllib/cpan"); unshift (@INC, "$ENV{'GSDLHOME'}/perllib/plugins"); unshift (@INC, "$ENV{'GSDLHOME'}/perllib/classify"); } use util; use ghtml; sub open_java_lucene { my ($doc_tag_level,$full_builddir,$indexdir,$create,$verbosity) = @_; my $bin_java = &util::filename_cat($ENV{'GSDLHOME'},"bin","java"); my $classpath = &util::filename_cat($bin_java,"LuceneWrapper.jar"); my $java_lucene = "java -classpath \"$classpath\" org.greenstone.LuceneWrapper.GS2LuceneIndexer"; my $cmd_options = "$create -verbosity $verbosity"; my $java_cmd = "$java_lucene $cmd_options $doc_tag_level \"$full_builddir\" $indexdir"; open (PIPEOUT, "| $java_cmd") or die "$PROGNAME - couldn't run $java_cmd\n"; } sub close_java_lucene { close(PIPEOUT); } sub save_xml_doc { my ($full_textdir,$output_filename,$doc_xml) = @_; $dir_sep = &util::get_os_dirsep(); my $full_output_filename = &util::filename_cat($full_textdir,$output_filename); my ($full_output_dir) = ($full_output_filename =~ m/^(.*$dir_sep)/x); &util::mk_all_dir($full_output_dir); open(DOCOUT,">$full_output_filename") || die "Unable to open $full_output_filename"; print DOCOUT $doc_xml; close(DOCOUT); my @secs = ($doc_xml =~ m/.*?<\/Sec>/sg); } sub compress_xml_doc { my ($full_textdir,$output_filename) = @_; my $full_output_filename = &util::filename_cat($full_textdir,$output_filename); `gzip $full_output_filename`; } # This appears to be the callback that gets the xml stream during the # build process, so I need to intercept it here and call my XML RPC # to insert into the Lucene database. sub monitor_xml_stream { my ($mode, $full_textdir) = @_; my $doc_xml = ""; my $output_filename = ""; my $line; while (defined ($line = )) { $doc_xml .= $line; if ($line =~ m/^$/) { $output_filename = $1; } if ($line =~ m/^<\/Doc>$/) { if ($mode eq "text") { save_xml_doc($full_textdir,$output_filename,$doc_xml); } elsif ($mode eq "index") { # notify lucene indexer # SAX parser seems to be sensitive to blank lines # => remove them $doc_xml =~ s/\n+/\n/g; # print STDERR $doc_xml; ## print PIPEOUT "$output_filename\n"; print PIPEOUT "$doc_xml"; #save_xml_doc($full_textdir, "$output_filename.txt", $doc_xml); } # compress file ### compress_xml_doc($full_textdir,$output_filename); $doc_xml = ""; $output_filename = ""; } } } # /** This checks the arguments on the command line, filters the # * unknown command line arguments and then calls the open_java_lucene # * function to begin processing. Most of the arguments are passed on # * the command line of the java wrapper. # * # * Do not set -create and -remove at the same time, although -create is # * required for -remove, -remove will set it it's self, if you set -create # * after -remove the create will be ignored. # * # * @version 2.0 Added support for removing documents from the index by John Rowe # * # * @author John Rowe, DL Consulting # */ sub main { my (@argv) = @_; my $argc = scalar(@argv); my $create = ""; my $verbosity = 1; my @filtered_argv = (); my $i = 0; while ($i<$argc) { if ($argv[$i] =~ m/^\-(.*)$/) { my $option = $1; # -create causes build to be incremental if ($option eq ("create")) { print STDERR "\n\n-create set\n"; $create = "-create"; } # In a blinding flash of unintuitiveness -remove causes # -create to be set (we don't want to remove the old indexes) elsif($option eq "remove") { # Look at the next arg for the oid and if that doesn't exist then $i++; if(!defined $argv[$i]) { print STDERR "Remove was specified but the OID was not specified"; die "\n\nCannot continue"; } $removeoid = $argv[$i]; print STDERR "\n\nWe're removing the document with id: '$removeoid'\n"; # Now, to make sure this gets through to the Java executable $create = "-create -remove '$removeoid'"; } # -verbosity num elsif ($option eq "verbosity") { $i++; if ($i<$argc) { $verbosity = $argv[$i]; } } else { print STDERR "Unrecognised minus option: -$option\n"; } } else { push(@filtered_argv,$argv[$i]); } $i++; } my $filtered_argc = scalar(@filtered_argv); if ($filtered_argc < 4) { print STDERR "Usage: $PROGNAME [-create|-verbosity num] \"text\"|\"index\" doc-tag-level build-dir index-name\n"; exit 1; } my $mode = $filtered_argv[0]; my $doc_tag_level = $filtered_argv[1]; my $full_builddir = $filtered_argv[2]; my $indexdir = $filtered_argv[3]; ### print STDERR "**** ARGS = ", join(" ", @argv), "\n"; my $full_textdir = &util::filename_cat($full_builddir,"text"); if ($mode eq "index") { # don't need the lucene stuff if we are just storing the docs open_java_lucene($doc_tag_level,$full_builddir,$indexdir,$create,$verbosity); } print STDERR "Monitoring for input!\n"; monitor_xml_stream($mode, $full_textdir); if ($mode eq "index") { close_java_lucene(); } } $PROGNAME = $0; $PROGNAME =~ s/^.*\/(.*)$/$1/; &main(@ARGV);