source: gsdl/trunk/bin/script/lucene_passes.pl@ 19764

Last change on this file since 19764 was 19764, checked in by mdewsnip, 15 years ago

Changed to look for a collection-specific bin/java/LuceneWrapper.jar file before using the Greenstone one.

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 5.7 KB
RevLine 
[8520]1#!/usr/bin/perl -w
2
3###########################################################################
4#
5# lucene_passes.pl -- perl wrapper, akin to mgpp_passes, for Lucene
6# A component of the Greenstone digital library software
[12844]7# from the New Zealand Digital Library Project at the
[8520]8# University of Waikato, New Zealand.
9#
10# Copyright (C) 1999 New Zealand Digital Library Project
11#
12# This program is free software; you can redistribute it and/or modify
13# it under the terms of the GNU General Public License as published by
14# the Free Software Foundation; either version 2 of the License, or
15# (at your option) any later version.
16#
17# This program is distributed in the hope that it will be useful,
18# but WITHOUT ANY WARRANTY; without even the implied warranty of
19# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20# GNU General Public License for more details.
21#
22# You should have received a copy of the GNU General Public License
23# along with this program; if not, write to the Free Software
24# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
25#
26###########################################################################
27
28
29BEGIN {
30 die "GSDLHOME not set\n" unless defined $ENV{'GSDLHOME'};
31 die "GSDLOS not set\n" unless defined $ENV{'GSDLOS'};
32 unshift (@INC, "$ENV{'GSDLHOME'}/perllib");
33}
34
[16263]35
36use strict;
[8520]37use util;
38
[16263]39
[8520]40sub open_java_lucene
41{
[16264]42 my ($doc_tag_level,$full_builddir,$indexdir,$java_lucene_options) = @_;
[8520]43
[19764]44 # Is there a collection-specific bin/java/LuceneWrapper.jar file?
45 my $bin_java = &util::filename_cat($ENV{'GSDLCOLLECTDIR'},"bin","java");
[13774]46 my $classpath = &util::filename_cat($bin_java,"LuceneWrapper.jar");
[19764]47 if (!-f $classpath)
48 {
49 # No, so use the Greenstone one
50 $bin_java = &util::filename_cat($ENV{'GSDLHOME'},"bin","java");
51 $classpath = &util::filename_cat($bin_java,"LuceneWrapper.jar");
52 }
[8520]53
[13774]54 my $java_lucene = "java -classpath \"$classpath\" org.greenstone.LuceneWrapper.GS2LuceneIndexer";
[16264]55 my $java_cmd = "$java_lucene $java_lucene_options $doc_tag_level \"$full_builddir\" $indexdir";
[8520]56
[16263]57 open (PIPEOUT, "| $java_cmd") or die "lucene_passes.pl - couldn't run $java_cmd\n";
[8520]58}
59
[16263]60
[8520]61sub close_java_lucene
62{
[12844]63 close(PIPEOUT);
[8520]64}
65
[16263]66
[8520]67sub save_xml_doc
68{
69 my ($full_textdir,$output_filename,$doc_xml) = @_;
70
[16263]71 my $dir_sep = &util::get_os_dirsep();
72
[12844]73 my $full_output_filename = &util::filename_cat($full_textdir,$output_filename);
[8520]74 my ($full_output_dir) = ($full_output_filename =~ m/^(.*$dir_sep)/x);
75 &util::mk_all_dir($full_output_dir);
[12844]76
77 open(DOCOUT,">$full_output_filename")
[8520]78 || die "Unable to open $full_output_filename";
79
80 print DOCOUT $doc_xml;
81 close(DOCOUT);
[10163]82
83 my @secs = ($doc_xml =~ m/<Sec\s+gs2:id="\d+"\s*>.*?<\/Sec>/sg);
[8520]84}
85
[16263]86
[8520]87sub compress_xml_doc
88{
89 my ($full_textdir,$output_filename) = @_;
90
[12844]91 my $full_output_filename
[8520]92 = &util::filename_cat($full_textdir,$output_filename);
93
94 `gzip $full_output_filename`;
95}
96
[16263]97
[12844]98# This appears to be the callback that gets the xml stream during the
99# build process, so I need to intercept it here and call my XML RPC
100# to insert into the Lucene database.
[8520]101sub monitor_xml_stream
102{
[9177]103 my ($mode, $full_textdir) = @_;
[8520]104
105 my $doc_xml = "";
106 my $output_filename = "";
107
108 my $line;
109 while (defined ($line = <STDIN>)) {
110 $doc_xml .= $line;
111 if ($line =~ m/^<Doc.+file=\"(.*?)\".*>$/) {
112 $output_filename = $1;
113
114 }
[9177]115
[8520]116 if ($line =~ m/^<\/Doc>$/) {
[9177]117 if ($mode eq "text") {
118 save_xml_doc($full_textdir,$output_filename,$doc_xml);
119 } elsif ($mode eq "index") {
120 # notify lucene indexer
[12003]121
122 # SAX parser seems to be sensitive to blank lines
123 # => remove them
124 $doc_xml =~ s/\n+/\n/g;
125
126# print STDERR $doc_xml;
127
[8520]128## print PIPEOUT "$output_filename\n";
[12003]129
[9177]130 print PIPEOUT "$doc_xml";
[12003]131
132
[9918]133 #save_xml_doc($full_textdir, "$output_filename.txt", $doc_xml);
[9177]134 }
[8520]135 # compress file
136### compress_xml_doc($full_textdir,$output_filename);
137
138 $doc_xml = "";
139 $output_filename = "";
140 }
141 }
142}
143
[12844]144
145# /** This checks the arguments on the command line, filters the
146# * unknown command line arguments and then calls the open_java_lucene
147# * function to begin processing. Most of the arguments are passed on
148# * the command line of the java wrapper.
149# *
150# */
[8520]151sub main
152{
[12844]153 my (@argv) = @_;
154 my $argc = scalar(@argv);
[8520]155
[16264]156 my $java_lucene_options = "";
[12844]157 my @filtered_argv = ();
[10163]158
[12844]159 my $i = 0;
160 while ($i<$argc) {
161 if ($argv[$i] =~ m/^\-(.*)$/) {
[10163]162
[12844]163 my $option = $1;
[10163]164
[16264]165 # -removeold causes the existing index to be overwritten
166 if ($option eq "removeold") {
167 print STDERR "\n-removeold set\n";
168 $java_lucene_options .= "-removeold ";
[12844]169 }
[16264]170 # -verbosity <num>
[12844]171 elsif ($option eq "verbosity") {
172 $i++;
[16264]173 if ($i<$argc)
174 {
175 $java_lucene_options .= "-verbosity " . $argv[$i];
[12844]176 }
177 }
178 else {
179 print STDERR "Unrecognised minus option: -$option\n";
180 }
[10163]181 }
[12844]182 else {
183 push(@filtered_argv,$argv[$i]);
184 }
185 $i++;
186 }
[10163]187
[12844]188 my $filtered_argc = scalar(@filtered_argv);
[10163]189
[12844]190 if ($filtered_argc < 4) {
[16264]191 print STDERR "Usage: lucene_passes.pl [-removeold|-verbosity num] \"text\"|\"index\" doc-tag-level build-dir index-name\n";
[12844]192 exit 1;
193 }
[8520]194
[12844]195 my $mode = $filtered_argv[0];
196 my $doc_tag_level = $filtered_argv[1];
197 my $full_builddir = $filtered_argv[2];
198 my $indexdir = $filtered_argv[3];
[10163]199### print STDERR "**** ARGS = ", join(" ", @argv), "\n";
[8520]200
[16263]201 # We only need the Lucene handle opened if we are indexing the documents, not if we are just storing the text
[12844]202 if ($mode eq "index") {
[16264]203 open_java_lucene($doc_tag_level, $full_builddir, $indexdir, $java_lucene_options);
[12844]204 }
[16263]205
[12844]206 print STDERR "Monitoring for input!\n";
[16263]207 my $full_textdir = &util::filename_cat($full_builddir,"text");
[12844]208 monitor_xml_stream($mode, $full_textdir);
[16263]209
[12844]210 if ($mode eq "index") {
211 close_java_lucene();
212 }
[8520]213}
214
215
216&main(@ARGV);
Note: See TracBrowser for help on using the repository browser.