source: branches/changing-indexers-branch/gsdl/bin/script/lucene_passes.pl@ 13725

Last change on this file since 13725 was 13725, checked in by kjdon, 17 years ago

lucene jar file renamed to LuceneWrapper.jar, package is now org.greenstone.LuceneWrapper instead of org.nzdl.gsdl.LuceneWrap

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 6.6 KB
Line 
1#!/usr/bin/perl -w
2
3###########################################################################
4#
5# lucene_passes.pl -- perl wrapper, akin to mgpp_passes, for Lucene
6# A component of the Greenstone digital library software
7# from the New Zealand Digital Library Project at the
8# University of Waikato, New Zealand.
9#
10# Copyright (C) 1999 New Zealand Digital Library Project
11#
12# This program is free software; you can redistribute it and/or modify
13# it under the terms of the GNU General Public License as published by
14# the Free Software Foundation; either version 2 of the License, or
15# (at your option) any later version.
16#
17# This program is distributed in the hope that it will be useful,
18# but WITHOUT ANY WARRANTY; without even the implied warranty of
19# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20# GNU General Public License for more details.
21#
22# You should have received a copy of the GNU General Public License
23# along with this program; if not, write to the Free Software
24# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
25#
26###########################################################################
27
28
29BEGIN {
30 die "GSDLHOME not set\n" unless defined $ENV{'GSDLHOME'};
31 die "GSDLOS not set\n" unless defined $ENV{'GSDLOS'};
32 unshift (@INC, "$ENV{'GSDLHOME'}/perllib");
33 unshift (@INC, "$ENV{'GSDLHOME'}/perllib/cpan");
34 unshift (@INC, "$ENV{'GSDLHOME'}/perllib/plugins");
35 unshift (@INC, "$ENV{'GSDLHOME'}/perllib/classify");
36}
37
38use util;
39use ghtml;
40
41sub open_java_lucene
42{
43 my ($doc_tag_level,$full_builddir,$indexdir,$create,$verbosity) = @_;
44
45 my $bin_java = &util::filename_cat($ENV{'GSDLHOME'},"bin","java");
46 my $classpath = &util::filename_cat($bin_java,"LuceneWrapper.jar");
47
48 my $java_lucene = "java -classpath \"$classpath\" org.greenstone.LuceneWrapper.GS2LuceneIndexer";
49 my $cmd_options = "$create -verbosity $verbosity";
50 my $java_cmd = "$java_lucene $cmd_options $doc_tag_level \"$full_builddir\" $indexdir";
51
52 open (PIPEOUT, "| $java_cmd") or die "$PROGNAME - couldn't run $java_cmd\n";
53}
54
55sub close_java_lucene
56{
57 close(PIPEOUT);
58}
59
60sub save_xml_doc
61{
62 my ($full_textdir,$output_filename,$doc_xml) = @_;
63 $dir_sep = &util::get_os_dirsep();
64
65 my $full_output_filename = &util::filename_cat($full_textdir,$output_filename);
66 my ($full_output_dir) = ($full_output_filename =~ m/^(.*$dir_sep)/x);
67 &util::mk_all_dir($full_output_dir);
68
69 open(DOCOUT,">$full_output_filename")
70 || die "Unable to open $full_output_filename";
71
72 print DOCOUT $doc_xml;
73 close(DOCOUT);
74
75 my @secs = ($doc_xml =~ m/<Sec\s+gs2:id="\d+"\s*>.*?<\/Sec>/sg);
76}
77
78sub compress_xml_doc
79{
80 my ($full_textdir,$output_filename) = @_;
81
82 my $full_output_filename
83 = &util::filename_cat($full_textdir,$output_filename);
84
85 `gzip $full_output_filename`;
86}
87
88# This appears to be the callback that gets the xml stream during the
89# build process, so I need to intercept it here and call my XML RPC
90# to insert into the Lucene database.
91sub monitor_xml_stream
92{
93 my ($mode, $full_textdir) = @_;
94
95 my $doc_xml = "";
96 my $output_filename = "";
97
98 my $line;
99 while (defined ($line = <STDIN>)) {
100 $line =~ s/&nbsp;/&amp;nbsp;/g;
101 $doc_xml .= $line;
102 if ($line =~ m/^<Doc.+file=\"(.*?)\".*>$/) {
103 $output_filename = $1;
104
105 }
106
107 if ($line =~ m/^<\/Doc>$/) {
108 if ($mode eq "text") {
109 save_xml_doc($full_textdir,$output_filename,$doc_xml);
110 } elsif ($mode eq "index") {
111 # notify lucene indexer
112
113 # SAX parser seems to be sensitive to blank lines
114 # => remove them
115 $doc_xml =~ s/\n+/\n/g;
116
117# print STDERR $doc_xml;
118
119## print PIPEOUT "$output_filename\n";
120
121 print PIPEOUT "$doc_xml";
122
123
124 #save_xml_doc($full_textdir, "$output_filename.txt", $doc_xml);
125 }
126 # compress file
127### compress_xml_doc($full_textdir,$output_filename);
128
129 $doc_xml = "";
130 $output_filename = "";
131 }
132 }
133}
134
135
136# /** This checks the arguments on the command line, filters the
137# * unknown command line arguments and then calls the open_java_lucene
138# * function to begin processing. Most of the arguments are passed on
139# * the command line of the java wrapper.
140# *
141# * Do not set -create and -remove at the same time, although -create is
142# * required for -remove, -remove will set it it's self, if you set -create
143# * after -remove the create will be ignored.
144# *
145# * @version 2.0 Added support for removing documents from the index by John Rowe
146# *
147# * @author John Rowe, DL Consulting
148# */
149sub main
150{
151 my (@argv) = @_;
152 my $argc = scalar(@argv);
153
154 my $create = "";
155 my $verbosity = 1;
156
157 my @filtered_argv = ();
158
159 my $i = 0;
160 while ($i<$argc) {
161 if ($argv[$i] =~ m/^\-(.*)$/) {
162
163 my $option = $1;
164
165 # -create causes build to be incremental
166 if ($option eq ("create")) {
167 print STDERR "\n\n-create set\n";
168 $create = "-create";
169 }
170 # In a blinding flash of unintuitiveness -remove causes
171 # -create to be set (we don't want to remove the old indexes)
172 elsif($option eq "remove")
173 {
174 # Look at the next arg for the oid and if that doesn't exist then
175 $i++;
176 if(!defined $argv[$i])
177 {
178 print STDERR "Remove was specified but the OID was not specified";
179 die "\n\nCannot continue";
180 }
181 $removeoid = $argv[$i];
182 print STDERR "\n\nWe're removing the document with id: '$removeoid'\n";
183
184 # Now, to make sure this gets through to the Java executable
185 $create = "-create -remove '$removeoid'";
186 }
187
188 # -verbosity num
189 elsif ($option eq "verbosity") {
190 $i++;
191 if ($i<$argc) {
192 $verbosity = $argv[$i];
193 }
194 }
195 else {
196 print STDERR "Unrecognised minus option: -$option\n";
197 }
198 }
199 else {
200 push(@filtered_argv,$argv[$i]);
201 }
202 $i++;
203 }
204
205 my $filtered_argc = scalar(@filtered_argv);
206
207 if ($filtered_argc < 4) {
208 print STDERR "Usage: $PROGNAME [-create|-verbosity num] \"text\"|\"index\" doc-tag-level build-dir index-name\n";
209 exit 1;
210 }
211
212 my $mode = $filtered_argv[0];
213 my $doc_tag_level = $filtered_argv[1];
214 my $full_builddir = $filtered_argv[2];
215 my $indexdir = $filtered_argv[3];
216### print STDERR "**** ARGS = ", join(" ", @argv), "\n";
217
218 my $full_textdir = &util::filename_cat($full_builddir,"text");
219
220 if ($mode eq "index") {
221# don't need the lucene stuff if we are just storing the docs
222 open_java_lucene($doc_tag_level,$full_builddir,$indexdir,$create,$verbosity);
223 }
224 print STDERR "Monitoring for input!\n";
225 monitor_xml_stream($mode, $full_textdir);
226 if ($mode eq "index") {
227 close_java_lucene();
228 }
229}
230
231$PROGNAME = $0;
232$PROGNAME =~ s/^.*\/(.*)$/$1/;
233
234&main(@ARGV);
235
Note: See TracBrowser for help on using the repository browser.