source: trunk/gsdl/bin/script/lucene_passes.pl@ 12844

Last change on this file since 12844 was 12844, checked in by mdewsnip, 17 years ago

Incremental building and dynamic GDBM updating code, many thanks to John Rowe and John Thompson at DL Consulting Ltd.

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 6.6 KB
Line 
1#!/usr/bin/perl -w
2
3###########################################################################
4#
5# lucene_passes.pl -- perl wrapper, akin to mgpp_passes, for Lucene
6# A component of the Greenstone digital library software
7# from the New Zealand Digital Library Project at the
8# University of Waikato, New Zealand.
9#
10# Copyright (C) 1999 New Zealand Digital Library Project
11#
12# This program is free software; you can redistribute it and/or modify
13# it under the terms of the GNU General Public License as published by
14# the Free Software Foundation; either version 2 of the License, or
15# (at your option) any later version.
16#
17# This program is distributed in the hope that it will be useful,
18# but WITHOUT ANY WARRANTY; without even the implied warranty of
19# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20# GNU General Public License for more details.
21#
22# You should have received a copy of the GNU General Public License
23# along with this program; if not, write to the Free Software
24# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
25#
26###########################################################################
27
28
29BEGIN {
30 die "GSDLHOME not set\n" unless defined $ENV{'GSDLHOME'};
31 die "GSDLOS not set\n" unless defined $ENV{'GSDLOS'};
32 unshift (@INC, "$ENV{'GSDLHOME'}/perllib");
33 unshift (@INC, "$ENV{'GSDLHOME'}/perllib/cpan");
34 unshift (@INC, "$ENV{'GSDLHOME'}/perllib/plugins");
35 unshift (@INC, "$ENV{'GSDLHOME'}/perllib/classify");
36}
37
38use util;
39use ghtml;
40
41sub open_java_lucene
42{
43 my ($doc_tag_level,$full_builddir,$indexdir,$create,$verbosity) = @_;
44
45 my $bin_java = &util::filename_cat($ENV{'GSDLHOME'},"bin","java");
46 my $classpath = &util::filename_cat($bin_java,"LuceneWrap.jar");
47
48 my $java_lucene = "java -classpath \"$classpath\" org.nzdl.gsdl.LuceneWrap.GS2LuceneIndexer";
49 my $cmd_options = "$create -verbosity $verbosity";
50 my $java_cmd = "$java_lucene $cmd_options $doc_tag_level \"$full_builddir\" $indexdir";
51
52 open (PIPEOUT, "| $java_cmd") or die "$PROGNAME - couldn't run $java_cmd\n";
53}
54
55sub close_java_lucene
56{
57 close(PIPEOUT);
58}
59
60sub save_xml_doc
61{
62 my ($full_textdir,$output_filename,$doc_xml) = @_;
63 $dir_sep = &util::get_os_dirsep();
64
65 my $full_output_filename = &util::filename_cat($full_textdir,$output_filename);
66 my ($full_output_dir) = ($full_output_filename =~ m/^(.*$dir_sep)/x);
67 &util::mk_all_dir($full_output_dir);
68
69 open(DOCOUT,">$full_output_filename")
70 || die "Unable to open $full_output_filename";
71
72 print DOCOUT $doc_xml;
73 close(DOCOUT);
74
75 my @secs = ($doc_xml =~ m/<Sec\s+gs2:id="\d+"\s*>.*?<\/Sec>/sg);
76}
77
78sub compress_xml_doc
79{
80 my ($full_textdir,$output_filename) = @_;
81
82 my $full_output_filename
83 = &util::filename_cat($full_textdir,$output_filename);
84
85 `gzip $full_output_filename`;
86}
87
88# This appears to be the callback that gets the xml stream during the
89# build process, so I need to intercept it here and call my XML RPC
90# to insert into the Lucene database.
91sub monitor_xml_stream
92{
93 my ($mode, $full_textdir) = @_;
94
95 my $doc_xml = "";
96 my $output_filename = "";
97
98 my $line;
99 while (defined ($line = <STDIN>)) {
100 $line =~ s/&nbsp;/&amp;nbsp;/g;
101 $doc_xml .= $line;
102 if ($line =~ m/^<Doc.+file=\"(.*?)\".*>$/) {
103 $output_filename = $1;
104
105 }
106
107 if ($line =~ m/^<\/Doc>$/) {
108 if ($mode eq "text") {
109 save_xml_doc($full_textdir,$output_filename,$doc_xml);
110 } elsif ($mode eq "index") {
111 # notify lucene indexer
112
113 # SAX parser seems to be sensitive to blank lines
114 # => remove them
115 $doc_xml =~ s/\n+/\n/g;
116
117# print STDERR $doc_xml;
118
119## print PIPEOUT "$output_filename\n";
120
121 print PIPEOUT "$doc_xml";
122
123
124 #save_xml_doc($full_textdir, "$output_filename.txt", $doc_xml);
125 }
126 # compress file
127### compress_xml_doc($full_textdir,$output_filename);
128
129 $doc_xml = "";
130 $output_filename = "";
131 }
132 }
133}
134
135
136# /** This checks the arguments on the command line, filters the
137# * unknown command line arguments and then calls the open_java_lucene
138# * function to begin processing. Most of the arguments are passed on
139# * the command line of the java wrapper.
140# *
141# * Do not set -create and -remove at the same time, although -create is
142# * required for -remove, -remove will set it it's self, if you set -create
143# * after -remove the create will be ignored.
144# *
145# * @version 2.0 Added support for removing documents from the index by John Rowe
146# *
147# * @author John Rowe, DL Consulting
148# */
149sub main
150{
151 my (@argv) = @_;
152 my $argc = scalar(@argv);
153
154 my $create = "";
155 my $verbosity = 1;
156
157 my @filtered_argv = ();
158
159 my $i = 0;
160 while ($i<$argc) {
161 if ($argv[$i] =~ m/^\-(.*)$/) {
162
163 my $option = $1;
164
165 # -create causes build to be incremental
166 if ($option eq ("create")) {
167 print STDERR "\n\n-create set\n";
168 $create = "-create";
169 }
170 # In a blinding flash of unintuitiveness -remove causes
171 # -create to be set (we don't want to remove the old indexes)
172 elsif($option eq "remove")
173 {
174 # Look at the next arg for the oid and if that doesn't exist then
175 $i++;
176 if(!defined $argv[$i])
177 {
178 print STDERR "Remove was specified but the OID was not specified";
179 die "\n\nCannot continue";
180 }
181 $removeoid = $argv[$i];
182 print STDERR "\n\nWe're removing the document with id: '$removeoid'\n";
183
184 # Now, to make sure this gets through to the Java executable
185 $create = "-create -remove '$removeoid'";
186 }
187
188 # -verbosity num
189 elsif ($option eq "verbosity") {
190 $i++;
191 if ($i<$argc) {
192 $verbosity = $argv[$i];
193 }
194 }
195 else {
196 print STDERR "Unrecognised minus option: -$option\n";
197 }
198 }
199 else {
200 push(@filtered_argv,$argv[$i]);
201 }
202 $i++;
203 }
204
205 my $filtered_argc = scalar(@filtered_argv);
206
207 if ($filtered_argc < 4) {
208 print STDERR "Usage: $PROGNAME [-create|-verbosity num] \"text\"|\"index\" doc-tag-level build-dir index-name\n";
209 exit 1;
210 }
211
212 my $mode = $filtered_argv[0];
213 my $doc_tag_level = $filtered_argv[1];
214 my $full_builddir = $filtered_argv[2];
215 my $indexdir = $filtered_argv[3];
216### print STDERR "**** ARGS = ", join(" ", @argv), "\n";
217
218 my $full_textdir = &util::filename_cat($full_builddir,"text");
219
220 if ($mode eq "index") {
221# don't need the lucene stuff if we are just storing the docs
222 open_java_lucene($doc_tag_level,$full_builddir,$indexdir,$create,$verbosity);
223 }
224 print STDERR "Monitoring for input!\n";
225 monitor_xml_stream($mode, $full_textdir);
226 if ($mode eq "index") {
227 close_java_lucene();
228 }
229}
230
231$PROGNAME = $0;
232$PROGNAME =~ s/^.*\/(.*)$/$1/;
233
234&main(@ARGV);
235
Note: See TracBrowser for help on using the repository browser.