source: gsdl/trunk/bin/script/lucene_passes.pl@ 18470

Last change on this file since 18470 was 18456, checked in by davidb, 15 years ago

Additions to support the deleting of documents from the index. Only works for indexers that support incremental building, e.g. lucene

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 5.5 KB
Line 
1#!/usr/bin/perl -w
2
3###########################################################################
4#
5# lucene_passes.pl -- perl wrapper, akin to mgpp_passes, for Lucene
6# A component of the Greenstone digital library software
7# from the New Zealand Digital Library Project at the
8# University of Waikato, New Zealand.
9#
10# Copyright (C) 1999 New Zealand Digital Library Project
11#
12# This program is free software; you can redistribute it and/or modify
13# it under the terms of the GNU General Public License as published by
14# the Free Software Foundation; either version 2 of the License, or
15# (at your option) any later version.
16#
17# This program is distributed in the hope that it will be useful,
18# but WITHOUT ANY WARRANTY; without even the implied warranty of
19# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20# GNU General Public License for more details.
21#
22# You should have received a copy of the GNU General Public License
23# along with this program; if not, write to the Free Software
24# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
25#
26###########################################################################
27
28
29BEGIN {
30 die "GSDLHOME not set\n" unless defined $ENV{'GSDLHOME'};
31 die "GSDLOS not set\n" unless defined $ENV{'GSDLOS'};
32 unshift (@INC, "$ENV{'GSDLHOME'}/perllib");
33}
34
35
36use strict;
37use util;
38
39
40sub open_java_lucene
41{
42 my ($doc_tag_level,$full_builddir,$indexdir,$java_lucene_options) = @_;
43
44 my $bin_java = &util::filename_cat($ENV{'GSDLHOME'},"bin","java");
45 my $classpath = &util::filename_cat($bin_java,"LuceneWrapper.jar");
46
47 my $java_lucene = "java -classpath \"$classpath\" org.greenstone.LuceneWrapper.GS2LuceneIndexer";
48 my $java_cmd = "$java_lucene $java_lucene_options $doc_tag_level \"$full_builddir\" $indexdir";
49
50 open (PIPEOUT, "| $java_cmd") or die "lucene_passes.pl - couldn't run $java_cmd\n";
51}
52
53
54sub close_java_lucene
55{
56 close(PIPEOUT);
57}
58
59
60sub save_xml_doc
61{
62 my ($full_textdir,$output_filename,$doc_xml) = @_;
63
64 my $dir_sep = &util::get_os_dirsep();
65
66 my $full_output_filename = &util::filename_cat($full_textdir,$output_filename);
67 my ($full_output_dir) = ($full_output_filename =~ m/^(.*$dir_sep)/x);
68 &util::mk_all_dir($full_output_dir);
69
70 open(DOCOUT,">$full_output_filename")
71 || die "Unable to open $full_output_filename";
72
73 print DOCOUT $doc_xml;
74 close(DOCOUT);
75
76 my @secs = ($doc_xml =~ m/<Sec\s+gs2:id="\d+"\s*>.*?<\/Sec>/sg);
77}
78
79
80sub compress_xml_doc
81{
82 my ($full_textdir,$output_filename) = @_;
83
84 my $full_output_filename
85 = &util::filename_cat($full_textdir,$output_filename);
86
87 `gzip $full_output_filename`;
88}
89
90
91# This appears to be the callback that gets the xml stream during the
92# build process, so I need to intercept it here and call my XML RPC
93# to insert into the Lucene database.
94sub monitor_xml_stream
95{
96 my ($mode, $full_textdir) = @_;
97
98 my $doc_xml = "";
99 my $output_filename = "";
100
101 my $line;
102 while (defined ($line = <STDIN>)) {
103 $doc_xml .= $line;
104 if ($line =~ m/^<Doc.+file=\"(.*?)\".*>$/) {
105 $output_filename = $1;
106
107 }
108
109 if ($line =~ m/^<\/Doc>$/) {
110 if ($mode eq "text") {
111 save_xml_doc($full_textdir,$output_filename,$doc_xml);
112 } elsif ($mode eq "index") {
113 # notify lucene indexer
114
115 # SAX parser seems to be sensitive to blank lines
116 # => remove them
117 $doc_xml =~ s/\n+/\n/g;
118
119# print STDERR $doc_xml;
120
121## print PIPEOUT "$output_filename\n";
122
123 print PIPEOUT "$doc_xml";
124
125
126 #save_xml_doc($full_textdir, "$output_filename.txt", $doc_xml);
127 }
128 # compress file
129### compress_xml_doc($full_textdir,$output_filename);
130
131 $doc_xml = "";
132 $output_filename = "";
133 }
134 }
135}
136
137
138# /** This checks the arguments on the command line, filters the
139# * unknown command line arguments and then calls the open_java_lucene
140# * function to begin processing. Most of the arguments are passed on
141# * the command line of the java wrapper.
142# *
143# */
144sub main
145{
146 my (@argv) = @_;
147 my $argc = scalar(@argv);
148
149 my $java_lucene_options = "";
150 my @filtered_argv = ();
151
152 my $i = 0;
153 while ($i<$argc) {
154 if ($argv[$i] =~ m/^\-(.*)$/) {
155
156 my $option = $1;
157
158 # -removeold causes the existing index to be overwritten
159 if ($option eq "removeold") {
160 print STDERR "\n-removeold set\n";
161 $java_lucene_options .= "-removeold ";
162 }
163 # -verbosity <num>
164 elsif ($option eq "verbosity") {
165 $i++;
166 if ($i<$argc)
167 {
168 $java_lucene_options .= "-verbosity " . $argv[$i];
169 }
170 }
171 else {
172 print STDERR "Unrecognised minus option: -$option\n";
173 }
174 }
175 else {
176 push(@filtered_argv,$argv[$i]);
177 }
178 $i++;
179 }
180
181 my $filtered_argc = scalar(@filtered_argv);
182
183 if ($filtered_argc < 4) {
184 print STDERR "Usage: lucene_passes.pl [-removeold|-verbosity num] \"text\"|\"index\" doc-tag-level build-dir index-name\n";
185 exit 1;
186 }
187
188 my $mode = $filtered_argv[0];
189 my $doc_tag_level = $filtered_argv[1];
190 my $full_builddir = $filtered_argv[2];
191 my $indexdir = $filtered_argv[3];
192### print STDERR "**** ARGS = ", join(" ", @argv), "\n";
193
194 # We only need the Lucene handle opened if we are indexing the documents, not if we are just storing the text
195 if ($mode eq "index") {
196 open_java_lucene($doc_tag_level, $full_builddir, $indexdir, $java_lucene_options);
197 }
198
199 print STDERR "Monitoring for input!\n";
200 my $full_textdir = &util::filename_cat($full_builddir,"text");
201 monitor_xml_stream($mode, $full_textdir);
202
203 if ($mode eq "index") {
204 close_java_lucene();
205 }
206}
207
208
209&main(@ARGV);
Note: See TracBrowser for help on using the repository browser.