source: trunk/gsdl/perllib/lucenebuilder.pm@ 11175

Last change on this file since 11175 was 11175, checked in by kjdon, 18 years ago

added quotes around lucene_poasses_exe in case installed in a place with spaces in filename

  • Property svn:keywords set to Author Date Id Revision
File size: 10.3 KB
Line 
1###########################################################################
2#
3# lucenebuilder.pm -- perl wrapper for building index with Lucene
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26package lucenebuilder;
27
28# Use same basic XML structure setup by mgppbuilder/mgppbuildproc
29
30use mgppbuilder;
31
32sub BEGIN {
33 @lucenebuilder::ISA = ('mgppbuilder');
34}
35
36
37sub new {
38 my $class = shift(@_);
39 my ($collection, $source_dir, $build_dir, $verbosity,
40 $maxdocs, $debug, $keepold, $allclassifications,
41 $outhandle, $no_text, $gli) = @_;
42
43 my $self = new mgppbuilder (@_);
44 $self = bless $self, $class;
45
46 $self->{'buildtype'} = "lucene";
47
48 return $self;
49}
50
51sub default_buildproc {
52 my $self = shift (@_);
53
54 return "lucenebuildproc";
55}
56
57# this writes a nice version of the text docs
58sub compress_text {
59
60 my $self = shift (@_);
61
62 # we don't do anything if we don't want compressed text
63 return if $self->{'no_text'};
64
65 my ($textindex) = @_;
66 my $outhandle = $self->{'outhandle'};
67 print STDERR "Saving the document text\n";
68 # the text directory
69 my $text_dir = &util::filename_cat($self->{'build_dir'}, "text");
70 my $build_dir = &util::filename_cat($self->{'build_dir'},"");
71 &util::mk_all_dir ($text_dir);
72
73 my $osextra = "";
74 if ($ENV{'GSDLOS'} =~ /^windows$/i) {
75 $text_dir =~ s@/@\\@g;
76 } else {
77 if ($outhandle ne "STDERR") {
78 # so lucene_passes doesn't print to stderr if we redirect output
79 $osextra .= " 2>/dev/null";
80 }
81 }
82
83
84 # get any os specific stuff
85 my $scriptdir = "$ENV{'GSDLHOME'}/bin/script";
86
87 my $lucene_passes_exe = &util::filename_cat($scriptdir, "lucene_passes.pl");
88 my $full_lucene_passes_exe = "\"$lucene_passes_exe\"";
89 if ($ENV{'GSDLOS'} =~ /^windows$/i) {
90 $full_lucene_passes_exe = "perl.exe -S \"$lucene_passes_exe\"";
91 }
92 my $lucene_passes_sections = "Doc";
93
94 my ($handle);
95
96 if ($self->{'debug'}) {
97 $handle = STDOUT;
98 } else {
99 if (!-e "$lucene_passes_exe" ||
100 !open (PIPEOUT, "| $full_lucene_passes_exe text $lucene_passes_sections \"$build_dir\" \"dummy\" $osextra")) {
101 print STDERR "<FatalError name='NoRunLucenePasses'/>\n</Stage>\n" if $self->{'gli'};
102 die "lucenebuilder::build_index - couldn't run $lucene_passes_exe\n";
103 }
104 $handle = lucenebuilder::PIPEOUT;
105 }
106 my $levels = $self->{'levels'};
107 my $gdbm_level = "document";
108 if ($levels->{'section'}) {
109 $gdbm_level = "section";
110 }
111
112 undef $levels->{'paragraph'}; # get rid of para if we had it.
113 # set up the document processr
114 $self->{'buildproc'}->set_output_handle ($handle);
115 $self->{'buildproc'}->set_mode ('text');
116 $self->{'buildproc'}->set_index ($textindex);
117 $self->{'buildproc'}->set_indexing_text (0);
118 $self->{'buildproc'}->set_indexfieldmap ($self->{'indexfieldmap'});
119 $self->{'buildproc'}->set_levels ($levels);
120 $self->{'buildproc'}->set_gdbm_level ($gdbm_level);
121 $self->{'buildproc'}->reset();
122 &plugin::begin($self->{'pluginfo'}, $self->{'source_dir'},
123 $self->{'buildproc'}, $self->{'maxdocs'});
124 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
125 "", {}, $self->{'buildproc'}, $self->{'maxdocs'}, 0, $self->{'gli'});
126 &plugin::end($self->{'pluginfo'});
127 close ($handle) unless $self->{'debug'};
128 close PIPEOUT;
129 $self->print_stats();
130
131 print STDERR "</Stage>\n" if $self->{'gli'};
132
133}
134
135sub build_indexes {
136 my $self = shift (@_);
137 my ($indexname) = @_;
138 my $outhandle = $self->{'outhandle'};
139
140 my $indexes = [];
141 if (defined $indexname && $indexname =~ /\w/) {
142 push @$indexes, $indexname;
143 } else {
144 $indexes = $self->{'collect_cfg'}->{'indexes'};
145 }
146
147 # create the mapping between the index descriptions
148 # and their directory names (includes subcolls and langs)
149 $self->{'index_mapping'} = $self->create_index_mapping ($indexes);
150
151 # build each of the indexes
152 foreach $index (@$indexes) {
153 if ($self->want_built($index)) {
154
155 my $idx = $self->{'index_mapping'}->{$index};
156 foreach my $level (keys %{$self->{'levels'}}) {
157 next if $level =~ /paragraph/; # we don't do para indexing
158 my ($pindex) = $level =~ /^(.)/;
159 # should probably check that new name with level
160 # is unique ... but currently (with doc sec and para)
161 # each has unique first letter.
162 $self->{'index_mapping'}->{$index} = $pindex.$idx;
163
164 my $llevel = $mgppbuilder::level_map{$level};
165 print $outhandle "\n*** building index $index at level $llevel in subdirectory " .
166 "$self->{'index_mapping'}->{$index}\n" if ($self->{'verbosity'} >= 1);
167 print STDERR "<Stage name='Index' source='$index' level=$llevel>\n" if $self->{'gli'};
168
169 $self->build_index($index,$llevel);
170 }
171 $self->{'index_mapping'}->{$index} = $idx;
172
173 } else {
174 print $outhandle "\n*** ignoring index $index\n" if ($self->{'verbosity'} >= 1);
175 }
176 }
177
178 #define the final field lists
179 $self->make_final_field_list();
180}
181
182sub build_index {
183 my $self = shift (@_);
184 my ($index,$llevel) = @_;
185 my $outhandle = $self->{'outhandle'};
186 my $build_dir = $self->{'build_dir'};
187
188 # get the full index directory path and make sure it exists
189 my $indexdir = $self->{'index_mapping'}->{$index};
190 &util::mk_all_dir (&util::filename_cat($build_dir, $indexdir));
191
192 # get any os specific stuff
193 my $exedir = "$ENV{'GSDLHOME'}/bin/$ENV{'GSDLOS'}";
194 my $scriptdir = "$ENV{'GSDLHOME'}/bin/script";
195
196 my $exe = &util::get_os_exe ();
197 my $lucene_passes_exe = &util::filename_cat($scriptdir, "lucene_passes.pl");
198 my $full_lucene_passes_exe = "\"$lucene_passes_exe\"";
199 if ($ENV{'GSDLOS'} =~ /^windows$/i) {
200 $full_lucene_passes_exe = "perl.exe -S \"$lucene_passes_exe\"";
201 }
202
203 # define the section names for lucenepasses
204 # define the section names and possibly the doc name for lucenepasses
205 my $lucene_passes_sections = $llevel;
206
207 my $opt_create_index = ($self->{'keepold'}) ? "" : "-create";
208
209 my $osextra = "";
210 if ($ENV{'GSDLOS'} =~ /^windows$/i) {
211 $build_dir =~ s@/@\\@g;
212 } else {
213 if ($outhandle ne "STDERR") {
214 # so lucene_passes doesn't print to stderr if we redirect output
215 $osextra .= " 2>/dev/null";
216 }
217 }
218
219 # get the index expression if this index belongs
220 # to a subcollection
221 my $indexexparr = [];
222 my $langarr = [];
223
224 # there may be subcollection info, and language info.
225 my ($fields, $subcollection, $language) = split (":", $index);
226 my @subcollections = ();
227 @subcollections = split /,/, $subcollection if (defined $subcollection);
228
229 foreach $subcollection (@subcollections) {
230 if (defined ($self->{'collect_cfg'}->{'subcollection'}->{$subcollection})) {
231 push (@$indexexparr, $self->{'collect_cfg'}->{'subcollection'}->{$subcollection});
232 }
233 }
234
235 # add expressions for languages if this index belongs to
236 # a language subcollection - only put languages expressions for the
237 # ones we want in the index
238
239 my @languages = ();
240 my $language_metadata = "Language";
241 if (defined ($self->{'collect_cfg'}->{'language_metadata'})) {
242 $language_metadata = $self->{'collect_cfg'}->{'language_metadata'};
243 }
244 @languages = split /,/, $language if (defined $language);
245 foreach my $language (@languages) {
246 my $not=0;
247 if ($language =~ s/^\!//) {
248 $not = 1;
249 }
250 if($not) {
251 push (@$langarr, "!$language");
252 } else {
253 push (@$langarr, "$language");
254 }
255 }
256
257 # Build index dictionary. Uses verbatim stem method
258 print $outhandle "\n creating index dictionary (lucene_passes -I1)\n" if ($self->{'verbosity'} >= 1);
259 print STDERR "<Phase name='CreatingIndexDic'/>\n" if $self->{'gli'};
260 my ($handle);
261
262 if ($self->{'debug'}) {
263 $handle = STDOUT;
264 } else {
265 if (!-e "$lucene_passes_exe" ||
266 !open (PIPEOUT, "| $full_lucene_passes_exe $opt_create_index index $lucene_passes_sections \"$build_dir\" \"$indexdir\" $osextra")) {
267 print STDERR "<FatalError name='NoRunLucenePasses'/>\n</Stage>\n" if $self->{'gli'};
268 die "lucenebuilder::build_index - couldn't run $lucene_passes_exe\n";
269 }
270 $handle = lucenebuilder::PIPEOUT;
271 }
272
273 my $store_levels = $self->{'levels'};
274 my $gdbm_level = "document";
275 if ($store_levels->{'section'}) {
276 $gdbm_level = "section";
277 }
278
279 my $dom_level = "";
280 foreach my $key (keys %$store_levels) {
281 if ($mgppbuilder::level_map{$key} eq $llevel) {
282 $dom_level = $key;
283 }
284 }
285 if ($dom_level eq "") {
286 print STDERR "Warning: unrecognized tag level $llevel\n";
287 $dom_level = "document";
288 }
289
290 my $local_levels = { $dom_level => 1 }; # work on one level at a time
291
292 # set up the document processr
293 $self->{'buildproc'}->set_output_handle ($handle);
294 $self->{'buildproc'}->set_mode ('text');
295 $self->{'buildproc'}->set_index ($index, $indexexparr);
296 $self->{'buildproc'}->set_index_languages ($language_metadata, $langarr) if (defined $language);
297 $self->{'buildproc'}->set_indexing_text (1);
298 $self->{'buildproc'}->set_indexfieldmap ($self->{'indexfieldmap'});
299 $self->{'buildproc'}->set_levels ($local_levels);
300 $self->{'buildproc'}->set_gdbm_level($gdbm_level);
301 $self->{'buildproc'}->reset();
302 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
303 "", {}, $self->{'buildproc'}, $self->{'maxdocs'}, 0, $self->{'gli'});
304 close ($handle) unless $self->{'debug'};
305
306 $self->print_stats();
307
308 $self->{'buildproc'}->set_levels ($store_levels);
309 print STDERR "</Stage>\n" if $self->{'gli'};
310}
311
3121;
313
314
Note: See TracBrowser for help on using the repository browser.