source: trunk/gsdl/perllib/lucenebuilder.pm@ 12910

Last change on this file since 12910 was 12910, checked in by kjdon, 18 years ago

new indexoptions field in collect.cfg specifies which stem indexes should be built (stem, casefold, accentfold). mg and lucene ignore this, mg does stem and casefold, llucene does none. stemindexes is output to build.cfg so that the library knows what options are available for the collection - don't give stem option if stemming is not implemented, for example. added in accent fold stem indexes for mgpp (thanks to Juan Grigera). accent folding may be disabled in mgpp, so we check the first time we try to create one, and don't try to build the rest if it failed.

  • Property svn:keywords set to Author Date Id Revision
File size: 15.5 KB
Line 
1###########################################################################
2#
3# lucenebuilder.pm -- perl wrapper for building index with Lucene
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26###########################################################################
27# /*
28# * @version 1.0 ?
29# * @version 2.0 Incremental building assistance added, including
30# * remove_document_from_database which implements the granddad's
31# * empty function to call the lucene_passes.pl and full_lucene_passes_exe
32# * so there is one place in the code that works out where the
33# * perl script is. John Rowe
34# *
35# * @author John Rowe, DL Consulting Ltd.
36# */
37###########################################################################
38
39package lucenebuilder;
40
41# Use same basic XML structure setup by mgppbuilder/mgppbuildproc
42
43use mgppbuilder;
44
45# use IncrementalBuildUtils;
46
47sub BEGIN {
48 @lucenebuilder::ISA = ('mgppbuilder');
49}
50
51# /**
52# * @author John Thompson, DL Consulting Ltd.
53# */
54sub new {
55 my $class = shift(@_);
56 my ($collection, $source_dir, $build_dir, $verbosity, $maxdocs, $debug, $keepold, $allclassifications, $outhandle, $no_text, $faillog, $gli, $incremental) = @_;
57
58 my $self = new mgppbuilder (@_);
59 $self = bless $self, $class;
60
61 $self->{'buildtype'} = "lucene";
62
63 # Do we need to put exe on the end?
64 my $exe = &util::get_os_exe ();
65 my $scriptdir = "$ENV{'GSDLHOME'}/bin/script";
66
67 # So where is lucene_passes.pl anyway?
68 my $lucene_passes_script = &util::filename_cat($scriptdir, "lucene_passes.pl");
69
70 # So tack perl on the beginning to ensure execution
71 $self->{'full_lucene_passes'} = "$lucene_passes_script";
72 if ($exe eq ".exe")
73 {
74 $self->{'full_lucene_passes_exe'} = "perl$exe \"$lucene_passes_script\"";
75 }
76 else
77 {
78 $self->{'full_lucene_passes_exe'} = "perl -S \"$lucene_passes_script\"";
79 }
80
81 # We must also record whether we have been asked to do just an incremental
82 # build (which makes no difference to the Lucene indexing bit, just the
83 # building of the classifiers in the GDBM).
84 $self->{'incremental'} = $incremental;
85
86 return $self;
87}
88# /** new() **/
89
90# lucene has none of these options
91sub generate_index_options {
92 my $self = shift (@_);
93
94 $self->{'casefold'} = 0;
95 $self->{'stem'} = 0;
96 $self->{'accentfold'} = 0;
97 $self->{'stemindexes'} = 0;
98}
99
100sub default_buildproc {
101 my $self = shift (@_);
102
103 return "lucenebuildproc";
104}
105
106# this writes a nice version of the text docs
107sub compress_text
108 {
109 my $self = shift (@_);
110 # we don't do anything if we don't want compressed text
111 return if $self->{'no_text'};
112
113 my ($textindex) = @_;
114 my $outhandle = $self->{'outhandle'};
115 print STDERR "Saving the document text\n";
116 # the text directory
117 my $text_dir = &util::filename_cat($self->{'build_dir'}, "text");
118 my $build_dir = &util::filename_cat($self->{'build_dir'},"");
119 &util::mk_all_dir ($text_dir);
120
121 my $osextra = "";
122 if ($ENV{'GSDLOS'} =~ /^windows$/i)
123 {
124 $text_dir =~ s@/@\\@g;
125 }
126 else
127 {
128 if ($outhandle ne "STDERR")
129 {
130 # so lucene_passes doesn't print to stderr if we redirect output
131 $osextra .= " 2>/dev/null";
132 }
133 }
134
135 # get any os specific stuff
136 my $scriptdir = "$ENV{'GSDLHOME'}/bin/script";
137
138 # Find the perl script to call to run lucene
139 my $full_lucene_passes = $self->{'full_lucene_passes'};
140 my $full_lucene_passes_exe = $self->{'full_lucene_passes_exe'};
141
142 my $lucene_passes_sections = "Doc";
143
144 my ($handle);
145
146 if ($self->{'debug'})
147 {
148 $handle = STDOUT;
149 }
150 else
151 {
152 print STDERR "Full Path: $full_lucene_passes\n";
153 print STDERR "Executable: $full_lucene_passes_exe\n";
154 print STDERR "Sections: $lucene_passes_sections\n";
155 print STDERR "Build Dir: $build_dir\n";
156 print STDERR "Cmd: $full_lucene_passes_exe text $lucene_passes_sections \"$build_dir\" \"dummy\" $osextra\n";
157 if (!-e "$full_lucene_passes" ||
158 !open (PIPEOUT, "| $full_lucene_passes_exe text $lucene_passes_sections \"$build_dir\" \"dummy\" $osextra"))
159 {
160 print STDERR "<FatalError name='NoRunLucenePasses'/>\n</Stage>\n" if $self->{'gli'};
161 die "lucenebuilder::build_index - couldn't run $full_lucene_passes_exe\n";
162 }
163 $handle = lucenebuilder::PIPEOUT;
164 }
165 my $levels = $self->{'levels'};
166 my $gdbm_level = "document";
167 if ($levels->{'section'})
168 {
169 $gdbm_level = "section";
170 }
171
172 undef $levels->{'paragraph'}; # get rid of para if we had it.
173 # set up the document processr
174 $self->{'buildproc'}->set_output_handle ($handle);
175 $self->{'buildproc'}->set_mode ('text');
176 $self->{'buildproc'}->set_index ($textindex);
177 $self->{'buildproc'}->set_indexing_text (0);
178 $self->{'buildproc'}->set_indexfieldmap ($self->{'indexfieldmap'});
179 $self->{'buildproc'}->set_levels ($levels);
180 $self->{'buildproc'}->set_gdbm_level ($gdbm_level);
181 $self->{'buildproc'}->reset();
182 &plugin::begin($self->{'pluginfo'}, $self->{'source_dir'},
183 $self->{'buildproc'}, $self->{'maxdocs'});
184 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
185 "", {}, $self->{'buildproc'}, $self->{'maxdocs'}, 0, $self->{'gli'});
186 &plugin::end($self->{'pluginfo'});
187 close ($handle) unless $self->{'debug'};
188 close PIPEOUT;
189 $self->print_stats();
190
191 print STDERR "</Stage>\n" if $self->{'gli'};
192 }
193
194sub build_indexes {
195 my $self = shift (@_);
196 my ($indexname) = @_;
197 my $outhandle = $self->{'outhandle'};
198
199 my $indexes = [];
200 if (defined $indexname && $indexname =~ /\w/) {
201 push @$indexes, $indexname;
202 } else {
203 $indexes = $self->{'collect_cfg'}->{'indexes'};
204 }
205
206 # create the mapping between the index descriptions
207 # and their directory names (includes subcolls and langs)
208 $self->{'index_mapping'} = $self->create_index_mapping ($indexes);
209
210 # build each of the indexes
211 foreach $index (@$indexes) {
212 if ($self->want_built($index)) {
213
214 my $idx = $self->{'index_mapping'}->{$index};
215 foreach my $level (keys %{$self->{'levels'}}) {
216 next if $level =~ /paragraph/; # we don't do para indexing
217 my ($pindex) = $level =~ /^(.)/;
218 # should probably check that new name with level
219 # is unique ... but currently (with doc sec and para)
220 # each has unique first letter.
221 $self->{'index_mapping'}->{$index} = $pindex.$idx;
222
223 my $llevel = $mgppbuilder::level_map{$level};
224 print $outhandle "\n*** building index $index at level $llevel in subdirectory " .
225 "$self->{'index_mapping'}->{$index}\n" if ($self->{'verbosity'} >= 1);
226 print STDERR "<Stage name='Index' source='$index' level=$llevel>\n" if $self->{'gli'};
227
228 $self->build_index($index,$llevel);
229 }
230 $self->{'index_mapping'}->{$index} = $idx;
231
232 } else {
233 print $outhandle "\n*** ignoring index $index\n" if ($self->{'verbosity'} >= 1);
234 }
235 }
236
237 #define the final field lists
238 $self->make_final_field_list();
239}
240
241# /** Lucene specific document removal function. This works by calling lucene_passes.pl with
242# * -remove and the document id on the command line.
243# *
244# * @param oid is the document identifier to be removed.
245# *
246# * @author John Rowe, DL Consulting Ltd.
247# */
248sub remove_document_from_database
249 {
250 my ($self, $oid) = @_;
251 # Find the perl script to call to run lucene
252 my $full_lucene_passes_exe = $self->{'full_lucene_passes_exe'};
253 # Call lucene_passes.pl with -remove and the document ID on the command line
254 `$full_lucene_passes_exe -remove "$oid"`;
255 }
256# /** remove_document_from_database **/
257
258sub build_index {
259 my $self = shift (@_);
260 my ($index,$llevel) = @_;
261 my $outhandle = $self->{'outhandle'};
262 my $build_dir = $self->{'build_dir'};
263
264 # get the full index directory path and make sure it exists
265 my $indexdir = $self->{'index_mapping'}->{$index};
266 &util::mk_all_dir (&util::filename_cat($build_dir, $indexdir));
267
268 # get any os specific stuff
269 my $exedir = "$ENV{'GSDLHOME'}/bin/$ENV{'GSDLOS'}";
270 my $scriptdir = "$ENV{'GSDLHOME'}/bin/script";
271
272 # Find the perl script to call to run lucene
273 my $full_lucene_passes = $self->{'full_lucene_passes'};
274 my $full_lucene_passes_exe = $self->{'full_lucene_passes_exe'};
275
276 # define the section names for lucenepasses
277 # define the section names and possibly the doc name for lucenepasses
278 my $lucene_passes_sections = $llevel;
279
280 my $opt_create_index = ($self->{'keepold'}) ? "" : "-create";
281
282 my $osextra = "";
283 if ($ENV{'GSDLOS'} =~ /^windows$/i) {
284 $build_dir =~ s@/@\\@g;
285 } else {
286 if ($outhandle ne "STDERR") {
287 # so lucene_passes doesn't print to stderr if we redirect output
288 $osextra .= " 2>/dev/null";
289 }
290 }
291
292 # get the index expression if this index belongs
293 # to a subcollection
294 my $indexexparr = [];
295 my $langarr = [];
296
297 # there may be subcollection info, and language info.
298 my ($fields, $subcollection, $language) = split (":", $index);
299 my @subcollections = ();
300 @subcollections = split /,/, $subcollection if (defined $subcollection);
301
302 foreach $subcollection (@subcollections) {
303 if (defined ($self->{'collect_cfg'}->{'subcollection'}->{$subcollection})) {
304 push (@$indexexparr, $self->{'collect_cfg'}->{'subcollection'}->{$subcollection});
305 }
306 }
307
308 # add expressions for languages if this index belongs to
309 # a language subcollection - only put languages expressions for the
310 # ones we want in the index
311 my @languages = ();
312 my $language_metadata = "Language";
313 if (defined ($self->{'collect_cfg'}->{'language_metadata'})) {
314 $language_metadata = $self->{'collect_cfg'}->{'language_metadata'};
315 }
316 @languages = split /,/, $language if (defined $language);
317 foreach my $language (@languages) {
318 my $not=0;
319 if ($language =~ s/^\!//) {
320 $not = 1;
321 }
322 if($not) {
323 push (@$langarr, "!$language");
324 } else {
325 push (@$langarr, "$language");
326 }
327 }
328
329 # Build index dictionary. Uses verbatim stem method
330 print $outhandle "\n creating index dictionary (lucene_passes -I1)\n" if ($self->{'verbosity'} >= 1);
331 print STDERR "<Phase name='CreatingIndexDic'/>\n" if $self->{'gli'};
332 my ($handle);
333
334 if ($self->{'debug'}) {
335 $handle = STDOUT;
336 } else {
337 print STDERR "Cmd: $full_lucene_passes_exe $opt_create_index index $lucene_passes_sections \"$build_dir\" \"$indexdir\" $osextra\n";
338 if (!-e "$full_lucene_passes" ||
339 !open (PIPEOUT, "| $full_lucene_passes_exe $opt_create_index index $lucene_passes_sections \"$build_dir\" \"$indexdir\" $osextra")) {
340 print STDERR "<FatalError name='NoRunLucenePasses'/>\n</Stage>\n" if $self->{'gli'};
341 die "lucenebuilder::build_index - couldn't run $lucene_passes_exe\n";
342 }
343 $handle = lucenebuilder::PIPEOUT;
344 }
345
346 my $store_levels = $self->{'levels'};
347 my $gdbm_level = "document";
348 if ($store_levels->{'section'}) {
349 $gdbm_level = "section";
350 }
351
352 my $dom_level = "";
353 foreach my $key (keys %$store_levels) {
354 if ($mgppbuilder::level_map{$key} eq $llevel) {
355 $dom_level = $key;
356 }
357 }
358 if ($dom_level eq "") {
359 print STDERR "Warning: unrecognized tag level $llevel\n";
360 $dom_level = "document";
361 }
362
363 my $local_levels = { $dom_level => 1 }; # work on one level at a time
364
365 # set up the document processr
366 $self->{'buildproc'}->set_output_handle ($handle);
367 $self->{'buildproc'}->set_mode ('text');
368 $self->{'buildproc'}->set_index ($index, $indexexparr);
369 $self->{'buildproc'}->set_index_languages ($language_metadata, $langarr) if (defined $language);
370 $self->{'buildproc'}->set_indexing_text (1);
371 $self->{'buildproc'}->set_indexfieldmap ($self->{'indexfieldmap'});
372 $self->{'buildproc'}->set_levels ($local_levels);
373 $self->{'buildproc'}->set_gdbm_level($gdbm_level);
374 $self->{'buildproc'}->reset();
375 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
376 "", {}, $self->{'buildproc'}, $self->{'maxdocs'}, 0, $self->{'gli'});
377 close ($handle) unless $self->{'debug'};
378
379 $self->print_stats();
380
381 $self->{'buildproc'}->set_levels ($store_levels);
382 print STDERR "</Stage>\n" if $self->{'gli'};
383}
384
385# /** A modified version of the basebuilder.pm's function that generates the
386# * information database (GDBM) from the GA documents. We need to change this
387# * so that if we've been asked to do an incremental build we only add
388# * metadata to autohierarchy classifiers via the IncrementalBuildUtils
389# * module. All other classifiers and metadata will be ignored.
390# */
391sub make_infodatabase
392 {
393 my $self = shift (@_);
394 my $outhandle = $self->{'outhandle'};
395
396 my $dbext = ".bdb";
397 $dbext = ".ldb" if &util::is_little_endian();
398 my $infodb_file = &util::filename_cat($self->{'build_dir'}, "text", $self->{'collection'} . $dbext);
399
400 # If we aren't doing an incremental addition, then we just call the super-
401 # classes version
402 # Note: Incremental addition can only occur if a text/<collection>.ldb
403 # already exists. If it doesn't, let the super classes function be
404 # called once to generate it.
405 if (!$self->{'incremental'} || !(-e $infodb_file))
406 {
407 # basebuilder::make_infodatabase(@_);
408 # Note: this doesn't work as the direct reference means all the $self
409 # data is lost.
410 $self->basebuilder::make_infodatabase(@_);
411 return;
412 }
413
414 # Carry on with an incremental addition
415 print $outhandle "\n*** performing an incremental addition to the info database\n" if ($self->{'verbosity'} >= 1);
416 print STDERR "<Stage name='CreateInfoData'>\n" if $self->{'gli'};
417
418 # 1. Init all the classifiers
419 &classify::init_classifiers ($self->{'classifiers'});
420 # 2. Init the buildproc settings.
421 # Note: we still need this to process any associated files - but we
422 # don't expect to pipe anything to txt2db so we can do away with the
423 # complex output handle.
424 my $assocdir = &util::filename_cat($self->{'build_dir'}, "assoc");
425 &util::mk_all_dir ($assocdir);
426 $self->{'buildproc'}->set_mode ('incinfodb'); # Very Important
427 $self->{'buildproc'}->set_assocdir ($assocdir);
428 # 3. Read in all the metadata from the files in the archives directory using
429 # the GAPlug and using ourselves as the document processor!
430 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'}, "", {}, $self->{'buildproc'}, $self->{'maxdocs'},0, $self->{'gli'});
431
432 print STDERR "</Stage>\n" if $self->{'gli'};
433}
434
4351;
436
437
Note: See TracBrowser for help on using the repository browser.