source: gs2-extensions/parallel-building/trunk/src/perllib/lucenebuilder.pm@ 24626

Last change on this file since 24626 was 24626, checked in by jmt12, 13 years ago

An (almost) complete copy of the perllib directory from a (circa SEP2011) head checkout from Greenstone 2 trunk - in order to try and make merging in this extension a little easier later on (as there have been some major changes to buildcol.pl commited in the main trunk but not in the x64 branch)

File size: 17.9 KB
Line 
1###########################################################################
2#
3# lucenebuilder.pm -- perl wrapper for building index with Lucene
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26###########################################################################
27# /*
28# * @version 1.0 Initial implementation of incremental building
29# * @version 2.0 Incremental building assistance added, including
30# * remove_document_from_database which implements the granddad's
31# * empty function to call the lucene_passes.pl and full_lucene_passes_exe
32# * so there is one place in the code that works out where the
33# * perl script is. John Rowe
34# *
35# * @author David Bainbridge and Katherine Don, Waikato DL Research group
36# * @author John Rowe, DL Consulting Ltd.
37# * @author John Thompson, DL Consulting Ltd.
38# */
39###########################################################################
40
41package lucenebuilder;
42
43# Use same basic XML structure setup by mgppbuilder/mgppbuildproc
44
45use mgppbuilder;
46use strict;
47no strict 'refs';
48use util;
49
50sub BEGIN {
51 @lucenebuilder::ISA = ('mgppbuilder');
52}
53
54# /**
55# * @author John Thompson, DL Consulting Ltd.
56# */
57sub new {
58 my $class = shift(@_);
59 my $self = new mgppbuilder (@_);
60 $self = bless $self, $class;
61
62 $self->{'buildtype'} = "lucene";
63
64 # If ENABLE_LUCENE was turned off during GS compilation, then we won't be able to
65 # continue. Check for existence of LuceneWrapper to see if Lucene was disabled.
66 my $lucene = &util::filename_cat($ENV{'GSDLHOME'},"bin","java","LuceneWrapper.jar");
67 if (! -f $lucene) {
68 die "***** ERROR: $lucene does not exist\n";
69 }
70
71 # Do we need to put exe on the end?
72 my $exe = &util::get_os_exe ();
73 my $scriptdir = "$ENV{'GSDLHOME'}/bin/script";
74
75 # So where is lucene_passes.pl anyway?
76 my $lucene_passes_script = &util::filename_cat($scriptdir, "lucene_passes.pl");
77
78 # So tack perl on the beginning to ensure execution
79 $self->{'full_lucene_passes'} = "$lucene_passes_script";
80 if ($exe eq ".exe")
81 {
82 $self->{'full_lucene_passes_exe'} = "perl$exe \"$lucene_passes_script\"";
83 }
84 else
85 {
86 $self->{'full_lucene_passes_exe'} = "\"".&util::get_perl_exec()."\" -S \"$lucene_passes_script\"";
87 }
88
89 return $self;
90}
91# /** new() **/
92
93sub is_incremental_capable
94{
95 # lucene can do incremental building
96
97 return 1;
98}
99
100sub init_for_incremental_build {
101 my $self = shift (@_);
102
103 # we want to read in indexfieldmap and indexfields from existing build.cfg
104 # so that we know what has already been indexed
105 my $buildcfg = $self->read_build_cfg();
106 return unless defined $buildcfg;
107
108 my $field;
109 if (defined $buildcfg->{'indexfields'}) {
110 foreach $field (@{$buildcfg->{'indexfields'}}) {
111 $self->{'buildproc'}->{'indexfields'}->{$field} = 1;
112 }
113 }
114
115 if (defined $buildcfg->{'indexfieldmap'}) {
116 foreach $field (@{$buildcfg->{'indexfieldmap'}}) {
117 my ($f, $v) = $field =~ /^(.*)\-\>(.*)$/;
118 $self->{'buildproc'}->{'indexfieldmap'}->{$f} = $v;
119 }
120 }
121}
122
123# lucene has none of these options
124sub generate_index_options {
125 my $self = shift (@_);
126
127 $self->SUPER::generate_index_options();
128
129 $self->{'casefold'} = 0;
130 $self->{'stem'} = 0;
131 $self->{'accentfold'} = 0;
132 $self->{'stemindexes'} = 0;
133}
134
135sub default_buildproc {
136 my $self = shift (@_);
137
138 return "lucenebuildproc";
139}
140
141# this writes a nice version of the text docs
142sub compress_text
143{
144 my $self = shift (@_);
145 # we don't do anything if we don't want compressed text
146 return if $self->{'no_text'};
147
148 my ($textindex) = @_;
149 my $outhandle = $self->{'outhandle'};
150
151 # the text directory
152 my $text_dir = &util::filename_cat($self->{'build_dir'}, "text");
153 my $build_dir = &util::filename_cat($self->{'build_dir'},"");
154 &util::mk_all_dir ($text_dir);
155
156 my $osextra = "";
157 if ($ENV{'GSDLOS'} =~ /^windows$/i)
158 {
159 $text_dir =~ s@/@\\@g;
160 }
161 else
162 {
163 if ($outhandle ne "STDERR")
164 {
165 # so lucene_passes doesn't print to stderr if we redirect output
166 $osextra .= " 2>/dev/null";
167 }
168 }
169
170 # get any os specific stuff
171 my $scriptdir = "$ENV{'GSDLHOME'}/bin/script";
172
173 # Find the perl script to call to run lucene
174 my $full_lucene_passes = $self->{'full_lucene_passes'};
175 my $full_lucene_passes_exe = $self->{'full_lucene_passes_exe'};
176
177 my $lucene_passes_sections = "Doc";
178
179 my ($handle);
180
181 if ($self->{'debug'})
182 {
183 $handle = *STDOUT;
184 }
185 else
186 {
187 print STDERR "Full Path: $full_lucene_passes\n";
188 print STDERR "Executable: $full_lucene_passes_exe\n";
189 print STDERR "Sections: $lucene_passes_sections\n";
190 print STDERR "Build Dir: $build_dir\n";
191 print STDERR "Cmd: $full_lucene_passes_exe text $lucene_passes_sections \"$build_dir\" \"dummy\" $osextra\n";
192 if (!-e "$full_lucene_passes" ||
193 !open($handle, "| $full_lucene_passes_exe text $lucene_passes_sections \"$build_dir\" \"dummy\" $osextra"))
194 {
195 print STDERR "<FatalError name='NoRunLucenePasses'/>\n</Stage>\n" if $self->{'gli'};
196 die "lucenebuilder::build_index - couldn't run $full_lucene_passes_exe\n";
197 }
198 }
199
200 # stored text is always Doc and Sec levels
201 my $levels = { 'document' => 1, 'section' => 1 };
202 # always do database at section level
203 my $db_level = "section";
204
205 # set up the document processr
206 $self->{'buildproc'}->set_output_handle ($handle);
207 $self->{'buildproc'}->set_mode ('text');
208 $self->{'buildproc'}->set_index ($textindex);
209 $self->{'buildproc'}->set_indexing_text (0);
210 #$self->{'buildproc'}->set_indexfieldmap ($self->{'indexfieldmap'});
211 $self->{'buildproc'}->set_levels ($levels);
212 $self->{'buildproc'}->set_db_level ($db_level);
213 $self->{'buildproc'}->reset();
214 &plugin::begin($self->{'pluginfo'}, $self->{'source_dir'},
215 $self->{'buildproc'}, $self->{'maxdocs'});
216 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
217 "", {}, {}, $self->{'buildproc'}, $self->{'maxdocs'}, 0, $self->{'gli'});
218 &plugin::end($self->{'pluginfo'});
219 close ($handle) unless $self->{'debug'};
220 $self->print_stats();
221
222 print STDERR "</Stage>\n" if $self->{'gli'};
223}
224
225sub build_indexes {
226 my $self = shift (@_);
227 my ($indexname, $indexlevel) = @_;
228 my $outhandle = $self->{'outhandle'};
229
230 $self->pre_build_indexes($indexname);
231
232 my $indexes = [];
233 if (defined $indexname && $indexname =~ /\w/) {
234 push @$indexes, $indexname;
235 } else {
236 $indexes = $self->{'collect_cfg'}->{'indexes'};
237 }
238
239 # Determine what level of index we want to build (may be controlled by user
240 # configuration)
241 if ($indexlevel =~ /paragraph/)
242 {
243 print $outhandle "WARNING: Paragraph level indexing not supported by Lucene. Ignoring index\n";
244 }
245 my $indexlevels = [];
246 foreach my $level (keys %{$self->{'levels'}})
247 {
248 # have we got para index?
249 if ($level =~ /paragraph/)
250 {
251 print $outhandle "WARNING: Paragraph level indexing not supported by Lucene. Ignoring index\n";
252 }
253 elsif (defined $indexlevel && $indexlevel =~ /\w/)
254 {
255 # - we only build the desired level
256 if ($indexlevel eq $level)
257 {
258 push (@{$indexlevels}, $level);
259 }
260 }
261 else
262 {
263 print "Adding all levels include: $level\n";
264 push (@{$indexlevels}, $level);
265 }
266 }
267
268 # Create the mapping between the index descriptions
269 # and their directory names (includes subcolls and langs)
270 $self->{'index_mapping'} = $self->create_index_mapping ($indexes);
271
272 # build each of the indexes
273 foreach my $index (@$indexes) {
274
275 if ($self->want_built($index)) {
276
277 my $idx = $self->{'index_mapping'}->{$index};
278 foreach my $level (@{$indexlevels}) {
279 next if $level =~ /paragraph/; # we don't do para indexing
280 my ($pindex) = $level =~ /^(.)/;
281 # should probably check that new name with level
282 # is unique ... but currently (with doc sec and para)
283 # each has unique first letter.
284 $self->{'index_mapping'}->{$index} = $pindex.$idx;
285
286 my $llevel = $mgppbuilder::level_map{$level};
287 print $outhandle "\n*** building index $index at level $llevel in subdirectory " .
288 "$self->{'index_mapping'}->{$index}\n" if ($self->{'verbosity'} >= 1);
289 print STDERR "<Stage name='Index' source='$index' level=$llevel>\n" if $self->{'gli'};
290
291 $self->build_index($index,$llevel);
292 }
293 $self->{'index_mapping'}->{$index} = $idx;
294
295 } else {
296 print $outhandle "\n*** ignoring index $index\n" if ($self->{'verbosity'} >= 1);
297 }
298 }
299
300 $self->post_build_indexes();
301}
302
303
304sub build_index {
305 my $self = shift (@_);
306 my ($index,$llevel) = @_;
307 my $outhandle = $self->{'outhandle'};
308 my $build_dir = $self->{'build_dir'};
309
310 # get the full index directory path and make sure it exists
311 my $indexdir = $self->{'index_mapping'}->{$index};
312 &util::mk_all_dir (&util::filename_cat($build_dir, $indexdir));
313
314 # get any os specific stuff
315 my $exedir = "$ENV{'GSDLHOME'}/bin/$ENV{'GSDLOS'}";
316 my $scriptdir = "$ENV{'GSDLHOME'}/bin/script";
317
318 # Find the perl script to call to run lucene
319 my $full_lucene_passes = $self->{'full_lucene_passes'};
320 my $full_lucene_passes_exe = $self->{'full_lucene_passes_exe'};
321
322 # define the section names for lucenepasses
323 # define the section names and possibly the doc name for lucenepasses
324 my $lucene_passes_sections = $llevel;
325
326 my $opt_create_index = ($self->{'incremental'}) ? "" : "-removeold";
327
328 my $osextra = "";
329 if ($ENV{'GSDLOS'} =~ /^windows$/i) {
330 $build_dir =~ s@/@\\@g;
331 } else {
332 if ($outhandle ne "STDERR") {
333 # so lucene_passes doesn't print to stderr if we redirect output
334 $osextra .= " 2>/dev/null";
335 }
336 }
337
338 # get the index expression if this index belongs
339 # to a subcollection
340 my $indexexparr = [];
341 my $langarr = [];
342
343 # there may be subcollection info, and language info.
344 my ($fields, $subcollection, $language) = split (":", $index);
345 my @subcollections = ();
346 @subcollections = split /,/, $subcollection if (defined $subcollection);
347
348 foreach $subcollection (@subcollections) {
349 if (defined ($self->{'collect_cfg'}->{'subcollection'}->{$subcollection})) {
350 push (@$indexexparr, $self->{'collect_cfg'}->{'subcollection'}->{$subcollection});
351 }
352 }
353
354 # add expressions for languages if this index belongs to
355 # a language subcollection - only put languages expressions for the
356 # ones we want in the index
357 my @languages = ();
358 my $languagemetadata = "Language";
359 if (defined ($self->{'collect_cfg'}->{'languagemetadata'})) {
360 $languagemetadata = $self->{'collect_cfg'}->{'languagemetadata'};
361 }
362 @languages = split /,/, $language if (defined $language);
363 foreach my $language (@languages) {
364 my $not=0;
365 if ($language =~ s/^\!//) {
366 $not = 1;
367 }
368 if($not) {
369 push (@$langarr, "!$language");
370 } else {
371 push (@$langarr, "$language");
372 }
373 }
374
375 # Build index dictionary. Uses verbatim stem method
376 print $outhandle "\n creating index dictionary (lucene_passes -I1)\n" if ($self->{'verbosity'} >= 1);
377 print STDERR "<Phase name='CreatingIndexDic'/>\n" if $self->{'gli'};
378 my ($handle);
379
380 if ($self->{'debug'}) {
381 $handle = *STDOUT;
382 } else {
383 print STDERR "Cmd: $full_lucene_passes_exe $opt_create_index index $lucene_passes_sections \"$build_dir\" \"$indexdir\" $osextra\n";
384 if (!-e "$full_lucene_passes" ||
385 !open($handle, "| $full_lucene_passes_exe $opt_create_index index $lucene_passes_sections \"$build_dir\" \"$indexdir\" $osextra")) {
386 print STDERR "<FatalError name='NoRunLucenePasses'/>\n</Stage>\n" if $self->{'gli'};
387 die "lucenebuilder::build_index - couldn't run $full_lucene_passes_exe\n";
388 }
389 }
390
391 my $store_levels = $self->{'levels'};
392 my $db_level = "section"; #always
393 my $dom_level = "";
394 foreach my $key (keys %$store_levels) {
395 if ($mgppbuilder::level_map{$key} eq $llevel) {
396 $dom_level = $key;
397 }
398 }
399 if ($dom_level eq "") {
400 print STDERR "Warning: unrecognized tag level $llevel\n";
401 $dom_level = "document";
402 }
403
404 my $local_levels = { $dom_level => 1 }; # work on one level at a time
405
406 # set up the document processr
407 $self->{'buildproc'}->set_output_handle ($handle);
408 $self->{'buildproc'}->set_mode ('text');
409 $self->{'buildproc'}->set_index ($index, $indexexparr);
410 $self->{'buildproc'}->set_index_languages ($languagemetadata, $langarr) if (defined $language);
411 $self->{'buildproc'}->set_indexing_text (1);
412 #$self->{'buildproc'}->set_indexfieldmap ($self->{'indexfieldmap'});
413 $self->{'buildproc'}->set_levels ($local_levels);
414 $self->{'buildproc'}->set_db_level($db_level);
415 $self->{'buildproc'}->reset();
416 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
417 "", {}, {}, $self->{'buildproc'}, $self->{'maxdocs'}, 0, $self->{'gli'});
418 close ($handle) unless $self->{'debug'};
419
420 $self->print_stats();
421
422 $self->{'buildproc'}->set_levels ($store_levels);
423 print STDERR "</Stage>\n" if $self->{'gli'};
424}
425
426# /** A modified version of the basebuilder.pm's function that generates the
427# * information database from the GA documents. We need to change this
428# * so that if we've been asked to do an incremental build we only add
429# * metadata to autohierarchy classifiers via the IncrementalBuildUtils
430# * module. All other classifiers and metadata will be ignored.
431# */
432# This was added to utilize DLC's incremental updating of Hierarchy classifiers. They are heading towards just using dynamic classifiers, and we do not want to use this code either. So now, we just use basebuilder's version of make_infodatabase
433sub make_infodatabase_dlc
434{
435 my $self = shift (@_);
436 my $outhandle = $self->{'outhandle'};
437
438 # Get info database file path
439 my $text_directory_path = &util::filename_cat($self->{'build_dir'}, "text");
440 my $infodb_file_path = &dbutil::get_infodb_file_path($self->{'infodbtype'}, $self->{'collection'}, $text_directory_path);
441
442 # If we aren't doing an incremental addition, then we just call the super-
443 # classes version
444 # Note: Incremental addition can only occur if an information database
445 # already exists. If it doesn't, let the super classes function be
446 # called once to generate it.
447 if (!$self->{'incremental'} || !-e $infodb_file_path)
448 {
449 # basebuilder::make_infodatabase(@_);
450 # Note: this doesn't work as the direct reference means all the $self
451 # data is lost.
452 $self->basebuilder::make_infodatabase(@_);
453 return;
454 }
455
456 # Carry on with an incremental addition
457 print $outhandle "\n*** performing an incremental addition to the info database\n" if ($self->{'verbosity'} >= 1);
458 print STDERR "<Stage name='CreateInfoData'>\n" if $self->{'gli'};
459
460 # 1. Init all the classifiers
461 &classify::init_classifiers ($self->{'classifiers'});
462 # 2. Init the buildproc settings.
463 # Note: we still need this to process any associated files - but we
464 # don't expect to pipe anything to the database so we can do away with the
465 # complex output handle.
466 my $assocdir = &util::filename_cat($self->{'build_dir'}, "assoc");
467 &util::mk_all_dir ($assocdir);
468 $self->{'buildproc'}->set_mode ('incinfodb'); # Very Important
469 $self->{'buildproc'}->set_assocdir ($assocdir);
470 # 3. Read in all the metadata from the files in the archives directory using
471 # the GAPlug and using ourselves as the document processor!
472 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'}, "", {}, {}, $self->{'buildproc'}, $self->{'maxdocs'},0, $self->{'gli'});
473
474 print STDERR "</Stage>\n" if $self->{'gli'};
475}
476
477# /** Lucene specific document removal function. This works by calling lucene_passes.pl with
478# * -remove and the document id on the command line.
479# *
480# * @param oid is the document identifier to be removed.
481# *
482# * @author John Rowe, DL Consulting Ltd.
483# */
484sub remove_document_from_database
485{
486 my ($self, $oid) = @_;
487 # Find the perl script to call to run lucene
488 my $full_lucene_passes_exe = $self->{'full_lucene_passes_exe'};
489 # Call lucene_passes.pl with -remove and the document ID on the command line
490 `$full_lucene_passes_exe -remove "$oid"`;
491}
492# /** remove_document_from_database **/
493
494# The three main 'modes' in Lucene builds are completely independent, while
495# index building can be further split by level (some there be more than
496# one level)
497sub prepare_build_recipe
498{
499 my ($self, $collection, $recipe) = @_;
500 my $outhandle = $self->{'outhandle'};
501 my $verbosity = $self->{'verbosity'};
502 # 1. Compressing the text
503 push(@{$recipe}, {'command'=>'buildcol.pl -keepold -verbosity ' . $verbosity . ' -mode compress_text ' . $collection});
504 # 2. Info database building
505 push(@{$recipe}, {'command'=>'buildcol.pl -keepold -verbosity ' . $verbosity . ' -mode infodb ' . $collection});
506 # 3. Now one command each for each level of index required
507 foreach my $level (keys %{$self->{'levels'}})
508 {
509 push(@{$recipe}, {'command'=>'buildcol.pl -keepold -verbosity ' . $verbosity . ' -mode build_index -indexlevel ' . $level . ' ' . $collection});
510 }
511 # Complete!
512}
513
5141;
515
516
Note: See TracBrowser for help on using the repository browser.