source: main/trunk/greenstone2/perllib/lucenebuilder.pm@ 32096

Last change on this file since 32096 was 29144, checked in by ak19, 10 years ago

Part of port from lucene3.3.0 to lucene4.7.2. LuceneWrapper related. Changes to gs2build/greenstone 2's perllib and bin/script lucene related perl scripts, to switch over from using Lucene3Wrapper to Lucene4Wrapper

  • Property svn:keywords set to Author Date Id Revision
File size: 18.5 KB
Line 
1###########################################################################
2#
3# lucenebuilder.pm -- perl wrapper for building index with Lucene
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26###########################################################################
27# /*
28# * @version 1.0 Initial implementation of incremental building
29# * @version 2.0 Incremental building assistance added, including
30# * remove_document_from_database which implements the granddad's
31# * empty function to call the lucene_passes.pl and full_lucene_passes_exe
32# * so there is one place in the code that works out where the
33# * perl script is. John Rowe
34# *
35# * @author David Bainbridge and Katherine Don, Waikato DL Research group
36# * @author John Rowe, DL Consulting Ltd.
37# * @author John Thompson, DL Consulting Ltd.
38# */
39###########################################################################
40
41package lucenebuilder;
42
43# Use same basic XML structure setup by mgppbuilder/mgppbuildproc
44
45use mgppbuilder;
46use strict;
47no strict 'refs';
48use util;
49use FileUtils;
50
51sub BEGIN {
52 @lucenebuilder::ISA = ('mgppbuilder');
53}
54
55# /**
56# * @author John Thompson, DL Consulting Ltd.
57# */
58sub new {
59 my $class = shift(@_);
60 my $self = new mgppbuilder (@_);
61 $self = bless $self, $class;
62
63 $self->{'buildtype'} = "lucene";
64
65 # If ENABLE_LUCENE was turned off during GS compilation, then we won't be able to
66 # continue. Check for existence of LuceneWrapper to see if Lucene was disabled.
67 my $lucene = &FileUtils::filenameConcatenate($ENV{'GSDLHOME'},"bin","java","LuceneWrapper4.jar");
68 if (! -f $lucene) {
69 die "***** ERROR: $lucene does not exist\n";
70 }
71
72 # Do we need to put exe on the end?
73 my $exe = &util::get_os_exe ();
74 my $scriptdir = "$ENV{'GSDLHOME'}/bin/script";
75
76 # So where is lucene_passes.pl anyway?
77 my $lucene_passes_script = &FileUtils::filenameConcatenate($scriptdir, "lucene_passes.pl");
78
79 # So tack perl on the beginning to ensure execution
80 $self->{'full_lucene_passes'} = "$lucene_passes_script";
81 if ($exe eq ".exe")
82 {
83 $self->{'full_lucene_passes_exe'} = "perl$exe \"$lucene_passes_script\"";
84 }
85 else
86 {
87 $self->{'full_lucene_passes_exe'} = "\"".&util::get_perl_exec()."\" -S \"$lucene_passes_script\"";
88 }
89
90 return $self;
91}
92# /** new() **/
93
94sub set_sections_sort_on_document_metadata {
95 my $self = shift (@_);
96 my ($index) = @_;
97
98 $self->{'buildproc'}->set_sections_sort_on_document_metadata($index);
99}
100
101sub is_incremental_capable
102{
103 # lucene can do incremental building
104
105 return 1;
106}
107
108sub init_for_incremental_build {
109 my $self = shift (@_);
110
111 # we want to read in indexfieldmap and indexfields from existing build.cfg
112 # so that we know what has already been indexed
113 my $buildcfg = $self->read_build_cfg();
114 return unless defined $buildcfg;
115
116 my $field;
117 if (defined $buildcfg->{'indexfields'}) {
118 foreach $field (@{$buildcfg->{'indexfields'}}) {
119 # extraindexfields is only supposed to have extra ones in it, not those already specified in indexes. And this list has all indexes in it. But we do a check before including things from extraindexfields whether it was specified in indexes, so it all ok.
120 $self->{'buildproc'}->{'extraindexfields'}->{$field} = 1;
121 }
122 }
123
124 if (defined $buildcfg->{'indexfieldmap'}) {
125 foreach $field (@{$buildcfg->{'indexfieldmap'}}) {
126 my ($f, $v) = $field =~ /^(.*)\-\>(.*)$/;
127 $self->{'buildproc'}->{'fieldnamemap'}->{$f} = $v;
128 $self->{'buildproc'}->{'fieldnamemap'}->{$v} = 1;
129 $self->{'buildproc'}->{'allindexfields'}->{$f} = 1;
130 }
131 }
132}
133
134# lucene has none of these options
135sub generate_index_options {
136 my $self = shift (@_);
137
138 $self->SUPER::generate_index_options();
139
140 $self->{'casefold'} = 0;
141 $self->{'stem'} = 0;
142 $self->{'accentfold'} = 0;
143 $self->{'stemindexes'} = 0;
144}
145
146sub default_buildproc {
147 my $self = shift (@_);
148
149 return "lucenebuildproc";
150}
151
152# this writes a nice version of the text docs
153sub compress_text
154{
155 my $self = shift (@_);
156 # we don't do anything if we don't want compressed text
157 return if $self->{'no_text'};
158
159 my ($textindex) = @_;
160 my $outhandle = $self->{'outhandle'};
161
162 # the text directory
163 my $text_dir = &FileUtils::filenameConcatenate($self->{'build_dir'}, "text");
164 my $build_dir = &FileUtils::filenameConcatenate($self->{'build_dir'},"");
165 &FileUtils::makeAllDirectories ($text_dir);
166
167 my $osextra = "";
168 if (($ENV{'GSDLOS'} =~ /^windows$/i) && ($^O ne "cygwin"))
169 {
170 $text_dir =~ s@/@\\@g;
171 }
172 else
173 {
174 if ($outhandle ne "STDERR")
175 {
176 # so lucene_passes doesn't print to stderr if we redirect output
177 $osextra .= " 2>/dev/null";
178 }
179 }
180
181 # get any os specific stuff
182 my $scriptdir = "$ENV{'GSDLHOME'}/bin/script";
183
184 # Find the perl script to call to run lucene
185 my $full_lucene_passes = $self->{'full_lucene_passes'};
186 my $full_lucene_passes_exe = $self->{'full_lucene_passes_exe'};
187
188 my $lucene_passes_sections = "Doc";
189
190 my ($handle);
191
192 if ($self->{'debug'})
193 {
194 $handle = *STDOUT;
195 }
196 else
197 {
198 print STDERR "Full Path: $full_lucene_passes\n";
199 print STDERR "Executable: $full_lucene_passes_exe\n";
200 print STDERR "Sections: $lucene_passes_sections\n";
201 print STDERR "Build Dir: $build_dir\n";
202 print STDERR "Cmd: $full_lucene_passes_exe text $lucene_passes_sections \"$build_dir\" \"dummy\" $osextra\n";
203 if (!-e "$full_lucene_passes" ||
204 !open($handle, "| $full_lucene_passes_exe text $lucene_passes_sections \"$build_dir\" \"dummy\" $osextra"))
205 {
206 print STDERR "<FatalError name='NoRunLucenePasses'/>\n</Stage>\n" if $self->{'gli'};
207 die "lucenebuilder::build_index - couldn't run $full_lucene_passes_exe\n";
208 }
209 }
210
211 # stored text is always Doc and Sec levels
212 my $levels = { 'document' => 1, 'section' => 1 };
213 # always do database at section level
214 my $db_level = "section";
215
216 # set up the document processr
217 $self->{'buildproc'}->set_output_handle ($handle);
218 $self->{'buildproc'}->set_mode ('text');
219 $self->{'buildproc'}->set_index ($textindex);
220 $self->{'buildproc'}->set_indexing_text (0);
221 $self->{'buildproc'}->set_levels ($levels);
222 $self->{'buildproc'}->set_db_level ($db_level);
223 $self->{'buildproc'}->reset();
224 &plugin::begin($self->{'pluginfo'}, $self->{'source_dir'},
225 $self->{'buildproc'}, $self->{'maxdocs'});
226 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
227 "", {}, {}, $self->{'buildproc'}, $self->{'maxdocs'}, 0, $self->{'gli'});
228 &plugin::end($self->{'pluginfo'});
229 close ($handle) unless $self->{'debug'};
230 $self->print_stats();
231
232 print STDERR "</Stage>\n" if $self->{'gli'};
233}
234
235sub build_indexes {
236 my $self = shift (@_);
237 my ($indexname, $indexlevel) = @_;
238 my $outhandle = $self->{'outhandle'};
239
240 $self->pre_build_indexes($indexname);
241
242 my $indexes = [];
243 if (defined $indexname && $indexname =~ /\w/) {
244 push @$indexes, $indexname;
245 } else {
246 $indexes = $self->{'collect_cfg'}->{'indexes'};
247 }
248
249 # Determine what levels of index we want to build (a user may a specific
250 # level to index by using indexlevel parameter) [jmt12]
251 my @desired_indexlevels;
252 foreach my $level (keys %{$self->{'levels'}})
253 {
254 # ignore paragraph levels as they are unsupported in Lucene
255 if ($level =~ /paragraph/)
256 {
257 print $outhandle "WARNING: Paragraph level indexing not supported by Lucene. Ignoring index\n";
258 }
259 # build only the requested level if specified
260 elsif (defined $indexlevel && $indexlevel eq $level)
261 {
262 push (@desired_indexlevels, $level);
263 last;
264 }
265 # otherwise build all levels defined
266 else
267 {
268 push (@desired_indexlevels, $level);
269 }
270 }
271
272 # Create the mapping between the index descriptions
273 # and their directory names (includes subcolls and langs)
274 $self->{'index_mapping'} = $self->create_index_mapping ($indexes);
275
276 # build each of the indexes
277 foreach my $index (@$indexes) {
278
279 if ($self->want_built($index)) {
280
281 my $idx = $self->{'index_mapping'}->{$index};
282 # we now iterate through the filtered list of index levels [jmt12]
283 foreach my $level (@desired_indexlevels) {
284 next if $level =~ /paragraph/; # we don't do para indexing
285 my ($pindex) = $level =~ /^(.)/;
286 # should probably check that new name with level
287 # is unique ... but currently (with doc sec and para)
288 # each has unique first letter.
289 $self->{'index_mapping'}->{$index} = $pindex.$idx;
290
291 my $llevel = $mgppbuilder::level_map{$level};
292 print $outhandle "\n*** building index $index at level $llevel in subdirectory " .
293 "$self->{'index_mapping'}->{$index}\n" if ($self->{'verbosity'} >= 1);
294 print STDERR "<Stage name='Index' source='$index' level=$llevel>\n" if $self->{'gli'};
295
296 $self->build_index($index,$llevel);
297 }
298 $self->{'index_mapping'}->{$index} = $idx;
299
300 } else {
301 print $outhandle "\n*** ignoring index $index\n" if ($self->{'verbosity'} >= 1);
302 }
303 }
304
305 $self->post_build_indexes();
306}
307
308
309sub build_index {
310 my $self = shift (@_);
311 my ($index,$llevel) = @_;
312 my $outhandle = $self->{'outhandle'};
313 my $build_dir = $self->{'build_dir'};
314
315 # get the full index directory path and make sure it exists
316 my $indexdir = $self->{'index_mapping'}->{$index};
317 &FileUtils::makeAllDirectories (&FileUtils::filenameConcatenate($build_dir, $indexdir));
318
319 # get any os specific stuff
320 my $exedir = "$ENV{'GSDLHOME'}/bin/$ENV{'GSDLOS'}";
321 my $scriptdir = "$ENV{'GSDLHOME'}/bin/script";
322
323 # Find the perl script to call to run lucene
324 my $full_lucene_passes = $self->{'full_lucene_passes'};
325 my $full_lucene_passes_exe = $self->{'full_lucene_passes_exe'};
326
327 # define the section names for lucenepasses
328 # define the section names and possibly the doc name for lucenepasses
329 my $lucene_passes_sections = $llevel;
330
331 my $opt_create_index = ($self->{'incremental'}) ? "" : "-removeold";
332
333 my $osextra = "";
334 if (($ENV{'GSDLOS'} =~ /^windows$/i) && ($^O ne "cygwin")) {
335 $build_dir =~ s@/@\\@g;
336 } else {
337 if ($outhandle ne "STDERR") {
338 # so lucene_passes doesn't print to stderr if we redirect output
339 $osextra .= " 2>/dev/null";
340 }
341 }
342
343 # get the index expression if this index belongs
344 # to a subcollection
345 my $indexexparr = [];
346 my $langarr = [];
347
348 # there may be subcollection info, and language info.
349 my ($fields, $subcollection, $language) = split (":", $index);
350 my @subcollections = ();
351 @subcollections = split /,/, $subcollection if (defined $subcollection);
352
353 foreach $subcollection (@subcollections) {
354 if (defined ($self->{'collect_cfg'}->{'subcollection'}->{$subcollection})) {
355 push (@$indexexparr, $self->{'collect_cfg'}->{'subcollection'}->{$subcollection});
356 }
357 }
358
359 # add expressions for languages if this index belongs to
360 # a language subcollection - only put languages expressions for the
361 # ones we want in the index
362 my @languages = ();
363 my $languagemetadata = "Language";
364 if (defined ($self->{'collect_cfg'}->{'languagemetadata'})) {
365 $languagemetadata = $self->{'collect_cfg'}->{'languagemetadata'};
366 }
367 @languages = split /,/, $language if (defined $language);
368 foreach my $language (@languages) {
369 my $not=0;
370 if ($language =~ s/^\!//) {
371 $not = 1;
372 }
373 if($not) {
374 push (@$langarr, "!$language");
375 } else {
376 push (@$langarr, "$language");
377 }
378 }
379
380 # Build index dictionary. Uses verbatim stem method
381 print $outhandle "\n creating index dictionary (lucene_passes -I1)\n" if ($self->{'verbosity'} >= 1);
382 print STDERR "<Phase name='CreatingIndexDic'/>\n" if $self->{'gli'};
383 my ($handle);
384
385 if ($self->{'debug'}) {
386 $handle = *STDOUT;
387 } else {
388 print STDERR "Cmd: $full_lucene_passes_exe $opt_create_index index $lucene_passes_sections \"$build_dir\" \"$indexdir\" $osextra\n";
389 if (!-e "$full_lucene_passes" ||
390 !open($handle, "| $full_lucene_passes_exe $opt_create_index index $lucene_passes_sections \"$build_dir\" \"$indexdir\" $osextra")) {
391 print STDERR "<FatalError name='NoRunLucenePasses'/>\n</Stage>\n" if $self->{'gli'};
392 die "lucenebuilder::build_index - couldn't run $full_lucene_passes_exe\n";
393 }
394 }
395
396 my $store_levels = $self->{'levels'};
397 my $db_level = "section"; #always
398 my $dom_level = "";
399 foreach my $key (keys %$store_levels) {
400 if ($mgppbuilder::level_map{$key} eq $llevel) {
401 $dom_level = $key;
402 }
403 }
404 if ($dom_level eq "") {
405 print STDERR "Warning: unrecognized tag level $llevel\n";
406 $dom_level = "document";
407 }
408
409 my $local_levels = { $dom_level => 1 }; # work on one level at a time
410
411 # set up the document processr
412 $self->{'buildproc'}->set_output_handle ($handle);
413 $self->{'buildproc'}->set_mode ('text');
414 $self->{'buildproc'}->set_index ($index, $indexexparr);
415 $self->{'buildproc'}->set_index_languages ($languagemetadata, $langarr) if (defined $language);
416 $self->{'buildproc'}->set_indexing_text (1);
417 #$self->{'buildproc'}->set_indexfieldmap ($self->{'indexfieldmap'});
418 $self->{'buildproc'}->set_levels ($local_levels);
419 if (defined $self->{'collect_cfg'}->{'sortfields'}) {
420 $self->{'buildproc'}->set_sortfields ($self->{'collect_cfg'}->{'sortfields'});
421 }
422
423 $self->{'buildproc'}->set_db_level($db_level);
424 $self->{'buildproc'}->reset();
425 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
426 "", {}, {}, $self->{'buildproc'}, $self->{'maxdocs'}, 0, $self->{'gli'});
427 close ($handle) unless $self->{'debug'};
428
429 $self->print_stats();
430
431 $self->{'buildproc'}->set_levels ($store_levels);
432 print STDERR "</Stage>\n" if $self->{'gli'};
433}
434
435# /** A modified version of the basebuilder.pm's function that generates the
436# * information database from the GA documents. We need to change this
437# * so that if we've been asked to do an incremental build we only add
438# * metadata to autohierarchy classifiers via the IncrementalBuildUtils
439# * module. All other classifiers and metadata will be ignored.
440# */
441# This was added to utilize DLC's incremental updating of Hierarchy classifiers. They are heading towards just using dynamic classifiers, and we do not want to use this code either. So now, we just use basebuilder's version of make_infodatabase
442sub make_infodatabase_dlc
443{
444 my $self = shift (@_);
445 my $outhandle = $self->{'outhandle'};
446
447 # Get info database file path
448 my $text_directory_path = &FileUtils::filenameConcatenate($self->{'build_dir'}, "text");
449 my $infodb_file_path = &dbutil::get_infodb_file_path($self->{'infodbtype'}, $self->{'collection'}, $text_directory_path);
450
451 # If we aren't doing an incremental addition, then we just call the super-
452 # classes version
453 # Note: Incremental addition can only occur if an information database
454 # already exists. If it doesn't, let the super classes function be
455 # called once to generate it.
456 if (!$self->{'incremental'} || !-e $infodb_file_path)
457 {
458 # basebuilder::make_infodatabase(@_);
459 # Note: this doesn't work as the direct reference means all the $self
460 # data is lost.
461 $self->basebuilder::make_infodatabase(@_);
462 return;
463 }
464
465 # Carry on with an incremental addition
466 print $outhandle "\n*** performing an incremental addition to the info database\n" if ($self->{'verbosity'} >= 1);
467 print STDERR "<Stage name='CreateInfoData'>\n" if $self->{'gli'};
468
469 # 1. Init all the classifiers
470 &classify::init_classifiers ($self->{'classifiers'});
471 # 2. Init the buildproc settings.
472 # Note: we still need this to process any associated files - but we
473 # don't expect to pipe anything to the database so we can do away with the
474 # complex output handle.
475 my $assocdir = &FileUtils::filenameConcatenate($self->{'build_dir'}, "assoc");
476 &FileUtils::makeAllDirectories ($assocdir);
477 $self->{'buildproc'}->set_mode ('incinfodb'); # Very Important
478 $self->{'buildproc'}->set_assocdir ($assocdir);
479 # 3. Read in all the metadata from the files in the archives directory using
480 # the GAPlug and using ourselves as the document processor!
481 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'}, "", {}, {}, $self->{'buildproc'}, $self->{'maxdocs'},0, $self->{'gli'});
482
483 print STDERR "</Stage>\n" if $self->{'gli'};
484}
485
486# /** Lucene specific document removal function. This works by calling lucene_passes.pl with
487# * -remove and the document id on the command line.
488# *
489# * @param oid is the document identifier to be removed.
490# *
491# * @author John Rowe, DL Consulting Ltd.
492# */
493sub remove_document_from_database
494{
495 my ($self, $oid) = @_;
496 # Find the perl script to call to run lucene
497 my $full_lucene_passes_exe = $self->{'full_lucene_passes_exe'};
498 # Call lucene_passes.pl with -remove and the document ID on the command line
499 `$full_lucene_passes_exe -remove "$oid"`;
500}
501# /** remove_document_from_database **/
502
503sub build_cfg_extra {
504 my $self = shift (@_);
505 my ($build_cfg) = @_;
506
507 $self->mgppbuilder::build_cfg_extra($build_cfg);
508
509 # need to add in sort stuff
510 my @sortfields = ();
511 my @sortfieldmap = ();
512
513 foreach my $sf (@{$self->{'buildproc'}->{'sortfields'}}) {
514 if ($sf eq "rank" || $sf eq "none") {
515 push(@sortfields, $sf);
516 push (@sortfieldmap, "$sf\-\>$sf");
517 } elsif ($self->{'buildproc'}->{'actualsortfields'}->{$sf}) {
518 my $shortname = $self->{'buildproc'}->{'sortfieldnamemap'}->{$sf};
519 push(@sortfields, $shortname);
520 push (@sortfieldmap, "$sf\-\>$shortname");
521 }
522
523 }
524 $build_cfg->{'indexsortfields'} = \@sortfields;
525 $build_cfg->{'indexsortfieldmap'} = \@sortfieldmap;
526}
5271;
528
529
Note: See TracBrowser for help on using the repository browser.