source: main/trunk/greenstone2/perllib/lucenebuilder.pm@ 27357

Last change on this file since 27357 was 27357, checked in by kjdon, 11 years ago

setting sortfield info to buildproc, and getting it back from buildproc for the build.cfg file.

  • Property svn:keywords set to Author Date Id Revision
File size: 18.2 KB
RevLine 
[8072]1###########################################################################
2#
3# lucenebuilder.pm -- perl wrapper for building index with Lucene
4# A component of the Greenstone digital library software
[12844]5# from the New Zealand Digital Library Project at the
[8072]6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
[12844]26###########################################################################
27# /*
[24496]28# * @version 1.0 Initial implementation of incremental building
[12844]29# * @version 2.0 Incremental building assistance added, including
30# * remove_document_from_database which implements the granddad's
31# * empty function to call the lucene_passes.pl and full_lucene_passes_exe
32# * so there is one place in the code that works out where the
33# * perl script is. John Rowe
34# *
[24496]35# * @author David Bainbridge and Katherine Don, Waikato DL Research group
[12844]36# * @author John Rowe, DL Consulting Ltd.
[24496]37# * @author John Thompson, DL Consulting Ltd.
[12844]38# */
39###########################################################################
40
[8072]41package lucenebuilder;
42
43# Use same basic XML structure setup by mgppbuilder/mgppbuildproc
44
[12844]45use mgppbuilder;
[17564]46use strict;
47no strict 'refs';
[24362]48use util;
[27329]49use FileUtils;
[8072]50
51sub BEGIN {
[8716]52 @lucenebuilder::ISA = ('mgppbuilder');
[8072]53}
54
[12844]55# /**
56# * @author John Thompson, DL Consulting Ltd.
57# */
[8072]58sub new {
59 my $class = shift(@_);
60 my $self = new mgppbuilder (@_);
61 $self = bless $self, $class;
62
63 $self->{'buildtype'} = "lucene";
[21621]64
65 # If ENABLE_LUCENE was turned off during GS compilation, then we won't be able to
66 # continue. Check for existence of LuceneWrapper to see if Lucene was disabled.
[27329]67 my $lucene = &FileUtils::filenameConcatenate($ENV{'GSDLHOME'},"bin","java","LuceneWrapper3.jar");
[24496]68 if (! -f $lucene) {
[21621]69 die "***** ERROR: $lucene does not exist\n";
70 }
71
[12844]72 # Do we need to put exe on the end?
73 my $exe = &util::get_os_exe ();
74 my $scriptdir = "$ENV{'GSDLHOME'}/bin/script";
75
76 # So where is lucene_passes.pl anyway?
[27329]77 my $lucene_passes_script = &FileUtils::filenameConcatenate($scriptdir, "lucene_passes.pl");
[12844]78
79 # So tack perl on the beginning to ensure execution
80 $self->{'full_lucene_passes'} = "$lucene_passes_script";
81 if ($exe eq ".exe")
82 {
[13589]83 $self->{'full_lucene_passes_exe'} = "perl$exe \"$lucene_passes_script\"";
[12844]84 }
85 else
86 {
[24362]87 $self->{'full_lucene_passes_exe'} = "\"".&util::get_perl_exec()."\" -S \"$lucene_passes_script\"";
[12844]88 }
89
[8072]90 return $self;
91}
[12844]92# /** new() **/
[8072]93
[20645]94sub is_incremental_capable
95{
96 # lucene can do incremental building
97
98 return 1;
99}
100
[17575]101sub init_for_incremental_build {
102 my $self = shift (@_);
103
104 # we want to read in indexfieldmap and indexfields from existing build.cfg
105 # so that we know what has already been indexed
106 my $buildcfg = $self->read_build_cfg();
107 return unless defined $buildcfg;
108
109 my $field;
110 if (defined $buildcfg->{'indexfields'}) {
111 foreach $field (@{$buildcfg->{'indexfields'}}) {
[27329]112 # extraindexfields is only supposed to have extra ones in it, not those already specified in indexes. And this list has all indexes in it. But we do a check before including things from extraindexfields whether it was specified in indexes, so it all ok.
113 $self->{'buildproc'}->{'extraindexfields'}->{$field} = 1;
[17575]114 }
115 }
116
117 if (defined $buildcfg->{'indexfieldmap'}) {
118 foreach $field (@{$buildcfg->{'indexfieldmap'}}) {
119 my ($f, $v) = $field =~ /^(.*)\-\>(.*)$/;
[27329]120 $self->{'buildproc'}->{'fieldnamemap'}->{$f} = $v;
121 $self->{'buildproc'}->{'fieldnamemap'}->{$v} = 1;
122 $self->{'buildproc'}->{'allindexfields'}->{$f} = 1;
[17575]123 }
124 }
125}
126
[12910]127# lucene has none of these options
128sub generate_index_options {
129 my $self = shift (@_);
130
[17110]131 $self->SUPER::generate_index_options();
132
[12910]133 $self->{'casefold'} = 0;
134 $self->{'stem'} = 0;
135 $self->{'accentfold'} = 0;
136 $self->{'stemindexes'} = 0;
137}
138
[10468]139sub default_buildproc {
140 my $self = shift (@_);
141
142 return "lucenebuildproc";
143}
144
[9179]145# this writes a nice version of the text docs
[12844]146sub compress_text
[13589]147{
[8072]148 my $self = shift (@_);
[10961]149 # we don't do anything if we don't want compressed text
150 return if $self->{'no_text'};
151
[8072]152 my ($textindex) = @_;
[9179]153 my $outhandle = $self->{'outhandle'};
[17575]154
[9179]155 # the text directory
[27329]156 my $text_dir = &FileUtils::filenameConcatenate($self->{'build_dir'}, "text");
157 my $build_dir = &FileUtils::filenameConcatenate($self->{'build_dir'},"");
158 &FileUtils::makeAllDirectories ($text_dir);
[9179]159
160 my $osextra = "";
[12844]161 if ($ENV{'GSDLOS'} =~ /^windows$/i)
[13589]162 {
163 $text_dir =~ s@/@\\@g;
164 }
[12844]165 else
[13589]166 {
167 if ($outhandle ne "STDERR")
168 {
169 # so lucene_passes doesn't print to stderr if we redirect output
170 $osextra .= " 2>/dev/null";
171 }
172 }
[9179]173
174 # get any os specific stuff
175 my $scriptdir = "$ENV{'GSDLHOME'}/bin/script";
176
[12844]177 # Find the perl script to call to run lucene
178 my $full_lucene_passes = $self->{'full_lucene_passes'};
179 my $full_lucene_passes_exe = $self->{'full_lucene_passes_exe'};
180
[9179]181 my $lucene_passes_sections = "Doc";
182
183 my ($handle);
184
[12844]185 if ($self->{'debug'})
[13589]186 {
[15712]187 $handle = *STDOUT;
[13589]188 }
[12844]189 else
[13589]190 {
[12844]191 print STDERR "Full Path: $full_lucene_passes\n";
192 print STDERR "Executable: $full_lucene_passes_exe\n";
193 print STDERR "Sections: $lucene_passes_sections\n";
194 print STDERR "Build Dir: $build_dir\n";
195 print STDERR "Cmd: $full_lucene_passes_exe text $lucene_passes_sections \"$build_dir\" \"dummy\" $osextra\n";
[13589]196 if (!-e "$full_lucene_passes" ||
[15712]197 !open($handle, "| $full_lucene_passes_exe text $lucene_passes_sections \"$build_dir\" \"dummy\" $osextra"))
[13589]198 {
199 print STDERR "<FatalError name='NoRunLucenePasses'/>\n</Stage>\n" if $self->{'gli'};
200 die "lucenebuilder::build_index - couldn't run $full_lucene_passes_exe\n";
201 }
202 }
[9214]203
[13590]204 # stored text is always Doc and Sec levels
205 my $levels = { 'document' => 1, 'section' => 1 };
[15685]206 # always do database at section level
207 my $db_level = "section";
[13590]208
[9179]209 # set up the document processr
210 $self->{'buildproc'}->set_output_handle ($handle);
211 $self->{'buildproc'}->set_mode ('text');
212 $self->{'buildproc'}->set_index ($textindex);
213 $self->{'buildproc'}->set_indexing_text (0);
[12844]214 $self->{'buildproc'}->set_levels ($levels);
[15685]215 $self->{'buildproc'}->set_db_level ($db_level);
[9179]216 $self->{'buildproc'}->reset();
[12844]217 &plugin::begin($self->{'pluginfo'}, $self->{'source_dir'},
[13589]218 $self->{'buildproc'}, $self->{'maxdocs'});
[12844]219 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
[16379]220 "", {}, {}, $self->{'buildproc'}, $self->{'maxdocs'}, 0, $self->{'gli'});
[9179]221 &plugin::end($self->{'pluginfo'});
222 close ($handle) unless $self->{'debug'};
223 $self->print_stats();
224
225 print STDERR "</Stage>\n" if $self->{'gli'};
[13589]226}
[9179]227
[8072]228sub build_indexes {
229 my $self = shift (@_);
[27301]230 my ($indexname, $indexlevel) = @_;
[8072]231 my $outhandle = $self->{'outhandle'};
232
[24460]233 $self->pre_build_indexes($indexname);
234
[8072]235 my $indexes = [];
236 if (defined $indexname && $indexname =~ /\w/) {
[13589]237 push @$indexes, $indexname;
[8072]238 } else {
[13589]239 $indexes = $self->{'collect_cfg'}->{'indexes'};
[8072]240 }
[24460]241
[27301]242 # Determine what levels of index we want to build (a user may a specific
243 # level to index by using indexlevel parameter) [jmt12]
244 my @desired_indexlevels;
245 foreach my $level (keys %{$self->{'levels'}})
246 {
247 # ignore paragraph levels as they are unsupported in Lucene
248 if ($level =~ /paragraph/)
249 {
250 print $outhandle "WARNING: Paragraph level indexing not supported by Lucene. Ignoring index\n";
251 }
252 # build only the requested level if specified
253 elsif (defined $indexlevel && $indexlevel eq $level)
254 {
255 push (@desired_indexlevels, $level);
256 last;
257 }
258 # otherwise build all levels defined
259 else
260 {
261 push (@desired_indexlevels, $level);
262 }
[17566]263 }
[24460]264
265 # Create the mapping between the index descriptions
[8072]266 # and their directory names (includes subcolls and langs)
267 $self->{'index_mapping'} = $self->create_index_mapping ($indexes);
268
269 # build each of the indexes
[15712]270 foreach my $index (@$indexes) {
[24460]271
[13589]272 if ($self->want_built($index)) {
[8072]273
[13589]274 my $idx = $self->{'index_mapping'}->{$index};
[27301]275 # we now iterate through the filtered list of index levels [jmt12]
276 foreach my $level (@desired_indexlevels) {
[13589]277 next if $level =~ /paragraph/; # we don't do para indexing
278 my ($pindex) = $level =~ /^(.)/;
279 # should probably check that new name with level
280 # is unique ... but currently (with doc sec and para)
281 # each has unique first letter.
282 $self->{'index_mapping'}->{$index} = $pindex.$idx;
[8072]283
[13589]284 my $llevel = $mgppbuilder::level_map{$level};
285 print $outhandle "\n*** building index $index at level $llevel in subdirectory " .
286 "$self->{'index_mapping'}->{$index}\n" if ($self->{'verbosity'} >= 1);
287 print STDERR "<Stage name='Index' source='$index' level=$llevel>\n" if $self->{'gli'};
[8072]288
[13589]289 $self->build_index($index,$llevel);
290 }
291 $self->{'index_mapping'}->{$index} = $idx;
[8072]292
[13589]293 } else {
294 print $outhandle "\n*** ignoring index $index\n" if ($self->{'verbosity'} >= 1);
295 }
[8072]296 }
297
[24460]298 $self->post_build_indexes();
[8072]299}
300
[12844]301
[8072]302sub build_index {
303 my $self = shift (@_);
304 my ($index,$llevel) = @_;
305 my $outhandle = $self->{'outhandle'};
306 my $build_dir = $self->{'build_dir'};
307
308 # get the full index directory path and make sure it exists
309 my $indexdir = $self->{'index_mapping'}->{$index};
[27329]310 &FileUtils::makeAllDirectories (&FileUtils::filenameConcatenate($build_dir, $indexdir));
[8072]311
312 # get any os specific stuff
313 my $exedir = "$ENV{'GSDLHOME'}/bin/$ENV{'GSDLOS'}";
314 my $scriptdir = "$ENV{'GSDLHOME'}/bin/script";
315
[12844]316 # Find the perl script to call to run lucene
317 my $full_lucene_passes = $self->{'full_lucene_passes'};
318 my $full_lucene_passes_exe = $self->{'full_lucene_passes_exe'};
[8072]319
320 # define the section names for lucenepasses
321 # define the section names and possibly the doc name for lucenepasses
322 my $lucene_passes_sections = $llevel;
323
[20683]324 my $opt_create_index = ($self->{'incremental'}) ? "" : "-removeold";
[10158]325
[8072]326 my $osextra = "";
327 if ($ENV{'GSDLOS'} =~ /^windows$/i) {
[13589]328 $build_dir =~ s@/@\\@g;
[8072]329 } else {
[13589]330 if ($outhandle ne "STDERR") {
331 # so lucene_passes doesn't print to stderr if we redirect output
332 $osextra .= " 2>/dev/null";
333 }
[8072]334 }
[12844]335
[8072]336 # get the index expression if this index belongs
337 # to a subcollection
338 my $indexexparr = [];
[9669]339 my $langarr = [];
[8072]340
[12844]341 # there may be subcollection info, and language info.
[8072]342 my ($fields, $subcollection, $language) = split (":", $index);
343 my @subcollections = ();
344 @subcollections = split /,/, $subcollection if (defined $subcollection);
345
346 foreach $subcollection (@subcollections) {
[13589]347 if (defined ($self->{'collect_cfg'}->{'subcollection'}->{$subcollection})) {
348 push (@$indexexparr, $self->{'collect_cfg'}->{'subcollection'}->{$subcollection});
349 }
[8072]350 }
[12844]351
[8072]352 # add expressions for languages if this index belongs to
[12844]353 # a language subcollection - only put languages expressions for the
[8072]354 # ones we want in the index
355 my @languages = ();
[20418]356 my $languagemetadata = "Language";
357 if (defined ($self->{'collect_cfg'}->{'languagemetadata'})) {
358 $languagemetadata = $self->{'collect_cfg'}->{'languagemetadata'};
[9548]359 }
[8072]360 @languages = split /,/, $language if (defined $language);
[9548]361 foreach my $language (@languages) {
[13589]362 my $not=0;
363 if ($language =~ s/^\!//) {
364 $not = 1;
365 }
366 if($not) {
367 push (@$langarr, "!$language");
368 } else {
369 push (@$langarr, "$language");
370 }
[8072]371 }
372
373 # Build index dictionary. Uses verbatim stem method
374 print $outhandle "\n creating index dictionary (lucene_passes -I1)\n" if ($self->{'verbosity'} >= 1);
375 print STDERR "<Phase name='CreatingIndexDic'/>\n" if $self->{'gli'};
376 my ($handle);
377
378 if ($self->{'debug'}) {
[15712]379 $handle = *STDOUT;
[8072]380 } else {
[13589]381 print STDERR "Cmd: $full_lucene_passes_exe $opt_create_index index $lucene_passes_sections \"$build_dir\" \"$indexdir\" $osextra\n";
382 if (!-e "$full_lucene_passes" ||
[15712]383 !open($handle, "| $full_lucene_passes_exe $opt_create_index index $lucene_passes_sections \"$build_dir\" \"$indexdir\" $osextra")) {
[13589]384 print STDERR "<FatalError name='NoRunLucenePasses'/>\n</Stage>\n" if $self->{'gli'};
[15712]385 die "lucenebuilder::build_index - couldn't run $full_lucene_passes_exe\n";
[13589]386 }
[8072]387 }
[12844]388
[8072]389 my $store_levels = $self->{'levels'};
[15685]390 my $db_level = "section"; #always
[8072]391 my $dom_level = "";
392 foreach my $key (keys %$store_levels) {
[13589]393 if ($mgppbuilder::level_map{$key} eq $llevel) {
394 $dom_level = $key;
395 }
[8072]396 }
397 if ($dom_level eq "") {
[13589]398 print STDERR "Warning: unrecognized tag level $llevel\n";
399 $dom_level = "document";
[8072]400 }
401
402 my $local_levels = { $dom_level => 1 }; # work on one level at a time
403
404 # set up the document processr
405 $self->{'buildproc'}->set_output_handle ($handle);
406 $self->{'buildproc'}->set_mode ('text');
407 $self->{'buildproc'}->set_index ($index, $indexexparr);
[20418]408 $self->{'buildproc'}->set_index_languages ($languagemetadata, $langarr) if (defined $language);
[8072]409 $self->{'buildproc'}->set_indexing_text (1);
[17564]410 #$self->{'buildproc'}->set_indexfieldmap ($self->{'indexfieldmap'});
[12844]411 $self->{'buildproc'}->set_levels ($local_levels);
[27357]412 $self->{'buildproc'}->set_sortfields ($self->{'collect_cfg'}->{'sortfields'});;
413
[15685]414 $self->{'buildproc'}->set_db_level($db_level);
[8072]415 $self->{'buildproc'}->reset();
[12844]416 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
[16379]417 "", {}, {}, $self->{'buildproc'}, $self->{'maxdocs'}, 0, $self->{'gli'});
[8072]418 close ($handle) unless $self->{'debug'};
419
420 $self->print_stats();
421
[12844]422 $self->{'buildproc'}->set_levels ($store_levels);
[8072]423 print STDERR "</Stage>\n" if $self->{'gli'};
[12844]424}
[8072]425
[12844]426# /** A modified version of the basebuilder.pm's function that generates the
[15714]427# * information database from the GA documents. We need to change this
[12844]428# * so that if we've been asked to do an incremental build we only add
429# * metadata to autohierarchy classifiers via the IncrementalBuildUtils
430# * module. All other classifiers and metadata will be ignored.
431# */
[17286]432# This was added to utilize DLC's incremental updating of Hierarchy classifiers. They are heading towards just using dynamic classifiers, and we do not want to use this code either. So now, we just use basebuilder's version of make_infodatabase
433sub make_infodatabase_dlc
[13589]434{
[12844]435 my $self = shift (@_);
436 my $outhandle = $self->{'outhandle'};
437
[15714]438 # Get info database file path
[27329]439 my $text_directory_path = &FileUtils::filenameConcatenate($self->{'build_dir'}, "text");
[15725]440 my $infodb_file_path = &dbutil::get_infodb_file_path($self->{'infodbtype'}, $self->{'collection'}, $text_directory_path);
[12844]441
442 # If we aren't doing an incremental addition, then we just call the super-
443 # classes version
[15714]444 # Note: Incremental addition can only occur if an information database
[12844]445 # already exists. If it doesn't, let the super classes function be
446 # called once to generate it.
[16259]447 if (!$self->{'incremental'} || !-e $infodb_file_path)
[13589]448 {
[12844]449 # basebuilder::make_infodatabase(@_);
450 # Note: this doesn't work as the direct reference means all the $self
451 # data is lost.
452 $self->basebuilder::make_infodatabase(@_);
453 return;
[13589]454 }
[12844]455
456 # Carry on with an incremental addition
457 print $outhandle "\n*** performing an incremental addition to the info database\n" if ($self->{'verbosity'} >= 1);
458 print STDERR "<Stage name='CreateInfoData'>\n" if $self->{'gli'};
459
460 # 1. Init all the classifiers
461 &classify::init_classifiers ($self->{'classifiers'});
462 # 2. Init the buildproc settings.
463 # Note: we still need this to process any associated files - but we
[15718]464 # don't expect to pipe anything to the database so we can do away with the
[12844]465 # complex output handle.
[27329]466 my $assocdir = &FileUtils::filenameConcatenate($self->{'build_dir'}, "assoc");
467 &FileUtils::makeAllDirectories ($assocdir);
[12844]468 $self->{'buildproc'}->set_mode ('incinfodb'); # Very Important
469 $self->{'buildproc'}->set_assocdir ($assocdir);
470 # 3. Read in all the metadata from the files in the archives directory using
471 # the GAPlug and using ourselves as the document processor!
[16379]472 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'}, "", {}, {}, $self->{'buildproc'}, $self->{'maxdocs'},0, $self->{'gli'});
[12844]473
474 print STDERR "</Stage>\n" if $self->{'gli'};
475}
476
[13590]477# /** Lucene specific document removal function. This works by calling lucene_passes.pl with
478# * -remove and the document id on the command line.
479# *
480# * @param oid is the document identifier to be removed.
481# *
482# * @author John Rowe, DL Consulting Ltd.
483# */
484sub remove_document_from_database
485{
486 my ($self, $oid) = @_;
487 # Find the perl script to call to run lucene
488 my $full_lucene_passes_exe = $self->{'full_lucene_passes_exe'};
489 # Call lucene_passes.pl with -remove and the document ID on the command line
490 `$full_lucene_passes_exe -remove "$oid"`;
491}
492# /** remove_document_from_database **/
493
[27357]494sub build_cfg_extra {
495 my $self = shift (@_);
496 my ($build_cfg) = @_;
[13590]497
[27357]498 $self->mgppbuilder::build_cfg_extra($build_cfg);
499
500 # need to add in sort stuff
501 my @sortfields = ();
502 my @sortfieldmap = ();
503
504 foreach my $sf (@{$self->{'buildproc'}->{'sortfields'}}) {
505 if ($sf eq "rank") {
506 push(@sortfields, $sf);
507 } elsif ($self->{'buildproc'}->{'actualsortfields'}->{$sf}) {
508 my $shortname = $self->{'buildproc'}->{'sortfieldnamemap'}->{$sf};
509 push(@sortfields, $shortname);
510 push (@sortfieldmap, "$sf\-\>$shortname");
511 }
512
513 }
514 $build_cfg->{'indexsortfields'} = \@sortfields;
515 $build_cfg->{'indexsortfieldmap'} = \@sortfieldmap;
516}
[8072]5171;
518
519
Note: See TracBrowser for help on using the repository browser.