source: main/trunk/greenstone2/perllib/lucenebuilder.pm@ 24754

Last change on this file since 24754 was 24496, checked in by davidb, 13 years ago

Tidy up of comments

  • Property svn:keywords set to Author Date Id Revision
File size: 16.4 KB
RevLine 
[8072]1###########################################################################
2#
3# lucenebuilder.pm -- perl wrapper for building index with Lucene
4# A component of the Greenstone digital library software
[12844]5# from the New Zealand Digital Library Project at the
[8072]6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
[12844]26###########################################################################
27# /*
[24496]28# * @version 1.0 Initial implementation of incremental building
[12844]29# * @version 2.0 Incremental building assistance added, including
30# * remove_document_from_database which implements the granddad's
31# * empty function to call the lucene_passes.pl and full_lucene_passes_exe
32# * so there is one place in the code that works out where the
33# * perl script is. John Rowe
34# *
[24496]35# * @author David Bainbridge and Katherine Don, Waikato DL Research group
[12844]36# * @author John Rowe, DL Consulting Ltd.
[24496]37# * @author John Thompson, DL Consulting Ltd.
[12844]38# */
39###########################################################################
40
[8072]41package lucenebuilder;
42
43# Use same basic XML structure setup by mgppbuilder/mgppbuildproc
44
[12844]45use mgppbuilder;
[17564]46use strict;
47no strict 'refs';
[24362]48use util;
[8072]49
50sub BEGIN {
[8716]51 @lucenebuilder::ISA = ('mgppbuilder');
[8072]52}
53
[12844]54# /**
55# * @author John Thompson, DL Consulting Ltd.
56# */
[8072]57sub new {
58 my $class = shift(@_);
59 my $self = new mgppbuilder (@_);
60 $self = bless $self, $class;
61
62 $self->{'buildtype'} = "lucene";
[21621]63
64 # If ENABLE_LUCENE was turned off during GS compilation, then we won't be able to
65 # continue. Check for existence of LuceneWrapper to see if Lucene was disabled.
66 my $lucene = &util::filename_cat($ENV{'GSDLHOME'},"bin","java","LuceneWrapper.jar");
[24496]67 if (! -f $lucene) {
[21621]68 die "***** ERROR: $lucene does not exist\n";
69 }
70
[12844]71 # Do we need to put exe on the end?
72 my $exe = &util::get_os_exe ();
73 my $scriptdir = "$ENV{'GSDLHOME'}/bin/script";
74
75 # So where is lucene_passes.pl anyway?
76 my $lucene_passes_script = &util::filename_cat($scriptdir, "lucene_passes.pl");
77
78 # So tack perl on the beginning to ensure execution
79 $self->{'full_lucene_passes'} = "$lucene_passes_script";
80 if ($exe eq ".exe")
81 {
[13589]82 $self->{'full_lucene_passes_exe'} = "perl$exe \"$lucene_passes_script\"";
[12844]83 }
84 else
85 {
[24362]86 $self->{'full_lucene_passes_exe'} = "\"".&util::get_perl_exec()."\" -S \"$lucene_passes_script\"";
[12844]87 }
88
[8072]89 return $self;
90}
[12844]91# /** new() **/
[8072]92
[20645]93sub is_incremental_capable
94{
95 # lucene can do incremental building
96
97 return 1;
98}
99
[17575]100sub init_for_incremental_build {
101 my $self = shift (@_);
102
103 # we want to read in indexfieldmap and indexfields from existing build.cfg
104 # so that we know what has already been indexed
105 my $buildcfg = $self->read_build_cfg();
106 return unless defined $buildcfg;
107
108 my $field;
109 if (defined $buildcfg->{'indexfields'}) {
110 foreach $field (@{$buildcfg->{'indexfields'}}) {
111 $self->{'buildproc'}->{'indexfields'}->{$field} = 1;
112 }
113 }
114
115 if (defined $buildcfg->{'indexfieldmap'}) {
116 foreach $field (@{$buildcfg->{'indexfieldmap'}}) {
117 my ($f, $v) = $field =~ /^(.*)\-\>(.*)$/;
118 $self->{'buildproc'}->{'indexfieldmap'}->{$f} = $v;
119 }
120 }
121}
122
[12910]123# lucene has none of these options
124sub generate_index_options {
125 my $self = shift (@_);
126
[17110]127 $self->SUPER::generate_index_options();
128
[12910]129 $self->{'casefold'} = 0;
130 $self->{'stem'} = 0;
131 $self->{'accentfold'} = 0;
132 $self->{'stemindexes'} = 0;
133}
134
[10468]135sub default_buildproc {
136 my $self = shift (@_);
137
138 return "lucenebuildproc";
139}
140
[9179]141# this writes a nice version of the text docs
[12844]142sub compress_text
[13589]143{
[8072]144 my $self = shift (@_);
[10961]145 # we don't do anything if we don't want compressed text
146 return if $self->{'no_text'};
147
[8072]148 my ($textindex) = @_;
[9179]149 my $outhandle = $self->{'outhandle'};
[17575]150
[9179]151 # the text directory
152 my $text_dir = &util::filename_cat($self->{'build_dir'}, "text");
[9197]153 my $build_dir = &util::filename_cat($self->{'build_dir'},"");
[9179]154 &util::mk_all_dir ($text_dir);
155
156 my $osextra = "";
[12844]157 if ($ENV{'GSDLOS'} =~ /^windows$/i)
[13589]158 {
159 $text_dir =~ s@/@\\@g;
160 }
[12844]161 else
[13589]162 {
163 if ($outhandle ne "STDERR")
164 {
165 # so lucene_passes doesn't print to stderr if we redirect output
166 $osextra .= " 2>/dev/null";
167 }
168 }
[9179]169
170 # get any os specific stuff
171 my $scriptdir = "$ENV{'GSDLHOME'}/bin/script";
172
[12844]173 # Find the perl script to call to run lucene
174 my $full_lucene_passes = $self->{'full_lucene_passes'};
175 my $full_lucene_passes_exe = $self->{'full_lucene_passes_exe'};
176
[9179]177 my $lucene_passes_sections = "Doc";
178
179 my ($handle);
180
[12844]181 if ($self->{'debug'})
[13589]182 {
[15712]183 $handle = *STDOUT;
[13589]184 }
[12844]185 else
[13589]186 {
[12844]187 print STDERR "Full Path: $full_lucene_passes\n";
188 print STDERR "Executable: $full_lucene_passes_exe\n";
189 print STDERR "Sections: $lucene_passes_sections\n";
190 print STDERR "Build Dir: $build_dir\n";
191 print STDERR "Cmd: $full_lucene_passes_exe text $lucene_passes_sections \"$build_dir\" \"dummy\" $osextra\n";
[13589]192 if (!-e "$full_lucene_passes" ||
[15712]193 !open($handle, "| $full_lucene_passes_exe text $lucene_passes_sections \"$build_dir\" \"dummy\" $osextra"))
[13589]194 {
195 print STDERR "<FatalError name='NoRunLucenePasses'/>\n</Stage>\n" if $self->{'gli'};
196 die "lucenebuilder::build_index - couldn't run $full_lucene_passes_exe\n";
197 }
198 }
[9214]199
[13590]200 # stored text is always Doc and Sec levels
201 my $levels = { 'document' => 1, 'section' => 1 };
[15685]202 # always do database at section level
203 my $db_level = "section";
[13590]204
[9179]205 # set up the document processr
206 $self->{'buildproc'}->set_output_handle ($handle);
207 $self->{'buildproc'}->set_mode ('text');
208 $self->{'buildproc'}->set_index ($textindex);
209 $self->{'buildproc'}->set_indexing_text (0);
[17564]210 #$self->{'buildproc'}->set_indexfieldmap ($self->{'indexfieldmap'});
[12844]211 $self->{'buildproc'}->set_levels ($levels);
[15685]212 $self->{'buildproc'}->set_db_level ($db_level);
[9179]213 $self->{'buildproc'}->reset();
[12844]214 &plugin::begin($self->{'pluginfo'}, $self->{'source_dir'},
[13589]215 $self->{'buildproc'}, $self->{'maxdocs'});
[12844]216 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
[16379]217 "", {}, {}, $self->{'buildproc'}, $self->{'maxdocs'}, 0, $self->{'gli'});
[9179]218 &plugin::end($self->{'pluginfo'});
219 close ($handle) unless $self->{'debug'};
220 $self->print_stats();
221
222 print STDERR "</Stage>\n" if $self->{'gli'};
[13589]223}
[9179]224
[8072]225sub build_indexes {
226 my $self = shift (@_);
227 my ($indexname) = @_;
228 my $outhandle = $self->{'outhandle'};
229
[24460]230 $self->pre_build_indexes($indexname);
231
[8072]232 my $indexes = [];
233 if (defined $indexname && $indexname =~ /\w/) {
[13589]234 push @$indexes, $indexname;
[8072]235 } else {
[13589]236 $indexes = $self->{'collect_cfg'}->{'indexes'};
[8072]237 }
[24460]238
239 # Have we got para index?
[17566]240 foreach my $level (keys %{$self->{'levels'}}) {
241 if ($level =~ /paragraph/) {
[24460]242 print $outhandle "Warning: Paragraph level indexing not supported by Lucene/Solr\n";
[17566]243 last;
244 }
245 }
[24460]246
247 # Create the mapping between the index descriptions
[8072]248 # and their directory names (includes subcolls and langs)
249 $self->{'index_mapping'} = $self->create_index_mapping ($indexes);
250
251 # build each of the indexes
[15712]252 foreach my $index (@$indexes) {
[24460]253
[13589]254 if ($self->want_built($index)) {
[8072]255
[13589]256 my $idx = $self->{'index_mapping'}->{$index};
257 foreach my $level (keys %{$self->{'levels'}}) {
258 next if $level =~ /paragraph/; # we don't do para indexing
259 my ($pindex) = $level =~ /^(.)/;
260 # should probably check that new name with level
261 # is unique ... but currently (with doc sec and para)
262 # each has unique first letter.
263 $self->{'index_mapping'}->{$index} = $pindex.$idx;
[8072]264
[13589]265 my $llevel = $mgppbuilder::level_map{$level};
266 print $outhandle "\n*** building index $index at level $llevel in subdirectory " .
267 "$self->{'index_mapping'}->{$index}\n" if ($self->{'verbosity'} >= 1);
268 print STDERR "<Stage name='Index' source='$index' level=$llevel>\n" if $self->{'gli'};
[8072]269
[13589]270 $self->build_index($index,$llevel);
271 }
272 $self->{'index_mapping'}->{$index} = $idx;
[8072]273
[13589]274 } else {
275 print $outhandle "\n*** ignoring index $index\n" if ($self->{'verbosity'} >= 1);
276 }
[8072]277 }
278
[24460]279 $self->post_build_indexes();
[8072]280}
281
[12844]282
[8072]283sub build_index {
284 my $self = shift (@_);
285 my ($index,$llevel) = @_;
286 my $outhandle = $self->{'outhandle'};
287 my $build_dir = $self->{'build_dir'};
288
289 # get the full index directory path and make sure it exists
290 my $indexdir = $self->{'index_mapping'}->{$index};
291 &util::mk_all_dir (&util::filename_cat($build_dir, $indexdir));
292
293 # get any os specific stuff
294 my $exedir = "$ENV{'GSDLHOME'}/bin/$ENV{'GSDLOS'}";
295 my $scriptdir = "$ENV{'GSDLHOME'}/bin/script";
296
[12844]297 # Find the perl script to call to run lucene
298 my $full_lucene_passes = $self->{'full_lucene_passes'};
299 my $full_lucene_passes_exe = $self->{'full_lucene_passes_exe'};
[8072]300
301 # define the section names for lucenepasses
302 # define the section names and possibly the doc name for lucenepasses
303 my $lucene_passes_sections = $llevel;
304
[20683]305 my $opt_create_index = ($self->{'incremental'}) ? "" : "-removeold";
[10158]306
[8072]307 my $osextra = "";
308 if ($ENV{'GSDLOS'} =~ /^windows$/i) {
[13589]309 $build_dir =~ s@/@\\@g;
[8072]310 } else {
[13589]311 if ($outhandle ne "STDERR") {
312 # so lucene_passes doesn't print to stderr if we redirect output
313 $osextra .= " 2>/dev/null";
314 }
[8072]315 }
[12844]316
[8072]317 # get the index expression if this index belongs
318 # to a subcollection
319 my $indexexparr = [];
[9669]320 my $langarr = [];
[8072]321
[12844]322 # there may be subcollection info, and language info.
[8072]323 my ($fields, $subcollection, $language) = split (":", $index);
324 my @subcollections = ();
325 @subcollections = split /,/, $subcollection if (defined $subcollection);
326
327 foreach $subcollection (@subcollections) {
[13589]328 if (defined ($self->{'collect_cfg'}->{'subcollection'}->{$subcollection})) {
329 push (@$indexexparr, $self->{'collect_cfg'}->{'subcollection'}->{$subcollection});
330 }
[8072]331 }
[12844]332
[8072]333 # add expressions for languages if this index belongs to
[12844]334 # a language subcollection - only put languages expressions for the
[8072]335 # ones we want in the index
336 my @languages = ();
[20418]337 my $languagemetadata = "Language";
338 if (defined ($self->{'collect_cfg'}->{'languagemetadata'})) {
339 $languagemetadata = $self->{'collect_cfg'}->{'languagemetadata'};
[9548]340 }
[8072]341 @languages = split /,/, $language if (defined $language);
[9548]342 foreach my $language (@languages) {
[13589]343 my $not=0;
344 if ($language =~ s/^\!//) {
345 $not = 1;
346 }
347 if($not) {
348 push (@$langarr, "!$language");
349 } else {
350 push (@$langarr, "$language");
351 }
[8072]352 }
353
354 # Build index dictionary. Uses verbatim stem method
355 print $outhandle "\n creating index dictionary (lucene_passes -I1)\n" if ($self->{'verbosity'} >= 1);
356 print STDERR "<Phase name='CreatingIndexDic'/>\n" if $self->{'gli'};
357 my ($handle);
358
359 if ($self->{'debug'}) {
[15712]360 $handle = *STDOUT;
[8072]361 } else {
[13589]362 print STDERR "Cmd: $full_lucene_passes_exe $opt_create_index index $lucene_passes_sections \"$build_dir\" \"$indexdir\" $osextra\n";
363 if (!-e "$full_lucene_passes" ||
[15712]364 !open($handle, "| $full_lucene_passes_exe $opt_create_index index $lucene_passes_sections \"$build_dir\" \"$indexdir\" $osextra")) {
[13589]365 print STDERR "<FatalError name='NoRunLucenePasses'/>\n</Stage>\n" if $self->{'gli'};
[15712]366 die "lucenebuilder::build_index - couldn't run $full_lucene_passes_exe\n";
[13589]367 }
[8072]368 }
[12844]369
[8072]370 my $store_levels = $self->{'levels'};
[15685]371 my $db_level = "section"; #always
[8072]372 my $dom_level = "";
373 foreach my $key (keys %$store_levels) {
[13589]374 if ($mgppbuilder::level_map{$key} eq $llevel) {
375 $dom_level = $key;
376 }
[8072]377 }
378 if ($dom_level eq "") {
[13589]379 print STDERR "Warning: unrecognized tag level $llevel\n";
380 $dom_level = "document";
[8072]381 }
382
383 my $local_levels = { $dom_level => 1 }; # work on one level at a time
384
385 # set up the document processr
386 $self->{'buildproc'}->set_output_handle ($handle);
387 $self->{'buildproc'}->set_mode ('text');
388 $self->{'buildproc'}->set_index ($index, $indexexparr);
[20418]389 $self->{'buildproc'}->set_index_languages ($languagemetadata, $langarr) if (defined $language);
[8072]390 $self->{'buildproc'}->set_indexing_text (1);
[17564]391 #$self->{'buildproc'}->set_indexfieldmap ($self->{'indexfieldmap'});
[12844]392 $self->{'buildproc'}->set_levels ($local_levels);
[15685]393 $self->{'buildproc'}->set_db_level($db_level);
[8072]394 $self->{'buildproc'}->reset();
[12844]395 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
[16379]396 "", {}, {}, $self->{'buildproc'}, $self->{'maxdocs'}, 0, $self->{'gli'});
[8072]397 close ($handle) unless $self->{'debug'};
398
399 $self->print_stats();
400
[12844]401 $self->{'buildproc'}->set_levels ($store_levels);
[8072]402 print STDERR "</Stage>\n" if $self->{'gli'};
[12844]403}
[8072]404
[12844]405# /** A modified version of the basebuilder.pm's function that generates the
[15714]406# * information database from the GA documents. We need to change this
[12844]407# * so that if we've been asked to do an incremental build we only add
408# * metadata to autohierarchy classifiers via the IncrementalBuildUtils
409# * module. All other classifiers and metadata will be ignored.
410# */
[17286]411# This was added to utilize DLC's incremental updating of Hierarchy classifiers. They are heading towards just using dynamic classifiers, and we do not want to use this code either. So now, we just use basebuilder's version of make_infodatabase
412sub make_infodatabase_dlc
[13589]413{
[12844]414 my $self = shift (@_);
415 my $outhandle = $self->{'outhandle'};
416
[15714]417 # Get info database file path
418 my $text_directory_path = &util::filename_cat($self->{'build_dir'}, "text");
[15725]419 my $infodb_file_path = &dbutil::get_infodb_file_path($self->{'infodbtype'}, $self->{'collection'}, $text_directory_path);
[12844]420
421 # If we aren't doing an incremental addition, then we just call the super-
422 # classes version
[15714]423 # Note: Incremental addition can only occur if an information database
[12844]424 # already exists. If it doesn't, let the super classes function be
425 # called once to generate it.
[16259]426 if (!$self->{'incremental'} || !-e $infodb_file_path)
[13589]427 {
[12844]428 # basebuilder::make_infodatabase(@_);
429 # Note: this doesn't work as the direct reference means all the $self
430 # data is lost.
431 $self->basebuilder::make_infodatabase(@_);
432 return;
[13589]433 }
[12844]434
435 # Carry on with an incremental addition
436 print $outhandle "\n*** performing an incremental addition to the info database\n" if ($self->{'verbosity'} >= 1);
437 print STDERR "<Stage name='CreateInfoData'>\n" if $self->{'gli'};
438
439 # 1. Init all the classifiers
440 &classify::init_classifiers ($self->{'classifiers'});
441 # 2. Init the buildproc settings.
442 # Note: we still need this to process any associated files - but we
[15718]443 # don't expect to pipe anything to the database so we can do away with the
[12844]444 # complex output handle.
445 my $assocdir = &util::filename_cat($self->{'build_dir'}, "assoc");
446 &util::mk_all_dir ($assocdir);
447 $self->{'buildproc'}->set_mode ('incinfodb'); # Very Important
448 $self->{'buildproc'}->set_assocdir ($assocdir);
449 # 3. Read in all the metadata from the files in the archives directory using
450 # the GAPlug and using ourselves as the document processor!
[16379]451 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'}, "", {}, {}, $self->{'buildproc'}, $self->{'maxdocs'},0, $self->{'gli'});
[12844]452
453 print STDERR "</Stage>\n" if $self->{'gli'};
454}
455
[13590]456# /** Lucene specific document removal function. This works by calling lucene_passes.pl with
457# * -remove and the document id on the command line.
458# *
459# * @param oid is the document identifier to be removed.
460# *
461# * @author John Rowe, DL Consulting Ltd.
462# */
463sub remove_document_from_database
464{
465 my ($self, $oid) = @_;
466 # Find the perl script to call to run lucene
467 my $full_lucene_passes_exe = $self->{'full_lucene_passes_exe'};
468 # Call lucene_passes.pl with -remove and the document ID on the command line
469 `$full_lucene_passes_exe -remove "$oid"`;
470}
471# /** remove_document_from_database **/
472
473
[8072]4741;
475
476
Note: See TracBrowser for help on using the repository browser.