source: main/trunk/greenstone2/perllib/lucenebuilder.pm@ 24192

Last change on this file since 24192 was 24192, checked in by ak19, 13 years ago

Sam discovered that using dollar-Config{perlpath} in place of dollar-hat-X is the better way to obtain the path to the perl that is being used. We hope this will not be a relative path on the Mac as dollar-hat-x was on Professor Witten's Mac when we tried it there today.

  • Property svn:keywords set to Author Date Id Revision
File size: 16.3 KB
RevLine 
[8072]1###########################################################################
2#
3# lucenebuilder.pm -- perl wrapper for building index with Lucene
4# A component of the Greenstone digital library software
[12844]5# from the New Zealand Digital Library Project at the
[8072]6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
[12844]26###########################################################################
27# /*
28# * @version 1.0 ?
29# * @version 2.0 Incremental building assistance added, including
30# * remove_document_from_database which implements the granddad's
31# * empty function to call the lucene_passes.pl and full_lucene_passes_exe
32# * so there is one place in the code that works out where the
33# * perl script is. John Rowe
34# *
35# * @author John Rowe, DL Consulting Ltd.
36# */
37###########################################################################
38
[8072]39package lucenebuilder;
40
41# Use same basic XML structure setup by mgppbuilder/mgppbuildproc
42
[12844]43use mgppbuilder;
[17564]44use strict;
45no strict 'refs';
[24192]46use Config; # for getting the perlpath in the recommended way
[8072]47
48sub BEGIN {
[8716]49 @lucenebuilder::ISA = ('mgppbuilder');
[8072]50}
51
[12844]52# /**
53# * @author John Thompson, DL Consulting Ltd.
54# */
[8072]55sub new {
56 my $class = shift(@_);
57 my $self = new mgppbuilder (@_);
58 $self = bless $self, $class;
59
60 $self->{'buildtype'} = "lucene";
[21621]61
62 # If ENABLE_LUCENE was turned off during GS compilation, then we won't be able to
63 # continue. Check for existence of LuceneWrapper to see if Lucene was disabled.
64 my $lucene = &util::filename_cat($ENV{'GSDLHOME'},"bin","java","LuceneWrapper.jar");
65 if(!-f $lucene) {
66 die "***** ERROR: $lucene does not exist\n";
67 }
68
[12844]69 # Do we need to put exe on the end?
70 my $exe = &util::get_os_exe ();
71 my $scriptdir = "$ENV{'GSDLHOME'}/bin/script";
72
73 # So where is lucene_passes.pl anyway?
74 my $lucene_passes_script = &util::filename_cat($scriptdir, "lucene_passes.pl");
75
76 # So tack perl on the beginning to ensure execution
77 $self->{'full_lucene_passes'} = "$lucene_passes_script";
78 if ($exe eq ".exe")
79 {
[13589]80 $self->{'full_lucene_passes_exe'} = "perl$exe \"$lucene_passes_script\"";
[12844]81 }
82 else
83 {
[24192]84 $self->{'full_lucene_passes_exe'} = "\"$Config{perlpath}\" -S \"$lucene_passes_script\"";
[12844]85 }
86
[8072]87 return $self;
88}
[12844]89# /** new() **/
[8072]90
[20645]91sub is_incremental_capable
92{
93 # lucene can do incremental building
94
95 return 1;
96}
97
[17575]98sub init_for_incremental_build {
99 my $self = shift (@_);
100
101 # we want to read in indexfieldmap and indexfields from existing build.cfg
102 # so that we know what has already been indexed
103 my $buildcfg = $self->read_build_cfg();
104 return unless defined $buildcfg;
105
106 my $field;
107 if (defined $buildcfg->{'indexfields'}) {
108 foreach $field (@{$buildcfg->{'indexfields'}}) {
109 $self->{'buildproc'}->{'indexfields'}->{$field} = 1;
110 }
111 }
112
113 if (defined $buildcfg->{'indexfieldmap'}) {
114 foreach $field (@{$buildcfg->{'indexfieldmap'}}) {
115 my ($f, $v) = $field =~ /^(.*)\-\>(.*)$/;
116 $self->{'buildproc'}->{'indexfieldmap'}->{$f} = $v;
117 }
118 }
119
120}
121
[12910]122# lucene has none of these options
123sub generate_index_options {
124 my $self = shift (@_);
125
[17110]126 $self->SUPER::generate_index_options();
127
[12910]128 $self->{'casefold'} = 0;
129 $self->{'stem'} = 0;
130 $self->{'accentfold'} = 0;
131 $self->{'stemindexes'} = 0;
132}
133
[10468]134sub default_buildproc {
135 my $self = shift (@_);
136
137 return "lucenebuildproc";
138}
139
[9179]140# this writes a nice version of the text docs
[12844]141sub compress_text
[13589]142{
[8072]143 my $self = shift (@_);
[10961]144 # we don't do anything if we don't want compressed text
145 return if $self->{'no_text'};
146
[8072]147 my ($textindex) = @_;
[9179]148 my $outhandle = $self->{'outhandle'};
[17575]149
[9179]150 # the text directory
151 my $text_dir = &util::filename_cat($self->{'build_dir'}, "text");
[9197]152 my $build_dir = &util::filename_cat($self->{'build_dir'},"");
[9179]153 &util::mk_all_dir ($text_dir);
154
155 my $osextra = "";
[12844]156 if ($ENV{'GSDLOS'} =~ /^windows$/i)
[13589]157 {
158 $text_dir =~ s@/@\\@g;
159 }
[12844]160 else
[13589]161 {
162 if ($outhandle ne "STDERR")
163 {
164 # so lucene_passes doesn't print to stderr if we redirect output
165 $osextra .= " 2>/dev/null";
166 }
167 }
[9179]168
169 # get any os specific stuff
170 my $scriptdir = "$ENV{'GSDLHOME'}/bin/script";
171
[12844]172 # Find the perl script to call to run lucene
173 my $full_lucene_passes = $self->{'full_lucene_passes'};
174 my $full_lucene_passes_exe = $self->{'full_lucene_passes_exe'};
175
[9179]176 my $lucene_passes_sections = "Doc";
177
178 my ($handle);
179
[12844]180 if ($self->{'debug'})
[13589]181 {
[15712]182 $handle = *STDOUT;
[13589]183 }
[12844]184 else
[13589]185 {
[12844]186 print STDERR "Full Path: $full_lucene_passes\n";
187 print STDERR "Executable: $full_lucene_passes_exe\n";
188 print STDERR "Sections: $lucene_passes_sections\n";
189 print STDERR "Build Dir: $build_dir\n";
190 print STDERR "Cmd: $full_lucene_passes_exe text $lucene_passes_sections \"$build_dir\" \"dummy\" $osextra\n";
[13589]191 if (!-e "$full_lucene_passes" ||
[15712]192 !open($handle, "| $full_lucene_passes_exe text $lucene_passes_sections \"$build_dir\" \"dummy\" $osextra"))
[13589]193 {
194 print STDERR "<FatalError name='NoRunLucenePasses'/>\n</Stage>\n" if $self->{'gli'};
195 die "lucenebuilder::build_index - couldn't run $full_lucene_passes_exe\n";
196 }
197 }
[9214]198
[13590]199 # stored text is always Doc and Sec levels
200 my $levels = { 'document' => 1, 'section' => 1 };
[15685]201 # always do database at section level
202 my $db_level = "section";
[13590]203
[9179]204 # set up the document processr
205 $self->{'buildproc'}->set_output_handle ($handle);
206 $self->{'buildproc'}->set_mode ('text');
207 $self->{'buildproc'}->set_index ($textindex);
208 $self->{'buildproc'}->set_indexing_text (0);
[17564]209 #$self->{'buildproc'}->set_indexfieldmap ($self->{'indexfieldmap'});
[12844]210 $self->{'buildproc'}->set_levels ($levels);
[15685]211 $self->{'buildproc'}->set_db_level ($db_level);
[9179]212 $self->{'buildproc'}->reset();
[12844]213 &plugin::begin($self->{'pluginfo'}, $self->{'source_dir'},
[13589]214 $self->{'buildproc'}, $self->{'maxdocs'});
[12844]215 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
[16379]216 "", {}, {}, $self->{'buildproc'}, $self->{'maxdocs'}, 0, $self->{'gli'});
[9179]217 &plugin::end($self->{'pluginfo'});
218 close ($handle) unless $self->{'debug'};
219 $self->print_stats();
220
221 print STDERR "</Stage>\n" if $self->{'gli'};
[13589]222}
[9179]223
[8072]224sub build_indexes {
225 my $self = shift (@_);
226 my ($indexname) = @_;
227 my $outhandle = $self->{'outhandle'};
228
229 my $indexes = [];
230 if (defined $indexname && $indexname =~ /\w/) {
[13589]231 push @$indexes, $indexname;
[8072]232 } else {
[13589]233 $indexes = $self->{'collect_cfg'}->{'indexes'};
[8072]234 }
[17566]235 # have we got para index?
236 foreach my $level (keys %{$self->{'levels'}}) {
237 if ($level =~ /paragraph/) {
238 print $outhandle "Warning: Paragraph level indexing not supported by Lucene\n";
239 last;
240 }
241 }
[12844]242 # create the mapping between the index descriptions
[8072]243 # and their directory names (includes subcolls and langs)
244 $self->{'index_mapping'} = $self->create_index_mapping ($indexes);
245
246 # build each of the indexes
[15712]247 foreach my $index (@$indexes) {
[13589]248 if ($self->want_built($index)) {
[8072]249
[13589]250 my $idx = $self->{'index_mapping'}->{$index};
251 foreach my $level (keys %{$self->{'levels'}}) {
252 next if $level =~ /paragraph/; # we don't do para indexing
253 my ($pindex) = $level =~ /^(.)/;
254 # should probably check that new name with level
255 # is unique ... but currently (with doc sec and para)
256 # each has unique first letter.
257 $self->{'index_mapping'}->{$index} = $pindex.$idx;
[8072]258
[13589]259 my $llevel = $mgppbuilder::level_map{$level};
260 print $outhandle "\n*** building index $index at level $llevel in subdirectory " .
261 "$self->{'index_mapping'}->{$index}\n" if ($self->{'verbosity'} >= 1);
262 print STDERR "<Stage name='Index' source='$index' level=$llevel>\n" if $self->{'gli'};
[8072]263
[13589]264 $self->build_index($index,$llevel);
265 }
266 $self->{'index_mapping'}->{$index} = $idx;
[8072]267
[13589]268 } else {
269 print $outhandle "\n*** ignoring index $index\n" if ($self->{'verbosity'} >= 1);
270 }
[8072]271 }
272
273 #define the final field lists
274 $self->make_final_field_list();
275}
276
[12844]277
[8072]278sub build_index {
279 my $self = shift (@_);
280 my ($index,$llevel) = @_;
281 my $outhandle = $self->{'outhandle'};
282 my $build_dir = $self->{'build_dir'};
283
284 # get the full index directory path and make sure it exists
285 my $indexdir = $self->{'index_mapping'}->{$index};
286 &util::mk_all_dir (&util::filename_cat($build_dir, $indexdir));
287
288 # get any os specific stuff
289 my $exedir = "$ENV{'GSDLHOME'}/bin/$ENV{'GSDLOS'}";
290 my $scriptdir = "$ENV{'GSDLHOME'}/bin/script";
291
[12844]292 # Find the perl script to call to run lucene
293 my $full_lucene_passes = $self->{'full_lucene_passes'};
294 my $full_lucene_passes_exe = $self->{'full_lucene_passes_exe'};
[8072]295
296 # define the section names for lucenepasses
297 # define the section names and possibly the doc name for lucenepasses
298 my $lucene_passes_sections = $llevel;
299
[20683]300 my $opt_create_index = ($self->{'incremental'}) ? "" : "-removeold";
[10158]301
[8072]302 my $osextra = "";
303 if ($ENV{'GSDLOS'} =~ /^windows$/i) {
[13589]304 $build_dir =~ s@/@\\@g;
[8072]305 } else {
[13589]306 if ($outhandle ne "STDERR") {
307 # so lucene_passes doesn't print to stderr if we redirect output
308 $osextra .= " 2>/dev/null";
309 }
[8072]310 }
[12844]311
[8072]312 # get the index expression if this index belongs
313 # to a subcollection
314 my $indexexparr = [];
[9669]315 my $langarr = [];
[8072]316
[12844]317 # there may be subcollection info, and language info.
[8072]318 my ($fields, $subcollection, $language) = split (":", $index);
319 my @subcollections = ();
320 @subcollections = split /,/, $subcollection if (defined $subcollection);
321
322 foreach $subcollection (@subcollections) {
[13589]323 if (defined ($self->{'collect_cfg'}->{'subcollection'}->{$subcollection})) {
324 push (@$indexexparr, $self->{'collect_cfg'}->{'subcollection'}->{$subcollection});
325 }
[8072]326 }
[12844]327
[8072]328 # add expressions for languages if this index belongs to
[12844]329 # a language subcollection - only put languages expressions for the
[8072]330 # ones we want in the index
331 my @languages = ();
[20418]332 my $languagemetadata = "Language";
333 if (defined ($self->{'collect_cfg'}->{'languagemetadata'})) {
334 $languagemetadata = $self->{'collect_cfg'}->{'languagemetadata'};
[9548]335 }
[8072]336 @languages = split /,/, $language if (defined $language);
[9548]337 foreach my $language (@languages) {
[13589]338 my $not=0;
339 if ($language =~ s/^\!//) {
340 $not = 1;
341 }
342 if($not) {
343 push (@$langarr, "!$language");
344 } else {
345 push (@$langarr, "$language");
346 }
[8072]347 }
348
349 # Build index dictionary. Uses verbatim stem method
350 print $outhandle "\n creating index dictionary (lucene_passes -I1)\n" if ($self->{'verbosity'} >= 1);
351 print STDERR "<Phase name='CreatingIndexDic'/>\n" if $self->{'gli'};
352 my ($handle);
353
354 if ($self->{'debug'}) {
[15712]355 $handle = *STDOUT;
[8072]356 } else {
[13589]357 print STDERR "Cmd: $full_lucene_passes_exe $opt_create_index index $lucene_passes_sections \"$build_dir\" \"$indexdir\" $osextra\n";
358 if (!-e "$full_lucene_passes" ||
[15712]359 !open($handle, "| $full_lucene_passes_exe $opt_create_index index $lucene_passes_sections \"$build_dir\" \"$indexdir\" $osextra")) {
[13589]360 print STDERR "<FatalError name='NoRunLucenePasses'/>\n</Stage>\n" if $self->{'gli'};
[15712]361 die "lucenebuilder::build_index - couldn't run $full_lucene_passes_exe\n";
[13589]362 }
[8072]363 }
[12844]364
[8072]365 my $store_levels = $self->{'levels'};
[15685]366 my $db_level = "section"; #always
[8072]367 my $dom_level = "";
368 foreach my $key (keys %$store_levels) {
[13589]369 if ($mgppbuilder::level_map{$key} eq $llevel) {
370 $dom_level = $key;
371 }
[8072]372 }
373 if ($dom_level eq "") {
[13589]374 print STDERR "Warning: unrecognized tag level $llevel\n";
375 $dom_level = "document";
[8072]376 }
377
378 my $local_levels = { $dom_level => 1 }; # work on one level at a time
379
380 # set up the document processr
381 $self->{'buildproc'}->set_output_handle ($handle);
382 $self->{'buildproc'}->set_mode ('text');
383 $self->{'buildproc'}->set_index ($index, $indexexparr);
[20418]384 $self->{'buildproc'}->set_index_languages ($languagemetadata, $langarr) if (defined $language);
[8072]385 $self->{'buildproc'}->set_indexing_text (1);
[17564]386 #$self->{'buildproc'}->set_indexfieldmap ($self->{'indexfieldmap'});
[12844]387 $self->{'buildproc'}->set_levels ($local_levels);
[15685]388 $self->{'buildproc'}->set_db_level($db_level);
[8072]389 $self->{'buildproc'}->reset();
[12844]390 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
[16379]391 "", {}, {}, $self->{'buildproc'}, $self->{'maxdocs'}, 0, $self->{'gli'});
[8072]392 close ($handle) unless $self->{'debug'};
393
394 $self->print_stats();
395
[12844]396 $self->{'buildproc'}->set_levels ($store_levels);
[8072]397 print STDERR "</Stage>\n" if $self->{'gli'};
[12844]398}
[8072]399
[12844]400# /** A modified version of the basebuilder.pm's function that generates the
[15714]401# * information database from the GA documents. We need to change this
[12844]402# * so that if we've been asked to do an incremental build we only add
403# * metadata to autohierarchy classifiers via the IncrementalBuildUtils
404# * module. All other classifiers and metadata will be ignored.
405# */
[17286]406# This was added to utilize DLC's incremental updating of Hierarchy classifiers. They are heading towards just using dynamic classifiers, and we do not want to use this code either. So now, we just use basebuilder's version of make_infodatabase
407sub make_infodatabase_dlc
[13589]408{
[12844]409 my $self = shift (@_);
410 my $outhandle = $self->{'outhandle'};
411
[15714]412 # Get info database file path
413 my $text_directory_path = &util::filename_cat($self->{'build_dir'}, "text");
[15725]414 my $infodb_file_path = &dbutil::get_infodb_file_path($self->{'infodbtype'}, $self->{'collection'}, $text_directory_path);
[12844]415
416 # If we aren't doing an incremental addition, then we just call the super-
417 # classes version
[15714]418 # Note: Incremental addition can only occur if an information database
[12844]419 # already exists. If it doesn't, let the super classes function be
420 # called once to generate it.
[16259]421 if (!$self->{'incremental'} || !-e $infodb_file_path)
[13589]422 {
[12844]423 # basebuilder::make_infodatabase(@_);
424 # Note: this doesn't work as the direct reference means all the $self
425 # data is lost.
426 $self->basebuilder::make_infodatabase(@_);
427 return;
[13589]428 }
[12844]429
430 # Carry on with an incremental addition
431 print $outhandle "\n*** performing an incremental addition to the info database\n" if ($self->{'verbosity'} >= 1);
432 print STDERR "<Stage name='CreateInfoData'>\n" if $self->{'gli'};
433
434 # 1. Init all the classifiers
435 &classify::init_classifiers ($self->{'classifiers'});
436 # 2. Init the buildproc settings.
437 # Note: we still need this to process any associated files - but we
[15718]438 # don't expect to pipe anything to the database so we can do away with the
[12844]439 # complex output handle.
440 my $assocdir = &util::filename_cat($self->{'build_dir'}, "assoc");
441 &util::mk_all_dir ($assocdir);
442 $self->{'buildproc'}->set_mode ('incinfodb'); # Very Important
443 $self->{'buildproc'}->set_assocdir ($assocdir);
444 # 3. Read in all the metadata from the files in the archives directory using
445 # the GAPlug and using ourselves as the document processor!
[16379]446 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'}, "", {}, {}, $self->{'buildproc'}, $self->{'maxdocs'},0, $self->{'gli'});
[12844]447
448 print STDERR "</Stage>\n" if $self->{'gli'};
449}
450
[13590]451# /** Lucene specific document removal function. This works by calling lucene_passes.pl with
452# * -remove and the document id on the command line.
453# *
454# * @param oid is the document identifier to be removed.
455# *
456# * @author John Rowe, DL Consulting Ltd.
457# */
458sub remove_document_from_database
459{
460 my ($self, $oid) = @_;
461 # Find the perl script to call to run lucene
462 my $full_lucene_passes_exe = $self->{'full_lucene_passes_exe'};
463 # Call lucene_passes.pl with -remove and the document ID on the command line
464 `$full_lucene_passes_exe -remove "$oid"`;
465}
466# /** remove_document_from_database **/
467
468
[8072]4691;
470
471
Note: See TracBrowser for help on using the repository browser.