source: main/trunk/greenstone2/perllib/lucenebuilder.pm@ 24460

Last change on this file since 24460 was 24460, checked in by davidb, 13 years ago

Code changes to support indexers that are provided through the extension mechanism

  • Property svn:keywords set to Author Date Id Revision
File size: 16.3 KB
RevLine 
[8072]1###########################################################################
2#
3# lucenebuilder.pm -- perl wrapper for building index with Lucene
4# A component of the Greenstone digital library software
[12844]5# from the New Zealand Digital Library Project at the
[8072]6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
[12844]26###########################################################################
27# /*
[24460]28# * @version 1.0 Automated incremental building
[12844]29# * @version 2.0 Incremental building assistance added, including
30# * remove_document_from_database which implements the granddad's
31# * empty function to call the lucene_passes.pl and full_lucene_passes_exe
32# * so there is one place in the code that works out where the
33# * perl script is. John Rowe
34# *
[24460]35# * @authro Waikato Digital Library Research Group
[12844]36# * @author John Rowe, DL Consulting Ltd.
37# */
38###########################################################################
39
[8072]40package lucenebuilder;
41
42# Use same basic XML structure setup by mgppbuilder/mgppbuildproc
43
[12844]44use mgppbuilder;
[17564]45use strict;
46no strict 'refs';
[24362]47use util;
[8072]48
49sub BEGIN {
[8716]50 @lucenebuilder::ISA = ('mgppbuilder');
[8072]51}
52
[12844]53# /**
54# * @author John Thompson, DL Consulting Ltd.
55# */
[8072]56sub new {
57 my $class = shift(@_);
58 my $self = new mgppbuilder (@_);
59 $self = bless $self, $class;
60
61 $self->{'buildtype'} = "lucene";
[21621]62
63 # If ENABLE_LUCENE was turned off during GS compilation, then we won't be able to
64 # continue. Check for existence of LuceneWrapper to see if Lucene was disabled.
65 my $lucene = &util::filename_cat($ENV{'GSDLHOME'},"bin","java","LuceneWrapper.jar");
66 if(!-f $lucene) {
67 die "***** ERROR: $lucene does not exist\n";
68 }
69
[12844]70 # Do we need to put exe on the end?
71 my $exe = &util::get_os_exe ();
72 my $scriptdir = "$ENV{'GSDLHOME'}/bin/script";
73
74 # So where is lucene_passes.pl anyway?
75 my $lucene_passes_script = &util::filename_cat($scriptdir, "lucene_passes.pl");
76
77 # So tack perl on the beginning to ensure execution
78 $self->{'full_lucene_passes'} = "$lucene_passes_script";
79 if ($exe eq ".exe")
80 {
[13589]81 $self->{'full_lucene_passes_exe'} = "perl$exe \"$lucene_passes_script\"";
[12844]82 }
83 else
84 {
[24362]85 $self->{'full_lucene_passes_exe'} = "\"".&util::get_perl_exec()."\" -S \"$lucene_passes_script\"";
[12844]86 }
87
[8072]88 return $self;
89}
[12844]90# /** new() **/
[8072]91
[20645]92sub is_incremental_capable
93{
94 # lucene can do incremental building
95
96 return 1;
97}
98
[17575]99sub init_for_incremental_build {
100 my $self = shift (@_);
101
102 # we want to read in indexfieldmap and indexfields from existing build.cfg
103 # so that we know what has already been indexed
104 my $buildcfg = $self->read_build_cfg();
105 return unless defined $buildcfg;
106
107 my $field;
108 if (defined $buildcfg->{'indexfields'}) {
109 foreach $field (@{$buildcfg->{'indexfields'}}) {
110 $self->{'buildproc'}->{'indexfields'}->{$field} = 1;
111 }
112 }
113
114 if (defined $buildcfg->{'indexfieldmap'}) {
115 foreach $field (@{$buildcfg->{'indexfieldmap'}}) {
116 my ($f, $v) = $field =~ /^(.*)\-\>(.*)$/;
117 $self->{'buildproc'}->{'indexfieldmap'}->{$f} = $v;
118 }
119 }
120}
121
[12910]122# lucene has none of these options
123sub generate_index_options {
124 my $self = shift (@_);
125
[17110]126 $self->SUPER::generate_index_options();
127
[12910]128 $self->{'casefold'} = 0;
129 $self->{'stem'} = 0;
130 $self->{'accentfold'} = 0;
131 $self->{'stemindexes'} = 0;
132}
133
[10468]134sub default_buildproc {
135 my $self = shift (@_);
136
137 return "lucenebuildproc";
138}
139
[9179]140# this writes a nice version of the text docs
[12844]141sub compress_text
[13589]142{
[8072]143 my $self = shift (@_);
[10961]144 # we don't do anything if we don't want compressed text
145 return if $self->{'no_text'};
146
[8072]147 my ($textindex) = @_;
[9179]148 my $outhandle = $self->{'outhandle'};
[17575]149
[9179]150 # the text directory
151 my $text_dir = &util::filename_cat($self->{'build_dir'}, "text");
[9197]152 my $build_dir = &util::filename_cat($self->{'build_dir'},"");
[9179]153 &util::mk_all_dir ($text_dir);
154
155 my $osextra = "";
[12844]156 if ($ENV{'GSDLOS'} =~ /^windows$/i)
[13589]157 {
158 $text_dir =~ s@/@\\@g;
159 }
[12844]160 else
[13589]161 {
162 if ($outhandle ne "STDERR")
163 {
164 # so lucene_passes doesn't print to stderr if we redirect output
165 $osextra .= " 2>/dev/null";
166 }
167 }
[9179]168
169 # get any os specific stuff
170 my $scriptdir = "$ENV{'GSDLHOME'}/bin/script";
171
[12844]172 # Find the perl script to call to run lucene
173 my $full_lucene_passes = $self->{'full_lucene_passes'};
174 my $full_lucene_passes_exe = $self->{'full_lucene_passes_exe'};
175
[9179]176 my $lucene_passes_sections = "Doc";
177
178 my ($handle);
179
[12844]180 if ($self->{'debug'})
[13589]181 {
[15712]182 $handle = *STDOUT;
[13589]183 }
[12844]184 else
[13589]185 {
[12844]186 print STDERR "Full Path: $full_lucene_passes\n";
187 print STDERR "Executable: $full_lucene_passes_exe\n";
188 print STDERR "Sections: $lucene_passes_sections\n";
189 print STDERR "Build Dir: $build_dir\n";
190 print STDERR "Cmd: $full_lucene_passes_exe text $lucene_passes_sections \"$build_dir\" \"dummy\" $osextra\n";
[13589]191 if (!-e "$full_lucene_passes" ||
[15712]192 !open($handle, "| $full_lucene_passes_exe text $lucene_passes_sections \"$build_dir\" \"dummy\" $osextra"))
[13589]193 {
194 print STDERR "<FatalError name='NoRunLucenePasses'/>\n</Stage>\n" if $self->{'gli'};
195 die "lucenebuilder::build_index - couldn't run $full_lucene_passes_exe\n";
196 }
197 }
[9214]198
[13590]199 # stored text is always Doc and Sec levels
200 my $levels = { 'document' => 1, 'section' => 1 };
[15685]201 # always do database at section level
202 my $db_level = "section";
[13590]203
[9179]204 # set up the document processr
205 $self->{'buildproc'}->set_output_handle ($handle);
206 $self->{'buildproc'}->set_mode ('text');
207 $self->{'buildproc'}->set_index ($textindex);
208 $self->{'buildproc'}->set_indexing_text (0);
[17564]209 #$self->{'buildproc'}->set_indexfieldmap ($self->{'indexfieldmap'});
[12844]210 $self->{'buildproc'}->set_levels ($levels);
[15685]211 $self->{'buildproc'}->set_db_level ($db_level);
[9179]212 $self->{'buildproc'}->reset();
[12844]213 &plugin::begin($self->{'pluginfo'}, $self->{'source_dir'},
[13589]214 $self->{'buildproc'}, $self->{'maxdocs'});
[12844]215 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
[16379]216 "", {}, {}, $self->{'buildproc'}, $self->{'maxdocs'}, 0, $self->{'gli'});
[9179]217 &plugin::end($self->{'pluginfo'});
218 close ($handle) unless $self->{'debug'};
219 $self->print_stats();
220
221 print STDERR "</Stage>\n" if $self->{'gli'};
[13589]222}
[9179]223
[8072]224sub build_indexes {
225 my $self = shift (@_);
226 my ($indexname) = @_;
227 my $outhandle = $self->{'outhandle'};
228
[24460]229 $self->pre_build_indexes($indexname);
230
[8072]231 my $indexes = [];
232 if (defined $indexname && $indexname =~ /\w/) {
[13589]233 push @$indexes, $indexname;
[8072]234 } else {
[13589]235 $indexes = $self->{'collect_cfg'}->{'indexes'};
[8072]236 }
[24460]237
238 # Have we got para index?
[17566]239 foreach my $level (keys %{$self->{'levels'}}) {
240 if ($level =~ /paragraph/) {
[24460]241 print $outhandle "Warning: Paragraph level indexing not supported by Lucene/Solr\n";
[17566]242 last;
243 }
244 }
[24460]245
246 # Create the mapping between the index descriptions
[8072]247 # and their directory names (includes subcolls and langs)
248 $self->{'index_mapping'} = $self->create_index_mapping ($indexes);
249
250 # build each of the indexes
[15712]251 foreach my $index (@$indexes) {
[24460]252
[13589]253 if ($self->want_built($index)) {
[8072]254
[13589]255 my $idx = $self->{'index_mapping'}->{$index};
256 foreach my $level (keys %{$self->{'levels'}}) {
257 next if $level =~ /paragraph/; # we don't do para indexing
258 my ($pindex) = $level =~ /^(.)/;
259 # should probably check that new name with level
260 # is unique ... but currently (with doc sec and para)
261 # each has unique first letter.
262 $self->{'index_mapping'}->{$index} = $pindex.$idx;
[8072]263
[13589]264 my $llevel = $mgppbuilder::level_map{$level};
265 print $outhandle "\n*** building index $index at level $llevel in subdirectory " .
266 "$self->{'index_mapping'}->{$index}\n" if ($self->{'verbosity'} >= 1);
267 print STDERR "<Stage name='Index' source='$index' level=$llevel>\n" if $self->{'gli'};
[8072]268
[13589]269 $self->build_index($index,$llevel);
270 }
271 $self->{'index_mapping'}->{$index} = $idx;
[8072]272
[13589]273 } else {
274 print $outhandle "\n*** ignoring index $index\n" if ($self->{'verbosity'} >= 1);
275 }
[8072]276 }
277
[24460]278 $self->post_build_indexes();
[8072]279}
280
[12844]281
[8072]282sub build_index {
283 my $self = shift (@_);
284 my ($index,$llevel) = @_;
285 my $outhandle = $self->{'outhandle'};
286 my $build_dir = $self->{'build_dir'};
287
288 # get the full index directory path and make sure it exists
289 my $indexdir = $self->{'index_mapping'}->{$index};
290 &util::mk_all_dir (&util::filename_cat($build_dir, $indexdir));
291
292 # get any os specific stuff
293 my $exedir = "$ENV{'GSDLHOME'}/bin/$ENV{'GSDLOS'}";
294 my $scriptdir = "$ENV{'GSDLHOME'}/bin/script";
295
[12844]296 # Find the perl script to call to run lucene
297 my $full_lucene_passes = $self->{'full_lucene_passes'};
298 my $full_lucene_passes_exe = $self->{'full_lucene_passes_exe'};
[8072]299
300 # define the section names for lucenepasses
301 # define the section names and possibly the doc name for lucenepasses
302 my $lucene_passes_sections = $llevel;
303
[20683]304 my $opt_create_index = ($self->{'incremental'}) ? "" : "-removeold";
[10158]305
[8072]306 my $osextra = "";
307 if ($ENV{'GSDLOS'} =~ /^windows$/i) {
[13589]308 $build_dir =~ s@/@\\@g;
[8072]309 } else {
[13589]310 if ($outhandle ne "STDERR") {
311 # so lucene_passes doesn't print to stderr if we redirect output
312 $osextra .= " 2>/dev/null";
313 }
[8072]314 }
[12844]315
[8072]316 # get the index expression if this index belongs
317 # to a subcollection
318 my $indexexparr = [];
[9669]319 my $langarr = [];
[8072]320
[12844]321 # there may be subcollection info, and language info.
[8072]322 my ($fields, $subcollection, $language) = split (":", $index);
323 my @subcollections = ();
324 @subcollections = split /,/, $subcollection if (defined $subcollection);
325
326 foreach $subcollection (@subcollections) {
[13589]327 if (defined ($self->{'collect_cfg'}->{'subcollection'}->{$subcollection})) {
328 push (@$indexexparr, $self->{'collect_cfg'}->{'subcollection'}->{$subcollection});
329 }
[8072]330 }
[12844]331
[8072]332 # add expressions for languages if this index belongs to
[12844]333 # a language subcollection - only put languages expressions for the
[8072]334 # ones we want in the index
335 my @languages = ();
[20418]336 my $languagemetadata = "Language";
337 if (defined ($self->{'collect_cfg'}->{'languagemetadata'})) {
338 $languagemetadata = $self->{'collect_cfg'}->{'languagemetadata'};
[9548]339 }
[8072]340 @languages = split /,/, $language if (defined $language);
[9548]341 foreach my $language (@languages) {
[13589]342 my $not=0;
343 if ($language =~ s/^\!//) {
344 $not = 1;
345 }
346 if($not) {
347 push (@$langarr, "!$language");
348 } else {
349 push (@$langarr, "$language");
350 }
[8072]351 }
352
353 # Build index dictionary. Uses verbatim stem method
354 print $outhandle "\n creating index dictionary (lucene_passes -I1)\n" if ($self->{'verbosity'} >= 1);
355 print STDERR "<Phase name='CreatingIndexDic'/>\n" if $self->{'gli'};
356 my ($handle);
357
358 if ($self->{'debug'}) {
[15712]359 $handle = *STDOUT;
[8072]360 } else {
[13589]361 print STDERR "Cmd: $full_lucene_passes_exe $opt_create_index index $lucene_passes_sections \"$build_dir\" \"$indexdir\" $osextra\n";
362 if (!-e "$full_lucene_passes" ||
[15712]363 !open($handle, "| $full_lucene_passes_exe $opt_create_index index $lucene_passes_sections \"$build_dir\" \"$indexdir\" $osextra")) {
[13589]364 print STDERR "<FatalError name='NoRunLucenePasses'/>\n</Stage>\n" if $self->{'gli'};
[15712]365 die "lucenebuilder::build_index - couldn't run $full_lucene_passes_exe\n";
[13589]366 }
[8072]367 }
[12844]368
[8072]369 my $store_levels = $self->{'levels'};
[15685]370 my $db_level = "section"; #always
[8072]371 my $dom_level = "";
372 foreach my $key (keys %$store_levels) {
[13589]373 if ($mgppbuilder::level_map{$key} eq $llevel) {
374 $dom_level = $key;
375 }
[8072]376 }
377 if ($dom_level eq "") {
[13589]378 print STDERR "Warning: unrecognized tag level $llevel\n";
379 $dom_level = "document";
[8072]380 }
381
382 my $local_levels = { $dom_level => 1 }; # work on one level at a time
383
384 # set up the document processr
385 $self->{'buildproc'}->set_output_handle ($handle);
386 $self->{'buildproc'}->set_mode ('text');
387 $self->{'buildproc'}->set_index ($index, $indexexparr);
[20418]388 $self->{'buildproc'}->set_index_languages ($languagemetadata, $langarr) if (defined $language);
[8072]389 $self->{'buildproc'}->set_indexing_text (1);
[17564]390 #$self->{'buildproc'}->set_indexfieldmap ($self->{'indexfieldmap'});
[12844]391 $self->{'buildproc'}->set_levels ($local_levels);
[15685]392 $self->{'buildproc'}->set_db_level($db_level);
[8072]393 $self->{'buildproc'}->reset();
[12844]394 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
[16379]395 "", {}, {}, $self->{'buildproc'}, $self->{'maxdocs'}, 0, $self->{'gli'});
[8072]396 close ($handle) unless $self->{'debug'};
397
398 $self->print_stats();
399
[12844]400 $self->{'buildproc'}->set_levels ($store_levels);
[8072]401 print STDERR "</Stage>\n" if $self->{'gli'};
[12844]402}
[8072]403
[12844]404# /** A modified version of the basebuilder.pm's function that generates the
[15714]405# * information database from the GA documents. We need to change this
[12844]406# * so that if we've been asked to do an incremental build we only add
407# * metadata to autohierarchy classifiers via the IncrementalBuildUtils
408# * module. All other classifiers and metadata will be ignored.
409# */
[17286]410# This was added to utilize DLC's incremental updating of Hierarchy classifiers. They are heading towards just using dynamic classifiers, and we do not want to use this code either. So now, we just use basebuilder's version of make_infodatabase
411sub make_infodatabase_dlc
[13589]412{
[12844]413 my $self = shift (@_);
414 my $outhandle = $self->{'outhandle'};
415
[15714]416 # Get info database file path
417 my $text_directory_path = &util::filename_cat($self->{'build_dir'}, "text");
[15725]418 my $infodb_file_path = &dbutil::get_infodb_file_path($self->{'infodbtype'}, $self->{'collection'}, $text_directory_path);
[12844]419
420 # If we aren't doing an incremental addition, then we just call the super-
421 # classes version
[15714]422 # Note: Incremental addition can only occur if an information database
[12844]423 # already exists. If it doesn't, let the super classes function be
424 # called once to generate it.
[16259]425 if (!$self->{'incremental'} || !-e $infodb_file_path)
[13589]426 {
[12844]427 # basebuilder::make_infodatabase(@_);
428 # Note: this doesn't work as the direct reference means all the $self
429 # data is lost.
430 $self->basebuilder::make_infodatabase(@_);
431 return;
[13589]432 }
[12844]433
434 # Carry on with an incremental addition
435 print $outhandle "\n*** performing an incremental addition to the info database\n" if ($self->{'verbosity'} >= 1);
436 print STDERR "<Stage name='CreateInfoData'>\n" if $self->{'gli'};
437
438 # 1. Init all the classifiers
439 &classify::init_classifiers ($self->{'classifiers'});
440 # 2. Init the buildproc settings.
441 # Note: we still need this to process any associated files - but we
[15718]442 # don't expect to pipe anything to the database so we can do away with the
[12844]443 # complex output handle.
444 my $assocdir = &util::filename_cat($self->{'build_dir'}, "assoc");
445 &util::mk_all_dir ($assocdir);
446 $self->{'buildproc'}->set_mode ('incinfodb'); # Very Important
447 $self->{'buildproc'}->set_assocdir ($assocdir);
448 # 3. Read in all the metadata from the files in the archives directory using
449 # the GAPlug and using ourselves as the document processor!
[16379]450 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'}, "", {}, {}, $self->{'buildproc'}, $self->{'maxdocs'},0, $self->{'gli'});
[12844]451
452 print STDERR "</Stage>\n" if $self->{'gli'};
453}
454
[13590]455# /** Lucene specific document removal function. This works by calling lucene_passes.pl with
456# * -remove and the document id on the command line.
457# *
458# * @param oid is the document identifier to be removed.
459# *
460# * @author John Rowe, DL Consulting Ltd.
461# */
462sub remove_document_from_database
463{
464 my ($self, $oid) = @_;
465 # Find the perl script to call to run lucene
466 my $full_lucene_passes_exe = $self->{'full_lucene_passes_exe'};
467 # Call lucene_passes.pl with -remove and the document ID on the command line
468 `$full_lucene_passes_exe -remove "$oid"`;
469}
470# /** remove_document_from_database **/
471
472
[8072]4731;
474
475
Note: See TracBrowser for help on using the repository browser.