source: main/trunk/greenstone2/perllib/lucenebuilder.pm@ 27301

Last change on this file since 27301 was 27301, checked in by jmt12, 11 years ago

You can now use the indexname and indexlevel options to buildcol to selectively build lucene indexes

  • Property svn:keywords set to Author Date Id Revision
File size: 17.0 KB
Line 
1###########################################################################
2#
3# lucenebuilder.pm -- perl wrapper for building index with Lucene
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26###########################################################################
27# /*
28# * @version 1.0 Initial implementation of incremental building
29# * @version 2.0 Incremental building assistance added, including
30# * remove_document_from_database which implements the granddad's
31# * empty function to call the lucene_passes.pl and full_lucene_passes_exe
32# * so there is one place in the code that works out where the
33# * perl script is. John Rowe
34# *
35# * @author David Bainbridge and Katherine Don, Waikato DL Research group
36# * @author John Rowe, DL Consulting Ltd.
37# * @author John Thompson, DL Consulting Ltd.
38# */
39###########################################################################
40
41package lucenebuilder;
42
43# Use same basic XML structure setup by mgppbuilder/mgppbuildproc
44
45use mgppbuilder;
46use strict;
47no strict 'refs';
48use util;
49
50sub BEGIN {
51 @lucenebuilder::ISA = ('mgppbuilder');
52}
53
54# /**
55# * @author John Thompson, DL Consulting Ltd.
56# */
57sub new {
58 my $class = shift(@_);
59 my $self = new mgppbuilder (@_);
60 $self = bless $self, $class;
61
62 $self->{'buildtype'} = "lucene";
63
64 # If ENABLE_LUCENE was turned off during GS compilation, then we won't be able to
65 # continue. Check for existence of LuceneWrapper to see if Lucene was disabled.
66 my $lucene = &util::filename_cat($ENV{'GSDLHOME'},"bin","java","LuceneWrapper.jar");
67 if (! -f $lucene) {
68 die "***** ERROR: $lucene does not exist\n";
69 }
70
71 # Do we need to put exe on the end?
72 my $exe = &util::get_os_exe ();
73 my $scriptdir = "$ENV{'GSDLHOME'}/bin/script";
74
75 # So where is lucene_passes.pl anyway?
76 my $lucene_passes_script = &util::filename_cat($scriptdir, "lucene_passes.pl");
77
78 # So tack perl on the beginning to ensure execution
79 $self->{'full_lucene_passes'} = "$lucene_passes_script";
80 if ($exe eq ".exe")
81 {
82 $self->{'full_lucene_passes_exe'} = "perl$exe \"$lucene_passes_script\"";
83 }
84 else
85 {
86 $self->{'full_lucene_passes_exe'} = "\"".&util::get_perl_exec()."\" -S \"$lucene_passes_script\"";
87 }
88
89 return $self;
90}
91# /** new() **/
92
93sub is_incremental_capable
94{
95 # lucene can do incremental building
96
97 return 1;
98}
99
100sub init_for_incremental_build {
101 my $self = shift (@_);
102
103 # we want to read in indexfieldmap and indexfields from existing build.cfg
104 # so that we know what has already been indexed
105 my $buildcfg = $self->read_build_cfg();
106 return unless defined $buildcfg;
107
108 my $field;
109 if (defined $buildcfg->{'indexfields'}) {
110 foreach $field (@{$buildcfg->{'indexfields'}}) {
111 $self->{'buildproc'}->{'indexfields'}->{$field} = 1;
112 }
113 }
114
115 if (defined $buildcfg->{'indexfieldmap'}) {
116 foreach $field (@{$buildcfg->{'indexfieldmap'}}) {
117 my ($f, $v) = $field =~ /^(.*)\-\>(.*)$/;
118 $self->{'buildproc'}->{'indexfieldmap'}->{$f} = $v;
119 $self->{'buildproc'}->{'indexfieldmap'}->{$v} = 1;
120 }
121 }
122}
123
124# lucene has none of these options
125sub generate_index_options {
126 my $self = shift (@_);
127
128 $self->SUPER::generate_index_options();
129
130 $self->{'casefold'} = 0;
131 $self->{'stem'} = 0;
132 $self->{'accentfold'} = 0;
133 $self->{'stemindexes'} = 0;
134}
135
136sub default_buildproc {
137 my $self = shift (@_);
138
139 return "lucenebuildproc";
140}
141
142# this writes a nice version of the text docs
143sub compress_text
144{
145 my $self = shift (@_);
146 # we don't do anything if we don't want compressed text
147 return if $self->{'no_text'};
148
149 my ($textindex) = @_;
150 my $outhandle = $self->{'outhandle'};
151
152 # the text directory
153 my $text_dir = &util::filename_cat($self->{'build_dir'}, "text");
154 my $build_dir = &util::filename_cat($self->{'build_dir'},"");
155 &util::mk_all_dir ($text_dir);
156
157 my $osextra = "";
158 if ($ENV{'GSDLOS'} =~ /^windows$/i)
159 {
160 $text_dir =~ s@/@\\@g;
161 }
162 else
163 {
164 if ($outhandle ne "STDERR")
165 {
166 # so lucene_passes doesn't print to stderr if we redirect output
167 $osextra .= " 2>/dev/null";
168 }
169 }
170
171 # get any os specific stuff
172 my $scriptdir = "$ENV{'GSDLHOME'}/bin/script";
173
174 # Find the perl script to call to run lucene
175 my $full_lucene_passes = $self->{'full_lucene_passes'};
176 my $full_lucene_passes_exe = $self->{'full_lucene_passes_exe'};
177
178 my $lucene_passes_sections = "Doc";
179
180 my ($handle);
181
182 if ($self->{'debug'})
183 {
184 $handle = *STDOUT;
185 }
186 else
187 {
188 print STDERR "Full Path: $full_lucene_passes\n";
189 print STDERR "Executable: $full_lucene_passes_exe\n";
190 print STDERR "Sections: $lucene_passes_sections\n";
191 print STDERR "Build Dir: $build_dir\n";
192 print STDERR "Cmd: $full_lucene_passes_exe text $lucene_passes_sections \"$build_dir\" \"dummy\" $osextra\n";
193 if (!-e "$full_lucene_passes" ||
194 !open($handle, "| $full_lucene_passes_exe text $lucene_passes_sections \"$build_dir\" \"dummy\" $osextra"))
195 {
196 print STDERR "<FatalError name='NoRunLucenePasses'/>\n</Stage>\n" if $self->{'gli'};
197 die "lucenebuilder::build_index - couldn't run $full_lucene_passes_exe\n";
198 }
199 }
200
201 # stored text is always Doc and Sec levels
202 my $levels = { 'document' => 1, 'section' => 1 };
203 # always do database at section level
204 my $db_level = "section";
205
206 # set up the document processr
207 $self->{'buildproc'}->set_output_handle ($handle);
208 $self->{'buildproc'}->set_mode ('text');
209 $self->{'buildproc'}->set_index ($textindex);
210 $self->{'buildproc'}->set_indexing_text (0);
211 #$self->{'buildproc'}->set_indexfieldmap ($self->{'indexfieldmap'});
212 $self->{'buildproc'}->set_levels ($levels);
213 $self->{'buildproc'}->set_db_level ($db_level);
214 $self->{'buildproc'}->reset();
215 &plugin::begin($self->{'pluginfo'}, $self->{'source_dir'},
216 $self->{'buildproc'}, $self->{'maxdocs'});
217 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
218 "", {}, {}, $self->{'buildproc'}, $self->{'maxdocs'}, 0, $self->{'gli'});
219 &plugin::end($self->{'pluginfo'});
220 close ($handle) unless $self->{'debug'};
221 $self->print_stats();
222
223 print STDERR "</Stage>\n" if $self->{'gli'};
224}
225
226sub build_indexes {
227 my $self = shift (@_);
228 my ($indexname, $indexlevel) = @_;
229 my $outhandle = $self->{'outhandle'};
230
231 $self->pre_build_indexes($indexname);
232
233 my $indexes = [];
234 if (defined $indexname && $indexname =~ /\w/) {
235 push @$indexes, $indexname;
236 } else {
237 $indexes = $self->{'collect_cfg'}->{'indexes'};
238 }
239
240 # Determine what levels of index we want to build (a user may a specific
241 # level to index by using indexlevel parameter) [jmt12]
242 my @desired_indexlevels;
243 foreach my $level (keys %{$self->{'levels'}})
244 {
245 # ignore paragraph levels as they are unsupported in Lucene
246 if ($level =~ /paragraph/)
247 {
248 print $outhandle "WARNING: Paragraph level indexing not supported by Lucene. Ignoring index\n";
249 }
250 # build only the requested level if specified
251 elsif (defined $indexlevel && $indexlevel eq $level)
252 {
253 push (@desired_indexlevels, $level);
254 last;
255 }
256 # otherwise build all levels defined
257 else
258 {
259 push (@desired_indexlevels, $level);
260 }
261 }
262
263 # Create the mapping between the index descriptions
264 # and their directory names (includes subcolls and langs)
265 $self->{'index_mapping'} = $self->create_index_mapping ($indexes);
266
267 # build each of the indexes
268 foreach my $index (@$indexes) {
269
270 if ($self->want_built($index)) {
271
272 my $idx = $self->{'index_mapping'}->{$index};
273 # we now iterate through the filtered list of index levels [jmt12]
274 foreach my $level (@desired_indexlevels) {
275 next if $level =~ /paragraph/; # we don't do para indexing
276 my ($pindex) = $level =~ /^(.)/;
277 # should probably check that new name with level
278 # is unique ... but currently (with doc sec and para)
279 # each has unique first letter.
280 $self->{'index_mapping'}->{$index} = $pindex.$idx;
281
282 my $llevel = $mgppbuilder::level_map{$level};
283 print $outhandle "\n*** building index $index at level $llevel in subdirectory " .
284 "$self->{'index_mapping'}->{$index}\n" if ($self->{'verbosity'} >= 1);
285 print STDERR "<Stage name='Index' source='$index' level=$llevel>\n" if $self->{'gli'};
286
287 $self->build_index($index,$llevel);
288 }
289 $self->{'index_mapping'}->{$index} = $idx;
290
291 } else {
292 print $outhandle "\n*** ignoring index $index\n" if ($self->{'verbosity'} >= 1);
293 }
294 }
295
296 $self->post_build_indexes();
297}
298
299
300sub build_index {
301 my $self = shift (@_);
302 my ($index,$llevel) = @_;
303 my $outhandle = $self->{'outhandle'};
304 my $build_dir = $self->{'build_dir'};
305
306 # get the full index directory path and make sure it exists
307 my $indexdir = $self->{'index_mapping'}->{$index};
308 &util::mk_all_dir (&util::filename_cat($build_dir, $indexdir));
309
310 # get any os specific stuff
311 my $exedir = "$ENV{'GSDLHOME'}/bin/$ENV{'GSDLOS'}";
312 my $scriptdir = "$ENV{'GSDLHOME'}/bin/script";
313
314 # Find the perl script to call to run lucene
315 my $full_lucene_passes = $self->{'full_lucene_passes'};
316 my $full_lucene_passes_exe = $self->{'full_lucene_passes_exe'};
317
318 # define the section names for lucenepasses
319 # define the section names and possibly the doc name for lucenepasses
320 my $lucene_passes_sections = $llevel;
321
322 my $opt_create_index = ($self->{'incremental'}) ? "" : "-removeold";
323
324 my $osextra = "";
325 if ($ENV{'GSDLOS'} =~ /^windows$/i) {
326 $build_dir =~ s@/@\\@g;
327 } else {
328 if ($outhandle ne "STDERR") {
329 # so lucene_passes doesn't print to stderr if we redirect output
330 $osextra .= " 2>/dev/null";
331 }
332 }
333
334 # get the index expression if this index belongs
335 # to a subcollection
336 my $indexexparr = [];
337 my $langarr = [];
338
339 # there may be subcollection info, and language info.
340 my ($fields, $subcollection, $language) = split (":", $index);
341 my @subcollections = ();
342 @subcollections = split /,/, $subcollection if (defined $subcollection);
343
344 foreach $subcollection (@subcollections) {
345 if (defined ($self->{'collect_cfg'}->{'subcollection'}->{$subcollection})) {
346 push (@$indexexparr, $self->{'collect_cfg'}->{'subcollection'}->{$subcollection});
347 }
348 }
349
350 # add expressions for languages if this index belongs to
351 # a language subcollection - only put languages expressions for the
352 # ones we want in the index
353 my @languages = ();
354 my $languagemetadata = "Language";
355 if (defined ($self->{'collect_cfg'}->{'languagemetadata'})) {
356 $languagemetadata = $self->{'collect_cfg'}->{'languagemetadata'};
357 }
358 @languages = split /,/, $language if (defined $language);
359 foreach my $language (@languages) {
360 my $not=0;
361 if ($language =~ s/^\!//) {
362 $not = 1;
363 }
364 if($not) {
365 push (@$langarr, "!$language");
366 } else {
367 push (@$langarr, "$language");
368 }
369 }
370
371 # Build index dictionary. Uses verbatim stem method
372 print $outhandle "\n creating index dictionary (lucene_passes -I1)\n" if ($self->{'verbosity'} >= 1);
373 print STDERR "<Phase name='CreatingIndexDic'/>\n" if $self->{'gli'};
374 my ($handle);
375
376 if ($self->{'debug'}) {
377 $handle = *STDOUT;
378 } else {
379 print STDERR "Cmd: $full_lucene_passes_exe $opt_create_index index $lucene_passes_sections \"$build_dir\" \"$indexdir\" $osextra\n";
380 if (!-e "$full_lucene_passes" ||
381 !open($handle, "| $full_lucene_passes_exe $opt_create_index index $lucene_passes_sections \"$build_dir\" \"$indexdir\" $osextra")) {
382 print STDERR "<FatalError name='NoRunLucenePasses'/>\n</Stage>\n" if $self->{'gli'};
383 die "lucenebuilder::build_index - couldn't run $full_lucene_passes_exe\n";
384 }
385 }
386
387 my $store_levels = $self->{'levels'};
388 my $db_level = "section"; #always
389 my $dom_level = "";
390 foreach my $key (keys %$store_levels) {
391 if ($mgppbuilder::level_map{$key} eq $llevel) {
392 $dom_level = $key;
393 }
394 }
395 if ($dom_level eq "") {
396 print STDERR "Warning: unrecognized tag level $llevel\n";
397 $dom_level = "document";
398 }
399
400 my $local_levels = { $dom_level => 1 }; # work on one level at a time
401
402 # set up the document processr
403 $self->{'buildproc'}->set_output_handle ($handle);
404 $self->{'buildproc'}->set_mode ('text');
405 $self->{'buildproc'}->set_index ($index, $indexexparr);
406 $self->{'buildproc'}->set_index_languages ($languagemetadata, $langarr) if (defined $language);
407 $self->{'buildproc'}->set_indexing_text (1);
408 #$self->{'buildproc'}->set_indexfieldmap ($self->{'indexfieldmap'});
409 $self->{'buildproc'}->set_levels ($local_levels);
410 $self->{'buildproc'}->set_db_level($db_level);
411 $self->{'buildproc'}->reset();
412 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
413 "", {}, {}, $self->{'buildproc'}, $self->{'maxdocs'}, 0, $self->{'gli'});
414 close ($handle) unless $self->{'debug'};
415
416 $self->print_stats();
417
418 $self->{'buildproc'}->set_levels ($store_levels);
419 print STDERR "</Stage>\n" if $self->{'gli'};
420}
421
422# /** A modified version of the basebuilder.pm's function that generates the
423# * information database from the GA documents. We need to change this
424# * so that if we've been asked to do an incremental build we only add
425# * metadata to autohierarchy classifiers via the IncrementalBuildUtils
426# * module. All other classifiers and metadata will be ignored.
427# */
428# This was added to utilize DLC's incremental updating of Hierarchy classifiers. They are heading towards just using dynamic classifiers, and we do not want to use this code either. So now, we just use basebuilder's version of make_infodatabase
429sub make_infodatabase_dlc
430{
431 my $self = shift (@_);
432 my $outhandle = $self->{'outhandle'};
433
434 # Get info database file path
435 my $text_directory_path = &util::filename_cat($self->{'build_dir'}, "text");
436 my $infodb_file_path = &dbutil::get_infodb_file_path($self->{'infodbtype'}, $self->{'collection'}, $text_directory_path);
437
438 # If we aren't doing an incremental addition, then we just call the super-
439 # classes version
440 # Note: Incremental addition can only occur if an information database
441 # already exists. If it doesn't, let the super classes function be
442 # called once to generate it.
443 if (!$self->{'incremental'} || !-e $infodb_file_path)
444 {
445 # basebuilder::make_infodatabase(@_);
446 # Note: this doesn't work as the direct reference means all the $self
447 # data is lost.
448 $self->basebuilder::make_infodatabase(@_);
449 return;
450 }
451
452 # Carry on with an incremental addition
453 print $outhandle "\n*** performing an incremental addition to the info database\n" if ($self->{'verbosity'} >= 1);
454 print STDERR "<Stage name='CreateInfoData'>\n" if $self->{'gli'};
455
456 # 1. Init all the classifiers
457 &classify::init_classifiers ($self->{'classifiers'});
458 # 2. Init the buildproc settings.
459 # Note: we still need this to process any associated files - but we
460 # don't expect to pipe anything to the database so we can do away with the
461 # complex output handle.
462 my $assocdir = &util::filename_cat($self->{'build_dir'}, "assoc");
463 &util::mk_all_dir ($assocdir);
464 $self->{'buildproc'}->set_mode ('incinfodb'); # Very Important
465 $self->{'buildproc'}->set_assocdir ($assocdir);
466 # 3. Read in all the metadata from the files in the archives directory using
467 # the GAPlug and using ourselves as the document processor!
468 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'}, "", {}, {}, $self->{'buildproc'}, $self->{'maxdocs'},0, $self->{'gli'});
469
470 print STDERR "</Stage>\n" if $self->{'gli'};
471}
472
473# /** Lucene specific document removal function. This works by calling lucene_passes.pl with
474# * -remove and the document id on the command line.
475# *
476# * @param oid is the document identifier to be removed.
477# *
478# * @author John Rowe, DL Consulting Ltd.
479# */
480sub remove_document_from_database
481{
482 my ($self, $oid) = @_;
483 # Find the perl script to call to run lucene
484 my $full_lucene_passes_exe = $self->{'full_lucene_passes_exe'};
485 # Call lucene_passes.pl with -remove and the document ID on the command line
486 `$full_lucene_passes_exe -remove "$oid"`;
487}
488# /** remove_document_from_database **/
489
490
4911;
492
493
Note: See TracBrowser for help on using the repository browser.