source: main/trunk/greenstone2/perllib/lucenebuilder.pm@ 21308

Last change on this file since 21308 was 20683, checked in by kjdon, 15 years ago

use incremental instead of keepold to decide whether to pass -removeold to lucene_passes. keepold is used when doing each pass of building separately, not for incremental building

  • Property svn:keywords set to Author Date Id Revision
File size: 15.8 KB
Line 
1###########################################################################
2#
3# lucenebuilder.pm -- perl wrapper for building index with Lucene
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26###########################################################################
27# /*
28# * @version 1.0 ?
29# * @version 2.0 Incremental building assistance added, including
30# * remove_document_from_database which implements the granddad's
31# * empty function to call the lucene_passes.pl and full_lucene_passes_exe
32# * so there is one place in the code that works out where the
33# * perl script is. John Rowe
34# *
35# * @author John Rowe, DL Consulting Ltd.
36# */
37###########################################################################
38
39package lucenebuilder;
40
41# Use same basic XML structure setup by mgppbuilder/mgppbuildproc
42
43use mgppbuilder;
44use strict;
45no strict 'refs';
46
47
48sub BEGIN {
49 @lucenebuilder::ISA = ('mgppbuilder');
50}
51
52# /**
53# * @author John Thompson, DL Consulting Ltd.
54# */
55sub new {
56 my $class = shift(@_);
57 my $self = new mgppbuilder (@_);
58 $self = bless $self, $class;
59
60 $self->{'buildtype'} = "lucene";
61
62 # Do we need to put exe on the end?
63 my $exe = &util::get_os_exe ();
64 my $scriptdir = "$ENV{'GSDLHOME'}/bin/script";
65
66 # So where is lucene_passes.pl anyway?
67 my $lucene_passes_script = &util::filename_cat($scriptdir, "lucene_passes.pl");
68
69 # So tack perl on the beginning to ensure execution
70 $self->{'full_lucene_passes'} = "$lucene_passes_script";
71 if ($exe eq ".exe")
72 {
73 $self->{'full_lucene_passes_exe'} = "perl$exe \"$lucene_passes_script\"";
74 }
75 else
76 {
77 $self->{'full_lucene_passes_exe'} = "perl -S \"$lucene_passes_script\"";
78 }
79
80 return $self;
81}
82# /** new() **/
83
84sub is_incremental_capable
85{
86 # lucene can do incremental building
87
88 return 1;
89}
90
91sub init_for_incremental_build {
92 my $self = shift (@_);
93
94 # we want to read in indexfieldmap and indexfields from existing build.cfg
95 # so that we know what has already been indexed
96 my $buildcfg = $self->read_build_cfg();
97 return unless defined $buildcfg;
98
99 my $field;
100 if (defined $buildcfg->{'indexfields'}) {
101 foreach $field (@{$buildcfg->{'indexfields'}}) {
102 $self->{'buildproc'}->{'indexfields'}->{$field} = 1;
103 }
104 }
105
106 if (defined $buildcfg->{'indexfieldmap'}) {
107 foreach $field (@{$buildcfg->{'indexfieldmap'}}) {
108 my ($f, $v) = $field =~ /^(.*)\-\>(.*)$/;
109 $self->{'buildproc'}->{'indexfieldmap'}->{$f} = $v;
110 }
111 }
112
113}
114
115# lucene has none of these options
116sub generate_index_options {
117 my $self = shift (@_);
118
119 $self->SUPER::generate_index_options();
120
121 $self->{'casefold'} = 0;
122 $self->{'stem'} = 0;
123 $self->{'accentfold'} = 0;
124 $self->{'stemindexes'} = 0;
125}
126
127sub default_buildproc {
128 my $self = shift (@_);
129
130 return "lucenebuildproc";
131}
132
133# this writes a nice version of the text docs
134sub compress_text
135{
136 my $self = shift (@_);
137 # we don't do anything if we don't want compressed text
138 return if $self->{'no_text'};
139
140 my ($textindex) = @_;
141 my $outhandle = $self->{'outhandle'};
142
143 # the text directory
144 my $text_dir = &util::filename_cat($self->{'build_dir'}, "text");
145 my $build_dir = &util::filename_cat($self->{'build_dir'},"");
146 &util::mk_all_dir ($text_dir);
147
148 my $osextra = "";
149 if ($ENV{'GSDLOS'} =~ /^windows$/i)
150 {
151 $text_dir =~ s@/@\\@g;
152 }
153 else
154 {
155 if ($outhandle ne "STDERR")
156 {
157 # so lucene_passes doesn't print to stderr if we redirect output
158 $osextra .= " 2>/dev/null";
159 }
160 }
161
162 # get any os specific stuff
163 my $scriptdir = "$ENV{'GSDLHOME'}/bin/script";
164
165 # Find the perl script to call to run lucene
166 my $full_lucene_passes = $self->{'full_lucene_passes'};
167 my $full_lucene_passes_exe = $self->{'full_lucene_passes_exe'};
168
169 my $lucene_passes_sections = "Doc";
170
171 my ($handle);
172
173 if ($self->{'debug'})
174 {
175 $handle = *STDOUT;
176 }
177 else
178 {
179 print STDERR "Full Path: $full_lucene_passes\n";
180 print STDERR "Executable: $full_lucene_passes_exe\n";
181 print STDERR "Sections: $lucene_passes_sections\n";
182 print STDERR "Build Dir: $build_dir\n";
183 print STDERR "Cmd: $full_lucene_passes_exe text $lucene_passes_sections \"$build_dir\" \"dummy\" $osextra\n";
184 if (!-e "$full_lucene_passes" ||
185 !open($handle, "| $full_lucene_passes_exe text $lucene_passes_sections \"$build_dir\" \"dummy\" $osextra"))
186 {
187 print STDERR "<FatalError name='NoRunLucenePasses'/>\n</Stage>\n" if $self->{'gli'};
188 die "lucenebuilder::build_index - couldn't run $full_lucene_passes_exe\n";
189 }
190 }
191
192 # stored text is always Doc and Sec levels
193 my $levels = { 'document' => 1, 'section' => 1 };
194 # always do database at section level
195 my $db_level = "section";
196
197 # set up the document processr
198 $self->{'buildproc'}->set_output_handle ($handle);
199 $self->{'buildproc'}->set_mode ('text');
200 $self->{'buildproc'}->set_index ($textindex);
201 $self->{'buildproc'}->set_indexing_text (0);
202 #$self->{'buildproc'}->set_indexfieldmap ($self->{'indexfieldmap'});
203 $self->{'buildproc'}->set_levels ($levels);
204 $self->{'buildproc'}->set_db_level ($db_level);
205 $self->{'buildproc'}->reset();
206 &plugin::begin($self->{'pluginfo'}, $self->{'source_dir'},
207 $self->{'buildproc'}, $self->{'maxdocs'});
208 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
209 "", {}, {}, $self->{'buildproc'}, $self->{'maxdocs'}, 0, $self->{'gli'});
210 &plugin::end($self->{'pluginfo'});
211 close ($handle) unless $self->{'debug'};
212 $self->print_stats();
213
214 print STDERR "</Stage>\n" if $self->{'gli'};
215}
216
217sub build_indexes {
218 my $self = shift (@_);
219 my ($indexname) = @_;
220 my $outhandle = $self->{'outhandle'};
221
222 my $indexes = [];
223 if (defined $indexname && $indexname =~ /\w/) {
224 push @$indexes, $indexname;
225 } else {
226 $indexes = $self->{'collect_cfg'}->{'indexes'};
227 }
228 # have we got para index?
229 foreach my $level (keys %{$self->{'levels'}}) {
230 if ($level =~ /paragraph/) {
231 print $outhandle "Warning: Paragraph level indexing not supported by Lucene\n";
232 last;
233 }
234 }
235 # create the mapping between the index descriptions
236 # and their directory names (includes subcolls and langs)
237 $self->{'index_mapping'} = $self->create_index_mapping ($indexes);
238
239 # build each of the indexes
240 foreach my $index (@$indexes) {
241 if ($self->want_built($index)) {
242
243 my $idx = $self->{'index_mapping'}->{$index};
244 foreach my $level (keys %{$self->{'levels'}}) {
245 next if $level =~ /paragraph/; # we don't do para indexing
246 my ($pindex) = $level =~ /^(.)/;
247 # should probably check that new name with level
248 # is unique ... but currently (with doc sec and para)
249 # each has unique first letter.
250 $self->{'index_mapping'}->{$index} = $pindex.$idx;
251
252 my $llevel = $mgppbuilder::level_map{$level};
253 print $outhandle "\n*** building index $index at level $llevel in subdirectory " .
254 "$self->{'index_mapping'}->{$index}\n" if ($self->{'verbosity'} >= 1);
255 print STDERR "<Stage name='Index' source='$index' level=$llevel>\n" if $self->{'gli'};
256
257 $self->build_index($index,$llevel);
258 }
259 $self->{'index_mapping'}->{$index} = $idx;
260
261 } else {
262 print $outhandle "\n*** ignoring index $index\n" if ($self->{'verbosity'} >= 1);
263 }
264 }
265
266 #define the final field lists
267 $self->make_final_field_list();
268}
269
270
271sub build_index {
272 my $self = shift (@_);
273 my ($index,$llevel) = @_;
274 my $outhandle = $self->{'outhandle'};
275 my $build_dir = $self->{'build_dir'};
276
277 # get the full index directory path and make sure it exists
278 my $indexdir = $self->{'index_mapping'}->{$index};
279 &util::mk_all_dir (&util::filename_cat($build_dir, $indexdir));
280
281 # get any os specific stuff
282 my $exedir = "$ENV{'GSDLHOME'}/bin/$ENV{'GSDLOS'}";
283 my $scriptdir = "$ENV{'GSDLHOME'}/bin/script";
284
285 # Find the perl script to call to run lucene
286 my $full_lucene_passes = $self->{'full_lucene_passes'};
287 my $full_lucene_passes_exe = $self->{'full_lucene_passes_exe'};
288
289 # define the section names for lucenepasses
290 # define the section names and possibly the doc name for lucenepasses
291 my $lucene_passes_sections = $llevel;
292
293 my $opt_create_index = ($self->{'incremental'}) ? "" : "-removeold";
294
295 my $osextra = "";
296 if ($ENV{'GSDLOS'} =~ /^windows$/i) {
297 $build_dir =~ s@/@\\@g;
298 } else {
299 if ($outhandle ne "STDERR") {
300 # so lucene_passes doesn't print to stderr if we redirect output
301 $osextra .= " 2>/dev/null";
302 }
303 }
304
305 # get the index expression if this index belongs
306 # to a subcollection
307 my $indexexparr = [];
308 my $langarr = [];
309
310 # there may be subcollection info, and language info.
311 my ($fields, $subcollection, $language) = split (":", $index);
312 my @subcollections = ();
313 @subcollections = split /,/, $subcollection if (defined $subcollection);
314
315 foreach $subcollection (@subcollections) {
316 if (defined ($self->{'collect_cfg'}->{'subcollection'}->{$subcollection})) {
317 push (@$indexexparr, $self->{'collect_cfg'}->{'subcollection'}->{$subcollection});
318 }
319 }
320
321 # add expressions for languages if this index belongs to
322 # a language subcollection - only put languages expressions for the
323 # ones we want in the index
324 my @languages = ();
325 my $languagemetadata = "Language";
326 if (defined ($self->{'collect_cfg'}->{'languagemetadata'})) {
327 $languagemetadata = $self->{'collect_cfg'}->{'languagemetadata'};
328 }
329 @languages = split /,/, $language if (defined $language);
330 foreach my $language (@languages) {
331 my $not=0;
332 if ($language =~ s/^\!//) {
333 $not = 1;
334 }
335 if($not) {
336 push (@$langarr, "!$language");
337 } else {
338 push (@$langarr, "$language");
339 }
340 }
341
342 # Build index dictionary. Uses verbatim stem method
343 print $outhandle "\n creating index dictionary (lucene_passes -I1)\n" if ($self->{'verbosity'} >= 1);
344 print STDERR "<Phase name='CreatingIndexDic'/>\n" if $self->{'gli'};
345 my ($handle);
346
347 if ($self->{'debug'}) {
348 $handle = *STDOUT;
349 } else {
350 print STDERR "Cmd: $full_lucene_passes_exe $opt_create_index index $lucene_passes_sections \"$build_dir\" \"$indexdir\" $osextra\n";
351 if (!-e "$full_lucene_passes" ||
352 !open($handle, "| $full_lucene_passes_exe $opt_create_index index $lucene_passes_sections \"$build_dir\" \"$indexdir\" $osextra")) {
353 print STDERR "<FatalError name='NoRunLucenePasses'/>\n</Stage>\n" if $self->{'gli'};
354 die "lucenebuilder::build_index - couldn't run $full_lucene_passes_exe\n";
355 }
356 }
357
358 my $store_levels = $self->{'levels'};
359 my $db_level = "section"; #always
360 my $dom_level = "";
361 foreach my $key (keys %$store_levels) {
362 if ($mgppbuilder::level_map{$key} eq $llevel) {
363 $dom_level = $key;
364 }
365 }
366 if ($dom_level eq "") {
367 print STDERR "Warning: unrecognized tag level $llevel\n";
368 $dom_level = "document";
369 }
370
371 my $local_levels = { $dom_level => 1 }; # work on one level at a time
372
373 # set up the document processr
374 $self->{'buildproc'}->set_output_handle ($handle);
375 $self->{'buildproc'}->set_mode ('text');
376 $self->{'buildproc'}->set_index ($index, $indexexparr);
377 $self->{'buildproc'}->set_index_languages ($languagemetadata, $langarr) if (defined $language);
378 $self->{'buildproc'}->set_indexing_text (1);
379 #$self->{'buildproc'}->set_indexfieldmap ($self->{'indexfieldmap'});
380 $self->{'buildproc'}->set_levels ($local_levels);
381 $self->{'buildproc'}->set_db_level($db_level);
382 $self->{'buildproc'}->reset();
383 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
384 "", {}, {}, $self->{'buildproc'}, $self->{'maxdocs'}, 0, $self->{'gli'});
385 close ($handle) unless $self->{'debug'};
386
387 $self->print_stats();
388
389 $self->{'buildproc'}->set_levels ($store_levels);
390 print STDERR "</Stage>\n" if $self->{'gli'};
391}
392
393# /** A modified version of the basebuilder.pm's function that generates the
394# * information database from the GA documents. We need to change this
395# * so that if we've been asked to do an incremental build we only add
396# * metadata to autohierarchy classifiers via the IncrementalBuildUtils
397# * module. All other classifiers and metadata will be ignored.
398# */
399# This was added to utilize DLC's incremental updating of Hierarchy classifiers. They are heading towards just using dynamic classifiers, and we do not want to use this code either. So now, we just use basebuilder's version of make_infodatabase
400sub make_infodatabase_dlc
401{
402 my $self = shift (@_);
403 my $outhandle = $self->{'outhandle'};
404
405 # Get info database file path
406 my $text_directory_path = &util::filename_cat($self->{'build_dir'}, "text");
407 my $infodb_file_path = &dbutil::get_infodb_file_path($self->{'infodbtype'}, $self->{'collection'}, $text_directory_path);
408
409 # If we aren't doing an incremental addition, then we just call the super-
410 # classes version
411 # Note: Incremental addition can only occur if an information database
412 # already exists. If it doesn't, let the super classes function be
413 # called once to generate it.
414 if (!$self->{'incremental'} || !-e $infodb_file_path)
415 {
416 # basebuilder::make_infodatabase(@_);
417 # Note: this doesn't work as the direct reference means all the $self
418 # data is lost.
419 $self->basebuilder::make_infodatabase(@_);
420 return;
421 }
422
423 # Carry on with an incremental addition
424 print $outhandle "\n*** performing an incremental addition to the info database\n" if ($self->{'verbosity'} >= 1);
425 print STDERR "<Stage name='CreateInfoData'>\n" if $self->{'gli'};
426
427 # 1. Init all the classifiers
428 &classify::init_classifiers ($self->{'classifiers'});
429 # 2. Init the buildproc settings.
430 # Note: we still need this to process any associated files - but we
431 # don't expect to pipe anything to the database so we can do away with the
432 # complex output handle.
433 my $assocdir = &util::filename_cat($self->{'build_dir'}, "assoc");
434 &util::mk_all_dir ($assocdir);
435 $self->{'buildproc'}->set_mode ('incinfodb'); # Very Important
436 $self->{'buildproc'}->set_assocdir ($assocdir);
437 # 3. Read in all the metadata from the files in the archives directory using
438 # the GAPlug and using ourselves as the document processor!
439 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'}, "", {}, {}, $self->{'buildproc'}, $self->{'maxdocs'},0, $self->{'gli'});
440
441 print STDERR "</Stage>\n" if $self->{'gli'};
442}
443
444# /** Lucene specific document removal function. This works by calling lucene_passes.pl with
445# * -remove and the document id on the command line.
446# *
447# * @param oid is the document identifier to be removed.
448# *
449# * @author John Rowe, DL Consulting Ltd.
450# */
451sub remove_document_from_database
452{
453 my ($self, $oid) = @_;
454 # Find the perl script to call to run lucene
455 my $full_lucene_passes_exe = $self->{'full_lucene_passes_exe'};
456 # Call lucene_passes.pl with -remove and the document ID on the command line
457 `$full_lucene_passes_exe -remove "$oid"`;
458}
459# /** remove_document_from_database **/
460
461
4621;
463
464
Note: See TracBrowser for help on using the repository browser.