source: gsdl/trunk/perllib/lucenebuilder.pm@ 19617

Last change on this file since 19617 was 17575, checked in by kjdon, 16 years ago

implemented init_for_incremental_build to read in indexfields and indexfieldmap so that when doing an incremental build we don't lose this info

  • Property svn:keywords set to Author Date Id Revision
File size: 15.8 KB
Line 
1###########################################################################
2#
3# lucenebuilder.pm -- perl wrapper for building index with Lucene
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26###########################################################################
27# /*
28# * @version 1.0 ?
29# * @version 2.0 Incremental building assistance added, including
30# * remove_document_from_database which implements the granddad's
31# * empty function to call the lucene_passes.pl and full_lucene_passes_exe
32# * so there is one place in the code that works out where the
33# * perl script is. John Rowe
34# *
35# * @author John Rowe, DL Consulting Ltd.
36# */
37###########################################################################
38
39package lucenebuilder;
40
41# Use same basic XML structure setup by mgppbuilder/mgppbuildproc
42
43use mgppbuilder;
44use strict;
45no strict 'refs';
46
47
48sub BEGIN {
49 @lucenebuilder::ISA = ('mgppbuilder');
50}
51
52# /**
53# * @author John Thompson, DL Consulting Ltd.
54# */
55sub new {
56 my $class = shift(@_);
57 my $self = new mgppbuilder (@_);
58 $self = bless $self, $class;
59
60 $self->{'buildtype'} = "lucene";
61
62 # Do we need to put exe on the end?
63 my $exe = &util::get_os_exe ();
64 my $scriptdir = "$ENV{'GSDLHOME'}/bin/script";
65
66 # So where is lucene_passes.pl anyway?
67 my $lucene_passes_script = &util::filename_cat($scriptdir, "lucene_passes.pl");
68
69 # So tack perl on the beginning to ensure execution
70 $self->{'full_lucene_passes'} = "$lucene_passes_script";
71 if ($exe eq ".exe")
72 {
73 $self->{'full_lucene_passes_exe'} = "perl$exe \"$lucene_passes_script\"";
74 }
75 else
76 {
77 $self->{'full_lucene_passes_exe'} = "perl -S \"$lucene_passes_script\"";
78 }
79
80 return $self;
81}
82# /** new() **/
83
84sub init_for_incremental_build {
85 my $self = shift (@_);
86
87 # we want to read in indexfieldmap and indexfields from existing build.cfg
88 # so that we know what has already been indexed
89 my $buildcfg = $self->read_build_cfg();
90 return unless defined $buildcfg;
91
92 my $field;
93 if (defined $buildcfg->{'indexfields'}) {
94 foreach $field (@{$buildcfg->{'indexfields'}}) {
95 $self->{'buildproc'}->{'indexfields'}->{$field} = 1;
96 }
97 }
98
99 if (defined $buildcfg->{'indexfieldmap'}) {
100 foreach $field (@{$buildcfg->{'indexfieldmap'}}) {
101 my ($f, $v) = $field =~ /^(.*)\-\>(.*)$/;
102 $self->{'buildproc'}->{'indexfieldmap'}->{$f} = $v;
103 }
104 }
105
106}
107
108# lucene has none of these options
109sub generate_index_options {
110 my $self = shift (@_);
111
112 $self->SUPER::generate_index_options();
113
114 $self->{'casefold'} = 0;
115 $self->{'stem'} = 0;
116 $self->{'accentfold'} = 0;
117 $self->{'stemindexes'} = 0;
118}
119
120sub default_buildproc {
121 my $self = shift (@_);
122
123 return "lucenebuildproc";
124}
125
126# this writes a nice version of the text docs
127sub compress_text
128{
129 my $self = shift (@_);
130 # we don't do anything if we don't want compressed text
131 return if $self->{'no_text'};
132
133 my ($textindex) = @_;
134 my $outhandle = $self->{'outhandle'};
135
136 # the text directory
137 my $text_dir = &util::filename_cat($self->{'build_dir'}, "text");
138 my $build_dir = &util::filename_cat($self->{'build_dir'},"");
139 &util::mk_all_dir ($text_dir);
140
141 my $osextra = "";
142 if ($ENV{'GSDLOS'} =~ /^windows$/i)
143 {
144 $text_dir =~ s@/@\\@g;
145 }
146 else
147 {
148 if ($outhandle ne "STDERR")
149 {
150 # so lucene_passes doesn't print to stderr if we redirect output
151 $osextra .= " 2>/dev/null";
152 }
153 }
154
155 # get any os specific stuff
156 my $scriptdir = "$ENV{'GSDLHOME'}/bin/script";
157
158 # Find the perl script to call to run lucene
159 my $full_lucene_passes = $self->{'full_lucene_passes'};
160 my $full_lucene_passes_exe = $self->{'full_lucene_passes_exe'};
161
162 my $lucene_passes_sections = "Doc";
163
164 my ($handle);
165
166 if ($self->{'debug'})
167 {
168 $handle = *STDOUT;
169 }
170 else
171 {
172 print STDERR "Full Path: $full_lucene_passes\n";
173 print STDERR "Executable: $full_lucene_passes_exe\n";
174 print STDERR "Sections: $lucene_passes_sections\n";
175 print STDERR "Build Dir: $build_dir\n";
176 print STDERR "Cmd: $full_lucene_passes_exe text $lucene_passes_sections \"$build_dir\" \"dummy\" $osextra\n";
177 if (!-e "$full_lucene_passes" ||
178 !open($handle, "| $full_lucene_passes_exe text $lucene_passes_sections \"$build_dir\" \"dummy\" $osextra"))
179 {
180 print STDERR "<FatalError name='NoRunLucenePasses'/>\n</Stage>\n" if $self->{'gli'};
181 die "lucenebuilder::build_index - couldn't run $full_lucene_passes_exe\n";
182 }
183 }
184
185 # stored text is always Doc and Sec levels
186 my $levels = { 'document' => 1, 'section' => 1 };
187 # always do database at section level
188 my $db_level = "section";
189
190 # set up the document processr
191 $self->{'buildproc'}->set_output_handle ($handle);
192 $self->{'buildproc'}->set_mode ('text');
193 $self->{'buildproc'}->set_index ($textindex);
194 $self->{'buildproc'}->set_indexing_text (0);
195 #$self->{'buildproc'}->set_indexfieldmap ($self->{'indexfieldmap'});
196 $self->{'buildproc'}->set_levels ($levels);
197 $self->{'buildproc'}->set_db_level ($db_level);
198 $self->{'buildproc'}->reset();
199 &plugin::begin($self->{'pluginfo'}, $self->{'source_dir'},
200 $self->{'buildproc'}, $self->{'maxdocs'});
201 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
202 "", {}, {}, $self->{'buildproc'}, $self->{'maxdocs'}, 0, $self->{'gli'});
203 &plugin::end($self->{'pluginfo'});
204 close ($handle) unless $self->{'debug'};
205 $self->print_stats();
206
207 print STDERR "</Stage>\n" if $self->{'gli'};
208}
209
210sub build_indexes {
211 my $self = shift (@_);
212 my ($indexname) = @_;
213 my $outhandle = $self->{'outhandle'};
214
215 my $indexes = [];
216 if (defined $indexname && $indexname =~ /\w/) {
217 push @$indexes, $indexname;
218 } else {
219 $indexes = $self->{'collect_cfg'}->{'indexes'};
220 }
221 # have we got para index?
222 foreach my $level (keys %{$self->{'levels'}}) {
223 if ($level =~ /paragraph/) {
224 print $outhandle "Warning: Paragraph level indexing not supported by Lucene\n";
225 last;
226 }
227 }
228 # create the mapping between the index descriptions
229 # and their directory names (includes subcolls and langs)
230 $self->{'index_mapping'} = $self->create_index_mapping ($indexes);
231
232 # build each of the indexes
233 foreach my $index (@$indexes) {
234 if ($self->want_built($index)) {
235
236 my $idx = $self->{'index_mapping'}->{$index};
237 foreach my $level (keys %{$self->{'levels'}}) {
238 next if $level =~ /paragraph/; # we don't do para indexing
239 my ($pindex) = $level =~ /^(.)/;
240 # should probably check that new name with level
241 # is unique ... but currently (with doc sec and para)
242 # each has unique first letter.
243 $self->{'index_mapping'}->{$index} = $pindex.$idx;
244
245 my $llevel = $mgppbuilder::level_map{$level};
246 print $outhandle "\n*** building index $index at level $llevel in subdirectory " .
247 "$self->{'index_mapping'}->{$index}\n" if ($self->{'verbosity'} >= 1);
248 print STDERR "<Stage name='Index' source='$index' level=$llevel>\n" if $self->{'gli'};
249
250 $self->build_index($index,$llevel);
251 }
252 $self->{'index_mapping'}->{$index} = $idx;
253
254 } else {
255 print $outhandle "\n*** ignoring index $index\n" if ($self->{'verbosity'} >= 1);
256 }
257 }
258
259 #define the final field lists
260 $self->make_final_field_list();
261}
262
263
264sub build_index {
265 my $self = shift (@_);
266 my ($index,$llevel) = @_;
267 my $outhandle = $self->{'outhandle'};
268 my $build_dir = $self->{'build_dir'};
269
270 # get the full index directory path and make sure it exists
271 my $indexdir = $self->{'index_mapping'}->{$index};
272 &util::mk_all_dir (&util::filename_cat($build_dir, $indexdir));
273
274 # get any os specific stuff
275 my $exedir = "$ENV{'GSDLHOME'}/bin/$ENV{'GSDLOS'}";
276 my $scriptdir = "$ENV{'GSDLHOME'}/bin/script";
277
278 # Find the perl script to call to run lucene
279 my $full_lucene_passes = $self->{'full_lucene_passes'};
280 my $full_lucene_passes_exe = $self->{'full_lucene_passes_exe'};
281
282 # define the section names for lucenepasses
283 # define the section names and possibly the doc name for lucenepasses
284 my $lucene_passes_sections = $llevel;
285
286 my $opt_create_index = ($self->{'keepold'}) ? "" : "-removeold";
287
288 my $osextra = "";
289 if ($ENV{'GSDLOS'} =~ /^windows$/i) {
290 $build_dir =~ s@/@\\@g;
291 } else {
292 if ($outhandle ne "STDERR") {
293 # so lucene_passes doesn't print to stderr if we redirect output
294 $osextra .= " 2>/dev/null";
295 }
296 }
297
298 # get the index expression if this index belongs
299 # to a subcollection
300 my $indexexparr = [];
301 my $langarr = [];
302
303 # there may be subcollection info, and language info.
304 my ($fields, $subcollection, $language) = split (":", $index);
305 my @subcollections = ();
306 @subcollections = split /,/, $subcollection if (defined $subcollection);
307
308 foreach $subcollection (@subcollections) {
309 if (defined ($self->{'collect_cfg'}->{'subcollection'}->{$subcollection})) {
310 push (@$indexexparr, $self->{'collect_cfg'}->{'subcollection'}->{$subcollection});
311 }
312 }
313
314 # add expressions for languages if this index belongs to
315 # a language subcollection - only put languages expressions for the
316 # ones we want in the index
317 my @languages = ();
318 my $language_metadata = "Language";
319 if (defined ($self->{'collect_cfg'}->{'language_metadata'})) {
320 $language_metadata = $self->{'collect_cfg'}->{'language_metadata'};
321 }
322 @languages = split /,/, $language if (defined $language);
323 foreach my $language (@languages) {
324 my $not=0;
325 if ($language =~ s/^\!//) {
326 $not = 1;
327 }
328 if($not) {
329 push (@$langarr, "!$language");
330 } else {
331 push (@$langarr, "$language");
332 }
333 }
334
335 # Build index dictionary. Uses verbatim stem method
336 print $outhandle "\n creating index dictionary (lucene_passes -I1)\n" if ($self->{'verbosity'} >= 1);
337 print STDERR "<Phase name='CreatingIndexDic'/>\n" if $self->{'gli'};
338 my ($handle);
339
340 if ($self->{'debug'}) {
341 $handle = *STDOUT;
342 } else {
343 print STDERR "Cmd: $full_lucene_passes_exe $opt_create_index index $lucene_passes_sections \"$build_dir\" \"$indexdir\" $osextra\n";
344 if (!-e "$full_lucene_passes" ||
345 !open($handle, "| $full_lucene_passes_exe $opt_create_index index $lucene_passes_sections \"$build_dir\" \"$indexdir\" $osextra")) {
346 print STDERR "<FatalError name='NoRunLucenePasses'/>\n</Stage>\n" if $self->{'gli'};
347 die "lucenebuilder::build_index - couldn't run $full_lucene_passes_exe\n";
348 }
349 }
350
351 my $store_levels = $self->{'levels'};
352 my $db_level = "section"; #always
353 my $dom_level = "";
354 foreach my $key (keys %$store_levels) {
355 if ($mgppbuilder::level_map{$key} eq $llevel) {
356 $dom_level = $key;
357 }
358 }
359 if ($dom_level eq "") {
360 print STDERR "Warning: unrecognized tag level $llevel\n";
361 $dom_level = "document";
362 }
363
364 my $local_levels = { $dom_level => 1 }; # work on one level at a time
365
366 # set up the document processr
367 $self->{'buildproc'}->set_output_handle ($handle);
368 $self->{'buildproc'}->set_mode ('text');
369 $self->{'buildproc'}->set_index ($index, $indexexparr);
370 $self->{'buildproc'}->set_index_languages ($language_metadata, $langarr) if (defined $language);
371 $self->{'buildproc'}->set_indexing_text (1);
372 #$self->{'buildproc'}->set_indexfieldmap ($self->{'indexfieldmap'});
373 $self->{'buildproc'}->set_levels ($local_levels);
374 $self->{'buildproc'}->set_db_level($db_level);
375 $self->{'buildproc'}->reset();
376 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
377 "", {}, {}, $self->{'buildproc'}, $self->{'maxdocs'}, 0, $self->{'gli'});
378 close ($handle) unless $self->{'debug'};
379
380 $self->print_stats();
381
382 $self->{'buildproc'}->set_levels ($store_levels);
383 print STDERR "</Stage>\n" if $self->{'gli'};
384}
385
386# /** A modified version of the basebuilder.pm's function that generates the
387# * information database from the GA documents. We need to change this
388# * so that if we've been asked to do an incremental build we only add
389# * metadata to autohierarchy classifiers via the IncrementalBuildUtils
390# * module. All other classifiers and metadata will be ignored.
391# */
392# This was added to utilize DLC's incremental updating of Hierarchy classifiers. They are heading towards just using dynamic classifiers, and we do not want to use this code either. So now, we just use basebuilder's version of make_infodatabase
393sub make_infodatabase_dlc
394{
395 my $self = shift (@_);
396 my $outhandle = $self->{'outhandle'};
397
398 # Get info database file path
399 my $text_directory_path = &util::filename_cat($self->{'build_dir'}, "text");
400 my $infodb_file_path = &dbutil::get_infodb_file_path($self->{'infodbtype'}, $self->{'collection'}, $text_directory_path);
401
402 # If we aren't doing an incremental addition, then we just call the super-
403 # classes version
404 # Note: Incremental addition can only occur if an information database
405 # already exists. If it doesn't, let the super classes function be
406 # called once to generate it.
407 if (!$self->{'incremental'} || !-e $infodb_file_path)
408 {
409 # basebuilder::make_infodatabase(@_);
410 # Note: this doesn't work as the direct reference means all the $self
411 # data is lost.
412 $self->basebuilder::make_infodatabase(@_);
413 return;
414 }
415
416 # Carry on with an incremental addition
417 print $outhandle "\n*** performing an incremental addition to the info database\n" if ($self->{'verbosity'} >= 1);
418 print STDERR "<Stage name='CreateInfoData'>\n" if $self->{'gli'};
419
420 # 1. Init all the classifiers
421 &classify::init_classifiers ($self->{'classifiers'});
422 # 2. Init the buildproc settings.
423 # Note: we still need this to process any associated files - but we
424 # don't expect to pipe anything to the database so we can do away with the
425 # complex output handle.
426 my $assocdir = &util::filename_cat($self->{'build_dir'}, "assoc");
427 &util::mk_all_dir ($assocdir);
428 $self->{'buildproc'}->set_mode ('incinfodb'); # Very Important
429 $self->{'buildproc'}->set_assocdir ($assocdir);
430 # 3. Read in all the metadata from the files in the archives directory using
431 # the GAPlug and using ourselves as the document processor!
432 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'}, "", {}, {}, $self->{'buildproc'}, $self->{'maxdocs'},0, $self->{'gli'});
433
434 print STDERR "</Stage>\n" if $self->{'gli'};
435}
436
437# /** Lucene specific document removal function. This works by calling lucene_passes.pl with
438# * -remove and the document id on the command line.
439# *
440# * @param oid is the document identifier to be removed.
441# *
442# * @author John Rowe, DL Consulting Ltd.
443# */
444sub remove_document_from_database
445{
446 my ($self, $oid) = @_;
447 # Find the perl script to call to run lucene
448 my $full_lucene_passes_exe = $self->{'full_lucene_passes_exe'};
449 # Call lucene_passes.pl with -remove and the document ID on the command line
450 `$full_lucene_passes_exe -remove "$oid"`;
451}
452# /** remove_document_from_database **/
453
454
4551;
456
457
Note: See TracBrowser for help on using the repository browser.