source: trunk/gsdl/perllib/lucenebuilder.pm@ 12974

Last change on this file since 12974 was 12974, checked in by kjdon, 15 years ago

incremental option changed to incremental_dlc, and its now set by basebuilder

  • Property svn:keywords set to Author Date Id Revision
File size: 15.1 KB
Line 
1###########################################################################
2#
3# lucenebuilder.pm -- perl wrapper for building index with Lucene
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26###########################################################################
27# /*
28# * @version 1.0 ?
29# * @version 2.0 Incremental building assistance added, including
30# * remove_document_from_database which implements the granddad's
31# * empty function to call the lucene_passes.pl and full_lucene_passes_exe
32# * so there is one place in the code that works out where the
33# * perl script is. John Rowe
34# *
35# * @author John Rowe, DL Consulting Ltd.
36# */
37###########################################################################
38
39package lucenebuilder;
40
41# Use same basic XML structure setup by mgppbuilder/mgppbuildproc
42
43use mgppbuilder;
44
45# use IncrementalBuildUtils;
46
47sub BEGIN {
48 @lucenebuilder::ISA = ('mgppbuilder');
49}
50
51# /**
52# * @author John Thompson, DL Consulting Ltd.
53# */
54sub new {
55 my $class = shift(@_);
56 my $self = new mgppbuilder (@_);
57 $self = bless $self, $class;
58
59 $self->{'buildtype'} = "lucene";
60
61 # Do we need to put exe on the end?
62 my $exe = &util::get_os_exe ();
63 my $scriptdir = "$ENV{'GSDLHOME'}/bin/script";
64
65 # So where is lucene_passes.pl anyway?
66 my $lucene_passes_script = &util::filename_cat($scriptdir, "lucene_passes.pl");
67
68 # So tack perl on the beginning to ensure execution
69 $self->{'full_lucene_passes'} = "$lucene_passes_script";
70 if ($exe eq ".exe")
71 {
72 $self->{'full_lucene_passes_exe'} = "perl$exe \"$lucene_passes_script\"";
73 }
74 else
75 {
76 $self->{'full_lucene_passes_exe'} = "perl -S \"$lucene_passes_script\"";
77 }
78
79 return $self;
80}
81# /** new() **/
82
83# lucene has none of these options
84sub generate_index_options {
85 my $self = shift (@_);
86
87 $self->{'casefold'} = 0;
88 $self->{'stem'} = 0;
89 $self->{'accentfold'} = 0;
90 $self->{'stemindexes'} = 0;
91}
92
93sub default_buildproc {
94 my $self = shift (@_);
95
96 return "lucenebuildproc";
97}
98
99# this writes a nice version of the text docs
100sub compress_text
101 {
102 my $self = shift (@_);
103 # we don't do anything if we don't want compressed text
104 return if $self->{'no_text'};
105
106 my ($textindex) = @_;
107 my $outhandle = $self->{'outhandle'};
108 print STDERR "Saving the document text\n";
109 # the text directory
110 my $text_dir = &util::filename_cat($self->{'build_dir'}, "text");
111 my $build_dir = &util::filename_cat($self->{'build_dir'},"");
112 &util::mk_all_dir ($text_dir);
113
114 my $osextra = "";
115 if ($ENV{'GSDLOS'} =~ /^windows$/i)
116 {
117 $text_dir =~ s@/@\\@g;
118 }
119 else
120 {
121 if ($outhandle ne "STDERR")
122 {
123 # so lucene_passes doesn't print to stderr if we redirect output
124 $osextra .= " 2>/dev/null";
125 }
126 }
127
128 # get any os specific stuff
129 my $scriptdir = "$ENV{'GSDLHOME'}/bin/script";
130
131 # Find the perl script to call to run lucene
132 my $full_lucene_passes = $self->{'full_lucene_passes'};
133 my $full_lucene_passes_exe = $self->{'full_lucene_passes_exe'};
134
135 my $lucene_passes_sections = "Doc";
136
137 my ($handle);
138
139 if ($self->{'debug'})
140 {
141 $handle = STDOUT;
142 }
143 else
144 {
145 print STDERR "Full Path: $full_lucene_passes\n";
146 print STDERR "Executable: $full_lucene_passes_exe\n";
147 print STDERR "Sections: $lucene_passes_sections\n";
148 print STDERR "Build Dir: $build_dir\n";
149 print STDERR "Cmd: $full_lucene_passes_exe text $lucene_passes_sections \"$build_dir\" \"dummy\" $osextra\n";
150 if (!-e "$full_lucene_passes" ||
151 !open (PIPEOUT, "| $full_lucene_passes_exe text $lucene_passes_sections \"$build_dir\" \"dummy\" $osextra"))
152 {
153 print STDERR "<FatalError name='NoRunLucenePasses'/>\n</Stage>\n" if $self->{'gli'};
154 die "lucenebuilder::build_index - couldn't run $full_lucene_passes_exe\n";
155 }
156 $handle = lucenebuilder::PIPEOUT;
157 }
158 my $levels = $self->{'levels'};
159 my $gdbm_level = "document";
160 if ($levels->{'section'})
161 {
162 $gdbm_level = "section";
163 }
164
165 undef $levels->{'paragraph'}; # get rid of para if we had it.
166 # set up the document processr
167 $self->{'buildproc'}->set_output_handle ($handle);
168 $self->{'buildproc'}->set_mode ('text');
169 $self->{'buildproc'}->set_index ($textindex);
170 $self->{'buildproc'}->set_indexing_text (0);
171 $self->{'buildproc'}->set_indexfieldmap ($self->{'indexfieldmap'});
172 $self->{'buildproc'}->set_levels ($levels);
173 $self->{'buildproc'}->set_gdbm_level ($gdbm_level);
174 $self->{'buildproc'}->reset();
175 &plugin::begin($self->{'pluginfo'}, $self->{'source_dir'},
176 $self->{'buildproc'}, $self->{'maxdocs'});
177 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
178 "", {}, $self->{'buildproc'}, $self->{'maxdocs'}, 0, $self->{'gli'});
179 &plugin::end($self->{'pluginfo'});
180 close ($handle) unless $self->{'debug'};
181 close PIPEOUT;
182 $self->print_stats();
183
184 print STDERR "</Stage>\n" if $self->{'gli'};
185 }
186
187sub build_indexes {
188 my $self = shift (@_);
189 my ($indexname) = @_;
190 my $outhandle = $self->{'outhandle'};
191
192 my $indexes = [];
193 if (defined $indexname && $indexname =~ /\w/) {
194 push @$indexes, $indexname;
195 } else {
196 $indexes = $self->{'collect_cfg'}->{'indexes'};
197 }
198
199 # create the mapping between the index descriptions
200 # and their directory names (includes subcolls and langs)
201 $self->{'index_mapping'} = $self->create_index_mapping ($indexes);
202
203 # build each of the indexes
204 foreach $index (@$indexes) {
205 if ($self->want_built($index)) {
206
207 my $idx = $self->{'index_mapping'}->{$index};
208 foreach my $level (keys %{$self->{'levels'}}) {
209 next if $level =~ /paragraph/; # we don't do para indexing
210 my ($pindex) = $level =~ /^(.)/;
211 # should probably check that new name with level
212 # is unique ... but currently (with doc sec and para)
213 # each has unique first letter.
214 $self->{'index_mapping'}->{$index} = $pindex.$idx;
215
216 my $llevel = $mgppbuilder::level_map{$level};
217 print $outhandle "\n*** building index $index at level $llevel in subdirectory " .
218 "$self->{'index_mapping'}->{$index}\n" if ($self->{'verbosity'} >= 1);
219 print STDERR "<Stage name='Index' source='$index' level=$llevel>\n" if $self->{'gli'};
220
221 $self->build_index($index,$llevel);
222 }
223 $self->{'index_mapping'}->{$index} = $idx;
224
225 } else {
226 print $outhandle "\n*** ignoring index $index\n" if ($self->{'verbosity'} >= 1);
227 }
228 }
229
230 #define the final field lists
231 $self->make_final_field_list();
232}
233
234# /** Lucene specific document removal function. This works by calling lucene_passes.pl with
235# * -remove and the document id on the command line.
236# *
237# * @param oid is the document identifier to be removed.
238# *
239# * @author John Rowe, DL Consulting Ltd.
240# */
241sub remove_document_from_database
242 {
243 my ($self, $oid) = @_;
244 # Find the perl script to call to run lucene
245 my $full_lucene_passes_exe = $self->{'full_lucene_passes_exe'};
246 # Call lucene_passes.pl with -remove and the document ID on the command line
247 `$full_lucene_passes_exe -remove "$oid"`;
248 }
249# /** remove_document_from_database **/
250
251sub build_index {
252 my $self = shift (@_);
253 my ($index,$llevel) = @_;
254 my $outhandle = $self->{'outhandle'};
255 my $build_dir = $self->{'build_dir'};
256
257 # get the full index directory path and make sure it exists
258 my $indexdir = $self->{'index_mapping'}->{$index};
259 &util::mk_all_dir (&util::filename_cat($build_dir, $indexdir));
260
261 # get any os specific stuff
262 my $exedir = "$ENV{'GSDLHOME'}/bin/$ENV{'GSDLOS'}";
263 my $scriptdir = "$ENV{'GSDLHOME'}/bin/script";
264
265 # Find the perl script to call to run lucene
266 my $full_lucene_passes = $self->{'full_lucene_passes'};
267 my $full_lucene_passes_exe = $self->{'full_lucene_passes_exe'};
268
269 # define the section names for lucenepasses
270 # define the section names and possibly the doc name for lucenepasses
271 my $lucene_passes_sections = $llevel;
272
273 my $opt_create_index = ($self->{'keepold'}) ? "" : "-create";
274
275 my $osextra = "";
276 if ($ENV{'GSDLOS'} =~ /^windows$/i) {
277 $build_dir =~ s@/@\\@g;
278 } else {
279 if ($outhandle ne "STDERR") {
280 # so lucene_passes doesn't print to stderr if we redirect output
281 $osextra .= " 2>/dev/null";
282 }
283 }
284
285 # get the index expression if this index belongs
286 # to a subcollection
287 my $indexexparr = [];
288 my $langarr = [];
289
290 # there may be subcollection info, and language info.
291 my ($fields, $subcollection, $language) = split (":", $index);
292 my @subcollections = ();
293 @subcollections = split /,/, $subcollection if (defined $subcollection);
294
295 foreach $subcollection (@subcollections) {
296 if (defined ($self->{'collect_cfg'}->{'subcollection'}->{$subcollection})) {
297 push (@$indexexparr, $self->{'collect_cfg'}->{'subcollection'}->{$subcollection});
298 }
299 }
300
301 # add expressions for languages if this index belongs to
302 # a language subcollection - only put languages expressions for the
303 # ones we want in the index
304 my @languages = ();
305 my $language_metadata = "Language";
306 if (defined ($self->{'collect_cfg'}->{'language_metadata'})) {
307 $language_metadata = $self->{'collect_cfg'}->{'language_metadata'};
308 }
309 @languages = split /,/, $language if (defined $language);
310 foreach my $language (@languages) {
311 my $not=0;
312 if ($language =~ s/^\!//) {
313 $not = 1;
314 }
315 if($not) {
316 push (@$langarr, "!$language");
317 } else {
318 push (@$langarr, "$language");
319 }
320 }
321
322 # Build index dictionary. Uses verbatim stem method
323 print $outhandle "\n creating index dictionary (lucene_passes -I1)\n" if ($self->{'verbosity'} >= 1);
324 print STDERR "<Phase name='CreatingIndexDic'/>\n" if $self->{'gli'};
325 my ($handle);
326
327 if ($self->{'debug'}) {
328 $handle = STDOUT;
329 } else {
330 print STDERR "Cmd: $full_lucene_passes_exe $opt_create_index index $lucene_passes_sections \"$build_dir\" \"$indexdir\" $osextra\n";
331 if (!-e "$full_lucene_passes" ||
332 !open (PIPEOUT, "| $full_lucene_passes_exe $opt_create_index index $lucene_passes_sections \"$build_dir\" \"$indexdir\" $osextra")) {
333 print STDERR "<FatalError name='NoRunLucenePasses'/>\n</Stage>\n" if $self->{'gli'};
334 die "lucenebuilder::build_index - couldn't run $lucene_passes_exe\n";
335 }
336 $handle = lucenebuilder::PIPEOUT;
337 }
338
339 my $store_levels = $self->{'levels'};
340 my $gdbm_level = "document";
341 if ($store_levels->{'section'}) {
342 $gdbm_level = "section";
343 }
344
345 my $dom_level = "";
346 foreach my $key (keys %$store_levels) {
347 if ($mgppbuilder::level_map{$key} eq $llevel) {
348 $dom_level = $key;
349 }
350 }
351 if ($dom_level eq "") {
352 print STDERR "Warning: unrecognized tag level $llevel\n";
353 $dom_level = "document";
354 }
355
356 my $local_levels = { $dom_level => 1 }; # work on one level at a time
357
358 # set up the document processr
359 $self->{'buildproc'}->set_output_handle ($handle);
360 $self->{'buildproc'}->set_mode ('text');
361 $self->{'buildproc'}->set_index ($index, $indexexparr);
362 $self->{'buildproc'}->set_index_languages ($language_metadata, $langarr) if (defined $language);
363 $self->{'buildproc'}->set_indexing_text (1);
364 $self->{'buildproc'}->set_indexfieldmap ($self->{'indexfieldmap'});
365 $self->{'buildproc'}->set_levels ($local_levels);
366 $self->{'buildproc'}->set_gdbm_level($gdbm_level);
367 $self->{'buildproc'}->reset();
368 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
369 "", {}, $self->{'buildproc'}, $self->{'maxdocs'}, 0, $self->{'gli'});
370 close ($handle) unless $self->{'debug'};
371
372 $self->print_stats();
373
374 $self->{'buildproc'}->set_levels ($store_levels);
375 print STDERR "</Stage>\n" if $self->{'gli'};
376}
377
378# /** A modified version of the basebuilder.pm's function that generates the
379# * information database (GDBM) from the GA documents. We need to change this
380# * so that if we've been asked to do an incremental build we only add
381# * metadata to autohierarchy classifiers via the IncrementalBuildUtils
382# * module. All other classifiers and metadata will be ignored.
383# */
384sub make_infodatabase
385 {
386 my $self = shift (@_);
387 my $outhandle = $self->{'outhandle'};
388
389 my $dbext = ".bdb";
390 $dbext = ".ldb" if &util::is_little_endian();
391 my $infodb_file = &util::filename_cat($self->{'build_dir'}, "text", $self->{'collection'} . $dbext);
392
393 # If we aren't doing an incremental addition, then we just call the super-
394 # classes version
395 # Note: Incremental addition can only occur if a text/<collection>.ldb
396 # already exists. If it doesn't, let the super classes function be
397 # called once to generate it.
398 if (!$self->{'incremental_dlc'} || !(-e $infodb_file))
399 {
400 # basebuilder::make_infodatabase(@_);
401 # Note: this doesn't work as the direct reference means all the $self
402 # data is lost.
403 $self->basebuilder::make_infodatabase(@_);
404 return;
405 }
406
407 # Carry on with an incremental addition
408 print $outhandle "\n*** performing an incremental addition to the info database\n" if ($self->{'verbosity'} >= 1);
409 print STDERR "<Stage name='CreateInfoData'>\n" if $self->{'gli'};
410
411 # 1. Init all the classifiers
412 &classify::init_classifiers ($self->{'classifiers'});
413 # 2. Init the buildproc settings.
414 # Note: we still need this to process any associated files - but we
415 # don't expect to pipe anything to txt2db so we can do away with the
416 # complex output handle.
417 my $assocdir = &util::filename_cat($self->{'build_dir'}, "assoc");
418 &util::mk_all_dir ($assocdir);
419 $self->{'buildproc'}->set_mode ('incinfodb'); # Very Important
420 $self->{'buildproc'}->set_assocdir ($assocdir);
421 # 3. Read in all the metadata from the files in the archives directory using
422 # the GAPlug and using ourselves as the document processor!
423 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'}, "", {}, $self->{'buildproc'}, $self->{'maxdocs'},0, $self->{'gli'});
424
425 print STDERR "</Stage>\n" if $self->{'gli'};
426}
427
4281;
429
430
Note: See TracBrowser for help on using the repository browser.