source: trunk/gsdl/perllib/lucenebuilder.pm@ 12844

Last change on this file since 12844 was 12844, checked in by mdewsnip, 18 years ago

Incremental building and dynamic GDBM updating code, many thanks to John Rowe and John Thompson at DL Consulting Ltd.

  • Property svn:keywords set to Author Date Id Revision
File size: 15.2 KB
Line 
1###########################################################################
2#
3# lucenebuilder.pm -- perl wrapper for building index with Lucene
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26###########################################################################
27# /*
28# * @version 1.0 ?
29# * @version 2.0 Incremental building assistance added, including
30# * remove_document_from_database which implements the granddad's
31# * empty function to call the lucene_passes.pl and full_lucene_passes_exe
32# * so there is one place in the code that works out where the
33# * perl script is. John Rowe
34# *
35# * @author John Rowe, DL Consulting Ltd.
36# */
37###########################################################################
38
39package lucenebuilder;
40
41# Use same basic XML structure setup by mgppbuilder/mgppbuildproc
42
43use mgppbuilder;
44
45use IncrementalBuildUtils;
46
47sub BEGIN {
48 @lucenebuilder::ISA = ('mgppbuilder');
49}
50
51# /**
52# * @author John Thompson, DL Consulting Ltd.
53# */
54sub new {
55 my $class = shift(@_);
56 my ($collection, $source_dir, $build_dir, $verbosity, $maxdocs, $debug, $keepold, $allclassifications, $outhandle, $no_text, $faillog, $gli, $incremental) = @_;
57
58 my $self = new mgppbuilder (@_);
59 $self = bless $self, $class;
60
61 $self->{'buildtype'} = "lucene";
62
63 # Do we need to put exe on the end?
64 my $exe = &util::get_os_exe ();
65 my $scriptdir = "$ENV{'GSDLHOME'}/bin/script";
66
67 # So where is lucene_passes.pl anyway?
68 my $lucene_passes_script = &util::filename_cat($scriptdir, "lucene_passes.pl");
69
70 # So tack perl on the beginning to ensure execution
71 $self->{'full_lucene_passes'} = "$lucene_passes_script";
72 if ($exe eq ".exe")
73 {
74 $self->{'full_lucene_passes_exe'} = "perl$exe \"$lucene_passes_script\"";
75 }
76 else
77 {
78 $self->{'full_lucene_passes_exe'} = "perl -S \"$lucene_passes_script\"";
79 }
80
81 # We must also record whether we have been asked to do just an incremental
82 # build (which makes no difference to the Lucene indexing bit, just the
83 # building of the classifiers in the GDBM).
84 $self->{'incremental'} = $incremental;
85
86 return $self;
87}
88# /** new() **/
89
90sub default_buildproc {
91 my $self = shift (@_);
92
93 return "lucenebuildproc";
94}
95
96# this writes a nice version of the text docs
97sub compress_text
98 {
99 my $self = shift (@_);
100 # we don't do anything if we don't want compressed text
101 return if $self->{'no_text'};
102
103 my ($textindex) = @_;
104 my $outhandle = $self->{'outhandle'};
105 print STDERR "Saving the document text\n";
106 # the text directory
107 my $text_dir = &util::filename_cat($self->{'build_dir'}, "text");
108 my $build_dir = &util::filename_cat($self->{'build_dir'},"");
109 &util::mk_all_dir ($text_dir);
110
111 my $osextra = "";
112 if ($ENV{'GSDLOS'} =~ /^windows$/i)
113 {
114 $text_dir =~ s@/@\\@g;
115 }
116 else
117 {
118 if ($outhandle ne "STDERR")
119 {
120 # so lucene_passes doesn't print to stderr if we redirect output
121 $osextra .= " 2>/dev/null";
122 }
123 }
124
125 # get any os specific stuff
126 my $scriptdir = "$ENV{'GSDLHOME'}/bin/script";
127
128 # Find the perl script to call to run lucene
129 my $full_lucene_passes = $self->{'full_lucene_passes'};
130 my $full_lucene_passes_exe = $self->{'full_lucene_passes_exe'};
131
132 my $lucene_passes_sections = "Doc";
133
134 my ($handle);
135
136 if ($self->{'debug'})
137 {
138 $handle = STDOUT;
139 }
140 else
141 {
142 print STDERR "Full Path: $full_lucene_passes\n";
143 print STDERR "Executable: $full_lucene_passes_exe\n";
144 print STDERR "Sections: $lucene_passes_sections\n";
145 print STDERR "Build Dir: $build_dir\n";
146 print STDERR "Cmd: $full_lucene_passes_exe text $lucene_passes_sections \"$build_dir\" \"dummy\" $osextra\n";
147 if (!-e "$full_lucene_passes" ||
148 !open (PIPEOUT, "| $full_lucene_passes_exe text $lucene_passes_sections \"$build_dir\" \"dummy\" $osextra"))
149 {
150 print STDERR "<FatalError name='NoRunLucenePasses'/>\n</Stage>\n" if $self->{'gli'};
151 die "lucenebuilder::build_index - couldn't run $full_lucene_passes_exe\n";
152 }
153 $handle = lucenebuilder::PIPEOUT;
154 }
155 my $levels = $self->{'levels'};
156 my $gdbm_level = "document";
157 if ($levels->{'section'})
158 {
159 $gdbm_level = "section";
160 }
161
162 undef $levels->{'paragraph'}; # get rid of para if we had it.
163 # set up the document processr
164 $self->{'buildproc'}->set_output_handle ($handle);
165 $self->{'buildproc'}->set_mode ('text');
166 $self->{'buildproc'}->set_index ($textindex);
167 $self->{'buildproc'}->set_indexing_text (0);
168 $self->{'buildproc'}->set_indexfieldmap ($self->{'indexfieldmap'});
169 $self->{'buildproc'}->set_levels ($levels);
170 $self->{'buildproc'}->set_gdbm_level ($gdbm_level);
171 $self->{'buildproc'}->reset();
172 &plugin::begin($self->{'pluginfo'}, $self->{'source_dir'},
173 $self->{'buildproc'}, $self->{'maxdocs'});
174 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
175 "", {}, $self->{'buildproc'}, $self->{'maxdocs'}, 0, $self->{'gli'});
176 &plugin::end($self->{'pluginfo'});
177 close ($handle) unless $self->{'debug'};
178 close PIPEOUT;
179 $self->print_stats();
180
181 print STDERR "</Stage>\n" if $self->{'gli'};
182 }
183
184sub build_indexes {
185 my $self = shift (@_);
186 my ($indexname) = @_;
187 my $outhandle = $self->{'outhandle'};
188
189 my $indexes = [];
190 if (defined $indexname && $indexname =~ /\w/) {
191 push @$indexes, $indexname;
192 } else {
193 $indexes = $self->{'collect_cfg'}->{'indexes'};
194 }
195
196 # create the mapping between the index descriptions
197 # and their directory names (includes subcolls and langs)
198 $self->{'index_mapping'} = $self->create_index_mapping ($indexes);
199
200 # build each of the indexes
201 foreach $index (@$indexes) {
202 if ($self->want_built($index)) {
203
204 my $idx = $self->{'index_mapping'}->{$index};
205 foreach my $level (keys %{$self->{'levels'}}) {
206 next if $level =~ /paragraph/; # we don't do para indexing
207 my ($pindex) = $level =~ /^(.)/;
208 # should probably check that new name with level
209 # is unique ... but currently (with doc sec and para)
210 # each has unique first letter.
211 $self->{'index_mapping'}->{$index} = $pindex.$idx;
212
213 my $llevel = $mgppbuilder::level_map{$level};
214 print $outhandle "\n*** building index $index at level $llevel in subdirectory " .
215 "$self->{'index_mapping'}->{$index}\n" if ($self->{'verbosity'} >= 1);
216 print STDERR "<Stage name='Index' source='$index' level=$llevel>\n" if $self->{'gli'};
217
218 $self->build_index($index,$llevel);
219 }
220 $self->{'index_mapping'}->{$index} = $idx;
221
222 } else {
223 print $outhandle "\n*** ignoring index $index\n" if ($self->{'verbosity'} >= 1);
224 }
225 }
226
227 #define the final field lists
228 $self->make_final_field_list();
229}
230
231# /** Lucene specific document removal function. This works by calling lucene_passes.pl with
232# * -remove and the document id on the command line.
233# *
234# * @param oid is the document identifier to be removed.
235# *
236# * @author John Rowe, DL Consulting Ltd.
237# */
238sub remove_document_from_database
239 {
240 my ($self, $oid) = @_;
241 # Find the perl script to call to run lucene
242 my $full_lucene_passes_exe = $self->{'full_lucene_passes_exe'};
243 # Call lucene_passes.pl with -remove and the document ID on the command line
244 `$full_lucene_passes_exe -remove "$oid"`;
245 }
246# /** remove_document_from_database **/
247
248sub build_index {
249 my $self = shift (@_);
250 my ($index,$llevel) = @_;
251 my $outhandle = $self->{'outhandle'};
252 my $build_dir = $self->{'build_dir'};
253
254 # get the full index directory path and make sure it exists
255 my $indexdir = $self->{'index_mapping'}->{$index};
256 &util::mk_all_dir (&util::filename_cat($build_dir, $indexdir));
257
258 # get any os specific stuff
259 my $exedir = "$ENV{'GSDLHOME'}/bin/$ENV{'GSDLOS'}";
260 my $scriptdir = "$ENV{'GSDLHOME'}/bin/script";
261
262 # Find the perl script to call to run lucene
263 my $full_lucene_passes = $self->{'full_lucene_passes'};
264 my $full_lucene_passes_exe = $self->{'full_lucene_passes_exe'};
265
266 # define the section names for lucenepasses
267 # define the section names and possibly the doc name for lucenepasses
268 my $lucene_passes_sections = $llevel;
269
270 my $opt_create_index = ($self->{'keepold'}) ? "" : "-create";
271
272 my $osextra = "";
273 if ($ENV{'GSDLOS'} =~ /^windows$/i) {
274 $build_dir =~ s@/@\\@g;
275 } else {
276 if ($outhandle ne "STDERR") {
277 # so lucene_passes doesn't print to stderr if we redirect output
278 $osextra .= " 2>/dev/null";
279 }
280 }
281
282 # get the index expression if this index belongs
283 # to a subcollection
284 my $indexexparr = [];
285 my $langarr = [];
286
287 # there may be subcollection info, and language info.
288 my ($fields, $subcollection, $language) = split (":", $index);
289 my @subcollections = ();
290 @subcollections = split /,/, $subcollection if (defined $subcollection);
291
292 foreach $subcollection (@subcollections) {
293 if (defined ($self->{'collect_cfg'}->{'subcollection'}->{$subcollection})) {
294 push (@$indexexparr, $self->{'collect_cfg'}->{'subcollection'}->{$subcollection});
295 }
296 }
297
298 # add expressions for languages if this index belongs to
299 # a language subcollection - only put languages expressions for the
300 # ones we want in the index
301 my @languages = ();
302 my $language_metadata = "Language";
303 if (defined ($self->{'collect_cfg'}->{'language_metadata'})) {
304 $language_metadata = $self->{'collect_cfg'}->{'language_metadata'};
305 }
306 @languages = split /,/, $language if (defined $language);
307 foreach my $language (@languages) {
308 my $not=0;
309 if ($language =~ s/^\!//) {
310 $not = 1;
311 }
312 if($not) {
313 push (@$langarr, "!$language");
314 } else {
315 push (@$langarr, "$language");
316 }
317 }
318
319 # Build index dictionary. Uses verbatim stem method
320 print $outhandle "\n creating index dictionary (lucene_passes -I1)\n" if ($self->{'verbosity'} >= 1);
321 print STDERR "<Phase name='CreatingIndexDic'/>\n" if $self->{'gli'};
322 my ($handle);
323
324 if ($self->{'debug'}) {
325 $handle = STDOUT;
326 } else {
327 print STDERR "Cmd: $full_lucene_passes_exe $opt_create_index index $lucene_passes_sections \"$build_dir\" \"$indexdir\" $osextra\n";
328 if (!-e "$full_lucene_passes" ||
329 !open (PIPEOUT, "| $full_lucene_passes_exe $opt_create_index index $lucene_passes_sections \"$build_dir\" \"$indexdir\" $osextra")) {
330 print STDERR "<FatalError name='NoRunLucenePasses'/>\n</Stage>\n" if $self->{'gli'};
331 die "lucenebuilder::build_index - couldn't run $lucene_passes_exe\n";
332 }
333 $handle = lucenebuilder::PIPEOUT;
334 }
335
336 my $store_levels = $self->{'levels'};
337 my $gdbm_level = "document";
338 if ($store_levels->{'section'}) {
339 $gdbm_level = "section";
340 }
341
342 my $dom_level = "";
343 foreach my $key (keys %$store_levels) {
344 if ($mgppbuilder::level_map{$key} eq $llevel) {
345 $dom_level = $key;
346 }
347 }
348 if ($dom_level eq "") {
349 print STDERR "Warning: unrecognized tag level $llevel\n";
350 $dom_level = "document";
351 }
352
353 my $local_levels = { $dom_level => 1 }; # work on one level at a time
354
355 # set up the document processr
356 $self->{'buildproc'}->set_output_handle ($handle);
357 $self->{'buildproc'}->set_mode ('text');
358 $self->{'buildproc'}->set_index ($index, $indexexparr);
359 $self->{'buildproc'}->set_index_languages ($language_metadata, $langarr) if (defined $language);
360 $self->{'buildproc'}->set_indexing_text (1);
361 $self->{'buildproc'}->set_indexfieldmap ($self->{'indexfieldmap'});
362 $self->{'buildproc'}->set_levels ($local_levels);
363 $self->{'buildproc'}->set_gdbm_level($gdbm_level);
364 $self->{'buildproc'}->reset();
365 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
366 "", {}, $self->{'buildproc'}, $self->{'maxdocs'}, 0, $self->{'gli'});
367 close ($handle) unless $self->{'debug'};
368
369 $self->print_stats();
370
371 $self->{'buildproc'}->set_levels ($store_levels);
372 print STDERR "</Stage>\n" if $self->{'gli'};
373}
374
375# /** A modified version of the basebuilder.pm's function that generates the
376# * information database (GDBM) from the GA documents. We need to change this
377# * so that if we've been asked to do an incremental build we only add
378# * metadata to autohierarchy classifiers via the IncrementalBuildUtils
379# * module. All other classifiers and metadata will be ignored.
380# */
381sub make_infodatabase
382 {
383 my $self = shift (@_);
384 my $outhandle = $self->{'outhandle'};
385
386 my $dbext = ".bdb";
387 $dbext = ".ldb" if &util::is_little_endian();
388 my $infodb_file = &util::filename_cat($self->{'build_dir'}, "text", $self->{'collection'} . $dbext);
389
390 # If we aren't doing an incremental addition, then we just call the super-
391 # classes version
392 # Note: Incremental addition can only occur if a text/<collection>.ldb
393 # already exists. If it doesn't, let the super classes function be
394 # called once to generate it.
395 if (!$self->{'incremental'} || !(-e $infodb_file))
396 {
397 # basebuilder::make_infodatabase(@_);
398 # Note: this doesn't work as the direct reference means all the $self
399 # data is lost.
400 $self->basebuilder::make_infodatabase(@_);
401 return;
402 }
403
404 # Carry on with an incremental addition
405 print $outhandle "\n*** performing an incremental addition to the info database\n" if ($self->{'verbosity'} >= 1);
406 print STDERR "<Stage name='CreateInfoData'>\n" if $self->{'gli'};
407
408 # 1. Init all the classifiers
409 &classify::init_classifiers ($self->{'classifiers'});
410 # 2. Init the buildproc settings.
411 # Note: we still need this to process any associated files - but we
412 # don't expect to pipe anything to txt2db so we can do away with the
413 # complex output handle.
414 my $assocdir = &util::filename_cat($self->{'build_dir'}, "assoc");
415 &util::mk_all_dir ($assocdir);
416 $self->{'buildproc'}->set_mode ('incinfodb'); # Very Important
417 $self->{'buildproc'}->set_assocdir ($assocdir);
418 # 3. Read in all the metadata from the files in the archives directory using
419 # the GAPlug and using ourselves as the document processor!
420 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'}, "", {}, $self->{'buildproc'}, $self->{'maxdocs'},0, $self->{'gli'});
421
422 print STDERR "</Stage>\n" if $self->{'gli'};
423}
424
4251;
426
427
Note: See TracBrowser for help on using the repository browser.