source: gsdl/trunk/perllib/lucenebuilder.pm@ 17566

Last change on this file since 17566 was 17566, checked in by kjdon, 16 years ago

lucene no longer does anything with paragraphs, so we print a warning if the user has specified them

  • Property svn:keywords set to Author Date Id Revision
File size: 15.1 KB
Line 
1###########################################################################
2#
3# lucenebuilder.pm -- perl wrapper for building index with Lucene
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26###########################################################################
27# /*
28# * @version 1.0 ?
29# * @version 2.0 Incremental building assistance added, including
30# * remove_document_from_database which implements the granddad's
31# * empty function to call the lucene_passes.pl and full_lucene_passes_exe
32# * so there is one place in the code that works out where the
33# * perl script is. John Rowe
34# *
35# * @author John Rowe, DL Consulting Ltd.
36# */
37###########################################################################
38
39package lucenebuilder;
40
41# Use same basic XML structure setup by mgppbuilder/mgppbuildproc
42
43use mgppbuilder;
44use strict;
45no strict 'refs';
46
47
48sub BEGIN {
49 @lucenebuilder::ISA = ('mgppbuilder');
50}
51
52# /**
53# * @author John Thompson, DL Consulting Ltd.
54# */
55sub new {
56 my $class = shift(@_);
57 my $self = new mgppbuilder (@_);
58 $self = bless $self, $class;
59
60 $self->{'buildtype'} = "lucene";
61
62 # Do we need to put exe on the end?
63 my $exe = &util::get_os_exe ();
64 my $scriptdir = "$ENV{'GSDLHOME'}/bin/script";
65
66 # So where is lucene_passes.pl anyway?
67 my $lucene_passes_script = &util::filename_cat($scriptdir, "lucene_passes.pl");
68
69 # So tack perl on the beginning to ensure execution
70 $self->{'full_lucene_passes'} = "$lucene_passes_script";
71 if ($exe eq ".exe")
72 {
73 $self->{'full_lucene_passes_exe'} = "perl$exe \"$lucene_passes_script\"";
74 }
75 else
76 {
77 $self->{'full_lucene_passes_exe'} = "perl -S \"$lucene_passes_script\"";
78 }
79
80 return $self;
81}
82# /** new() **/
83
84# lucene has none of these options
85sub generate_index_options {
86 my $self = shift (@_);
87
88 $self->SUPER::generate_index_options();
89
90 $self->{'casefold'} = 0;
91 $self->{'stem'} = 0;
92 $self->{'accentfold'} = 0;
93 $self->{'stemindexes'} = 0;
94}
95
96sub default_buildproc {
97 my $self = shift (@_);
98
99 return "lucenebuildproc";
100}
101
102# this writes a nice version of the text docs
103sub compress_text
104{
105 my $self = shift (@_);
106 # we don't do anything if we don't want compressed text
107 return if $self->{'no_text'};
108
109 my ($textindex) = @_;
110 my $outhandle = $self->{'outhandle'};
111 print STDERR "Saving the document text\n";
112 # the text directory
113 my $text_dir = &util::filename_cat($self->{'build_dir'}, "text");
114 my $build_dir = &util::filename_cat($self->{'build_dir'},"");
115 &util::mk_all_dir ($text_dir);
116
117 my $osextra = "";
118 if ($ENV{'GSDLOS'} =~ /^windows$/i)
119 {
120 $text_dir =~ s@/@\\@g;
121 }
122 else
123 {
124 if ($outhandle ne "STDERR")
125 {
126 # so lucene_passes doesn't print to stderr if we redirect output
127 $osextra .= " 2>/dev/null";
128 }
129 }
130
131 # get any os specific stuff
132 my $scriptdir = "$ENV{'GSDLHOME'}/bin/script";
133
134 # Find the perl script to call to run lucene
135 my $full_lucene_passes = $self->{'full_lucene_passes'};
136 my $full_lucene_passes_exe = $self->{'full_lucene_passes_exe'};
137
138 my $lucene_passes_sections = "Doc";
139
140 my ($handle);
141
142 if ($self->{'debug'})
143 {
144 $handle = *STDOUT;
145 }
146 else
147 {
148 print STDERR "Full Path: $full_lucene_passes\n";
149 print STDERR "Executable: $full_lucene_passes_exe\n";
150 print STDERR "Sections: $lucene_passes_sections\n";
151 print STDERR "Build Dir: $build_dir\n";
152 print STDERR "Cmd: $full_lucene_passes_exe text $lucene_passes_sections \"$build_dir\" \"dummy\" $osextra\n";
153 if (!-e "$full_lucene_passes" ||
154 !open($handle, "| $full_lucene_passes_exe text $lucene_passes_sections \"$build_dir\" \"dummy\" $osextra"))
155 {
156 print STDERR "<FatalError name='NoRunLucenePasses'/>\n</Stage>\n" if $self->{'gli'};
157 die "lucenebuilder::build_index - couldn't run $full_lucene_passes_exe\n";
158 }
159 }
160
161 # stored text is always Doc and Sec levels
162 my $levels = { 'document' => 1, 'section' => 1 };
163 # always do database at section level
164 my $db_level = "section";
165
166 # set up the document processr
167 $self->{'buildproc'}->set_output_handle ($handle);
168 $self->{'buildproc'}->set_mode ('text');
169 $self->{'buildproc'}->set_index ($textindex);
170 $self->{'buildproc'}->set_indexing_text (0);
171 #$self->{'buildproc'}->set_indexfieldmap ($self->{'indexfieldmap'});
172 $self->{'buildproc'}->set_levels ($levels);
173 $self->{'buildproc'}->set_db_level ($db_level);
174 $self->{'buildproc'}->reset();
175 &plugin::begin($self->{'pluginfo'}, $self->{'source_dir'},
176 $self->{'buildproc'}, $self->{'maxdocs'});
177 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
178 "", {}, {}, $self->{'buildproc'}, $self->{'maxdocs'}, 0, $self->{'gli'});
179 &plugin::end($self->{'pluginfo'});
180 close ($handle) unless $self->{'debug'};
181 $self->print_stats();
182
183 print STDERR "</Stage>\n" if $self->{'gli'};
184}
185
186sub build_indexes {
187 my $self = shift (@_);
188 my ($indexname) = @_;
189 my $outhandle = $self->{'outhandle'};
190
191 my $indexes = [];
192 if (defined $indexname && $indexname =~ /\w/) {
193 push @$indexes, $indexname;
194 } else {
195 $indexes = $self->{'collect_cfg'}->{'indexes'};
196 }
197 # have we got para index?
198 foreach my $level (keys %{$self->{'levels'}}) {
199 if ($level =~ /paragraph/) {
200 print $outhandle "Warning: Paragraph level indexing not supported by Lucene\n";
201 last;
202 }
203 }
204 # create the mapping between the index descriptions
205 # and their directory names (includes subcolls and langs)
206 $self->{'index_mapping'} = $self->create_index_mapping ($indexes);
207
208 # build each of the indexes
209 foreach my $index (@$indexes) {
210 if ($self->want_built($index)) {
211
212 my $idx = $self->{'index_mapping'}->{$index};
213 foreach my $level (keys %{$self->{'levels'}}) {
214 next if $level =~ /paragraph/; # we don't do para indexing
215 my ($pindex) = $level =~ /^(.)/;
216 # should probably check that new name with level
217 # is unique ... but currently (with doc sec and para)
218 # each has unique first letter.
219 $self->{'index_mapping'}->{$index} = $pindex.$idx;
220
221 my $llevel = $mgppbuilder::level_map{$level};
222 print $outhandle "\n*** building index $index at level $llevel in subdirectory " .
223 "$self->{'index_mapping'}->{$index}\n" if ($self->{'verbosity'} >= 1);
224 print STDERR "<Stage name='Index' source='$index' level=$llevel>\n" if $self->{'gli'};
225
226 $self->build_index($index,$llevel);
227 }
228 $self->{'index_mapping'}->{$index} = $idx;
229
230 } else {
231 print $outhandle "\n*** ignoring index $index\n" if ($self->{'verbosity'} >= 1);
232 }
233 }
234
235 #define the final field lists
236 $self->make_final_field_list();
237}
238
239
240sub build_index {
241 my $self = shift (@_);
242 my ($index,$llevel) = @_;
243 my $outhandle = $self->{'outhandle'};
244 my $build_dir = $self->{'build_dir'};
245
246 # get the full index directory path and make sure it exists
247 my $indexdir = $self->{'index_mapping'}->{$index};
248 &util::mk_all_dir (&util::filename_cat($build_dir, $indexdir));
249
250 # get any os specific stuff
251 my $exedir = "$ENV{'GSDLHOME'}/bin/$ENV{'GSDLOS'}";
252 my $scriptdir = "$ENV{'GSDLHOME'}/bin/script";
253
254 # Find the perl script to call to run lucene
255 my $full_lucene_passes = $self->{'full_lucene_passes'};
256 my $full_lucene_passes_exe = $self->{'full_lucene_passes_exe'};
257
258 # define the section names for lucenepasses
259 # define the section names and possibly the doc name for lucenepasses
260 my $lucene_passes_sections = $llevel;
261
262 my $opt_create_index = ($self->{'keepold'}) ? "" : "-removeold";
263
264 my $osextra = "";
265 if ($ENV{'GSDLOS'} =~ /^windows$/i) {
266 $build_dir =~ s@/@\\@g;
267 } else {
268 if ($outhandle ne "STDERR") {
269 # so lucene_passes doesn't print to stderr if we redirect output
270 $osextra .= " 2>/dev/null";
271 }
272 }
273
274 # get the index expression if this index belongs
275 # to a subcollection
276 my $indexexparr = [];
277 my $langarr = [];
278
279 # there may be subcollection info, and language info.
280 my ($fields, $subcollection, $language) = split (":", $index);
281 my @subcollections = ();
282 @subcollections = split /,/, $subcollection if (defined $subcollection);
283
284 foreach $subcollection (@subcollections) {
285 if (defined ($self->{'collect_cfg'}->{'subcollection'}->{$subcollection})) {
286 push (@$indexexparr, $self->{'collect_cfg'}->{'subcollection'}->{$subcollection});
287 }
288 }
289
290 # add expressions for languages if this index belongs to
291 # a language subcollection - only put languages expressions for the
292 # ones we want in the index
293 my @languages = ();
294 my $language_metadata = "Language";
295 if (defined ($self->{'collect_cfg'}->{'language_metadata'})) {
296 $language_metadata = $self->{'collect_cfg'}->{'language_metadata'};
297 }
298 @languages = split /,/, $language if (defined $language);
299 foreach my $language (@languages) {
300 my $not=0;
301 if ($language =~ s/^\!//) {
302 $not = 1;
303 }
304 if($not) {
305 push (@$langarr, "!$language");
306 } else {
307 push (@$langarr, "$language");
308 }
309 }
310
311 # Build index dictionary. Uses verbatim stem method
312 print $outhandle "\n creating index dictionary (lucene_passes -I1)\n" if ($self->{'verbosity'} >= 1);
313 print STDERR "<Phase name='CreatingIndexDic'/>\n" if $self->{'gli'};
314 my ($handle);
315
316 if ($self->{'debug'}) {
317 $handle = *STDOUT;
318 } else {
319 print STDERR "Cmd: $full_lucene_passes_exe $opt_create_index index $lucene_passes_sections \"$build_dir\" \"$indexdir\" $osextra\n";
320 if (!-e "$full_lucene_passes" ||
321 !open($handle, "| $full_lucene_passes_exe $opt_create_index index $lucene_passes_sections \"$build_dir\" \"$indexdir\" $osextra")) {
322 print STDERR "<FatalError name='NoRunLucenePasses'/>\n</Stage>\n" if $self->{'gli'};
323 die "lucenebuilder::build_index - couldn't run $full_lucene_passes_exe\n";
324 }
325 }
326
327 my $store_levels = $self->{'levels'};
328 my $db_level = "section"; #always
329 my $dom_level = "";
330 foreach my $key (keys %$store_levels) {
331 if ($mgppbuilder::level_map{$key} eq $llevel) {
332 $dom_level = $key;
333 }
334 }
335 if ($dom_level eq "") {
336 print STDERR "Warning: unrecognized tag level $llevel\n";
337 $dom_level = "document";
338 }
339
340 my $local_levels = { $dom_level => 1 }; # work on one level at a time
341
342 # set up the document processr
343 $self->{'buildproc'}->set_output_handle ($handle);
344 $self->{'buildproc'}->set_mode ('text');
345 $self->{'buildproc'}->set_index ($index, $indexexparr);
346 $self->{'buildproc'}->set_index_languages ($language_metadata, $langarr) if (defined $language);
347 $self->{'buildproc'}->set_indexing_text (1);
348 #$self->{'buildproc'}->set_indexfieldmap ($self->{'indexfieldmap'});
349 $self->{'buildproc'}->set_levels ($local_levels);
350 $self->{'buildproc'}->set_db_level($db_level);
351 $self->{'buildproc'}->reset();
352 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
353 "", {}, {}, $self->{'buildproc'}, $self->{'maxdocs'}, 0, $self->{'gli'});
354 close ($handle) unless $self->{'debug'};
355
356 $self->print_stats();
357
358 $self->{'buildproc'}->set_levels ($store_levels);
359 print STDERR "</Stage>\n" if $self->{'gli'};
360}
361
362# /** A modified version of the basebuilder.pm's function that generates the
363# * information database from the GA documents. We need to change this
364# * so that if we've been asked to do an incremental build we only add
365# * metadata to autohierarchy classifiers via the IncrementalBuildUtils
366# * module. All other classifiers and metadata will be ignored.
367# */
368# This was added to utilize DLC's incremental updating of Hierarchy classifiers. They are heading towards just using dynamic classifiers, and we do not want to use this code either. So now, we just use basebuilder's version of make_infodatabase
369sub make_infodatabase_dlc
370{
371 my $self = shift (@_);
372 my $outhandle = $self->{'outhandle'};
373
374 # Get info database file path
375 my $text_directory_path = &util::filename_cat($self->{'build_dir'}, "text");
376 my $infodb_file_path = &dbutil::get_infodb_file_path($self->{'infodbtype'}, $self->{'collection'}, $text_directory_path);
377
378 # If we aren't doing an incremental addition, then we just call the super-
379 # classes version
380 # Note: Incremental addition can only occur if an information database
381 # already exists. If it doesn't, let the super classes function be
382 # called once to generate it.
383 if (!$self->{'incremental'} || !-e $infodb_file_path)
384 {
385 # basebuilder::make_infodatabase(@_);
386 # Note: this doesn't work as the direct reference means all the $self
387 # data is lost.
388 $self->basebuilder::make_infodatabase(@_);
389 return;
390 }
391
392 # Carry on with an incremental addition
393 print $outhandle "\n*** performing an incremental addition to the info database\n" if ($self->{'verbosity'} >= 1);
394 print STDERR "<Stage name='CreateInfoData'>\n" if $self->{'gli'};
395
396 # 1. Init all the classifiers
397 &classify::init_classifiers ($self->{'classifiers'});
398 # 2. Init the buildproc settings.
399 # Note: we still need this to process any associated files - but we
400 # don't expect to pipe anything to the database so we can do away with the
401 # complex output handle.
402 my $assocdir = &util::filename_cat($self->{'build_dir'}, "assoc");
403 &util::mk_all_dir ($assocdir);
404 $self->{'buildproc'}->set_mode ('incinfodb'); # Very Important
405 $self->{'buildproc'}->set_assocdir ($assocdir);
406 # 3. Read in all the metadata from the files in the archives directory using
407 # the GAPlug and using ourselves as the document processor!
408 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'}, "", {}, {}, $self->{'buildproc'}, $self->{'maxdocs'},0, $self->{'gli'});
409
410 print STDERR "</Stage>\n" if $self->{'gli'};
411}
412
413# /** Lucene specific document removal function. This works by calling lucene_passes.pl with
414# * -remove and the document id on the command line.
415# *
416# * @param oid is the document identifier to be removed.
417# *
418# * @author John Rowe, DL Consulting Ltd.
419# */
420sub remove_document_from_database
421{
422 my ($self, $oid) = @_;
423 # Find the perl script to call to run lucene
424 my $full_lucene_passes_exe = $self->{'full_lucene_passes_exe'};
425 # Call lucene_passes.pl with -remove and the document ID on the command line
426 `$full_lucene_passes_exe -remove "$oid"`;
427}
428# /** remove_document_from_database **/
429
430
4311;
432
433
Note: See TracBrowser for help on using the repository browser.