source: gsdl/trunk/perllib/lucenebuilder.pm@ 16956

Last change on this file since 16956 was 16379, checked in by kjdon, 16 years ago

global block pass: added in extra argument to plugin::read calls

  • Property svn:keywords set to Author Date Id Revision
File size: 14.6 KB
RevLine 
[8072]1###########################################################################
2#
3# lucenebuilder.pm -- perl wrapper for building index with Lucene
4# A component of the Greenstone digital library software
[12844]5# from the New Zealand Digital Library Project at the
[8072]6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
[12844]26###########################################################################
27# /*
28# * @version 1.0 ?
29# * @version 2.0 Incremental building assistance added, including
30# * remove_document_from_database which implements the granddad's
31# * empty function to call the lucene_passes.pl and full_lucene_passes_exe
32# * so there is one place in the code that works out where the
33# * perl script is. John Rowe
34# *
35# * @author John Rowe, DL Consulting Ltd.
36# */
37###########################################################################
38
[8072]39package lucenebuilder;
40
41# Use same basic XML structure setup by mgppbuilder/mgppbuildproc
42
[12844]43use mgppbuilder;
[15712]44use strict; no strict 'refs';
[8072]45
[12844]46
[8072]47sub BEGIN {
[8716]48 @lucenebuilder::ISA = ('mgppbuilder');
[8072]49}
50
[12844]51# /**
52# * @author John Thompson, DL Consulting Ltd.
53# */
[8072]54sub new {
55 my $class = shift(@_);
56 my $self = new mgppbuilder (@_);
57 $self = bless $self, $class;
58
59 $self->{'buildtype'} = "lucene";
60
[12844]61 # Do we need to put exe on the end?
62 my $exe = &util::get_os_exe ();
63 my $scriptdir = "$ENV{'GSDLHOME'}/bin/script";
64
65 # So where is lucene_passes.pl anyway?
66 my $lucene_passes_script = &util::filename_cat($scriptdir, "lucene_passes.pl");
67
68 # So tack perl on the beginning to ensure execution
69 $self->{'full_lucene_passes'} = "$lucene_passes_script";
70 if ($exe eq ".exe")
71 {
[13589]72 $self->{'full_lucene_passes_exe'} = "perl$exe \"$lucene_passes_script\"";
[12844]73 }
74 else
75 {
[13589]76 $self->{'full_lucene_passes_exe'} = "perl -S \"$lucene_passes_script\"";
[12844]77 }
78
[8072]79 return $self;
80}
[12844]81# /** new() **/
[8072]82
[12910]83# lucene has none of these options
84sub generate_index_options {
85 my $self = shift (@_);
86
87 $self->{'casefold'} = 0;
88 $self->{'stem'} = 0;
89 $self->{'accentfold'} = 0;
90 $self->{'stemindexes'} = 0;
91}
92
[10468]93sub default_buildproc {
94 my $self = shift (@_);
95
96 return "lucenebuildproc";
97}
98
[9179]99# this writes a nice version of the text docs
[12844]100sub compress_text
[13589]101{
[8072]102 my $self = shift (@_);
[10961]103 # we don't do anything if we don't want compressed text
104 return if $self->{'no_text'};
105
[8072]106 my ($textindex) = @_;
[9179]107 my $outhandle = $self->{'outhandle'};
108 print STDERR "Saving the document text\n";
109 # the text directory
110 my $text_dir = &util::filename_cat($self->{'build_dir'}, "text");
[9197]111 my $build_dir = &util::filename_cat($self->{'build_dir'},"");
[9179]112 &util::mk_all_dir ($text_dir);
113
114 my $osextra = "";
[12844]115 if ($ENV{'GSDLOS'} =~ /^windows$/i)
[13589]116 {
117 $text_dir =~ s@/@\\@g;
118 }
[12844]119 else
[13589]120 {
121 if ($outhandle ne "STDERR")
122 {
123 # so lucene_passes doesn't print to stderr if we redirect output
124 $osextra .= " 2>/dev/null";
125 }
126 }
[9179]127
128 # get any os specific stuff
129 my $scriptdir = "$ENV{'GSDLHOME'}/bin/script";
130
[12844]131 # Find the perl script to call to run lucene
132 my $full_lucene_passes = $self->{'full_lucene_passes'};
133 my $full_lucene_passes_exe = $self->{'full_lucene_passes_exe'};
134
[9179]135 my $lucene_passes_sections = "Doc";
136
137 my ($handle);
138
[12844]139 if ($self->{'debug'})
[13589]140 {
[15712]141 $handle = *STDOUT;
[13589]142 }
[12844]143 else
[13589]144 {
[12844]145 print STDERR "Full Path: $full_lucene_passes\n";
146 print STDERR "Executable: $full_lucene_passes_exe\n";
147 print STDERR "Sections: $lucene_passes_sections\n";
148 print STDERR "Build Dir: $build_dir\n";
149 print STDERR "Cmd: $full_lucene_passes_exe text $lucene_passes_sections \"$build_dir\" \"dummy\" $osextra\n";
[13589]150 if (!-e "$full_lucene_passes" ||
[15712]151 !open($handle, "| $full_lucene_passes_exe text $lucene_passes_sections \"$build_dir\" \"dummy\" $osextra"))
[13589]152 {
153 print STDERR "<FatalError name='NoRunLucenePasses'/>\n</Stage>\n" if $self->{'gli'};
154 die "lucenebuilder::build_index - couldn't run $full_lucene_passes_exe\n";
155 }
156 }
[9214]157
[13590]158 # stored text is always Doc and Sec levels
159 my $levels = { 'document' => 1, 'section' => 1 };
[15685]160 # always do database at section level
161 my $db_level = "section";
[13590]162
[9179]163 # set up the document processr
164 $self->{'buildproc'}->set_output_handle ($handle);
165 $self->{'buildproc'}->set_mode ('text');
166 $self->{'buildproc'}->set_index ($textindex);
167 $self->{'buildproc'}->set_indexing_text (0);
168 $self->{'buildproc'}->set_indexfieldmap ($self->{'indexfieldmap'});
[12844]169 $self->{'buildproc'}->set_levels ($levels);
[15685]170 $self->{'buildproc'}->set_db_level ($db_level);
[9179]171 $self->{'buildproc'}->reset();
[12844]172 &plugin::begin($self->{'pluginfo'}, $self->{'source_dir'},
[13589]173 $self->{'buildproc'}, $self->{'maxdocs'});
[12844]174 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
[16379]175 "", {}, {}, $self->{'buildproc'}, $self->{'maxdocs'}, 0, $self->{'gli'});
[9179]176 &plugin::end($self->{'pluginfo'});
177 close ($handle) unless $self->{'debug'};
178 $self->print_stats();
179
180 print STDERR "</Stage>\n" if $self->{'gli'};
[13589]181}
[9179]182
[8072]183sub build_indexes {
184 my $self = shift (@_);
185 my ($indexname) = @_;
186 my $outhandle = $self->{'outhandle'};
187
188 my $indexes = [];
189 if (defined $indexname && $indexname =~ /\w/) {
[13589]190 push @$indexes, $indexname;
[8072]191 } else {
[13589]192 $indexes = $self->{'collect_cfg'}->{'indexes'};
[8072]193 }
194
[12844]195 # create the mapping between the index descriptions
[8072]196 # and their directory names (includes subcolls and langs)
197 $self->{'index_mapping'} = $self->create_index_mapping ($indexes);
198
199 # build each of the indexes
[15712]200 foreach my $index (@$indexes) {
[13589]201 if ($self->want_built($index)) {
[8072]202
[13589]203 my $idx = $self->{'index_mapping'}->{$index};
204 foreach my $level (keys %{$self->{'levels'}}) {
205 next if $level =~ /paragraph/; # we don't do para indexing
206 my ($pindex) = $level =~ /^(.)/;
207 # should probably check that new name with level
208 # is unique ... but currently (with doc sec and para)
209 # each has unique first letter.
210 $self->{'index_mapping'}->{$index} = $pindex.$idx;
[8072]211
[13589]212 my $llevel = $mgppbuilder::level_map{$level};
213 print $outhandle "\n*** building index $index at level $llevel in subdirectory " .
214 "$self->{'index_mapping'}->{$index}\n" if ($self->{'verbosity'} >= 1);
215 print STDERR "<Stage name='Index' source='$index' level=$llevel>\n" if $self->{'gli'};
[8072]216
[13589]217 $self->build_index($index,$llevel);
218 }
219 $self->{'index_mapping'}->{$index} = $idx;
[8072]220
[13589]221 } else {
222 print $outhandle "\n*** ignoring index $index\n" if ($self->{'verbosity'} >= 1);
223 }
[8072]224 }
225
226 #define the final field lists
227 $self->make_final_field_list();
228}
229
[12844]230
[8072]231sub build_index {
232 my $self = shift (@_);
233 my ($index,$llevel) = @_;
234 my $outhandle = $self->{'outhandle'};
235 my $build_dir = $self->{'build_dir'};
236
237 # get the full index directory path and make sure it exists
238 my $indexdir = $self->{'index_mapping'}->{$index};
239 &util::mk_all_dir (&util::filename_cat($build_dir, $indexdir));
240
241 # get any os specific stuff
242 my $exedir = "$ENV{'GSDLHOME'}/bin/$ENV{'GSDLOS'}";
243 my $scriptdir = "$ENV{'GSDLHOME'}/bin/script";
244
[12844]245 # Find the perl script to call to run lucene
246 my $full_lucene_passes = $self->{'full_lucene_passes'};
247 my $full_lucene_passes_exe = $self->{'full_lucene_passes_exe'};
[8072]248
249 # define the section names for lucenepasses
250 # define the section names and possibly the doc name for lucenepasses
251 my $lucene_passes_sections = $llevel;
252
[16281]253 my $opt_create_index = ($self->{'keepold'}) ? "" : "-removeold";
[10158]254
[8072]255 my $osextra = "";
256 if ($ENV{'GSDLOS'} =~ /^windows$/i) {
[13589]257 $build_dir =~ s@/@\\@g;
[8072]258 } else {
[13589]259 if ($outhandle ne "STDERR") {
260 # so lucene_passes doesn't print to stderr if we redirect output
261 $osextra .= " 2>/dev/null";
262 }
[8072]263 }
[12844]264
[8072]265 # get the index expression if this index belongs
266 # to a subcollection
267 my $indexexparr = [];
[9669]268 my $langarr = [];
[8072]269
[12844]270 # there may be subcollection info, and language info.
[8072]271 my ($fields, $subcollection, $language) = split (":", $index);
272 my @subcollections = ();
273 @subcollections = split /,/, $subcollection if (defined $subcollection);
274
275 foreach $subcollection (@subcollections) {
[13589]276 if (defined ($self->{'collect_cfg'}->{'subcollection'}->{$subcollection})) {
277 push (@$indexexparr, $self->{'collect_cfg'}->{'subcollection'}->{$subcollection});
278 }
[8072]279 }
[12844]280
[8072]281 # add expressions for languages if this index belongs to
[12844]282 # a language subcollection - only put languages expressions for the
[8072]283 # ones we want in the index
284 my @languages = ();
[9548]285 my $language_metadata = "Language";
286 if (defined ($self->{'collect_cfg'}->{'language_metadata'})) {
[13589]287 $language_metadata = $self->{'collect_cfg'}->{'language_metadata'};
[9548]288 }
[8072]289 @languages = split /,/, $language if (defined $language);
[9548]290 foreach my $language (@languages) {
[13589]291 my $not=0;
292 if ($language =~ s/^\!//) {
293 $not = 1;
294 }
295 if($not) {
296 push (@$langarr, "!$language");
297 } else {
298 push (@$langarr, "$language");
299 }
[8072]300 }
301
302 # Build index dictionary. Uses verbatim stem method
303 print $outhandle "\n creating index dictionary (lucene_passes -I1)\n" if ($self->{'verbosity'} >= 1);
304 print STDERR "<Phase name='CreatingIndexDic'/>\n" if $self->{'gli'};
305 my ($handle);
306
307 if ($self->{'debug'}) {
[15712]308 $handle = *STDOUT;
[8072]309 } else {
[13589]310 print STDERR "Cmd: $full_lucene_passes_exe $opt_create_index index $lucene_passes_sections \"$build_dir\" \"$indexdir\" $osextra\n";
311 if (!-e "$full_lucene_passes" ||
[15712]312 !open($handle, "| $full_lucene_passes_exe $opt_create_index index $lucene_passes_sections \"$build_dir\" \"$indexdir\" $osextra")) {
[13589]313 print STDERR "<FatalError name='NoRunLucenePasses'/>\n</Stage>\n" if $self->{'gli'};
[15712]314 die "lucenebuilder::build_index - couldn't run $full_lucene_passes_exe\n";
[13589]315 }
[8072]316 }
[12844]317
[8072]318 my $store_levels = $self->{'levels'};
[15685]319 my $db_level = "section"; #always
[8072]320 my $dom_level = "";
321 foreach my $key (keys %$store_levels) {
[13589]322 if ($mgppbuilder::level_map{$key} eq $llevel) {
323 $dom_level = $key;
324 }
[8072]325 }
326 if ($dom_level eq "") {
[13589]327 print STDERR "Warning: unrecognized tag level $llevel\n";
328 $dom_level = "document";
[8072]329 }
330
331 my $local_levels = { $dom_level => 1 }; # work on one level at a time
332
333 # set up the document processr
334 $self->{'buildproc'}->set_output_handle ($handle);
335 $self->{'buildproc'}->set_mode ('text');
336 $self->{'buildproc'}->set_index ($index, $indexexparr);
[9669]337 $self->{'buildproc'}->set_index_languages ($language_metadata, $langarr) if (defined $language);
[8072]338 $self->{'buildproc'}->set_indexing_text (1);
339 $self->{'buildproc'}->set_indexfieldmap ($self->{'indexfieldmap'});
[12844]340 $self->{'buildproc'}->set_levels ($local_levels);
[15685]341 $self->{'buildproc'}->set_db_level($db_level);
[8072]342 $self->{'buildproc'}->reset();
[12844]343 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
[16379]344 "", {}, {}, $self->{'buildproc'}, $self->{'maxdocs'}, 0, $self->{'gli'});
[8072]345 close ($handle) unless $self->{'debug'};
346
347 $self->print_stats();
348
[12844]349 $self->{'buildproc'}->set_levels ($store_levels);
[8072]350 print STDERR "</Stage>\n" if $self->{'gli'};
[12844]351}
[8072]352
[12844]353# /** A modified version of the basebuilder.pm's function that generates the
[15714]354# * information database from the GA documents. We need to change this
[12844]355# * so that if we've been asked to do an incremental build we only add
356# * metadata to autohierarchy classifiers via the IncrementalBuildUtils
357# * module. All other classifiers and metadata will be ignored.
358# */
359sub make_infodatabase
[13589]360{
[12844]361 my $self = shift (@_);
362 my $outhandle = $self->{'outhandle'};
363
[15714]364 # Get info database file path
365 my $text_directory_path = &util::filename_cat($self->{'build_dir'}, "text");
[15725]366 my $infodb_file_path = &dbutil::get_infodb_file_path($self->{'infodbtype'}, $self->{'collection'}, $text_directory_path);
[12844]367
368 # If we aren't doing an incremental addition, then we just call the super-
369 # classes version
[15714]370 # Note: Incremental addition can only occur if an information database
[12844]371 # already exists. If it doesn't, let the super classes function be
372 # called once to generate it.
[16259]373 if (!$self->{'incremental'} || !-e $infodb_file_path)
[13589]374 {
[12844]375 # basebuilder::make_infodatabase(@_);
376 # Note: this doesn't work as the direct reference means all the $self
377 # data is lost.
378 $self->basebuilder::make_infodatabase(@_);
379 return;
[13589]380 }
[12844]381
382 # Carry on with an incremental addition
383 print $outhandle "\n*** performing an incremental addition to the info database\n" if ($self->{'verbosity'} >= 1);
384 print STDERR "<Stage name='CreateInfoData'>\n" if $self->{'gli'};
385
386 # 1. Init all the classifiers
387 &classify::init_classifiers ($self->{'classifiers'});
388 # 2. Init the buildproc settings.
389 # Note: we still need this to process any associated files - but we
[15718]390 # don't expect to pipe anything to the database so we can do away with the
[12844]391 # complex output handle.
392 my $assocdir = &util::filename_cat($self->{'build_dir'}, "assoc");
393 &util::mk_all_dir ($assocdir);
394 $self->{'buildproc'}->set_mode ('incinfodb'); # Very Important
395 $self->{'buildproc'}->set_assocdir ($assocdir);
396 # 3. Read in all the metadata from the files in the archives directory using
397 # the GAPlug and using ourselves as the document processor!
[16379]398 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'}, "", {}, {}, $self->{'buildproc'}, $self->{'maxdocs'},0, $self->{'gli'});
[12844]399
400 print STDERR "</Stage>\n" if $self->{'gli'};
401}
402
[13590]403# /** Lucene specific document removal function. This works by calling lucene_passes.pl with
404# * -remove and the document id on the command line.
405# *
406# * @param oid is the document identifier to be removed.
407# *
408# * @author John Rowe, DL Consulting Ltd.
409# */
410sub remove_document_from_database
411{
412 my ($self, $oid) = @_;
413 # Find the perl script to call to run lucene
414 my $full_lucene_passes_exe = $self->{'full_lucene_passes_exe'};
415 # Call lucene_passes.pl with -remove and the document ID on the command line
416 `$full_lucene_passes_exe -remove "$oid"`;
417}
418# /** remove_document_from_database **/
419
420
[8072]4211;
422
423
Note: See TracBrowser for help on using the repository browser.