source: trunk/gsdl/perllib/lucenebuilder.pm@ 13590

Last change on this file since 13590 was 13590, checked in by kjdon, 17 years ago

mgpp and lucene. made them always use doc and sec levels for the text regardless of index level specification. mgpp will always index at doc and sec level, but these options may not be presented to the user. this is to ensure that if we have sectioned documents, we don't need to turn on section indexing in order for the document display to use sections

  • Property svn:keywords set to Author Date Id Revision
File size: 14.6 KB
Line 
1###########################################################################
2#
3# lucenebuilder.pm -- perl wrapper for building index with Lucene
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26###########################################################################
27# /*
28# * @version 1.0 ?
29# * @version 2.0 Incremental building assistance added, including
30# * remove_document_from_database which implements the granddad's
31# * empty function to call the lucene_passes.pl and full_lucene_passes_exe
32# * so there is one place in the code that works out where the
33# * perl script is. John Rowe
34# *
35# * @author John Rowe, DL Consulting Ltd.
36# */
37###########################################################################
38
39package lucenebuilder;
40
41# Use same basic XML structure setup by mgppbuilder/mgppbuildproc
42
43use mgppbuilder;
44
45# use IncrementalBuildUtils;
46
47sub BEGIN {
48 @lucenebuilder::ISA = ('mgppbuilder');
49}
50
51# /**
52# * @author John Thompson, DL Consulting Ltd.
53# */
54sub new {
55 my $class = shift(@_);
56 my $self = new mgppbuilder (@_);
57 $self = bless $self, $class;
58
59 $self->{'buildtype'} = "lucene";
60
61 # Do we need to put exe on the end?
62 my $exe = &util::get_os_exe ();
63 my $scriptdir = "$ENV{'GSDLHOME'}/bin/script";
64
65 # So where is lucene_passes.pl anyway?
66 my $lucene_passes_script = &util::filename_cat($scriptdir, "lucene_passes.pl");
67
68 # So tack perl on the beginning to ensure execution
69 $self->{'full_lucene_passes'} = "$lucene_passes_script";
70 if ($exe eq ".exe")
71 {
72 $self->{'full_lucene_passes_exe'} = "perl$exe \"$lucene_passes_script\"";
73 }
74 else
75 {
76 $self->{'full_lucene_passes_exe'} = "perl -S \"$lucene_passes_script\"";
77 }
78
79 return $self;
80}
81# /** new() **/
82
83# lucene has none of these options
84sub generate_index_options {
85 my $self = shift (@_);
86
87 $self->{'casefold'} = 0;
88 $self->{'stem'} = 0;
89 $self->{'accentfold'} = 0;
90 $self->{'stemindexes'} = 0;
91}
92
93sub default_buildproc {
94 my $self = shift (@_);
95
96 return "lucenebuildproc";
97}
98
99# this writes a nice version of the text docs
100sub compress_text
101{
102 my $self = shift (@_);
103 # we don't do anything if we don't want compressed text
104 return if $self->{'no_text'};
105
106 my ($textindex) = @_;
107 my $outhandle = $self->{'outhandle'};
108 print STDERR "Saving the document text\n";
109 # the text directory
110 my $text_dir = &util::filename_cat($self->{'build_dir'}, "text");
111 my $build_dir = &util::filename_cat($self->{'build_dir'},"");
112 &util::mk_all_dir ($text_dir);
113
114 my $osextra = "";
115 if ($ENV{'GSDLOS'} =~ /^windows$/i)
116 {
117 $text_dir =~ s@/@\\@g;
118 }
119 else
120 {
121 if ($outhandle ne "STDERR")
122 {
123 # so lucene_passes doesn't print to stderr if we redirect output
124 $osextra .= " 2>/dev/null";
125 }
126 }
127
128 # get any os specific stuff
129 my $scriptdir = "$ENV{'GSDLHOME'}/bin/script";
130
131 # Find the perl script to call to run lucene
132 my $full_lucene_passes = $self->{'full_lucene_passes'};
133 my $full_lucene_passes_exe = $self->{'full_lucene_passes_exe'};
134
135 my $lucene_passes_sections = "Doc";
136
137 my ($handle);
138
139 if ($self->{'debug'})
140 {
141 $handle = STDOUT;
142 }
143 else
144 {
145 print STDERR "Full Path: $full_lucene_passes\n";
146 print STDERR "Executable: $full_lucene_passes_exe\n";
147 print STDERR "Sections: $lucene_passes_sections\n";
148 print STDERR "Build Dir: $build_dir\n";
149 print STDERR "Cmd: $full_lucene_passes_exe text $lucene_passes_sections \"$build_dir\" \"dummy\" $osextra\n";
150 if (!-e "$full_lucene_passes" ||
151 !open (PIPEOUT, "| $full_lucene_passes_exe text $lucene_passes_sections \"$build_dir\" \"dummy\" $osextra"))
152 {
153 print STDERR "<FatalError name='NoRunLucenePasses'/>\n</Stage>\n" if $self->{'gli'};
154 die "lucenebuilder::build_index - couldn't run $full_lucene_passes_exe\n";
155 }
156 $handle = lucenebuilder::PIPEOUT;
157 }
158
159 # stored text is always Doc and Sec levels
160 my $levels = { 'document' => 1, 'section' => 1 };
161 # always do gdbm at section level
162 my $gdbm_level = "section";
163
164 # set up the document processr
165 $self->{'buildproc'}->set_output_handle ($handle);
166 $self->{'buildproc'}->set_mode ('text');
167 $self->{'buildproc'}->set_index ($textindex);
168 $self->{'buildproc'}->set_indexing_text (0);
169 $self->{'buildproc'}->set_indexfieldmap ($self->{'indexfieldmap'});
170 $self->{'buildproc'}->set_levels ($levels);
171 $self->{'buildproc'}->set_gdbm_level ($gdbm_level);
172 $self->{'buildproc'}->reset();
173 &plugin::begin($self->{'pluginfo'}, $self->{'source_dir'},
174 $self->{'buildproc'}, $self->{'maxdocs'});
175 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
176 "", {}, $self->{'buildproc'}, $self->{'maxdocs'}, 0, $self->{'gli'});
177 &plugin::end($self->{'pluginfo'});
178 close ($handle) unless $self->{'debug'};
179 close PIPEOUT;
180 $self->print_stats();
181
182 print STDERR "</Stage>\n" if $self->{'gli'};
183}
184
185sub build_indexes {
186 my $self = shift (@_);
187 my ($indexname) = @_;
188 my $outhandle = $self->{'outhandle'};
189
190 my $indexes = [];
191 if (defined $indexname && $indexname =~ /\w/) {
192 push @$indexes, $indexname;
193 } else {
194 $indexes = $self->{'collect_cfg'}->{'indexes'};
195 }
196
197 # create the mapping between the index descriptions
198 # and their directory names (includes subcolls and langs)
199 $self->{'index_mapping'} = $self->create_index_mapping ($indexes);
200
201 # build each of the indexes
202 foreach $index (@$indexes) {
203 if ($self->want_built($index)) {
204
205 my $idx = $self->{'index_mapping'}->{$index};
206 foreach my $level (keys %{$self->{'levels'}}) {
207 next if $level =~ /paragraph/; # we don't do para indexing
208 my ($pindex) = $level =~ /^(.)/;
209 # should probably check that new name with level
210 # is unique ... but currently (with doc sec and para)
211 # each has unique first letter.
212 $self->{'index_mapping'}->{$index} = $pindex.$idx;
213
214 my $llevel = $mgppbuilder::level_map{$level};
215 print $outhandle "\n*** building index $index at level $llevel in subdirectory " .
216 "$self->{'index_mapping'}->{$index}\n" if ($self->{'verbosity'} >= 1);
217 print STDERR "<Stage name='Index' source='$index' level=$llevel>\n" if $self->{'gli'};
218
219 $self->build_index($index,$llevel);
220 }
221 $self->{'index_mapping'}->{$index} = $idx;
222
223 } else {
224 print $outhandle "\n*** ignoring index $index\n" if ($self->{'verbosity'} >= 1);
225 }
226 }
227
228 #define the final field lists
229 $self->make_final_field_list();
230}
231
232
233sub build_index {
234 my $self = shift (@_);
235 my ($index,$llevel) = @_;
236 my $outhandle = $self->{'outhandle'};
237 my $build_dir = $self->{'build_dir'};
238
239 # get the full index directory path and make sure it exists
240 my $indexdir = $self->{'index_mapping'}->{$index};
241 &util::mk_all_dir (&util::filename_cat($build_dir, $indexdir));
242
243 # get any os specific stuff
244 my $exedir = "$ENV{'GSDLHOME'}/bin/$ENV{'GSDLOS'}";
245 my $scriptdir = "$ENV{'GSDLHOME'}/bin/script";
246
247 # Find the perl script to call to run lucene
248 my $full_lucene_passes = $self->{'full_lucene_passes'};
249 my $full_lucene_passes_exe = $self->{'full_lucene_passes_exe'};
250
251 # define the section names for lucenepasses
252 # define the section names and possibly the doc name for lucenepasses
253 my $lucene_passes_sections = $llevel;
254
255 my $opt_create_index = ($self->{'keepold'}) ? "" : "-create";
256
257 my $osextra = "";
258 if ($ENV{'GSDLOS'} =~ /^windows$/i) {
259 $build_dir =~ s@/@\\@g;
260 } else {
261 if ($outhandle ne "STDERR") {
262 # so lucene_passes doesn't print to stderr if we redirect output
263 $osextra .= " 2>/dev/null";
264 }
265 }
266
267 # get the index expression if this index belongs
268 # to a subcollection
269 my $indexexparr = [];
270 my $langarr = [];
271
272 # there may be subcollection info, and language info.
273 my ($fields, $subcollection, $language) = split (":", $index);
274 my @subcollections = ();
275 @subcollections = split /,/, $subcollection if (defined $subcollection);
276
277 foreach $subcollection (@subcollections) {
278 if (defined ($self->{'collect_cfg'}->{'subcollection'}->{$subcollection})) {
279 push (@$indexexparr, $self->{'collect_cfg'}->{'subcollection'}->{$subcollection});
280 }
281 }
282
283 # add expressions for languages if this index belongs to
284 # a language subcollection - only put languages expressions for the
285 # ones we want in the index
286 my @languages = ();
287 my $language_metadata = "Language";
288 if (defined ($self->{'collect_cfg'}->{'language_metadata'})) {
289 $language_metadata = $self->{'collect_cfg'}->{'language_metadata'};
290 }
291 @languages = split /,/, $language if (defined $language);
292 foreach my $language (@languages) {
293 my $not=0;
294 if ($language =~ s/^\!//) {
295 $not = 1;
296 }
297 if($not) {
298 push (@$langarr, "!$language");
299 } else {
300 push (@$langarr, "$language");
301 }
302 }
303
304 # Build index dictionary. Uses verbatim stem method
305 print $outhandle "\n creating index dictionary (lucene_passes -I1)\n" if ($self->{'verbosity'} >= 1);
306 print STDERR "<Phase name='CreatingIndexDic'/>\n" if $self->{'gli'};
307 my ($handle);
308
309 if ($self->{'debug'}) {
310 $handle = STDOUT;
311 } else {
312 print STDERR "Cmd: $full_lucene_passes_exe $opt_create_index index $lucene_passes_sections \"$build_dir\" \"$indexdir\" $osextra\n";
313 if (!-e "$full_lucene_passes" ||
314 !open (PIPEOUT, "| $full_lucene_passes_exe $opt_create_index index $lucene_passes_sections \"$build_dir\" \"$indexdir\" $osextra")) {
315 print STDERR "<FatalError name='NoRunLucenePasses'/>\n</Stage>\n" if $self->{'gli'};
316 die "lucenebuilder::build_index - couldn't run $lucene_passes_exe\n";
317 }
318 $handle = lucenebuilder::PIPEOUT;
319 }
320
321 my $store_levels = $self->{'levels'};
322 my $gdbm_level = "section"; #always
323 my $dom_level = "";
324 foreach my $key (keys %$store_levels) {
325 if ($mgppbuilder::level_map{$key} eq $llevel) {
326 $dom_level = $key;
327 }
328 }
329 if ($dom_level eq "") {
330 print STDERR "Warning: unrecognized tag level $llevel\n";
331 $dom_level = "document";
332 }
333
334 my $local_levels = { $dom_level => 1 }; # work on one level at a time
335
336 # set up the document processr
337 $self->{'buildproc'}->set_output_handle ($handle);
338 $self->{'buildproc'}->set_mode ('text');
339 $self->{'buildproc'}->set_index ($index, $indexexparr);
340 $self->{'buildproc'}->set_index_languages ($language_metadata, $langarr) if (defined $language);
341 $self->{'buildproc'}->set_indexing_text (1);
342 $self->{'buildproc'}->set_indexfieldmap ($self->{'indexfieldmap'});
343 $self->{'buildproc'}->set_levels ($local_levels);
344 $self->{'buildproc'}->set_gdbm_level($gdbm_level);
345 $self->{'buildproc'}->reset();
346 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
347 "", {}, $self->{'buildproc'}, $self->{'maxdocs'}, 0, $self->{'gli'});
348 close ($handle) unless $self->{'debug'};
349
350 $self->print_stats();
351
352 $self->{'buildproc'}->set_levels ($store_levels);
353 print STDERR "</Stage>\n" if $self->{'gli'};
354}
355
356# /** A modified version of the basebuilder.pm's function that generates the
357# * information database (GDBM) from the GA documents. We need to change this
358# * so that if we've been asked to do an incremental build we only add
359# * metadata to autohierarchy classifiers via the IncrementalBuildUtils
360# * module. All other classifiers and metadata will be ignored.
361# */
362sub make_infodatabase
363{
364 my $self = shift (@_);
365 my $outhandle = $self->{'outhandle'};
366
367 my $dbext = ".bdb";
368 $dbext = ".ldb" if &util::is_little_endian();
369 my $infodb_file = &util::filename_cat($self->{'build_dir'}, "text", $self->{'collection'} . $dbext);
370
371 # If we aren't doing an incremental addition, then we just call the super-
372 # classes version
373 # Note: Incremental addition can only occur if a text/<collection>.ldb
374 # already exists. If it doesn't, let the super classes function be
375 # called once to generate it.
376 if (!$self->{'incremental_dlc'} || !(-e $infodb_file))
377 {
378 # basebuilder::make_infodatabase(@_);
379 # Note: this doesn't work as the direct reference means all the $self
380 # data is lost.
381 $self->basebuilder::make_infodatabase(@_);
382 return;
383 }
384
385 # Carry on with an incremental addition
386 print $outhandle "\n*** performing an incremental addition to the info database\n" if ($self->{'verbosity'} >= 1);
387 print STDERR "<Stage name='CreateInfoData'>\n" if $self->{'gli'};
388
389 # 1. Init all the classifiers
390 &classify::init_classifiers ($self->{'classifiers'});
391 # 2. Init the buildproc settings.
392 # Note: we still need this to process any associated files - but we
393 # don't expect to pipe anything to txt2db so we can do away with the
394 # complex output handle.
395 my $assocdir = &util::filename_cat($self->{'build_dir'}, "assoc");
396 &util::mk_all_dir ($assocdir);
397 $self->{'buildproc'}->set_mode ('incinfodb'); # Very Important
398 $self->{'buildproc'}->set_assocdir ($assocdir);
399 # 3. Read in all the metadata from the files in the archives directory using
400 # the GAPlug and using ourselves as the document processor!
401 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'}, "", {}, $self->{'buildproc'}, $self->{'maxdocs'},0, $self->{'gli'});
402
403 print STDERR "</Stage>\n" if $self->{'gli'};
404}
405
406# /** Lucene specific document removal function. This works by calling lucene_passes.pl with
407# * -remove and the document id on the command line.
408# *
409# * @param oid is the document identifier to be removed.
410# *
411# * @author John Rowe, DL Consulting Ltd.
412# */
413sub remove_document_from_database
414{
415 my ($self, $oid) = @_;
416 # Find the perl script to call to run lucene
417 my $full_lucene_passes_exe = $self->{'full_lucene_passes_exe'};
418 # Call lucene_passes.pl with -remove and the document ID on the command line
419 `$full_lucene_passes_exe -remove "$oid"`;
420}
421# /** remove_document_from_database **/
422
423
4241;
425
426
Note: See TracBrowser for help on using the repository browser.