root/gsdl/trunk/perllib/lucenebuilder.pm @ 15712

Revision 15712, 14.6 KB (checked in by mdewsnip, 11 years ago)

Added "use strict", and fixed up various problems it found.

  • Property svn:keywords set to Author Date Id Revision
Line 
1###########################################################################
2#
3# lucenebuilder.pm -- perl wrapper for building index with Lucene
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26###########################################################################
27# /*
28#  *  @version 1.0 ?
29#  *  @version 2.0 Incremental building assistance added, including
30#  *               remove_document_from_database which implements the granddad's
31#  *               empty function to call the lucene_passes.pl and full_lucene_passes_exe
32#  *               so there is one place in the code that works out where the
33#  *               perl script is. John Rowe
34#  *
35#  *  @author John Rowe, DL Consulting Ltd.
36#  */
37###########################################################################
38
39package lucenebuilder;
40
41# Use same basic XML structure setup by mgppbuilder/mgppbuildproc
42
43use mgppbuilder;
44use strict; no strict 'refs';
45
46
47sub BEGIN {
48    @lucenebuilder::ISA = ('mgppbuilder');
49}
50
51# /**
52#  *  @author  John Thompson, DL Consulting Ltd.
53#  */
54sub new {
55    my $class = shift(@_);
56    my $self = new mgppbuilder (@_);
57    $self = bless $self, $class;
58
59    $self->{'buildtype'} = "lucene";
60
61    # Do we need to put exe on the end?
62    my $exe = &util::get_os_exe ();
63    my $scriptdir = "$ENV{'GSDLHOME'}/bin/script";
64
65    # So where is lucene_passes.pl anyway?
66    my $lucene_passes_script = &util::filename_cat($scriptdir, "lucene_passes.pl");
67
68    # So tack perl on the beginning to ensure execution
69    $self->{'full_lucene_passes'} = "$lucene_passes_script";
70    if ($exe eq ".exe")
71    {
72    $self->{'full_lucene_passes_exe'} = "perl$exe \"$lucene_passes_script\"";
73    }
74    else
75    {
76    $self->{'full_lucene_passes_exe'} = "perl -S \"$lucene_passes_script\"";
77    }
78
79    return $self;
80}
81# /** new() **/
82
83# lucene has none of these options
84sub generate_index_options {
85    my $self = shift (@_);
86
87    $self->{'casefold'} = 0;
88    $self->{'stem'} = 0;
89    $self->{'accentfold'} = 0;
90    $self->{'stemindexes'} = 0;
91}   
92
93sub default_buildproc {
94    my $self  = shift (@_);
95
96    return "lucenebuildproc";
97}
98
99# this writes a nice version of the text docs
100sub compress_text
101{
102    my $self = shift (@_);
103    # we don't do anything if we don't want compressed text
104    return if $self->{'no_text'};
105
106    my ($textindex) = @_;
107    my $outhandle = $self->{'outhandle'};
108    print STDERR "Saving the document text\n";
109    # the text directory
110    my $text_dir = &util::filename_cat($self->{'build_dir'}, "text");
111    my $build_dir = &util::filename_cat($self->{'build_dir'},"");
112    &util::mk_all_dir ($text_dir);
113
114    my $osextra = "";
115    if ($ENV{'GSDLOS'} =~ /^windows$/i)
116    {
117    $text_dir =~ s@/@\\@g;
118    }
119    else
120    {
121    if ($outhandle ne "STDERR")
122    {
123        # so lucene_passes doesn't print to stderr if we redirect output
124        $osextra .= " 2>/dev/null";
125    }
126    }
127
128    # get any os specific stuff
129    my $scriptdir = "$ENV{'GSDLHOME'}/bin/script";
130
131    # Find the perl script to call to run lucene
132    my $full_lucene_passes = $self->{'full_lucene_passes'};
133    my $full_lucene_passes_exe = $self->{'full_lucene_passes_exe'};
134
135    my $lucene_passes_sections = "Doc";
136
137    my ($handle);
138
139    if ($self->{'debug'})
140    {
141    $handle = *STDOUT;
142    }
143    else
144    {
145        print STDERR "Full Path:     $full_lucene_passes\n";
146        print STDERR "Executable:    $full_lucene_passes_exe\n";
147        print STDERR "Sections:      $lucene_passes_sections\n";
148        print STDERR "Build Dir:     $build_dir\n";
149        print STDERR "Cmd:           $full_lucene_passes_exe text $lucene_passes_sections \"$build_dir\" \"dummy\"   $osextra\n";
150    if (!-e "$full_lucene_passes" ||
151        !open($handle, "| $full_lucene_passes_exe text $lucene_passes_sections \"$build_dir\" \"dummy\"   $osextra"))
152    {
153        print STDERR "<FatalError name='NoRunLucenePasses'/>\n</Stage>\n" if $self->{'gli'};
154        die "lucenebuilder::build_index - couldn't run $full_lucene_passes_exe\n";
155    }
156    }
157
158    # stored text is always Doc and Sec levels   
159    my $levels = { 'document' => 1, 'section' => 1 };
160    # always do database at section level
161    my $db_level = "section";
162
163    # set up the document processr
164    $self->{'buildproc'}->set_output_handle ($handle);
165    $self->{'buildproc'}->set_mode ('text');
166    $self->{'buildproc'}->set_index ($textindex);
167    $self->{'buildproc'}->set_indexing_text (0);
168    $self->{'buildproc'}->set_indexfieldmap ($self->{'indexfieldmap'});
169    $self->{'buildproc'}->set_levels ($levels);
170    $self->{'buildproc'}->set_db_level ($db_level);
171    $self->{'buildproc'}->reset();
172    &plugin::begin($self->{'pluginfo'}, $self->{'source_dir'},
173           $self->{'buildproc'}, $self->{'maxdocs'});
174    &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
175           "", {}, $self->{'buildproc'}, $self->{'maxdocs'}, 0, $self->{'gli'});
176    &plugin::end($self->{'pluginfo'});
177    close ($handle) unless $self->{'debug'};
178    $self->print_stats();
179
180    print STDERR "</Stage>\n" if $self->{'gli'};
181}
182
183sub build_indexes {
184    my $self = shift (@_);
185    my ($indexname) = @_;
186    my $outhandle = $self->{'outhandle'};
187
188    my $indexes = [];
189    if (defined $indexname && $indexname =~ /\w/) {
190    push @$indexes, $indexname;
191    } else {
192    $indexes = $self->{'collect_cfg'}->{'indexes'};
193    }
194
195    # create the mapping between the index descriptions
196    # and their directory names (includes subcolls and langs)
197    $self->{'index_mapping'} = $self->create_index_mapping ($indexes);
198
199    # build each of the indexes
200    foreach my $index (@$indexes) {
201    if ($self->want_built($index)) {
202
203        my $idx = $self->{'index_mapping'}->{$index};
204        foreach my $level (keys %{$self->{'levels'}}) {
205        next if $level =~ /paragraph/; # we don't do para indexing
206        my ($pindex) = $level =~ /^(.)/;
207        # should probably check that new name with level
208        # is unique ... but currently (with doc sec and para)
209        # each has unique first letter.
210        $self->{'index_mapping'}->{$index} = $pindex.$idx;
211
212        my $llevel = $mgppbuilder::level_map{$level};
213        print $outhandle "\n*** building index $index at level $llevel in subdirectory " .
214            "$self->{'index_mapping'}->{$index}\n" if ($self->{'verbosity'} >= 1);
215        print STDERR "<Stage name='Index' source='$index' level=$llevel>\n" if $self->{'gli'};
216
217        $self->build_index($index,$llevel);
218        }
219        $self->{'index_mapping'}->{$index} = $idx;
220
221    } else {
222        print $outhandle "\n*** ignoring index $index\n" if ($self->{'verbosity'} >= 1);
223    }
224    }
225
226    #define the final field lists
227    $self->make_final_field_list();
228}
229
230
231sub build_index {
232    my $self = shift (@_);
233    my ($index,$llevel) = @_;
234    my $outhandle = $self->{'outhandle'};
235    my $build_dir = $self->{'build_dir'};
236
237    # get the full index directory path and make sure it exists
238    my $indexdir = $self->{'index_mapping'}->{$index};
239    &util::mk_all_dir (&util::filename_cat($build_dir, $indexdir));
240
241    # get any os specific stuff
242    my $exedir = "$ENV{'GSDLHOME'}/bin/$ENV{'GSDLOS'}";
243    my $scriptdir = "$ENV{'GSDLHOME'}/bin/script";
244
245    # Find the perl script to call to run lucene
246    my $full_lucene_passes = $self->{'full_lucene_passes'};
247    my $full_lucene_passes_exe = $self->{'full_lucene_passes_exe'};
248
249    # define the section names for lucenepasses
250    # define the section names and possibly the doc name for lucenepasses
251    my $lucene_passes_sections = $llevel;
252
253    my $opt_create_index = ($self->{'keepold'}) ? "" : "-create";
254
255    my $osextra = "";
256    if ($ENV{'GSDLOS'} =~ /^windows$/i) {
257    $build_dir =~ s@/@\\@g;
258    } else {
259    if ($outhandle ne "STDERR") {
260        # so lucene_passes doesn't print to stderr if we redirect output
261        $osextra .= " 2>/dev/null";
262    }
263    }
264
265    # get the index expression if this index belongs
266    # to a subcollection
267    my $indexexparr = [];
268    my $langarr = [];
269
270    # there may be subcollection info, and language info.
271    my ($fields, $subcollection, $language) = split (":", $index);
272    my @subcollections = ();
273    @subcollections = split /,/, $subcollection if (defined $subcollection);
274
275    foreach $subcollection (@subcollections) {
276    if (defined ($self->{'collect_cfg'}->{'subcollection'}->{$subcollection})) {
277        push (@$indexexparr, $self->{'collect_cfg'}->{'subcollection'}->{$subcollection});
278    }
279    }
280
281    # add expressions for languages if this index belongs to
282    # a language subcollection - only put languages expressions for the
283    # ones we want in the index
284    my @languages = ();
285    my $language_metadata = "Language";
286    if (defined ($self->{'collect_cfg'}->{'language_metadata'})) {
287    $language_metadata = $self->{'collect_cfg'}->{'language_metadata'};
288    }
289    @languages = split /,/, $language if (defined $language);
290    foreach my $language (@languages) {
291    my $not=0;
292    if ($language =~ s/^\!//) {
293        $not = 1;
294    }
295    if($not) {
296        push (@$langarr, "!$language");
297    } else {
298        push (@$langarr, "$language");
299    }
300    }
301
302    # Build index dictionary. Uses verbatim stem method
303    print $outhandle "\n    creating index dictionary (lucene_passes -I1)\n"  if ($self->{'verbosity'} >= 1);
304    print STDERR "<Phase name='CreatingIndexDic'/>\n" if $self->{'gli'};
305    my ($handle);
306
307    if ($self->{'debug'}) {
308    $handle = *STDOUT;
309    } else {
310    print STDERR "Cmd: $full_lucene_passes_exe $opt_create_index index $lucene_passes_sections \"$build_dir\" \"$indexdir\"   $osextra\n";
311    if (!-e "$full_lucene_passes" ||
312        !open($handle, "| $full_lucene_passes_exe $opt_create_index index $lucene_passes_sections \"$build_dir\" \"$indexdir\"   $osextra")) {
313        print STDERR "<FatalError name='NoRunLucenePasses'/>\n</Stage>\n" if $self->{'gli'};
314        die "lucenebuilder::build_index - couldn't run $full_lucene_passes_exe\n";
315    }
316    }
317
318    my $store_levels = $self->{'levels'};
319    my $db_level = "section"; #always
320    my $dom_level = "";
321    foreach my $key (keys %$store_levels) {
322    if ($mgppbuilder::level_map{$key} eq $llevel) {
323        $dom_level = $key;
324    }
325    }
326    if ($dom_level eq "") {
327    print STDERR "Warning: unrecognized tag level $llevel\n";
328    $dom_level = "document";
329    }
330
331    my $local_levels = { $dom_level => 1 }; # work on one level at a time
332
333    # set up the document processr
334    $self->{'buildproc'}->set_output_handle ($handle);
335    $self->{'buildproc'}->set_mode ('text');
336    $self->{'buildproc'}->set_index ($index, $indexexparr);
337    $self->{'buildproc'}->set_index_languages ($language_metadata, $langarr) if (defined $language);
338    $self->{'buildproc'}->set_indexing_text (1);
339    $self->{'buildproc'}->set_indexfieldmap ($self->{'indexfieldmap'});
340    $self->{'buildproc'}->set_levels ($local_levels);
341    $self->{'buildproc'}->set_db_level($db_level);
342    $self->{'buildproc'}->reset();
343    &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
344           "", {}, $self->{'buildproc'}, $self->{'maxdocs'}, 0, $self->{'gli'});
345    close ($handle) unless $self->{'debug'};
346
347    $self->print_stats();
348
349    $self->{'buildproc'}->set_levels ($store_levels);
350    print STDERR "</Stage>\n" if $self->{'gli'};
351}
352
353# /** A modified version of the basebuilder.pm's function that generates the
354#  *  information database (GDBM) from the GA documents. We need to change this
355#  *  so that if we've been asked to do an incremental build we only add
356#  *  metadata to autohierarchy classifiers via the IncrementalBuildUtils
357#  *  module. All other classifiers and metadata will be ignored.
358#  */
359sub make_infodatabase
360{
361    my $self = shift (@_);
362    my $outhandle = $self->{'outhandle'};
363
364    my $dbext = ".bdb";
365    $dbext = ".ldb" if &util::is_little_endian();
366
367    my $collect_tail = &util::get_dirsep_tail($self->{'collection'});
368    my $infodb_file = &util::filename_cat($self->{'build_dir'}, "text", $collect_tail . $dbext);
369
370    # If we aren't doing an incremental addition, then we just call the super-
371    # classes version
372    # Note: Incremental addition can only occur if a text/<collection>.ldb
373    #       already exists. If it doesn't, let the super classes function be
374    #       called once to generate it.
375    if (!$self->{'incremental_dlc'} || !(-e $infodb_file))
376    {
377        # basebuilder::make_infodatabase(@_);
378        # Note: this doesn't work as the direct reference means all the $self
379        #       data is lost.
380        $self->basebuilder::make_infodatabase(@_);
381        return;
382    }
383
384    # Carry on with an incremental addition
385    print $outhandle "\n*** performing an incremental addition to the info database\n" if ($self->{'verbosity'} >= 1);
386    print STDERR "<Stage name='CreateInfoData'>\n" if $self->{'gli'};
387
388    # 1. Init all the classifiers
389    &classify::init_classifiers ($self->{'classifiers'});
390    # 2. Init the buildproc settings.
391    #    Note: we still need this to process any associated files - but we
392    #    don't expect to pipe anything to txt2db so we can do away with the
393    #    complex output handle.
394    my $assocdir = &util::filename_cat($self->{'build_dir'}, "assoc");
395    &util::mk_all_dir ($assocdir);
396    $self->{'buildproc'}->set_mode ('incinfodb'); # Very Important
397    $self->{'buildproc'}->set_assocdir ($assocdir);
398    # 3. Read in all the metadata from the files in the archives directory using
399    #    the GAPlug and using ourselves as the document processor!
400    &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'}, "", {}, $self->{'buildproc'}, $self->{'maxdocs'},0, $self->{'gli'});
401
402    print STDERR "</Stage>\n" if $self->{'gli'};
403}
404
405# /** Lucene specific document removal function. This works by calling lucene_passes.pl with
406#  *  -remove and the document id on the command line.
407#  *
408#  *  @param oid is the document identifier to be removed.
409#  *
410#  *  @author John Rowe, DL Consulting Ltd.
411#  */
412sub remove_document_from_database
413{
414    my ($self, $oid) = @_;
415    # Find the perl script to call to run lucene
416    my $full_lucene_passes_exe = $self->{'full_lucene_passes_exe'};
417    # Call lucene_passes.pl with -remove and the document ID on the command line
418    `$full_lucene_passes_exe -remove "$oid"`;
419}
420# /** remove_document_from_database **/
421
422
4231;
424
425
Note: See TracBrowser for help on using the browser.