root/main/trunk/greenstone2/perllib/lucenebuilder.pm @ 24496

Revision 24496, 16.4 KB (checked in by davidb, 9 years ago)

Tidy up of comments

  • Property svn:keywords set to Author Date Id Revision
Line 
1###########################################################################
2#
3# lucenebuilder.pm -- perl wrapper for building index with Lucene
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26###########################################################################
27# /*
28#  *  @version 1.0 Initial implementation of incremental building
29#  *  @version 2.0 Incremental building assistance added, including
30#  *               remove_document_from_database which implements the granddad's
31#  *               empty function to call the lucene_passes.pl and full_lucene_passes_exe
32#  *               so there is one place in the code that works out where the
33#  *               perl script is. John Rowe
34#  *
35#  *  @author David Bainbridge and Katherine Don, Waikato DL Research group
36#  *  @author John Rowe, DL Consulting Ltd.
37#  *  @author John Thompson, DL Consulting Ltd.
38#  */
39###########################################################################
40
41package lucenebuilder;
42
43# Use same basic XML structure setup by mgppbuilder/mgppbuildproc
44
45use mgppbuilder;
46use strict;
47no strict 'refs';
48use util;
49
50sub BEGIN {
51    @lucenebuilder::ISA = ('mgppbuilder');
52}
53
54# /**
55#  *  @author  John Thompson, DL Consulting Ltd.
56#  */
57sub new {
58    my $class = shift(@_);
59    my $self = new mgppbuilder (@_);
60    $self = bless $self, $class;
61
62    $self->{'buildtype'} = "lucene";
63   
64    # If ENABLE_LUCENE was turned off during GS compilation, then we won't be able to
65    # continue. Check for existence of LuceneWrapper to see if Lucene was disabled.
66    my $lucene = &util::filename_cat($ENV{'GSDLHOME'},"bin","java","LuceneWrapper.jar");
67    if (! -f $lucene) {
68    die "***** ERROR: $lucene does not exist\n";     
69    }
70 
71    # Do we need to put exe on the end?
72    my $exe = &util::get_os_exe ();
73    my $scriptdir = "$ENV{'GSDLHOME'}/bin/script";
74
75    # So where is lucene_passes.pl anyway?
76    my $lucene_passes_script = &util::filename_cat($scriptdir, "lucene_passes.pl");
77
78    # So tack perl on the beginning to ensure execution
79    $self->{'full_lucene_passes'} = "$lucene_passes_script";
80    if ($exe eq ".exe")
81    {
82    $self->{'full_lucene_passes_exe'} = "perl$exe \"$lucene_passes_script\"";
83    }
84    else
85    {
86    $self->{'full_lucene_passes_exe'} = "\"".&util::get_perl_exec()."\" -S \"$lucene_passes_script\"";
87    }
88
89    return $self;
90}
91# /** new() **/
92
93sub is_incremental_capable
94{
95    # lucene can do incremental building
96
97    return 1;
98}
99
100sub init_for_incremental_build {
101    my $self = shift (@_);
102
103    # we want to read in indexfieldmap and indexfields from existing build.cfg
104    # so that we know what has already been indexed
105    my $buildcfg = $self->read_build_cfg();
106    return unless defined $buildcfg;
107
108    my $field;
109    if (defined $buildcfg->{'indexfields'}) {
110    foreach $field (@{$buildcfg->{'indexfields'}}) {
111        $self->{'buildproc'}->{'indexfields'}->{$field} = 1;
112    }
113    }
114
115    if (defined $buildcfg->{'indexfieldmap'}) {
116    foreach $field (@{$buildcfg->{'indexfieldmap'}}) {
117        my ($f, $v) = $field =~ /^(.*)\-\>(.*)$/;
118        $self->{'buildproc'}->{'indexfieldmap'}->{$f} = $v;
119    }
120    }       
121}
122
123# lucene has none of these options
124sub generate_index_options {
125    my $self = shift (@_);
126
127    $self->SUPER::generate_index_options();
128   
129    $self->{'casefold'} = 0;
130    $self->{'stem'} = 0;
131    $self->{'accentfold'} = 0;
132    $self->{'stemindexes'} = 0;
133}   
134
135sub default_buildproc {
136    my $self  = shift (@_);
137
138    return "lucenebuildproc";
139}
140
141# this writes a nice version of the text docs
142sub compress_text
143{
144    my $self = shift (@_);
145    # we don't do anything if we don't want compressed text
146    return if $self->{'no_text'};
147
148    my ($textindex) = @_;
149    my $outhandle = $self->{'outhandle'};
150
151    # the text directory
152    my $text_dir = &util::filename_cat($self->{'build_dir'}, "text");
153    my $build_dir = &util::filename_cat($self->{'build_dir'},"");
154    &util::mk_all_dir ($text_dir);
155
156    my $osextra = "";
157    if ($ENV{'GSDLOS'} =~ /^windows$/i)
158    {
159    $text_dir =~ s@/@\\@g;
160    }
161    else
162    {
163    if ($outhandle ne "STDERR")
164    {
165        # so lucene_passes doesn't print to stderr if we redirect output
166        $osextra .= " 2>/dev/null";
167    }
168    }
169
170    # get any os specific stuff
171    my $scriptdir = "$ENV{'GSDLHOME'}/bin/script";
172
173    # Find the perl script to call to run lucene
174    my $full_lucene_passes = $self->{'full_lucene_passes'};
175    my $full_lucene_passes_exe = $self->{'full_lucene_passes_exe'};
176
177    my $lucene_passes_sections = "Doc";
178
179    my ($handle);
180
181    if ($self->{'debug'})
182    {
183    $handle = *STDOUT;
184    }
185    else
186    {
187        print STDERR "Full Path:     $full_lucene_passes\n";
188        print STDERR "Executable:    $full_lucene_passes_exe\n";
189        print STDERR "Sections:      $lucene_passes_sections\n";
190        print STDERR "Build Dir:     $build_dir\n";
191        print STDERR "Cmd:           $full_lucene_passes_exe text $lucene_passes_sections \"$build_dir\" \"dummy\"   $osextra\n";
192    if (!-e "$full_lucene_passes" ||
193        !open($handle, "| $full_lucene_passes_exe text $lucene_passes_sections \"$build_dir\" \"dummy\"   $osextra"))
194    {
195        print STDERR "<FatalError name='NoRunLucenePasses'/>\n</Stage>\n" if $self->{'gli'};
196        die "lucenebuilder::build_index - couldn't run $full_lucene_passes_exe\n";
197    }
198    }
199
200    # stored text is always Doc and Sec levels   
201    my $levels = { 'document' => 1, 'section' => 1 };
202    # always do database at section level
203    my $db_level = "section";
204
205    # set up the document processr
206    $self->{'buildproc'}->set_output_handle ($handle);
207    $self->{'buildproc'}->set_mode ('text');
208    $self->{'buildproc'}->set_index ($textindex);
209    $self->{'buildproc'}->set_indexing_text (0);
210    #$self->{'buildproc'}->set_indexfieldmap ($self->{'indexfieldmap'});
211    $self->{'buildproc'}->set_levels ($levels);
212    $self->{'buildproc'}->set_db_level ($db_level);
213    $self->{'buildproc'}->reset();
214    &plugin::begin($self->{'pluginfo'}, $self->{'source_dir'},
215           $self->{'buildproc'}, $self->{'maxdocs'});
216    &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
217           "", {}, {}, $self->{'buildproc'}, $self->{'maxdocs'}, 0, $self->{'gli'});
218    &plugin::end($self->{'pluginfo'});
219    close ($handle) unless $self->{'debug'};
220    $self->print_stats();
221
222    print STDERR "</Stage>\n" if $self->{'gli'};
223}
224
225sub build_indexes {
226    my $self = shift (@_);
227    my ($indexname) = @_;
228    my $outhandle = $self->{'outhandle'};
229
230    $self->pre_build_indexes($indexname);
231
232    my $indexes = [];
233    if (defined $indexname && $indexname =~ /\w/) {
234    push @$indexes, $indexname;
235    } else {
236    $indexes = $self->{'collect_cfg'}->{'indexes'};
237    }
238
239    # Have we got para index?
240    foreach my $level (keys %{$self->{'levels'}}) {
241    if ($level =~ /paragraph/) {
242        print $outhandle "Warning: Paragraph level indexing not supported by Lucene/Solr\n";
243        last;
244    }
245    }
246
247    # Create the mapping between the index descriptions
248    # and their directory names (includes subcolls and langs)
249    $self->{'index_mapping'} = $self->create_index_mapping ($indexes);
250
251    # build each of the indexes
252    foreach my $index (@$indexes) {
253
254    if ($self->want_built($index)) {
255
256        my $idx = $self->{'index_mapping'}->{$index};
257        foreach my $level (keys %{$self->{'levels'}}) {
258        next if $level =~ /paragraph/; # we don't do para indexing
259        my ($pindex) = $level =~ /^(.)/;
260        # should probably check that new name with level
261        # is unique ... but currently (with doc sec and para)
262        # each has unique first letter.
263        $self->{'index_mapping'}->{$index} = $pindex.$idx;
264
265        my $llevel = $mgppbuilder::level_map{$level};
266        print $outhandle "\n*** building index $index at level $llevel in subdirectory " .
267            "$self->{'index_mapping'}->{$index}\n" if ($self->{'verbosity'} >= 1);
268        print STDERR "<Stage name='Index' source='$index' level=$llevel>\n" if $self->{'gli'};
269
270        $self->build_index($index,$llevel);
271        }
272        $self->{'index_mapping'}->{$index} = $idx;
273
274    } else {
275        print $outhandle "\n*** ignoring index $index\n" if ($self->{'verbosity'} >= 1);
276    }
277    }
278
279    $self->post_build_indexes();
280}
281
282
283sub build_index {
284    my $self = shift (@_);
285    my ($index,$llevel) = @_;
286    my $outhandle = $self->{'outhandle'};
287    my $build_dir = $self->{'build_dir'};
288
289    # get the full index directory path and make sure it exists
290    my $indexdir = $self->{'index_mapping'}->{$index};
291    &util::mk_all_dir (&util::filename_cat($build_dir, $indexdir));
292
293    # get any os specific stuff
294    my $exedir = "$ENV{'GSDLHOME'}/bin/$ENV{'GSDLOS'}";
295    my $scriptdir = "$ENV{'GSDLHOME'}/bin/script";
296
297    # Find the perl script to call to run lucene
298    my $full_lucene_passes = $self->{'full_lucene_passes'};
299    my $full_lucene_passes_exe = $self->{'full_lucene_passes_exe'};
300
301    # define the section names for lucenepasses
302    # define the section names and possibly the doc name for lucenepasses
303    my $lucene_passes_sections = $llevel;
304
305    my $opt_create_index = ($self->{'incremental'}) ? "" : "-removeold";
306
307    my $osextra = "";
308    if ($ENV{'GSDLOS'} =~ /^windows$/i) {
309    $build_dir =~ s@/@\\@g;
310    } else {
311    if ($outhandle ne "STDERR") {
312        # so lucene_passes doesn't print to stderr if we redirect output
313        $osextra .= " 2>/dev/null";
314    }
315    }
316
317    # get the index expression if this index belongs
318    # to a subcollection
319    my $indexexparr = [];
320    my $langarr = [];
321
322    # there may be subcollection info, and language info.
323    my ($fields, $subcollection, $language) = split (":", $index);
324    my @subcollections = ();
325    @subcollections = split /,/, $subcollection if (defined $subcollection);
326
327    foreach $subcollection (@subcollections) {
328    if (defined ($self->{'collect_cfg'}->{'subcollection'}->{$subcollection})) {
329        push (@$indexexparr, $self->{'collect_cfg'}->{'subcollection'}->{$subcollection});
330    }
331    }
332
333    # add expressions for languages if this index belongs to
334    # a language subcollection - only put languages expressions for the
335    # ones we want in the index
336    my @languages = ();
337    my $languagemetadata = "Language";
338    if (defined ($self->{'collect_cfg'}->{'languagemetadata'})) {
339    $languagemetadata = $self->{'collect_cfg'}->{'languagemetadata'};
340    }
341    @languages = split /,/, $language if (defined $language);
342    foreach my $language (@languages) {
343    my $not=0;
344    if ($language =~ s/^\!//) {
345        $not = 1;
346    }
347    if($not) {
348        push (@$langarr, "!$language");
349    } else {
350        push (@$langarr, "$language");
351    }
352    }
353
354    # Build index dictionary. Uses verbatim stem method
355    print $outhandle "\n    creating index dictionary (lucene_passes -I1)\n"  if ($self->{'verbosity'} >= 1);
356    print STDERR "<Phase name='CreatingIndexDic'/>\n" if $self->{'gli'};
357    my ($handle);
358
359    if ($self->{'debug'}) {
360    $handle = *STDOUT;
361    } else {
362    print STDERR "Cmd: $full_lucene_passes_exe $opt_create_index index $lucene_passes_sections \"$build_dir\" \"$indexdir\"   $osextra\n";
363    if (!-e "$full_lucene_passes" ||
364        !open($handle, "| $full_lucene_passes_exe $opt_create_index index $lucene_passes_sections \"$build_dir\" \"$indexdir\"   $osextra")) {
365        print STDERR "<FatalError name='NoRunLucenePasses'/>\n</Stage>\n" if $self->{'gli'};
366        die "lucenebuilder::build_index - couldn't run $full_lucene_passes_exe\n";
367    }
368    }
369
370    my $store_levels = $self->{'levels'};
371    my $db_level = "section"; #always
372    my $dom_level = "";
373    foreach my $key (keys %$store_levels) {
374    if ($mgppbuilder::level_map{$key} eq $llevel) {
375        $dom_level = $key;
376    }
377    }
378    if ($dom_level eq "") {
379    print STDERR "Warning: unrecognized tag level $llevel\n";
380    $dom_level = "document";
381    }
382
383    my $local_levels = { $dom_level => 1 }; # work on one level at a time
384
385    # set up the document processr
386    $self->{'buildproc'}->set_output_handle ($handle);
387    $self->{'buildproc'}->set_mode ('text');
388    $self->{'buildproc'}->set_index ($index, $indexexparr);
389    $self->{'buildproc'}->set_index_languages ($languagemetadata, $langarr) if (defined $language);
390    $self->{'buildproc'}->set_indexing_text (1);
391    #$self->{'buildproc'}->set_indexfieldmap ($self->{'indexfieldmap'});
392    $self->{'buildproc'}->set_levels ($local_levels);
393    $self->{'buildproc'}->set_db_level($db_level);
394    $self->{'buildproc'}->reset();
395    &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
396           "", {}, {}, $self->{'buildproc'}, $self->{'maxdocs'}, 0, $self->{'gli'});
397    close ($handle) unless $self->{'debug'};
398
399    $self->print_stats();
400
401    $self->{'buildproc'}->set_levels ($store_levels);
402    print STDERR "</Stage>\n" if $self->{'gli'};
403}
404
405# /** A modified version of the basebuilder.pm's function that generates the
406#  *  information database from the GA documents. We need to change this
407#  *  so that if we've been asked to do an incremental build we only add
408#  *  metadata to autohierarchy classifiers via the IncrementalBuildUtils
409#  *  module. All other classifiers and metadata will be ignored.
410#  */
411# This was added to utilize DLC's incremental updating of Hierarchy classifiers. They are heading towards just using dynamic classifiers, and we do not want to use this code either. So now, we just use basebuilder's version of make_infodatabase
412sub make_infodatabase_dlc
413{
414    my $self = shift (@_);
415    my $outhandle = $self->{'outhandle'};
416
417    # Get info database file path
418    my $text_directory_path = &util::filename_cat($self->{'build_dir'}, "text");
419    my $infodb_file_path = &dbutil::get_infodb_file_path($self->{'infodbtype'}, $self->{'collection'}, $text_directory_path);
420
421    # If we aren't doing an incremental addition, then we just call the super-
422    # classes version
423    # Note: Incremental addition can only occur if an information database
424    #       already exists. If it doesn't, let the super classes function be
425    #       called once to generate it.
426    if (!$self->{'incremental'} || !-e $infodb_file_path)
427    {
428        # basebuilder::make_infodatabase(@_);
429        # Note: this doesn't work as the direct reference means all the $self
430        #       data is lost.
431        $self->basebuilder::make_infodatabase(@_);
432        return;
433    }
434
435    # Carry on with an incremental addition
436    print $outhandle "\n*** performing an incremental addition to the info database\n" if ($self->{'verbosity'} >= 1);
437    print STDERR "<Stage name='CreateInfoData'>\n" if $self->{'gli'};
438
439    # 1. Init all the classifiers
440    &classify::init_classifiers ($self->{'classifiers'});
441    # 2. Init the buildproc settings.
442    #    Note: we still need this to process any associated files - but we
443    #    don't expect to pipe anything to the database so we can do away with the
444    #    complex output handle.
445    my $assocdir = &util::filename_cat($self->{'build_dir'}, "assoc");
446    &util::mk_all_dir ($assocdir);
447    $self->{'buildproc'}->set_mode ('incinfodb'); # Very Important
448    $self->{'buildproc'}->set_assocdir ($assocdir);
449    # 3. Read in all the metadata from the files in the archives directory using
450    #    the GAPlug and using ourselves as the document processor!
451    &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'}, "", {}, {}, $self->{'buildproc'}, $self->{'maxdocs'},0, $self->{'gli'});
452
453    print STDERR "</Stage>\n" if $self->{'gli'};
454}
455
456# /** Lucene specific document removal function. This works by calling lucene_passes.pl with
457#  *  -remove and the document id on the command line.
458#  *
459#  *  @param oid is the document identifier to be removed.
460#  *
461#  *  @author John Rowe, DL Consulting Ltd.
462#  */
463sub remove_document_from_database
464{
465    my ($self, $oid) = @_;
466    # Find the perl script to call to run lucene
467    my $full_lucene_passes_exe = $self->{'full_lucene_passes_exe'};
468    # Call lucene_passes.pl with -remove and the document ID on the command line
469    `$full_lucene_passes_exe -remove "$oid"`;
470}
471# /** remove_document_from_database **/
472
473
4741;
475
476
Note: See TracBrowser for help on using the browser.