root/main/trunk/greenstone2/perllib/lucenebuilder.pm @ 24460

Revision 24460, 16.3 KB (checked in by davidb, 8 years ago)

Code changes to support indexers that are provided through the extension mechanism

  • Property svn:keywords set to Author Date Id Revision
Line 
1###########################################################################
2#
3# lucenebuilder.pm -- perl wrapper for building index with Lucene
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26###########################################################################
27# /*
28#  *  @version 1.0 Automated incremental building
29#  *  @version 2.0 Incremental building assistance added, including
30#  *               remove_document_from_database which implements the granddad's
31#  *               empty function to call the lucene_passes.pl and full_lucene_passes_exe
32#  *               so there is one place in the code that works out where the
33#  *               perl script is. John Rowe
34#  *
35#  *  @authro Waikato Digital Library Research Group
36#  *  @author John Rowe, DL Consulting Ltd.
37#  */
38###########################################################################
39
40package lucenebuilder;
41
42# Use same basic XML structure setup by mgppbuilder/mgppbuildproc
43
44use mgppbuilder;
45use strict;
46no strict 'refs';
47use util;
48
49sub BEGIN {
50    @lucenebuilder::ISA = ('mgppbuilder');
51}
52
53# /**
54#  *  @author  John Thompson, DL Consulting Ltd.
55#  */
56sub new {
57    my $class = shift(@_);
58    my $self = new mgppbuilder (@_);
59    $self = bless $self, $class;
60
61    $self->{'buildtype'} = "lucene";
62   
63    # If ENABLE_LUCENE was turned off during GS compilation, then we won't be able to
64    # continue. Check for existence of LuceneWrapper to see if Lucene was disabled.
65    my $lucene = &util::filename_cat($ENV{'GSDLHOME'},"bin","java","LuceneWrapper.jar");
66    if(!-f $lucene) {
67    die "***** ERROR: $lucene does not exist\n";     
68    }
69 
70    # Do we need to put exe on the end?
71    my $exe = &util::get_os_exe ();
72    my $scriptdir = "$ENV{'GSDLHOME'}/bin/script";
73
74    # So where is lucene_passes.pl anyway?
75    my $lucene_passes_script = &util::filename_cat($scriptdir, "lucene_passes.pl");
76
77    # So tack perl on the beginning to ensure execution
78    $self->{'full_lucene_passes'} = "$lucene_passes_script";
79    if ($exe eq ".exe")
80    {
81    $self->{'full_lucene_passes_exe'} = "perl$exe \"$lucene_passes_script\"";
82    }
83    else
84    {
85    $self->{'full_lucene_passes_exe'} = "\"".&util::get_perl_exec()."\" -S \"$lucene_passes_script\"";
86    }
87
88    return $self;
89}
90# /** new() **/
91
92sub is_incremental_capable
93{
94    # lucene can do incremental building
95
96    return 1;
97}
98
99sub init_for_incremental_build {
100    my $self = shift (@_);
101
102    # we want to read in indexfieldmap and indexfields from existing build.cfg
103    # so that we know what has already been indexed
104    my $buildcfg = $self->read_build_cfg();
105    return unless defined $buildcfg;
106
107    my $field;
108    if (defined $buildcfg->{'indexfields'}) {
109    foreach $field (@{$buildcfg->{'indexfields'}}) {
110        $self->{'buildproc'}->{'indexfields'}->{$field} = 1;
111    }
112    }
113
114    if (defined $buildcfg->{'indexfieldmap'}) {
115    foreach $field (@{$buildcfg->{'indexfieldmap'}}) {
116        my ($f, $v) = $field =~ /^(.*)\-\>(.*)$/;
117        $self->{'buildproc'}->{'indexfieldmap'}->{$f} = $v;
118    }
119    }       
120}
121
122# lucene has none of these options
123sub generate_index_options {
124    my $self = shift (@_);
125
126    $self->SUPER::generate_index_options();
127   
128    $self->{'casefold'} = 0;
129    $self->{'stem'} = 0;
130    $self->{'accentfold'} = 0;
131    $self->{'stemindexes'} = 0;
132}   
133
134sub default_buildproc {
135    my $self  = shift (@_);
136
137    return "lucenebuildproc";
138}
139
140# this writes a nice version of the text docs
141sub compress_text
142{
143    my $self = shift (@_);
144    # we don't do anything if we don't want compressed text
145    return if $self->{'no_text'};
146
147    my ($textindex) = @_;
148    my $outhandle = $self->{'outhandle'};
149
150    # the text directory
151    my $text_dir = &util::filename_cat($self->{'build_dir'}, "text");
152    my $build_dir = &util::filename_cat($self->{'build_dir'},"");
153    &util::mk_all_dir ($text_dir);
154
155    my $osextra = "";
156    if ($ENV{'GSDLOS'} =~ /^windows$/i)
157    {
158    $text_dir =~ s@/@\\@g;
159    }
160    else
161    {
162    if ($outhandle ne "STDERR")
163    {
164        # so lucene_passes doesn't print to stderr if we redirect output
165        $osextra .= " 2>/dev/null";
166    }
167    }
168
169    # get any os specific stuff
170    my $scriptdir = "$ENV{'GSDLHOME'}/bin/script";
171
172    # Find the perl script to call to run lucene
173    my $full_lucene_passes = $self->{'full_lucene_passes'};
174    my $full_lucene_passes_exe = $self->{'full_lucene_passes_exe'};
175
176    my $lucene_passes_sections = "Doc";
177
178    my ($handle);
179
180    if ($self->{'debug'})
181    {
182    $handle = *STDOUT;
183    }
184    else
185    {
186        print STDERR "Full Path:     $full_lucene_passes\n";
187        print STDERR "Executable:    $full_lucene_passes_exe\n";
188        print STDERR "Sections:      $lucene_passes_sections\n";
189        print STDERR "Build Dir:     $build_dir\n";
190        print STDERR "Cmd:           $full_lucene_passes_exe text $lucene_passes_sections \"$build_dir\" \"dummy\"   $osextra\n";
191    if (!-e "$full_lucene_passes" ||
192        !open($handle, "| $full_lucene_passes_exe text $lucene_passes_sections \"$build_dir\" \"dummy\"   $osextra"))
193    {
194        print STDERR "<FatalError name='NoRunLucenePasses'/>\n</Stage>\n" if $self->{'gli'};
195        die "lucenebuilder::build_index - couldn't run $full_lucene_passes_exe\n";
196    }
197    }
198
199    # stored text is always Doc and Sec levels   
200    my $levels = { 'document' => 1, 'section' => 1 };
201    # always do database at section level
202    my $db_level = "section";
203
204    # set up the document processr
205    $self->{'buildproc'}->set_output_handle ($handle);
206    $self->{'buildproc'}->set_mode ('text');
207    $self->{'buildproc'}->set_index ($textindex);
208    $self->{'buildproc'}->set_indexing_text (0);
209    #$self->{'buildproc'}->set_indexfieldmap ($self->{'indexfieldmap'});
210    $self->{'buildproc'}->set_levels ($levels);
211    $self->{'buildproc'}->set_db_level ($db_level);
212    $self->{'buildproc'}->reset();
213    &plugin::begin($self->{'pluginfo'}, $self->{'source_dir'},
214           $self->{'buildproc'}, $self->{'maxdocs'});
215    &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
216           "", {}, {}, $self->{'buildproc'}, $self->{'maxdocs'}, 0, $self->{'gli'});
217    &plugin::end($self->{'pluginfo'});
218    close ($handle) unless $self->{'debug'};
219    $self->print_stats();
220
221    print STDERR "</Stage>\n" if $self->{'gli'};
222}
223
224sub build_indexes {
225    my $self = shift (@_);
226    my ($indexname) = @_;
227    my $outhandle = $self->{'outhandle'};
228
229    $self->pre_build_indexes($indexname);
230
231    my $indexes = [];
232    if (defined $indexname && $indexname =~ /\w/) {
233    push @$indexes, $indexname;
234    } else {
235    $indexes = $self->{'collect_cfg'}->{'indexes'};
236    }
237
238    # Have we got para index?
239    foreach my $level (keys %{$self->{'levels'}}) {
240    if ($level =~ /paragraph/) {
241        print $outhandle "Warning: Paragraph level indexing not supported by Lucene/Solr\n";
242        last;
243    }
244    }
245
246    # Create the mapping between the index descriptions
247    # and their directory names (includes subcolls and langs)
248    $self->{'index_mapping'} = $self->create_index_mapping ($indexes);
249
250    # build each of the indexes
251    foreach my $index (@$indexes) {
252
253    if ($self->want_built($index)) {
254
255        my $idx = $self->{'index_mapping'}->{$index};
256        foreach my $level (keys %{$self->{'levels'}}) {
257        next if $level =~ /paragraph/; # we don't do para indexing
258        my ($pindex) = $level =~ /^(.)/;
259        # should probably check that new name with level
260        # is unique ... but currently (with doc sec and para)
261        # each has unique first letter.
262        $self->{'index_mapping'}->{$index} = $pindex.$idx;
263
264        my $llevel = $mgppbuilder::level_map{$level};
265        print $outhandle "\n*** building index $index at level $llevel in subdirectory " .
266            "$self->{'index_mapping'}->{$index}\n" if ($self->{'verbosity'} >= 1);
267        print STDERR "<Stage name='Index' source='$index' level=$llevel>\n" if $self->{'gli'};
268
269        $self->build_index($index,$llevel);
270        }
271        $self->{'index_mapping'}->{$index} = $idx;
272
273    } else {
274        print $outhandle "\n*** ignoring index $index\n" if ($self->{'verbosity'} >= 1);
275    }
276    }
277
278    $self->post_build_indexes();
279}
280
281
282sub build_index {
283    my $self = shift (@_);
284    my ($index,$llevel) = @_;
285    my $outhandle = $self->{'outhandle'};
286    my $build_dir = $self->{'build_dir'};
287
288    # get the full index directory path and make sure it exists
289    my $indexdir = $self->{'index_mapping'}->{$index};
290    &util::mk_all_dir (&util::filename_cat($build_dir, $indexdir));
291
292    # get any os specific stuff
293    my $exedir = "$ENV{'GSDLHOME'}/bin/$ENV{'GSDLOS'}";
294    my $scriptdir = "$ENV{'GSDLHOME'}/bin/script";
295
296    # Find the perl script to call to run lucene
297    my $full_lucene_passes = $self->{'full_lucene_passes'};
298    my $full_lucene_passes_exe = $self->{'full_lucene_passes_exe'};
299
300    # define the section names for lucenepasses
301    # define the section names and possibly the doc name for lucenepasses
302    my $lucene_passes_sections = $llevel;
303
304    my $opt_create_index = ($self->{'incremental'}) ? "" : "-removeold";
305
306    my $osextra = "";
307    if ($ENV{'GSDLOS'} =~ /^windows$/i) {
308    $build_dir =~ s@/@\\@g;
309    } else {
310    if ($outhandle ne "STDERR") {
311        # so lucene_passes doesn't print to stderr if we redirect output
312        $osextra .= " 2>/dev/null";
313    }
314    }
315
316    # get the index expression if this index belongs
317    # to a subcollection
318    my $indexexparr = [];
319    my $langarr = [];
320
321    # there may be subcollection info, and language info.
322    my ($fields, $subcollection, $language) = split (":", $index);
323    my @subcollections = ();
324    @subcollections = split /,/, $subcollection if (defined $subcollection);
325
326    foreach $subcollection (@subcollections) {
327    if (defined ($self->{'collect_cfg'}->{'subcollection'}->{$subcollection})) {
328        push (@$indexexparr, $self->{'collect_cfg'}->{'subcollection'}->{$subcollection});
329    }
330    }
331
332    # add expressions for languages if this index belongs to
333    # a language subcollection - only put languages expressions for the
334    # ones we want in the index
335    my @languages = ();
336    my $languagemetadata = "Language";
337    if (defined ($self->{'collect_cfg'}->{'languagemetadata'})) {
338    $languagemetadata = $self->{'collect_cfg'}->{'languagemetadata'};
339    }
340    @languages = split /,/, $language if (defined $language);
341    foreach my $language (@languages) {
342    my $not=0;
343    if ($language =~ s/^\!//) {
344        $not = 1;
345    }
346    if($not) {
347        push (@$langarr, "!$language");
348    } else {
349        push (@$langarr, "$language");
350    }
351    }
352
353    # Build index dictionary. Uses verbatim stem method
354    print $outhandle "\n    creating index dictionary (lucene_passes -I1)\n"  if ($self->{'verbosity'} >= 1);
355    print STDERR "<Phase name='CreatingIndexDic'/>\n" if $self->{'gli'};
356    my ($handle);
357
358    if ($self->{'debug'}) {
359    $handle = *STDOUT;
360    } else {
361    print STDERR "Cmd: $full_lucene_passes_exe $opt_create_index index $lucene_passes_sections \"$build_dir\" \"$indexdir\"   $osextra\n";
362    if (!-e "$full_lucene_passes" ||
363        !open($handle, "| $full_lucene_passes_exe $opt_create_index index $lucene_passes_sections \"$build_dir\" \"$indexdir\"   $osextra")) {
364        print STDERR "<FatalError name='NoRunLucenePasses'/>\n</Stage>\n" if $self->{'gli'};
365        die "lucenebuilder::build_index - couldn't run $full_lucene_passes_exe\n";
366    }
367    }
368
369    my $store_levels = $self->{'levels'};
370    my $db_level = "section"; #always
371    my $dom_level = "";
372    foreach my $key (keys %$store_levels) {
373    if ($mgppbuilder::level_map{$key} eq $llevel) {
374        $dom_level = $key;
375    }
376    }
377    if ($dom_level eq "") {
378    print STDERR "Warning: unrecognized tag level $llevel\n";
379    $dom_level = "document";
380    }
381
382    my $local_levels = { $dom_level => 1 }; # work on one level at a time
383
384    # set up the document processr
385    $self->{'buildproc'}->set_output_handle ($handle);
386    $self->{'buildproc'}->set_mode ('text');
387    $self->{'buildproc'}->set_index ($index, $indexexparr);
388    $self->{'buildproc'}->set_index_languages ($languagemetadata, $langarr) if (defined $language);
389    $self->{'buildproc'}->set_indexing_text (1);
390    #$self->{'buildproc'}->set_indexfieldmap ($self->{'indexfieldmap'});
391    $self->{'buildproc'}->set_levels ($local_levels);
392    $self->{'buildproc'}->set_db_level($db_level);
393    $self->{'buildproc'}->reset();
394    &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
395           "", {}, {}, $self->{'buildproc'}, $self->{'maxdocs'}, 0, $self->{'gli'});
396    close ($handle) unless $self->{'debug'};
397
398    $self->print_stats();
399
400    $self->{'buildproc'}->set_levels ($store_levels);
401    print STDERR "</Stage>\n" if $self->{'gli'};
402}
403
404# /** A modified version of the basebuilder.pm's function that generates the
405#  *  information database from the GA documents. We need to change this
406#  *  so that if we've been asked to do an incremental build we only add
407#  *  metadata to autohierarchy classifiers via the IncrementalBuildUtils
408#  *  module. All other classifiers and metadata will be ignored.
409#  */
410# This was added to utilize DLC's incremental updating of Hierarchy classifiers. They are heading towards just using dynamic classifiers, and we do not want to use this code either. So now, we just use basebuilder's version of make_infodatabase
411sub make_infodatabase_dlc
412{
413    my $self = shift (@_);
414    my $outhandle = $self->{'outhandle'};
415
416    # Get info database file path
417    my $text_directory_path = &util::filename_cat($self->{'build_dir'}, "text");
418    my $infodb_file_path = &dbutil::get_infodb_file_path($self->{'infodbtype'}, $self->{'collection'}, $text_directory_path);
419
420    # If we aren't doing an incremental addition, then we just call the super-
421    # classes version
422    # Note: Incremental addition can only occur if an information database
423    #       already exists. If it doesn't, let the super classes function be
424    #       called once to generate it.
425    if (!$self->{'incremental'} || !-e $infodb_file_path)
426    {
427        # basebuilder::make_infodatabase(@_);
428        # Note: this doesn't work as the direct reference means all the $self
429        #       data is lost.
430        $self->basebuilder::make_infodatabase(@_);
431        return;
432    }
433
434    # Carry on with an incremental addition
435    print $outhandle "\n*** performing an incremental addition to the info database\n" if ($self->{'verbosity'} >= 1);
436    print STDERR "<Stage name='CreateInfoData'>\n" if $self->{'gli'};
437
438    # 1. Init all the classifiers
439    &classify::init_classifiers ($self->{'classifiers'});
440    # 2. Init the buildproc settings.
441    #    Note: we still need this to process any associated files - but we
442    #    don't expect to pipe anything to the database so we can do away with the
443    #    complex output handle.
444    my $assocdir = &util::filename_cat($self->{'build_dir'}, "assoc");
445    &util::mk_all_dir ($assocdir);
446    $self->{'buildproc'}->set_mode ('incinfodb'); # Very Important
447    $self->{'buildproc'}->set_assocdir ($assocdir);
448    # 3. Read in all the metadata from the files in the archives directory using
449    #    the GAPlug and using ourselves as the document processor!
450    &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'}, "", {}, {}, $self->{'buildproc'}, $self->{'maxdocs'},0, $self->{'gli'});
451
452    print STDERR "</Stage>\n" if $self->{'gli'};
453}
454
455# /** Lucene specific document removal function. This works by calling lucene_passes.pl with
456#  *  -remove and the document id on the command line.
457#  *
458#  *  @param oid is the document identifier to be removed.
459#  *
460#  *  @author John Rowe, DL Consulting Ltd.
461#  */
462sub remove_document_from_database
463{
464    my ($self, $oid) = @_;
465    # Find the perl script to call to run lucene
466    my $full_lucene_passes_exe = $self->{'full_lucene_passes_exe'};
467    # Call lucene_passes.pl with -remove and the document ID on the command line
468    `$full_lucene_passes_exe -remove "$oid"`;
469}
470# /** remove_document_from_database **/
471
472
4731;
474
475
Note: See TracBrowser for help on using the browser.