root/main/trunk/greenstone2/perllib/lucenebuilder.pm @ 25723

Revision 25723, 16.4 KB (checked in by kjdon, 8 years ago)

when reloading the indexfieldmap from build.cfg for incremental building, need to add in the shortname->1 mapping, so that when we are generating a new shortname, we can see what other shortnames have already been used. Thanks to Malita, DL Consulting

  • Property svn:keywords set to Author Date Id Revision
Line 
1###########################################################################
2#
3# lucenebuilder.pm -- perl wrapper for building index with Lucene
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26###########################################################################
27# /*
28#  *  @version 1.0 Initial implementation of incremental building
29#  *  @version 2.0 Incremental building assistance added, including
30#  *               remove_document_from_database which implements the granddad's
31#  *               empty function to call the lucene_passes.pl and full_lucene_passes_exe
32#  *               so there is one place in the code that works out where the
33#  *               perl script is. John Rowe
34#  *
35#  *  @author David Bainbridge and Katherine Don, Waikato DL Research group
36#  *  @author John Rowe, DL Consulting Ltd.
37#  *  @author John Thompson, DL Consulting Ltd.
38#  */
39###########################################################################
40
41package lucenebuilder;
42
43# Use same basic XML structure setup by mgppbuilder/mgppbuildproc
44
45use mgppbuilder;
46use strict;
47no strict 'refs';
48use util;
49
50sub BEGIN {
51    @lucenebuilder::ISA = ('mgppbuilder');
52}
53
54# /**
55#  *  @author  John Thompson, DL Consulting Ltd.
56#  */
57sub new {
58    my $class = shift(@_);
59    my $self = new mgppbuilder (@_);
60    $self = bless $self, $class;
61
62    $self->{'buildtype'} = "lucene";
63   
64    # If ENABLE_LUCENE was turned off during GS compilation, then we won't be able to
65    # continue. Check for existence of LuceneWrapper to see if Lucene was disabled.
66    my $lucene = &util::filename_cat($ENV{'GSDLHOME'},"bin","java","LuceneWrapper.jar");
67    if (! -f $lucene) {
68    die "***** ERROR: $lucene does not exist\n";     
69    }
70 
71    # Do we need to put exe on the end?
72    my $exe = &util::get_os_exe ();
73    my $scriptdir = "$ENV{'GSDLHOME'}/bin/script";
74
75    # So where is lucene_passes.pl anyway?
76    my $lucene_passes_script = &util::filename_cat($scriptdir, "lucene_passes.pl");
77
78    # So tack perl on the beginning to ensure execution
79    $self->{'full_lucene_passes'} = "$lucene_passes_script";
80    if ($exe eq ".exe")
81    {
82    $self->{'full_lucene_passes_exe'} = "perl$exe \"$lucene_passes_script\"";
83    }
84    else
85    {
86    $self->{'full_lucene_passes_exe'} = "\"".&util::get_perl_exec()."\" -S \"$lucene_passes_script\"";
87    }
88
89    return $self;
90}
91# /** new() **/
92
93sub is_incremental_capable
94{
95    # lucene can do incremental building
96
97    return 1;
98}
99
100sub init_for_incremental_build {
101    my $self = shift (@_);
102
103    # we want to read in indexfieldmap and indexfields from existing build.cfg
104    # so that we know what has already been indexed
105    my $buildcfg = $self->read_build_cfg();
106    return unless defined $buildcfg;
107
108    my $field;
109    if (defined $buildcfg->{'indexfields'}) {
110    foreach $field (@{$buildcfg->{'indexfields'}}) {
111        $self->{'buildproc'}->{'indexfields'}->{$field} = 1;
112    }
113    }
114
115    if (defined $buildcfg->{'indexfieldmap'}) {
116    foreach $field (@{$buildcfg->{'indexfieldmap'}}) {
117        my ($f, $v) = $field =~ /^(.*)\-\>(.*)$/;
118        $self->{'buildproc'}->{'indexfieldmap'}->{$f} = $v;
119        $self->{'buildproc'}->{'indexfieldmap'}->{$v} = 1;
120    }
121    }       
122}
123
124# lucene has none of these options
125sub generate_index_options {
126    my $self = shift (@_);
127
128    $self->SUPER::generate_index_options();
129   
130    $self->{'casefold'} = 0;
131    $self->{'stem'} = 0;
132    $self->{'accentfold'} = 0;
133    $self->{'stemindexes'} = 0;
134}   
135
136sub default_buildproc {
137    my $self  = shift (@_);
138
139    return "lucenebuildproc";
140}
141
142# this writes a nice version of the text docs
143sub compress_text
144{
145    my $self = shift (@_);
146    # we don't do anything if we don't want compressed text
147    return if $self->{'no_text'};
148
149    my ($textindex) = @_;
150    my $outhandle = $self->{'outhandle'};
151
152    # the text directory
153    my $text_dir = &util::filename_cat($self->{'build_dir'}, "text");
154    my $build_dir = &util::filename_cat($self->{'build_dir'},"");
155    &util::mk_all_dir ($text_dir);
156
157    my $osextra = "";
158    if ($ENV{'GSDLOS'} =~ /^windows$/i)
159    {
160    $text_dir =~ s@/@\\@g;
161    }
162    else
163    {
164    if ($outhandle ne "STDERR")
165    {
166        # so lucene_passes doesn't print to stderr if we redirect output
167        $osextra .= " 2>/dev/null";
168    }
169    }
170
171    # get any os specific stuff
172    my $scriptdir = "$ENV{'GSDLHOME'}/bin/script";
173
174    # Find the perl script to call to run lucene
175    my $full_lucene_passes = $self->{'full_lucene_passes'};
176    my $full_lucene_passes_exe = $self->{'full_lucene_passes_exe'};
177
178    my $lucene_passes_sections = "Doc";
179
180    my ($handle);
181
182    if ($self->{'debug'})
183    {
184    $handle = *STDOUT;
185    }
186    else
187    {
188        print STDERR "Full Path:     $full_lucene_passes\n";
189        print STDERR "Executable:    $full_lucene_passes_exe\n";
190        print STDERR "Sections:      $lucene_passes_sections\n";
191        print STDERR "Build Dir:     $build_dir\n";
192        print STDERR "Cmd:           $full_lucene_passes_exe text $lucene_passes_sections \"$build_dir\" \"dummy\"   $osextra\n";
193    if (!-e "$full_lucene_passes" ||
194        !open($handle, "| $full_lucene_passes_exe text $lucene_passes_sections \"$build_dir\" \"dummy\"   $osextra"))
195    {
196        print STDERR "<FatalError name='NoRunLucenePasses'/>\n</Stage>\n" if $self->{'gli'};
197        die "lucenebuilder::build_index - couldn't run $full_lucene_passes_exe\n";
198    }
199    }
200
201    # stored text is always Doc and Sec levels   
202    my $levels = { 'document' => 1, 'section' => 1 };
203    # always do database at section level
204    my $db_level = "section";
205
206    # set up the document processr
207    $self->{'buildproc'}->set_output_handle ($handle);
208    $self->{'buildproc'}->set_mode ('text');
209    $self->{'buildproc'}->set_index ($textindex);
210    $self->{'buildproc'}->set_indexing_text (0);
211    #$self->{'buildproc'}->set_indexfieldmap ($self->{'indexfieldmap'});
212    $self->{'buildproc'}->set_levels ($levels);
213    $self->{'buildproc'}->set_db_level ($db_level);
214    $self->{'buildproc'}->reset();
215    &plugin::begin($self->{'pluginfo'}, $self->{'source_dir'},
216           $self->{'buildproc'}, $self->{'maxdocs'});
217    &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
218           "", {}, {}, $self->{'buildproc'}, $self->{'maxdocs'}, 0, $self->{'gli'});
219    &plugin::end($self->{'pluginfo'});
220    close ($handle) unless $self->{'debug'};
221    $self->print_stats();
222
223    print STDERR "</Stage>\n" if $self->{'gli'};
224}
225
226sub build_indexes {
227    my $self = shift (@_);
228    my ($indexname) = @_;
229    my $outhandle = $self->{'outhandle'};
230
231    $self->pre_build_indexes($indexname);
232
233    my $indexes = [];
234    if (defined $indexname && $indexname =~ /\w/) {
235    push @$indexes, $indexname;
236    } else {
237    $indexes = $self->{'collect_cfg'}->{'indexes'};
238    }
239
240    # Have we got para index?
241    foreach my $level (keys %{$self->{'levels'}}) {
242    if ($level =~ /paragraph/) {
243        print $outhandle "Warning: Paragraph level indexing not supported by Lucene/Solr\n";
244        last;
245    }
246    }
247
248    # Create the mapping between the index descriptions
249    # and their directory names (includes subcolls and langs)
250    $self->{'index_mapping'} = $self->create_index_mapping ($indexes);
251
252    # build each of the indexes
253    foreach my $index (@$indexes) {
254
255    if ($self->want_built($index)) {
256
257        my $idx = $self->{'index_mapping'}->{$index};
258        foreach my $level (keys %{$self->{'levels'}}) {
259        next if $level =~ /paragraph/; # we don't do para indexing
260        my ($pindex) = $level =~ /^(.)/;
261        # should probably check that new name with level
262        # is unique ... but currently (with doc sec and para)
263        # each has unique first letter.
264        $self->{'index_mapping'}->{$index} = $pindex.$idx;
265
266        my $llevel = $mgppbuilder::level_map{$level};
267        print $outhandle "\n*** building index $index at level $llevel in subdirectory " .
268            "$self->{'index_mapping'}->{$index}\n" if ($self->{'verbosity'} >= 1);
269        print STDERR "<Stage name='Index' source='$index' level=$llevel>\n" if $self->{'gli'};
270
271        $self->build_index($index,$llevel);
272        }
273        $self->{'index_mapping'}->{$index} = $idx;
274
275    } else {
276        print $outhandle "\n*** ignoring index $index\n" if ($self->{'verbosity'} >= 1);
277    }
278    }
279
280    $self->post_build_indexes();
281}
282
283
284sub build_index {
285    my $self = shift (@_);
286    my ($index,$llevel) = @_;
287    my $outhandle = $self->{'outhandle'};
288    my $build_dir = $self->{'build_dir'};
289
290    # get the full index directory path and make sure it exists
291    my $indexdir = $self->{'index_mapping'}->{$index};
292    &util::mk_all_dir (&util::filename_cat($build_dir, $indexdir));
293
294    # get any os specific stuff
295    my $exedir = "$ENV{'GSDLHOME'}/bin/$ENV{'GSDLOS'}";
296    my $scriptdir = "$ENV{'GSDLHOME'}/bin/script";
297
298    # Find the perl script to call to run lucene
299    my $full_lucene_passes = $self->{'full_lucene_passes'};
300    my $full_lucene_passes_exe = $self->{'full_lucene_passes_exe'};
301
302    # define the section names for lucenepasses
303    # define the section names and possibly the doc name for lucenepasses
304    my $lucene_passes_sections = $llevel;
305
306    my $opt_create_index = ($self->{'incremental'}) ? "" : "-removeold";
307
308    my $osextra = "";
309    if ($ENV{'GSDLOS'} =~ /^windows$/i) {
310    $build_dir =~ s@/@\\@g;
311    } else {
312    if ($outhandle ne "STDERR") {
313        # so lucene_passes doesn't print to stderr if we redirect output
314        $osextra .= " 2>/dev/null";
315    }
316    }
317
318    # get the index expression if this index belongs
319    # to a subcollection
320    my $indexexparr = [];
321    my $langarr = [];
322
323    # there may be subcollection info, and language info.
324    my ($fields, $subcollection, $language) = split (":", $index);
325    my @subcollections = ();
326    @subcollections = split /,/, $subcollection if (defined $subcollection);
327
328    foreach $subcollection (@subcollections) {
329    if (defined ($self->{'collect_cfg'}->{'subcollection'}->{$subcollection})) {
330        push (@$indexexparr, $self->{'collect_cfg'}->{'subcollection'}->{$subcollection});
331    }
332    }
333
334    # add expressions for languages if this index belongs to
335    # a language subcollection - only put languages expressions for the
336    # ones we want in the index
337    my @languages = ();
338    my $languagemetadata = "Language";
339    if (defined ($self->{'collect_cfg'}->{'languagemetadata'})) {
340    $languagemetadata = $self->{'collect_cfg'}->{'languagemetadata'};
341    }
342    @languages = split /,/, $language if (defined $language);
343    foreach my $language (@languages) {
344    my $not=0;
345    if ($language =~ s/^\!//) {
346        $not = 1;
347    }
348    if($not) {
349        push (@$langarr, "!$language");
350    } else {
351        push (@$langarr, "$language");
352    }
353    }
354
355    # Build index dictionary. Uses verbatim stem method
356    print $outhandle "\n    creating index dictionary (lucene_passes -I1)\n"  if ($self->{'verbosity'} >= 1);
357    print STDERR "<Phase name='CreatingIndexDic'/>\n" if $self->{'gli'};
358    my ($handle);
359
360    if ($self->{'debug'}) {
361    $handle = *STDOUT;
362    } else {
363    print STDERR "Cmd: $full_lucene_passes_exe $opt_create_index index $lucene_passes_sections \"$build_dir\" \"$indexdir\"   $osextra\n";
364    if (!-e "$full_lucene_passes" ||
365        !open($handle, "| $full_lucene_passes_exe $opt_create_index index $lucene_passes_sections \"$build_dir\" \"$indexdir\"   $osextra")) {
366        print STDERR "<FatalError name='NoRunLucenePasses'/>\n</Stage>\n" if $self->{'gli'};
367        die "lucenebuilder::build_index - couldn't run $full_lucene_passes_exe\n";
368    }
369    }
370
371    my $store_levels = $self->{'levels'};
372    my $db_level = "section"; #always
373    my $dom_level = "";
374    foreach my $key (keys %$store_levels) {
375    if ($mgppbuilder::level_map{$key} eq $llevel) {
376        $dom_level = $key;
377    }
378    }
379    if ($dom_level eq "") {
380    print STDERR "Warning: unrecognized tag level $llevel\n";
381    $dom_level = "document";
382    }
383
384    my $local_levels = { $dom_level => 1 }; # work on one level at a time
385
386    # set up the document processr
387    $self->{'buildproc'}->set_output_handle ($handle);
388    $self->{'buildproc'}->set_mode ('text');
389    $self->{'buildproc'}->set_index ($index, $indexexparr);
390    $self->{'buildproc'}->set_index_languages ($languagemetadata, $langarr) if (defined $language);
391    $self->{'buildproc'}->set_indexing_text (1);
392    #$self->{'buildproc'}->set_indexfieldmap ($self->{'indexfieldmap'});
393    $self->{'buildproc'}->set_levels ($local_levels);
394    $self->{'buildproc'}->set_db_level($db_level);
395    $self->{'buildproc'}->reset();
396    &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
397           "", {}, {}, $self->{'buildproc'}, $self->{'maxdocs'}, 0, $self->{'gli'});
398    close ($handle) unless $self->{'debug'};
399
400    $self->print_stats();
401
402    $self->{'buildproc'}->set_levels ($store_levels);
403    print STDERR "</Stage>\n" if $self->{'gli'};
404}
405
406# /** A modified version of the basebuilder.pm's function that generates the
407#  *  information database from the GA documents. We need to change this
408#  *  so that if we've been asked to do an incremental build we only add
409#  *  metadata to autohierarchy classifiers via the IncrementalBuildUtils
410#  *  module. All other classifiers and metadata will be ignored.
411#  */
412# This was added to utilize DLC's incremental updating of Hierarchy classifiers. They are heading towards just using dynamic classifiers, and we do not want to use this code either. So now, we just use basebuilder's version of make_infodatabase
413sub make_infodatabase_dlc
414{
415    my $self = shift (@_);
416    my $outhandle = $self->{'outhandle'};
417
418    # Get info database file path
419    my $text_directory_path = &util::filename_cat($self->{'build_dir'}, "text");
420    my $infodb_file_path = &dbutil::get_infodb_file_path($self->{'infodbtype'}, $self->{'collection'}, $text_directory_path);
421
422    # If we aren't doing an incremental addition, then we just call the super-
423    # classes version
424    # Note: Incremental addition can only occur if an information database
425    #       already exists. If it doesn't, let the super classes function be
426    #       called once to generate it.
427    if (!$self->{'incremental'} || !-e $infodb_file_path)
428    {
429        # basebuilder::make_infodatabase(@_);
430        # Note: this doesn't work as the direct reference means all the $self
431        #       data is lost.
432        $self->basebuilder::make_infodatabase(@_);
433        return;
434    }
435
436    # Carry on with an incremental addition
437    print $outhandle "\n*** performing an incremental addition to the info database\n" if ($self->{'verbosity'} >= 1);
438    print STDERR "<Stage name='CreateInfoData'>\n" if $self->{'gli'};
439
440    # 1. Init all the classifiers
441    &classify::init_classifiers ($self->{'classifiers'});
442    # 2. Init the buildproc settings.
443    #    Note: we still need this to process any associated files - but we
444    #    don't expect to pipe anything to the database so we can do away with the
445    #    complex output handle.
446    my $assocdir = &util::filename_cat($self->{'build_dir'}, "assoc");
447    &util::mk_all_dir ($assocdir);
448    $self->{'buildproc'}->set_mode ('incinfodb'); # Very Important
449    $self->{'buildproc'}->set_assocdir ($assocdir);
450    # 3. Read in all the metadata from the files in the archives directory using
451    #    the GAPlug and using ourselves as the document processor!
452    &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'}, "", {}, {}, $self->{'buildproc'}, $self->{'maxdocs'},0, $self->{'gli'});
453
454    print STDERR "</Stage>\n" if $self->{'gli'};
455}
456
457# /** Lucene specific document removal function. This works by calling lucene_passes.pl with
458#  *  -remove and the document id on the command line.
459#  *
460#  *  @param oid is the document identifier to be removed.
461#  *
462#  *  @author John Rowe, DL Consulting Ltd.
463#  */
464sub remove_document_from_database
465{
466    my ($self, $oid) = @_;
467    # Find the perl script to call to run lucene
468    my $full_lucene_passes_exe = $self->{'full_lucene_passes_exe'};
469    # Call lucene_passes.pl with -remove and the document ID on the command line
470    `$full_lucene_passes_exe -remove "$oid"`;
471}
472# /** remove_document_from_database **/
473
474
4751;
476
477
Note: See TracBrowser for help on using the browser.