root/main/trunk/greenstone2/perllib/lucenebuilder.pm @ 27357

Revision 27357, 18.2 KB (checked in by kjdon, 6 years ago)

setting sortfield info to buildproc, and getting it back from buildproc for the build.cfg file.

  • Property svn:keywords set to Author Date Id Revision
Line 
1###########################################################################
2#
3# lucenebuilder.pm -- perl wrapper for building index with Lucene
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26###########################################################################
27# /*
28#  *  @version 1.0 Initial implementation of incremental building
29#  *  @version 2.0 Incremental building assistance added, including
30#  *               remove_document_from_database which implements the granddad's
31#  *               empty function to call the lucene_passes.pl and full_lucene_passes_exe
32#  *               so there is one place in the code that works out where the
33#  *               perl script is. John Rowe
34#  *
35#  *  @author David Bainbridge and Katherine Don, Waikato DL Research group
36#  *  @author John Rowe, DL Consulting Ltd.
37#  *  @author John Thompson, DL Consulting Ltd.
38#  */
39###########################################################################
40
41package lucenebuilder;
42
43# Use same basic XML structure setup by mgppbuilder/mgppbuildproc
44
45use mgppbuilder;
46use strict;
47no strict 'refs';
48use util;
49use FileUtils;
50
51sub BEGIN {
52    @lucenebuilder::ISA = ('mgppbuilder');
53}
54
55# /**
56#  *  @author  John Thompson, DL Consulting Ltd.
57#  */
58sub new {
59    my $class = shift(@_);
60    my $self = new mgppbuilder (@_);
61    $self = bless $self, $class;
62
63    $self->{'buildtype'} = "lucene";
64   
65    # If ENABLE_LUCENE was turned off during GS compilation, then we won't be able to
66    # continue. Check for existence of LuceneWrapper to see if Lucene was disabled.
67    my $lucene = &FileUtils::filenameConcatenate($ENV{'GSDLHOME'},"bin","java","LuceneWrapper3.jar");
68    if (! -f $lucene) {
69    die "***** ERROR: $lucene does not exist\n";     
70    }
71 
72    # Do we need to put exe on the end?
73    my $exe = &util::get_os_exe ();
74    my $scriptdir = "$ENV{'GSDLHOME'}/bin/script";
75
76    # So where is lucene_passes.pl anyway?
77    my $lucene_passes_script = &FileUtils::filenameConcatenate($scriptdir, "lucene_passes.pl");
78
79    # So tack perl on the beginning to ensure execution
80    $self->{'full_lucene_passes'} = "$lucene_passes_script";
81    if ($exe eq ".exe")
82    {
83    $self->{'full_lucene_passes_exe'} = "perl$exe \"$lucene_passes_script\"";
84    }
85    else
86    {
87    $self->{'full_lucene_passes_exe'} = "\"".&util::get_perl_exec()."\" -S \"$lucene_passes_script\"";
88    }
89
90    return $self;
91}
92# /** new() **/
93
94sub is_incremental_capable
95{
96    # lucene can do incremental building
97
98    return 1;
99}
100
101sub init_for_incremental_build {
102    my $self = shift (@_);
103
104    # we want to read in indexfieldmap and indexfields from existing build.cfg
105    # so that we know what has already been indexed
106    my $buildcfg = $self->read_build_cfg();
107    return unless defined $buildcfg;
108
109    my $field;
110    if (defined $buildcfg->{'indexfields'}) {
111    foreach $field (@{$buildcfg->{'indexfields'}}) {
112        # extraindexfields is only supposed to have extra ones in it, not those already specified in indexes. And this list has all indexes in it. But we do a check before including things from extraindexfields whether it was specified in indexes, so it all ok.
113        $self->{'buildproc'}->{'extraindexfields'}->{$field} = 1;
114    }
115    }
116
117    if (defined $buildcfg->{'indexfieldmap'}) {
118    foreach $field (@{$buildcfg->{'indexfieldmap'}}) {
119        my ($f, $v) = $field =~ /^(.*)\-\>(.*)$/;
120        $self->{'buildproc'}->{'fieldnamemap'}->{$f} = $v;
121        $self->{'buildproc'}->{'fieldnamemap'}->{$v} = 1;
122        $self->{'buildproc'}->{'allindexfields'}->{$f} = 1;
123    }
124    }       
125}
126
127# lucene has none of these options
128sub generate_index_options {
129    my $self = shift (@_);
130
131    $self->SUPER::generate_index_options();
132   
133    $self->{'casefold'} = 0;
134    $self->{'stem'} = 0;
135    $self->{'accentfold'} = 0;
136    $self->{'stemindexes'} = 0;
137}   
138
139sub default_buildproc {
140    my $self  = shift (@_);
141
142    return "lucenebuildproc";
143}
144
145# this writes a nice version of the text docs
146sub compress_text
147{
148    my $self = shift (@_);
149    # we don't do anything if we don't want compressed text
150    return if $self->{'no_text'};
151
152    my ($textindex) = @_;
153    my $outhandle = $self->{'outhandle'};
154
155    # the text directory
156    my $text_dir = &FileUtils::filenameConcatenate($self->{'build_dir'}, "text");
157    my $build_dir = &FileUtils::filenameConcatenate($self->{'build_dir'},"");
158    &FileUtils::makeAllDirectories ($text_dir);
159
160    my $osextra = "";
161    if ($ENV{'GSDLOS'} =~ /^windows$/i)
162    {
163    $text_dir =~ s@/@\\@g;
164    }
165    else
166    {
167    if ($outhandle ne "STDERR")
168    {
169        # so lucene_passes doesn't print to stderr if we redirect output
170        $osextra .= " 2>/dev/null";
171    }
172    }
173
174    # get any os specific stuff
175    my $scriptdir = "$ENV{'GSDLHOME'}/bin/script";
176
177    # Find the perl script to call to run lucene
178    my $full_lucene_passes = $self->{'full_lucene_passes'};
179    my $full_lucene_passes_exe = $self->{'full_lucene_passes_exe'};
180
181    my $lucene_passes_sections = "Doc";
182
183    my ($handle);
184
185    if ($self->{'debug'})
186    {
187    $handle = *STDOUT;
188    }
189    else
190    {
191        print STDERR "Full Path:     $full_lucene_passes\n";
192        print STDERR "Executable:    $full_lucene_passes_exe\n";
193        print STDERR "Sections:      $lucene_passes_sections\n";
194        print STDERR "Build Dir:     $build_dir\n";
195        print STDERR "Cmd:           $full_lucene_passes_exe text $lucene_passes_sections \"$build_dir\" \"dummy\"   $osextra\n";
196    if (!-e "$full_lucene_passes" ||
197        !open($handle, "| $full_lucene_passes_exe text $lucene_passes_sections \"$build_dir\" \"dummy\"   $osextra"))
198    {
199        print STDERR "<FatalError name='NoRunLucenePasses'/>\n</Stage>\n" if $self->{'gli'};
200        die "lucenebuilder::build_index - couldn't run $full_lucene_passes_exe\n";
201    }
202    }
203
204    # stored text is always Doc and Sec levels   
205    my $levels = { 'document' => 1, 'section' => 1 };
206    # always do database at section level
207    my $db_level = "section";
208
209    # set up the document processr
210    $self->{'buildproc'}->set_output_handle ($handle);
211    $self->{'buildproc'}->set_mode ('text');
212    $self->{'buildproc'}->set_index ($textindex);
213    $self->{'buildproc'}->set_indexing_text (0);
214    $self->{'buildproc'}->set_levels ($levels);
215    $self->{'buildproc'}->set_db_level ($db_level);
216    $self->{'buildproc'}->reset();
217    &plugin::begin($self->{'pluginfo'}, $self->{'source_dir'},
218           $self->{'buildproc'}, $self->{'maxdocs'});
219    &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
220           "", {}, {}, $self->{'buildproc'}, $self->{'maxdocs'}, 0, $self->{'gli'});
221    &plugin::end($self->{'pluginfo'});
222    close ($handle) unless $self->{'debug'};
223    $self->print_stats();
224
225    print STDERR "</Stage>\n" if $self->{'gli'};
226}
227
228sub build_indexes {
229    my $self = shift (@_);
230    my ($indexname, $indexlevel) = @_;
231    my $outhandle = $self->{'outhandle'};
232
233    $self->pre_build_indexes($indexname);
234
235    my $indexes = [];
236    if (defined $indexname && $indexname =~ /\w/) {
237    push @$indexes, $indexname;
238    } else {
239    $indexes = $self->{'collect_cfg'}->{'indexes'};
240    }
241
242    # Determine what levels of index we want to build (a user may a specific
243    # level to index by using indexlevel parameter) [jmt12]
244    my @desired_indexlevels;
245    foreach my $level (keys %{$self->{'levels'}})
246    {
247      # ignore paragraph levels as they are unsupported in Lucene
248      if ($level =~ /paragraph/)
249      {
250        print $outhandle "WARNING: Paragraph level indexing not supported by Lucene. Ignoring index\n";
251      }
252      # build only the requested level if specified
253      elsif (defined $indexlevel && $indexlevel eq $level)
254      {
255        push (@desired_indexlevels, $level);
256        last;
257      }
258      # otherwise build all levels defined
259      else
260      {
261        push (@desired_indexlevels, $level);
262      }
263    }
264
265    # Create the mapping between the index descriptions
266    # and their directory names (includes subcolls and langs)
267    $self->{'index_mapping'} = $self->create_index_mapping ($indexes);
268
269    # build each of the indexes
270    foreach my $index (@$indexes) {
271
272    if ($self->want_built($index)) {
273
274        my $idx = $self->{'index_mapping'}->{$index};
275            # we now iterate through the filtered list of index levels [jmt12]
276        foreach my $level (@desired_indexlevels) {
277        next if $level =~ /paragraph/; # we don't do para indexing
278        my ($pindex) = $level =~ /^(.)/;
279        # should probably check that new name with level
280        # is unique ... but currently (with doc sec and para)
281        # each has unique first letter.
282        $self->{'index_mapping'}->{$index} = $pindex.$idx;
283
284        my $llevel = $mgppbuilder::level_map{$level};
285        print $outhandle "\n*** building index $index at level $llevel in subdirectory " .
286            "$self->{'index_mapping'}->{$index}\n" if ($self->{'verbosity'} >= 1);
287        print STDERR "<Stage name='Index' source='$index' level=$llevel>\n" if $self->{'gli'};
288
289        $self->build_index($index,$llevel);
290        }
291        $self->{'index_mapping'}->{$index} = $idx;
292
293    } else {
294        print $outhandle "\n*** ignoring index $index\n" if ($self->{'verbosity'} >= 1);
295    }
296    }
297
298    $self->post_build_indexes();
299}
300
301
302sub build_index {
303    my $self = shift (@_);
304    my ($index,$llevel) = @_;
305    my $outhandle = $self->{'outhandle'};
306    my $build_dir = $self->{'build_dir'};
307
308    # get the full index directory path and make sure it exists
309    my $indexdir = $self->{'index_mapping'}->{$index};
310    &FileUtils::makeAllDirectories (&FileUtils::filenameConcatenate($build_dir, $indexdir));
311
312    # get any os specific stuff
313    my $exedir = "$ENV{'GSDLHOME'}/bin/$ENV{'GSDLOS'}";
314    my $scriptdir = "$ENV{'GSDLHOME'}/bin/script";
315
316    # Find the perl script to call to run lucene
317    my $full_lucene_passes = $self->{'full_lucene_passes'};
318    my $full_lucene_passes_exe = $self->{'full_lucene_passes_exe'};
319
320    # define the section names for lucenepasses
321    # define the section names and possibly the doc name for lucenepasses
322    my $lucene_passes_sections = $llevel;
323
324    my $opt_create_index = ($self->{'incremental'}) ? "" : "-removeold";
325
326    my $osextra = "";
327    if ($ENV{'GSDLOS'} =~ /^windows$/i) {
328    $build_dir =~ s@/@\\@g;
329    } else {
330    if ($outhandle ne "STDERR") {
331        # so lucene_passes doesn't print to stderr if we redirect output
332        $osextra .= " 2>/dev/null";
333    }
334    }
335
336    # get the index expression if this index belongs
337    # to a subcollection
338    my $indexexparr = [];
339    my $langarr = [];
340
341    # there may be subcollection info, and language info.
342    my ($fields, $subcollection, $language) = split (":", $index);
343    my @subcollections = ();
344    @subcollections = split /,/, $subcollection if (defined $subcollection);
345
346    foreach $subcollection (@subcollections) {
347    if (defined ($self->{'collect_cfg'}->{'subcollection'}->{$subcollection})) {
348        push (@$indexexparr, $self->{'collect_cfg'}->{'subcollection'}->{$subcollection});
349    }
350    }
351
352    # add expressions for languages if this index belongs to
353    # a language subcollection - only put languages expressions for the
354    # ones we want in the index
355    my @languages = ();
356    my $languagemetadata = "Language";
357    if (defined ($self->{'collect_cfg'}->{'languagemetadata'})) {
358    $languagemetadata = $self->{'collect_cfg'}->{'languagemetadata'};
359    }
360    @languages = split /,/, $language if (defined $language);
361    foreach my $language (@languages) {
362    my $not=0;
363    if ($language =~ s/^\!//) {
364        $not = 1;
365    }
366    if($not) {
367        push (@$langarr, "!$language");
368    } else {
369        push (@$langarr, "$language");
370    }
371    }
372
373    # Build index dictionary. Uses verbatim stem method
374    print $outhandle "\n    creating index dictionary (lucene_passes -I1)\n"  if ($self->{'verbosity'} >= 1);
375    print STDERR "<Phase name='CreatingIndexDic'/>\n" if $self->{'gli'};
376    my ($handle);
377
378    if ($self->{'debug'}) {
379    $handle = *STDOUT;
380    } else {
381    print STDERR "Cmd: $full_lucene_passes_exe $opt_create_index index $lucene_passes_sections \"$build_dir\" \"$indexdir\"   $osextra\n";
382    if (!-e "$full_lucene_passes" ||
383        !open($handle, "| $full_lucene_passes_exe $opt_create_index index $lucene_passes_sections \"$build_dir\" \"$indexdir\"   $osextra")) {
384        print STDERR "<FatalError name='NoRunLucenePasses'/>\n</Stage>\n" if $self->{'gli'};
385        die "lucenebuilder::build_index - couldn't run $full_lucene_passes_exe\n";
386    }
387    }
388
389    my $store_levels = $self->{'levels'};
390    my $db_level = "section"; #always
391    my $dom_level = "";
392    foreach my $key (keys %$store_levels) {
393    if ($mgppbuilder::level_map{$key} eq $llevel) {
394        $dom_level = $key;
395    }
396    }
397    if ($dom_level eq "") {
398    print STDERR "Warning: unrecognized tag level $llevel\n";
399    $dom_level = "document";
400    }
401
402    my $local_levels = { $dom_level => 1 }; # work on one level at a time
403
404    # set up the document processr
405    $self->{'buildproc'}->set_output_handle ($handle);
406    $self->{'buildproc'}->set_mode ('text');
407    $self->{'buildproc'}->set_index ($index, $indexexparr);
408    $self->{'buildproc'}->set_index_languages ($languagemetadata, $langarr) if (defined $language);
409    $self->{'buildproc'}->set_indexing_text (1);
410    #$self->{'buildproc'}->set_indexfieldmap ($self->{'indexfieldmap'});
411    $self->{'buildproc'}->set_levels ($local_levels);
412    $self->{'buildproc'}->set_sortfields ($self->{'collect_cfg'}->{'sortfields'});;
413
414    $self->{'buildproc'}->set_db_level($db_level);
415    $self->{'buildproc'}->reset();
416    &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
417           "", {}, {}, $self->{'buildproc'}, $self->{'maxdocs'}, 0, $self->{'gli'});
418    close ($handle) unless $self->{'debug'};
419
420    $self->print_stats();
421
422    $self->{'buildproc'}->set_levels ($store_levels);
423    print STDERR "</Stage>\n" if $self->{'gli'};
424}
425
426# /** A modified version of the basebuilder.pm's function that generates the
427#  *  information database from the GA documents. We need to change this
428#  *  so that if we've been asked to do an incremental build we only add
429#  *  metadata to autohierarchy classifiers via the IncrementalBuildUtils
430#  *  module. All other classifiers and metadata will be ignored.
431#  */
432# This was added to utilize DLC's incremental updating of Hierarchy classifiers. They are heading towards just using dynamic classifiers, and we do not want to use this code either. So now, we just use basebuilder's version of make_infodatabase
433sub make_infodatabase_dlc
434{
435    my $self = shift (@_);
436    my $outhandle = $self->{'outhandle'};
437
438    # Get info database file path
439    my $text_directory_path = &FileUtils::filenameConcatenate($self->{'build_dir'}, "text");
440    my $infodb_file_path = &dbutil::get_infodb_file_path($self->{'infodbtype'}, $self->{'collection'}, $text_directory_path);
441
442    # If we aren't doing an incremental addition, then we just call the super-
443    # classes version
444    # Note: Incremental addition can only occur if an information database
445    #       already exists. If it doesn't, let the super classes function be
446    #       called once to generate it.
447    if (!$self->{'incremental'} || !-e $infodb_file_path)
448    {
449        # basebuilder::make_infodatabase(@_);
450        # Note: this doesn't work as the direct reference means all the $self
451        #       data is lost.
452        $self->basebuilder::make_infodatabase(@_);
453        return;
454    }
455
456    # Carry on with an incremental addition
457    print $outhandle "\n*** performing an incremental addition to the info database\n" if ($self->{'verbosity'} >= 1);
458    print STDERR "<Stage name='CreateInfoData'>\n" if $self->{'gli'};
459
460    # 1. Init all the classifiers
461    &classify::init_classifiers ($self->{'classifiers'});
462    # 2. Init the buildproc settings.
463    #    Note: we still need this to process any associated files - but we
464    #    don't expect to pipe anything to the database so we can do away with the
465    #    complex output handle.
466    my $assocdir = &FileUtils::filenameConcatenate($self->{'build_dir'}, "assoc");
467    &FileUtils::makeAllDirectories ($assocdir);
468    $self->{'buildproc'}->set_mode ('incinfodb'); # Very Important
469    $self->{'buildproc'}->set_assocdir ($assocdir);
470    # 3. Read in all the metadata from the files in the archives directory using
471    #    the GAPlug and using ourselves as the document processor!
472    &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'}, "", {}, {}, $self->{'buildproc'}, $self->{'maxdocs'},0, $self->{'gli'});
473
474    print STDERR "</Stage>\n" if $self->{'gli'};
475}
476
477# /** Lucene specific document removal function. This works by calling lucene_passes.pl with
478#  *  -remove and the document id on the command line.
479#  *
480#  *  @param oid is the document identifier to be removed.
481#  *
482#  *  @author John Rowe, DL Consulting Ltd.
483#  */
484sub remove_document_from_database
485{
486    my ($self, $oid) = @_;
487    # Find the perl script to call to run lucene
488    my $full_lucene_passes_exe = $self->{'full_lucene_passes_exe'};
489    # Call lucene_passes.pl with -remove and the document ID on the command line
490    `$full_lucene_passes_exe -remove "$oid"`;
491}
492# /** remove_document_from_database **/
493
494sub build_cfg_extra {
495    my $self = shift (@_);
496    my ($build_cfg) = @_;
497
498    $self->mgppbuilder::build_cfg_extra($build_cfg);
499
500    # need to add in sort stuff
501    my @sortfields = ();
502    my @sortfieldmap = ();
503
504    foreach my $sf (@{$self->{'buildproc'}->{'sortfields'}}) {
505    if ($sf eq "rank") {
506        push(@sortfields, $sf);
507    } elsif ($self->{'buildproc'}->{'actualsortfields'}->{$sf}) {
508        my $shortname = $self->{'buildproc'}->{'sortfieldnamemap'}->{$sf};
509        push(@sortfields, $shortname);
510        push (@sortfieldmap, "$sf\-\>$shortname");
511    }
512   
513    }
514    $build_cfg->{'indexsortfields'} = \@sortfields;
515    $build_cfg->{'indexsortfieldmap'} = \@sortfieldmap;
516
5171;
518
519
Note: See TracBrowser for help on using the browser.