root/main/trunk/greenstone2/perllib/lucenebuilder.pm @ 28954

Revision 28954, 18.5 KB (checked in by kjdon, 6 years ago)

the library code is now going to use the indexsortfieldmap, and we need to have a mapping for rank/none to make the code easier. SO just add in a mapping like rank->rank

  • Property svn:keywords set to Author Date Id Revision
Line 
1###########################################################################
2#
3# lucenebuilder.pm -- perl wrapper for building index with Lucene
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26###########################################################################
27# /*
28#  *  @version 1.0 Initial implementation of incremental building
29#  *  @version 2.0 Incremental building assistance added, including
30#  *               remove_document_from_database which implements the granddad's
31#  *               empty function to call the lucene_passes.pl and full_lucene_passes_exe
32#  *               so there is one place in the code that works out where the
33#  *               perl script is. John Rowe
34#  *
35#  *  @author David Bainbridge and Katherine Don, Waikato DL Research group
36#  *  @author John Rowe, DL Consulting Ltd.
37#  *  @author John Thompson, DL Consulting Ltd.
38#  */
39###########################################################################
40
41package lucenebuilder;
42
43# Use same basic XML structure setup by mgppbuilder/mgppbuildproc
44
45use mgppbuilder;
46use strict;
47no strict 'refs';
48use util;
49use FileUtils;
50
51sub BEGIN {
52    @lucenebuilder::ISA = ('mgppbuilder');
53}
54
55# /**
56#  *  @author  John Thompson, DL Consulting Ltd.
57#  */
58sub new {
59    my $class = shift(@_);
60    my $self = new mgppbuilder (@_);
61    $self = bless $self, $class;
62
63    $self->{'buildtype'} = "lucene";
64   
65    # If ENABLE_LUCENE was turned off during GS compilation, then we won't be able to
66    # continue. Check for existence of LuceneWrapper to see if Lucene was disabled.
67    my $lucene = &FileUtils::filenameConcatenate($ENV{'GSDLHOME'},"bin","java","LuceneWrapper3.jar");
68    if (! -f $lucene) {
69    die "***** ERROR: $lucene does not exist\n";     
70    }
71 
72    # Do we need to put exe on the end?
73    my $exe = &util::get_os_exe ();
74    my $scriptdir = "$ENV{'GSDLHOME'}/bin/script";
75
76    # So where is lucene_passes.pl anyway?
77    my $lucene_passes_script = &FileUtils::filenameConcatenate($scriptdir, "lucene_passes.pl");
78
79    # So tack perl on the beginning to ensure execution
80    $self->{'full_lucene_passes'} = "$lucene_passes_script";
81    if ($exe eq ".exe")
82    {
83    $self->{'full_lucene_passes_exe'} = "perl$exe \"$lucene_passes_script\"";
84    }
85    else
86    {
87    $self->{'full_lucene_passes_exe'} = "\"".&util::get_perl_exec()."\" -S \"$lucene_passes_script\"";
88    }
89
90    return $self;
91}
92# /** new() **/
93
94sub set_sections_sort_on_document_metadata {
95    my $self = shift (@_);
96    my ($index) = @_;
97 
98    $self->{'buildproc'}->set_sections_sort_on_document_metadata($index);
99}
100
101sub is_incremental_capable
102{
103    # lucene can do incremental building
104
105    return 1;
106}
107
108sub init_for_incremental_build {
109    my $self = shift (@_);
110
111    # we want to read in indexfieldmap and indexfields from existing build.cfg
112    # so that we know what has already been indexed
113    my $buildcfg = $self->read_build_cfg();
114    return unless defined $buildcfg;
115
116    my $field;
117    if (defined $buildcfg->{'indexfields'}) {
118    foreach $field (@{$buildcfg->{'indexfields'}}) {
119        # extraindexfields is only supposed to have extra ones in it, not those already specified in indexes. And this list has all indexes in it. But we do a check before including things from extraindexfields whether it was specified in indexes, so it all ok.
120        $self->{'buildproc'}->{'extraindexfields'}->{$field} = 1;
121    }
122    }
123
124    if (defined $buildcfg->{'indexfieldmap'}) {
125    foreach $field (@{$buildcfg->{'indexfieldmap'}}) {
126        my ($f, $v) = $field =~ /^(.*)\-\>(.*)$/;
127        $self->{'buildproc'}->{'fieldnamemap'}->{$f} = $v;
128        $self->{'buildproc'}->{'fieldnamemap'}->{$v} = 1;
129        $self->{'buildproc'}->{'allindexfields'}->{$f} = 1;
130    }
131    }       
132}
133
134# lucene has none of these options
135sub generate_index_options {
136    my $self = shift (@_);
137
138    $self->SUPER::generate_index_options();
139   
140    $self->{'casefold'} = 0;
141    $self->{'stem'} = 0;
142    $self->{'accentfold'} = 0;
143    $self->{'stemindexes'} = 0;
144}   
145
146sub default_buildproc {
147    my $self  = shift (@_);
148
149    return "lucenebuildproc";
150}
151
152# this writes a nice version of the text docs
153sub compress_text
154{
155    my $self = shift (@_);
156    # we don't do anything if we don't want compressed text
157    return if $self->{'no_text'};
158
159    my ($textindex) = @_;
160    my $outhandle = $self->{'outhandle'};
161
162    # the text directory
163    my $text_dir = &FileUtils::filenameConcatenate($self->{'build_dir'}, "text");
164    my $build_dir = &FileUtils::filenameConcatenate($self->{'build_dir'},"");
165    &FileUtils::makeAllDirectories ($text_dir);
166
167    my $osextra = "";
168    if (($ENV{'GSDLOS'} =~ /^windows$/i) && ($^O ne "cygwin"))
169    {
170    $text_dir =~ s@/@\\@g;
171    }
172    else
173    {
174    if ($outhandle ne "STDERR")
175    {
176        # so lucene_passes doesn't print to stderr if we redirect output
177        $osextra .= " 2>/dev/null";
178    }
179    }
180
181    # get any os specific stuff
182    my $scriptdir = "$ENV{'GSDLHOME'}/bin/script";
183
184    # Find the perl script to call to run lucene
185    my $full_lucene_passes = $self->{'full_lucene_passes'};
186    my $full_lucene_passes_exe = $self->{'full_lucene_passes_exe'};
187
188    my $lucene_passes_sections = "Doc";
189
190    my ($handle);
191
192    if ($self->{'debug'})
193    {
194    $handle = *STDOUT;
195    }
196    else
197    {
198        print STDERR "Full Path:     $full_lucene_passes\n";
199        print STDERR "Executable:    $full_lucene_passes_exe\n";
200        print STDERR "Sections:      $lucene_passes_sections\n";
201        print STDERR "Build Dir:     $build_dir\n";
202        print STDERR "Cmd:           $full_lucene_passes_exe text $lucene_passes_sections \"$build_dir\" \"dummy\"   $osextra\n";
203    if (!-e "$full_lucene_passes" ||
204        !open($handle, "| $full_lucene_passes_exe text $lucene_passes_sections \"$build_dir\" \"dummy\"   $osextra"))
205    {
206        print STDERR "<FatalError name='NoRunLucenePasses'/>\n</Stage>\n" if $self->{'gli'};
207        die "lucenebuilder::build_index - couldn't run $full_lucene_passes_exe\n";
208    }
209    }
210
211    # stored text is always Doc and Sec levels   
212    my $levels = { 'document' => 1, 'section' => 1 };
213    # always do database at section level
214    my $db_level = "section";
215
216    # set up the document processr
217    $self->{'buildproc'}->set_output_handle ($handle);
218    $self->{'buildproc'}->set_mode ('text');
219    $self->{'buildproc'}->set_index ($textindex);
220    $self->{'buildproc'}->set_indexing_text (0);
221    $self->{'buildproc'}->set_levels ($levels);
222    $self->{'buildproc'}->set_db_level ($db_level);
223    $self->{'buildproc'}->reset();
224    &plugin::begin($self->{'pluginfo'}, $self->{'source_dir'},
225           $self->{'buildproc'}, $self->{'maxdocs'});
226    &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
227           "", {}, {}, $self->{'buildproc'}, $self->{'maxdocs'}, 0, $self->{'gli'});
228    &plugin::end($self->{'pluginfo'});
229    close ($handle) unless $self->{'debug'};
230    $self->print_stats();
231
232    print STDERR "</Stage>\n" if $self->{'gli'};
233}
234
235sub build_indexes {
236    my $self = shift (@_);
237    my ($indexname, $indexlevel) = @_;
238    my $outhandle = $self->{'outhandle'};
239
240    $self->pre_build_indexes($indexname);
241
242    my $indexes = [];
243    if (defined $indexname && $indexname =~ /\w/) {
244    push @$indexes, $indexname;
245    } else {
246    $indexes = $self->{'collect_cfg'}->{'indexes'};
247    }
248
249    # Determine what levels of index we want to build (a user may a specific
250    # level to index by using indexlevel parameter) [jmt12]
251    my @desired_indexlevels;
252    foreach my $level (keys %{$self->{'levels'}})
253    {
254      # ignore paragraph levels as they are unsupported in Lucene
255      if ($level =~ /paragraph/)
256      {
257        print $outhandle "WARNING: Paragraph level indexing not supported by Lucene. Ignoring index\n";
258      }
259      # build only the requested level if specified
260      elsif (defined $indexlevel && $indexlevel eq $level)
261      {
262        push (@desired_indexlevels, $level);
263        last;
264      }
265      # otherwise build all levels defined
266      else
267      {
268        push (@desired_indexlevels, $level);
269      }
270    }
271
272    # Create the mapping between the index descriptions
273    # and their directory names (includes subcolls and langs)
274    $self->{'index_mapping'} = $self->create_index_mapping ($indexes);
275
276    # build each of the indexes
277    foreach my $index (@$indexes) {
278
279    if ($self->want_built($index)) {
280
281        my $idx = $self->{'index_mapping'}->{$index};
282            # we now iterate through the filtered list of index levels [jmt12]
283        foreach my $level (@desired_indexlevels) {
284        next if $level =~ /paragraph/; # we don't do para indexing
285        my ($pindex) = $level =~ /^(.)/;
286        # should probably check that new name with level
287        # is unique ... but currently (with doc sec and para)
288        # each has unique first letter.
289        $self->{'index_mapping'}->{$index} = $pindex.$idx;
290
291        my $llevel = $mgppbuilder::level_map{$level};
292        print $outhandle "\n*** building index $index at level $llevel in subdirectory " .
293            "$self->{'index_mapping'}->{$index}\n" if ($self->{'verbosity'} >= 1);
294        print STDERR "<Stage name='Index' source='$index' level=$llevel>\n" if $self->{'gli'};
295
296        $self->build_index($index,$llevel);
297        }
298        $self->{'index_mapping'}->{$index} = $idx;
299
300    } else {
301        print $outhandle "\n*** ignoring index $index\n" if ($self->{'verbosity'} >= 1);
302    }
303    }
304
305    $self->post_build_indexes();
306}
307
308
309sub build_index {
310    my $self = shift (@_);
311    my ($index,$llevel) = @_;
312    my $outhandle = $self->{'outhandle'};
313    my $build_dir = $self->{'build_dir'};
314
315    # get the full index directory path and make sure it exists
316    my $indexdir = $self->{'index_mapping'}->{$index};
317    &FileUtils::makeAllDirectories (&FileUtils::filenameConcatenate($build_dir, $indexdir));
318
319    # get any os specific stuff
320    my $exedir = "$ENV{'GSDLHOME'}/bin/$ENV{'GSDLOS'}";
321    my $scriptdir = "$ENV{'GSDLHOME'}/bin/script";
322
323    # Find the perl script to call to run lucene
324    my $full_lucene_passes = $self->{'full_lucene_passes'};
325    my $full_lucene_passes_exe = $self->{'full_lucene_passes_exe'};
326
327    # define the section names for lucenepasses
328    # define the section names and possibly the doc name for lucenepasses
329    my $lucene_passes_sections = $llevel;
330
331    my $opt_create_index = ($self->{'incremental'}) ? "" : "-removeold";
332
333    my $osextra = "";
334    if (($ENV{'GSDLOS'} =~ /^windows$/i) && ($^O ne "cygwin")) {
335    $build_dir =~ s@/@\\@g;
336    } else {
337    if ($outhandle ne "STDERR") {
338        # so lucene_passes doesn't print to stderr if we redirect output
339        $osextra .= " 2>/dev/null";
340    }
341    }
342
343    # get the index expression if this index belongs
344    # to a subcollection
345    my $indexexparr = [];
346    my $langarr = [];
347
348    # there may be subcollection info, and language info.
349    my ($fields, $subcollection, $language) = split (":", $index);
350    my @subcollections = ();
351    @subcollections = split /,/, $subcollection if (defined $subcollection);
352
353    foreach $subcollection (@subcollections) {
354    if (defined ($self->{'collect_cfg'}->{'subcollection'}->{$subcollection})) {
355        push (@$indexexparr, $self->{'collect_cfg'}->{'subcollection'}->{$subcollection});
356    }
357    }
358
359    # add expressions for languages if this index belongs to
360    # a language subcollection - only put languages expressions for the
361    # ones we want in the index
362    my @languages = ();
363    my $languagemetadata = "Language";
364    if (defined ($self->{'collect_cfg'}->{'languagemetadata'})) {
365    $languagemetadata = $self->{'collect_cfg'}->{'languagemetadata'};
366    }
367    @languages = split /,/, $language if (defined $language);
368    foreach my $language (@languages) {
369    my $not=0;
370    if ($language =~ s/^\!//) {
371        $not = 1;
372    }
373    if($not) {
374        push (@$langarr, "!$language");
375    } else {
376        push (@$langarr, "$language");
377    }
378    }
379
380    # Build index dictionary. Uses verbatim stem method
381    print $outhandle "\n    creating index dictionary (lucene_passes -I1)\n"  if ($self->{'verbosity'} >= 1);
382    print STDERR "<Phase name='CreatingIndexDic'/>\n" if $self->{'gli'};
383    my ($handle);
384
385    if ($self->{'debug'}) {
386    $handle = *STDOUT;
387    } else {
388    print STDERR "Cmd: $full_lucene_passes_exe $opt_create_index index $lucene_passes_sections \"$build_dir\" \"$indexdir\"   $osextra\n";
389    if (!-e "$full_lucene_passes" ||
390        !open($handle, "| $full_lucene_passes_exe $opt_create_index index $lucene_passes_sections \"$build_dir\" \"$indexdir\"   $osextra")) {
391        print STDERR "<FatalError name='NoRunLucenePasses'/>\n</Stage>\n" if $self->{'gli'};
392        die "lucenebuilder::build_index - couldn't run $full_lucene_passes_exe\n";
393    }
394    }
395
396    my $store_levels = $self->{'levels'};
397    my $db_level = "section"; #always
398    my $dom_level = "";
399    foreach my $key (keys %$store_levels) {
400    if ($mgppbuilder::level_map{$key} eq $llevel) {
401        $dom_level = $key;
402    }
403    }
404    if ($dom_level eq "") {
405    print STDERR "Warning: unrecognized tag level $llevel\n";
406    $dom_level = "document";
407    }
408
409    my $local_levels = { $dom_level => 1 }; # work on one level at a time
410
411    # set up the document processr
412    $self->{'buildproc'}->set_output_handle ($handle);
413    $self->{'buildproc'}->set_mode ('text');
414    $self->{'buildproc'}->set_index ($index, $indexexparr);
415    $self->{'buildproc'}->set_index_languages ($languagemetadata, $langarr) if (defined $language);
416    $self->{'buildproc'}->set_indexing_text (1);
417    #$self->{'buildproc'}->set_indexfieldmap ($self->{'indexfieldmap'});
418    $self->{'buildproc'}->set_levels ($local_levels);
419    if (defined $self->{'collect_cfg'}->{'sortfields'}) {
420    $self->{'buildproc'}->set_sortfields ($self->{'collect_cfg'}->{'sortfields'});
421    }
422
423    $self->{'buildproc'}->set_db_level($db_level);
424    $self->{'buildproc'}->reset();
425    &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
426           "", {}, {}, $self->{'buildproc'}, $self->{'maxdocs'}, 0, $self->{'gli'});
427    close ($handle) unless $self->{'debug'};
428
429    $self->print_stats();
430
431    $self->{'buildproc'}->set_levels ($store_levels);
432    print STDERR "</Stage>\n" if $self->{'gli'};
433}
434
435# /** A modified version of the basebuilder.pm's function that generates the
436#  *  information database from the GA documents. We need to change this
437#  *  so that if we've been asked to do an incremental build we only add
438#  *  metadata to autohierarchy classifiers via the IncrementalBuildUtils
439#  *  module. All other classifiers and metadata will be ignored.
440#  */
441# This was added to utilize DLC's incremental updating of Hierarchy classifiers. They are heading towards just using dynamic classifiers, and we do not want to use this code either. So now, we just use basebuilder's version of make_infodatabase
442sub make_infodatabase_dlc
443{
444    my $self = shift (@_);
445    my $outhandle = $self->{'outhandle'};
446
447    # Get info database file path
448    my $text_directory_path = &FileUtils::filenameConcatenate($self->{'build_dir'}, "text");
449    my $infodb_file_path = &dbutil::get_infodb_file_path($self->{'infodbtype'}, $self->{'collection'}, $text_directory_path);
450
451    # If we aren't doing an incremental addition, then we just call the super-
452    # classes version
453    # Note: Incremental addition can only occur if an information database
454    #       already exists. If it doesn't, let the super classes function be
455    #       called once to generate it.
456    if (!$self->{'incremental'} || !-e $infodb_file_path)
457    {
458        # basebuilder::make_infodatabase(@_);
459        # Note: this doesn't work as the direct reference means all the $self
460        #       data is lost.
461        $self->basebuilder::make_infodatabase(@_);
462        return;
463    }
464
465    # Carry on with an incremental addition
466    print $outhandle "\n*** performing an incremental addition to the info database\n" if ($self->{'verbosity'} >= 1);
467    print STDERR "<Stage name='CreateInfoData'>\n" if $self->{'gli'};
468
469    # 1. Init all the classifiers
470    &classify::init_classifiers ($self->{'classifiers'});
471    # 2. Init the buildproc settings.
472    #    Note: we still need this to process any associated files - but we
473    #    don't expect to pipe anything to the database so we can do away with the
474    #    complex output handle.
475    my $assocdir = &FileUtils::filenameConcatenate($self->{'build_dir'}, "assoc");
476    &FileUtils::makeAllDirectories ($assocdir);
477    $self->{'buildproc'}->set_mode ('incinfodb'); # Very Important
478    $self->{'buildproc'}->set_assocdir ($assocdir);
479    # 3. Read in all the metadata from the files in the archives directory using
480    #    the GAPlug and using ourselves as the document processor!
481    &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'}, "", {}, {}, $self->{'buildproc'}, $self->{'maxdocs'},0, $self->{'gli'});
482
483    print STDERR "</Stage>\n" if $self->{'gli'};
484}
485
486# /** Lucene specific document removal function. This works by calling lucene_passes.pl with
487#  *  -remove and the document id on the command line.
488#  *
489#  *  @param oid is the document identifier to be removed.
490#  *
491#  *  @author John Rowe, DL Consulting Ltd.
492#  */
493sub remove_document_from_database
494{
495    my ($self, $oid) = @_;
496    # Find the perl script to call to run lucene
497    my $full_lucene_passes_exe = $self->{'full_lucene_passes_exe'};
498    # Call lucene_passes.pl with -remove and the document ID on the command line
499    `$full_lucene_passes_exe -remove "$oid"`;
500}
501# /** remove_document_from_database **/
502
503sub build_cfg_extra {
504    my $self = shift (@_);
505    my ($build_cfg) = @_;
506
507    $self->mgppbuilder::build_cfg_extra($build_cfg);
508
509    # need to add in sort stuff
510    my @sortfields = ();
511    my @sortfieldmap = ();
512
513    foreach my $sf (@{$self->{'buildproc'}->{'sortfields'}}) {
514    if ($sf eq "rank" || $sf eq "none") {
515        push(@sortfields, $sf);
516        push (@sortfieldmap, "$sf\-\>$sf");
517    } elsif ($self->{'buildproc'}->{'actualsortfields'}->{$sf}) {
518        my $shortname = $self->{'buildproc'}->{'sortfieldnamemap'}->{$sf};
519        push(@sortfields, $shortname);
520        push (@sortfieldmap, "$sf\-\>$shortname");
521    }
522   
523    }
524    $build_cfg->{'indexsortfields'} = \@sortfields;
525    $build_cfg->{'indexsortfieldmap'} = \@sortfieldmap;
526
5271;
528
529
Note: See TracBrowser for help on using the browser.