source: trunk/gsdl/perllib/lucenebuilder.pm@ 8716

Last change on this file since 8716 was 8716, checked in by kjdon, 19 years ago

added some changes made by Emanuel Dejanu (Simple Words)

  • Property svn:keywords set to Author Date Id Revision
File size: 7.8 KB
Line 
1###########################################################################
2#
3# lucenebuilder.pm -- perl wrapper for building index with Lucene
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26package lucenebuilder;
27
28# Use same basic XML structure setup by mgppbuilder/mgppbuildproc
29
30use mgppbuilder;
31
32sub BEGIN {
33 @lucenebuilder::ISA = ('mgppbuilder');
34}
35
36
37sub new {
38 my $class = shift(@_);
39 my ($collection, $source_dir, $build_dir, $verbosity,
40 $maxdocs, $debug, $keepold, $allclassifications,
41 $outhandle, $no_text, $gli) = @_;
42
43 my $self = new mgppbuilder (@_);
44 $self = bless $self, $class;
45
46 # load up the document processor for building
47 # if a buildproc class has been created for this collection, use it
48 # otherwise, use the lucene buildproc
49 my ($buildprocdir, $buildproctype);
50 if (-e "$ENV{'GSDLCOLLECTDIR'}/perllib/${collection}buildproc.pm") {
51 $buildprocdir = "$ENV{'GSDLCOLLECTDIR'}/perllib";
52 $buildproctype = "${collection}buildproc";
53 } else {
54 $buildprocdir = "$ENV{'GSDLHOME'}/perllib";
55 $buildproctype = "lucenebuildproc";
56 }
57 require "$buildprocdir/$buildproctype.pm";
58
59 eval("\$self->{'buildproc'} = new $buildproctype(\$collection, " .
60 "\$source_dir, \$build_dir, \$verbosity, \$outhandle)");
61 die "$@" if $@;
62
63 $self->{'buildtype'} = "lucene";
64
65 return $self;
66}
67
68sub compress_text {
69
70 my $self = shift (@_);
71 my ($textindex) = @_;
72}
73
74sub build_indexes {
75 my $self = shift (@_);
76 my ($indexname) = @_;
77 my $outhandle = $self->{'outhandle'};
78
79 my $indexes = [];
80 if (defined $indexname && $indexname =~ /\w/) {
81 push @$indexes, $indexname;
82 } else {
83 $indexes = $self->{'collect_cfg'}->{'indexes'};
84 }
85
86 # create the mapping between the index descriptions
87 # and their directory names (includes subcolls and langs)
88 $self->{'index_mapping'} = $self->create_index_mapping ($indexes);
89
90 # build each of the indexes
91 foreach $index (@$indexes) {
92 if ($self->want_built($index)) {
93
94 my $idx = $self->{'index_mapping'}->{$index};
95 foreach my $level (keys %{$self->{'levels'}}) {
96 my ($pindex) = $level =~ /^(.)/;
97 # should probably check that new name with level
98 # is unique ... but currently (with doc sec and para)
99 # each has unique first letter.
100 $self->{'index_mapping'}->{$index} = $pindex.$idx;
101
102 my $llevel = $mgppbuilder::level_map{$level};
103 print $outhandle "\n*** building index $index at level $llevel in subdirectory " .
104 "$self->{'index_mapping'}->{$index}\n" if ($self->{'verbosity'} >= 1);
105 print STDERR "<Stage name='Index' source='$index' level=$llevel>\n" if $self->{'gli'};
106
107 $self->build_index($index,$llevel);
108 }
109 $self->{'index_mapping'}->{$index} = $idx;
110
111 } else {
112 print $outhandle "\n*** ignoring index $index\n" if ($self->{'verbosity'} >= 1);
113 }
114 }
115
116 #define the final field lists
117 $self->make_final_field_list();
118}
119
120
121
122
123
124sub build_index {
125 my $self = shift (@_);
126 my ($index,$llevel) = @_;
127 my $outhandle = $self->{'outhandle'};
128 my $build_dir = $self->{'build_dir'};
129
130 # get the full index directory path and make sure it exists
131 my $indexdir = $self->{'index_mapping'}->{$index};
132 &util::mk_all_dir (&util::filename_cat($build_dir, $indexdir));
133
134 # get any os specific stuff
135 my $exedir = "$ENV{'GSDLHOME'}/bin/$ENV{'GSDLOS'}";
136 my $scriptdir = "$ENV{'GSDLHOME'}/bin/script";
137
138 my $exe = &util::get_os_exe ();
139 my $lucene_passes_exe = &util::filename_cat($scriptdir, "lucene_passes.pl");
140
141 # define the section names for lucenepasses
142 # define the section names and possibly the doc name for lucenepasses
143 my $lucene_passes_sections = $llevel;
144
145 my $osextra = "";
146 if ($ENV{'GSDLOS'} =~ /^windows$/i) {
147 $build_dir =~ s@/@\\@g;
148 } else {
149 $osextra = " -d /";
150 if ($outhandle ne "STDERR") {
151 # so lucene_passes doesn't print to stderr if we redirect output
152 $osextra .= " 2>/dev/null";
153 }
154 }
155
156 # get the index expression if this index belongs
157 # to a subcollection
158 my $indexexparr = [];
159
160 # there may be subcollection info, and language info.
161 my ($fields, $subcollection, $language) = split (":", $index);
162 my @subcollections = ();
163 @subcollections = split /,/, $subcollection if (defined $subcollection);
164
165 foreach $subcollection (@subcollections) {
166 if (defined ($self->{'collect_cfg'}->{'subcollection'}->{$subcollection})) {
167 push (@$indexexparr, $self->{'collect_cfg'}->{'subcollection'}->{$subcollection});
168 }
169 }
170
171 # add expressions for languages if this index belongs to
172 # a language subcollection - only put languages expressions for the
173 # ones we want in the index
174
175 # this puts a separate Language/en entry in for each language in the list
176 # is this what we want?
177 # should we just have one entry with Language/en,es/ ??
178 my @languages = ();
179 @languages = split /,/, $language if (defined $language);
180 foreach $language (@languages) {
181 my $not=0;
182 if ($language =~ s/^\!//) {
183 $not = 1;
184 }
185 if ($not) {
186 push (@$indexexparr, "!Language/$language/");
187 } else {
188 push (@$indexexparr, "Language/$language/");
189 }
190 }
191
192 # Build index dictionary. Uses verbatim stem method
193 print $outhandle "\n creating index dictionary (lucene_passes -I1)\n" if ($self->{'verbosity'} >= 1);
194 print STDERR "<Phase name='CreatingIndexDic'/>\n" if $self->{'gli'};
195 my ($handle);
196
197 if ($self->{'debug'}) {
198 $handle = STDOUT;
199 } else {
200 if (!-e "$lucene_passes_exe" ||
201 !open (PIPEOUT, "| $lucene_passes_exe $lucene_passes_sections \"$build_dir\" \"$indexdir\" $osextra")) {
202 print STDERR "<FatalError name='NoRunLucenePasses'/>\n</Stage>\n" if $self->{'gli'};
203 die "lucenebuilder::build_index - couldn't run $lucene_passes_exe\n";
204 }
205 $handle = lucenebuilder::PIPEOUT;
206 }
207
208 my $store_levels = $self->{'levels'};
209
210 my $dom_level = "";
211 foreach my $key (keys %$store_levels) {
212 if ($mgppbuilder::level_map{$key} eq $llevel) {
213 $dom_level = $key;
214 }
215 }
216 if ($dom_level eq "") {
217 print STDERR "Warning: unrecognized tag level $llevel\n";
218 $dom_level = "document";
219 }
220
221 my $local_levels = { $dom_level => 1 }; # work on one level at a time
222
223 # set up the document processr
224 $self->{'buildproc'}->set_output_handle ($handle);
225 $self->{'buildproc'}->set_mode ('text');
226 $self->{'buildproc'}->set_index ($index, $indexexparr);
227 $self->{'buildproc'}->set_indexing_text (1);
228 $self->{'buildproc'}->set_store_text(1);
229 $self->{'buildproc'}->set_indexfieldmap ($self->{'indexfieldmap'});
230 $self->{'buildproc'}->set_levels ($local_levels);
231 $self->{'buildproc'}->reset();
232 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
233 "", {}, $self->{'buildproc'}, $self->{'maxdocs'});
234 close ($handle) unless $self->{'debug'};
235
236 $self->print_stats();
237
238 $self->{'buildproc'}->set_levels ($store_levels);
239 print STDERR "</Stage>\n" if $self->{'gli'};
240}
241
2421;
243
244
Note: See TracBrowser for help on using the repository browser.