source: trunk/gsdl/perllib/lucenebuilder.pm@ 9548

Last change on this file since 9548 was 9548, checked in by kjdon, 19 years ago

for teh language subcollection stuff, can now specify language_metadata in teh config file - will use that metadata instead of ex.Language to match on for language

  • Property svn:keywords set to Author Date Id Revision
File size: 10.8 KB
Line 
1###########################################################################
2#
3# lucenebuilder.pm -- perl wrapper for building index with Lucene
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26package lucenebuilder;
27
28# Use same basic XML structure setup by mgppbuilder/mgppbuildproc
29
30use mgppbuilder;
31
32sub BEGIN {
33 @lucenebuilder::ISA = ('mgppbuilder');
34}
35
36
37sub new {
38 my $class = shift(@_);
39 my ($collection, $source_dir, $build_dir, $verbosity,
40 $maxdocs, $debug, $keepold, $allclassifications,
41 $outhandle, $no_text, $gli) = @_;
42
43 my $self = new mgppbuilder (@_);
44 $self = bless $self, $class;
45
46 # load up the document processor for building
47 # if a buildproc class has been created for this collection, use it
48 # otherwise, use the lucene buildproc
49 my ($buildprocdir, $buildproctype);
50 if (-e "$ENV{'GSDLCOLLECTDIR'}/perllib/${collection}buildproc.pm") {
51 $buildprocdir = "$ENV{'GSDLCOLLECTDIR'}/perllib";
52 $buildproctype = "${collection}buildproc";
53 } else {
54 $buildprocdir = "$ENV{'GSDLHOME'}/perllib";
55 $buildproctype = "lucenebuildproc";
56 }
57 require "$buildprocdir/$buildproctype.pm";
58
59 eval("\$self->{'buildproc'} = new $buildproctype(\$collection, " .
60 "\$source_dir, \$build_dir, \$verbosity, \$outhandle)");
61 die "$@" if $@;
62
63 $self->{'buildtype'} = "lucene";
64
65 return $self;
66}
67
68# this writes a nice version of the text docs
69sub compress_text {
70
71 my $self = shift (@_);
72 my ($textindex) = @_;
73 my $outhandle = $self->{'outhandle'};
74 print STDERR "Saving the document text\n";
75 # the text directory
76 my $text_dir = &util::filename_cat($self->{'build_dir'}, "text");
77 my $build_dir = &util::filename_cat($self->{'build_dir'},"");
78 &util::mk_all_dir ($text_dir);
79
80 my $osextra = "";
81 if ($ENV{'GSDLOS'} =~ /^windows$/i) {
82 $text_dir =~ s@/@\\@g;
83 } else {
84 if ($outhandle ne "STDERR") {
85 # so lucene_passes doesn't print to stderr if we redirect output
86 $osextra .= " 2>/dev/null";
87 }
88 }
89
90
91 # get any os specific stuff
92 my $scriptdir = "$ENV{'GSDLHOME'}/bin/script";
93
94 my $lucene_passes_exe = &util::filename_cat($scriptdir, "lucene_passes.pl");
95 my $full_lucene_passes_exe = $lucene_passes_exe;
96 if ($ENV{'GSDLOS'} =~ /^windows$/i) {
97 $full_lucene_passes_exe = "perl.exe -S $lucene_passes_exe";
98 }
99 my $lucene_passes_sections = "Doc";
100
101 my ($handle);
102
103 if ($self->{'debug'}) {
104 $handle = STDOUT;
105 } else {
106 if (!-e "$lucene_passes_exe" ||
107 !open (PIPEOUT, "| $full_lucene_passes_exe text $lucene_passes_sections \"$build_dir\" \"dummy\" $osextra")) {
108 print STDERR "<FatalError name='NoRunLucenePasses'/>\n</Stage>\n" if $self->{'gli'};
109 die "lucenebuilder::build_index - couldn't run $lucene_passes_exe\n";
110 }
111 $handle = lucenebuilder::PIPEOUT;
112 }
113 my $levels = $self->{'levels'};
114 my $gdbm_level = "document";
115 if ($levels->{'section'}) {
116 $gdbm_level = "section";
117 }
118
119 undef $levels->{'paragraph'}; # get rid of para if we had it.
120 # set up the document processr
121 $self->{'buildproc'}->set_output_handle ($handle);
122 $self->{'buildproc'}->set_mode ('text');
123 $self->{'buildproc'}->set_index ($textindex);
124 $self->{'buildproc'}->set_indexing_text (0);
125 $self->{'buildproc'}->set_store_text(1);
126 $self->{'buildproc'}->set_indexfieldmap ($self->{'indexfieldmap'});
127 $self->{'buildproc'}->set_levels ($levels);
128 $self->{'buildproc'}->set_gdbm_level ($gdbm_level);
129 $self->{'buildproc'}->reset();
130 &plugin::begin($self->{'pluginfo'}, $self->{'source_dir'},
131 $self->{'buildproc'}, $self->{'maxdocs'});
132 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
133 "", {}, $self->{'buildproc'}, $self->{'maxdocs'});
134 &plugin::end($self->{'pluginfo'});
135 close ($handle) unless $self->{'debug'};
136 close PIPEOUT;
137 $self->print_stats();
138
139 print STDERR "</Stage>\n" if $self->{'gli'};
140
141}
142
143sub build_indexes {
144 my $self = shift (@_);
145 my ($indexname) = @_;
146 my $outhandle = $self->{'outhandle'};
147
148 my $indexes = [];
149 if (defined $indexname && $indexname =~ /\w/) {
150 push @$indexes, $indexname;
151 } else {
152 $indexes = $self->{'collect_cfg'}->{'indexes'};
153 }
154
155 # create the mapping between the index descriptions
156 # and their directory names (includes subcolls and langs)
157 $self->{'index_mapping'} = $self->create_index_mapping ($indexes);
158
159 # build each of the indexes
160 foreach $index (@$indexes) {
161 if ($self->want_built($index)) {
162
163 my $idx = $self->{'index_mapping'}->{$index};
164 foreach my $level (keys %{$self->{'levels'}}) {
165 next if $level =~ /paragraph/; # we don't do para indexing
166 my ($pindex) = $level =~ /^(.)/;
167 # should probably check that new name with level
168 # is unique ... but currently (with doc sec and para)
169 # each has unique first letter.
170 $self->{'index_mapping'}->{$index} = $pindex.$idx;
171
172 my $llevel = $mgppbuilder::level_map{$level};
173 print $outhandle "\n*** building index $index at level $llevel in subdirectory " .
174 "$self->{'index_mapping'}->{$index}\n" if ($self->{'verbosity'} >= 1);
175 print STDERR "<Stage name='Index' source='$index' level=$llevel>\n" if $self->{'gli'};
176
177 $self->build_index($index,$llevel);
178 }
179 $self->{'index_mapping'}->{$index} = $idx;
180
181 } else {
182 print $outhandle "\n*** ignoring index $index\n" if ($self->{'verbosity'} >= 1);
183 }
184 }
185
186 #define the final field lists
187 $self->make_final_field_list();
188}
189
190
191
192
193
194sub build_index {
195 my $self = shift (@_);
196 my ($index,$llevel) = @_;
197 my $outhandle = $self->{'outhandle'};
198 my $build_dir = $self->{'build_dir'};
199
200 # get the full index directory path and make sure it exists
201 my $indexdir = $self->{'index_mapping'}->{$index};
202 &util::mk_all_dir (&util::filename_cat($build_dir, $indexdir));
203
204 # get any os specific stuff
205 my $exedir = "$ENV{'GSDLHOME'}/bin/$ENV{'GSDLOS'}";
206 my $scriptdir = "$ENV{'GSDLHOME'}/bin/script";
207
208 my $exe = &util::get_os_exe ();
209 my $lucene_passes_exe = &util::filename_cat($scriptdir, "lucene_passes.pl");
210 my $full_lucene_passes_exe = $lucene_passes_exe;
211 if ($ENV{'GSDLOS'} =~ /^windows$/i) {
212 $full_lucene_passes_exe = "perl.exe -S $lucene_passes_exe";
213 }
214
215 # define the section names for lucenepasses
216 # define the section names and possibly the doc name for lucenepasses
217 my $lucene_passes_sections = $llevel;
218
219 my $osextra = "";
220 if ($ENV{'GSDLOS'} =~ /^windows$/i) {
221 $build_dir =~ s@/@\\@g;
222 } else {
223 $osextra = " -d /";
224 if ($outhandle ne "STDERR") {
225 # so lucene_passes doesn't print to stderr if we redirect output
226 $osextra .= " 2>/dev/null";
227 }
228 }
229
230 # get the index expression if this index belongs
231 # to a subcollection
232 my $indexexparr = [];
233
234 # there may be subcollection info, and language info.
235 my ($fields, $subcollection, $language) = split (":", $index);
236 my @subcollections = ();
237 @subcollections = split /,/, $subcollection if (defined $subcollection);
238
239 foreach $subcollection (@subcollections) {
240 if (defined ($self->{'collect_cfg'}->{'subcollection'}->{$subcollection})) {
241 push (@$indexexparr, $self->{'collect_cfg'}->{'subcollection'}->{$subcollection});
242 }
243 }
244
245 # add expressions for languages if this index belongs to
246 # a language subcollection - only put languages expressions for the
247 # ones we want in the index
248
249 # this puts a separate Language/en entry in for each language in the list
250 # is this what we want?
251 # should we just have one entry with Language/en,es/ ??
252 my @languages = ();
253 my $language_metadata = "Language";
254 if (defined ($self->{'collect_cfg'}->{'language_metadata'})) {
255 $language_metadata = $self->{'collect_cfg'}->{'language_metadata'};
256 }
257 @languages = split /,/, $language if (defined $language);
258 foreach my $language (@languages) {
259 my $not=0;
260 if ($language =~ s/^\!//) {
261 $not = 1;
262 }
263 if($not) {
264 push (@$indexexparr, "!$language_metadata/$language/");
265 } else {
266 push (@$indexexparr, "$language_metadata/$language/");
267 }
268 }
269
270 # Build index dictionary. Uses verbatim stem method
271 print $outhandle "\n creating index dictionary (lucene_passes -I1)\n" if ($self->{'verbosity'} >= 1);
272 print STDERR "<Phase name='CreatingIndexDic'/>\n" if $self->{'gli'};
273 my ($handle);
274
275 if ($self->{'debug'}) {
276 $handle = STDOUT;
277 } else {
278 if (!-e "$lucene_passes_exe" ||
279 !open (PIPEOUT, "| $full_lucene_passes_exe index $lucene_passes_sections \"$build_dir\" \"$indexdir\" $osextra")) {
280 print STDERR "<FatalError name='NoRunLucenePasses'/>\n</Stage>\n" if $self->{'gli'};
281 die "lucenebuilder::build_index - couldn't run $lucene_passes_exe\n";
282 }
283 $handle = lucenebuilder::PIPEOUT;
284 }
285
286 my $store_levels = $self->{'levels'};
287 my $gdbm_level = "document";
288 if ($store_levels->{'section'}) {
289 $gdbm_level = "section";
290 }
291
292 my $dom_level = "";
293 foreach my $key (keys %$store_levels) {
294 if ($mgppbuilder::level_map{$key} eq $llevel) {
295 $dom_level = $key;
296 }
297 }
298 if ($dom_level eq "") {
299 print STDERR "Warning: unrecognized tag level $llevel\n";
300 $dom_level = "document";
301 }
302
303 my $local_levels = { $dom_level => 1 }; # work on one level at a time
304
305 # set up the document processr
306 $self->{'buildproc'}->set_output_handle ($handle);
307 $self->{'buildproc'}->set_mode ('text');
308 $self->{'buildproc'}->set_index ($index, $indexexparr);
309 $self->{'buildproc'}->set_indexing_text (1);
310 $self->{'buildproc'}->set_store_text(1);
311 $self->{'buildproc'}->set_indexfieldmap ($self->{'indexfieldmap'});
312 $self->{'buildproc'}->set_levels ($local_levels);
313 $self->{'buildproc'}->set_gdbm_level($gdbm_level);
314 $self->{'buildproc'}->reset();
315 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
316 "", {}, $self->{'buildproc'}, $self->{'maxdocs'});
317 close ($handle) unless $self->{'debug'};
318
319 $self->print_stats();
320
321 $self->{'buildproc'}->set_levels ($store_levels);
322 print STDERR "</Stage>\n" if $self->{'gli'};
323}
324
3251;
326
327
Note: See TracBrowser for help on using the repository browser.