source: gsdl/trunk/perllib/lucenebuilder.pm@ 17110

Last change on this file since 17110 was 17110, checked in by kjdon, 16 years ago

changed way cjk separation is done. Not done in plugins any more, but is now an indexoption. cnseg called from filter_text method. generate_index_options sets up the field in buildproc

  • Property svn:keywords set to Author Date Id Revision
File size: 14.7 KB
Line 
1###########################################################################
2#
3# lucenebuilder.pm -- perl wrapper for building index with Lucene
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26###########################################################################
27# /*
28# * @version 1.0 ?
29# * @version 2.0 Incremental building assistance added, including
30# * remove_document_from_database which implements the granddad's
31# * empty function to call the lucene_passes.pl and full_lucene_passes_exe
32# * so there is one place in the code that works out where the
33# * perl script is. John Rowe
34# *
35# * @author John Rowe, DL Consulting Ltd.
36# */
37###########################################################################
38
39package lucenebuilder;
40
41# Use same basic XML structure setup by mgppbuilder/mgppbuildproc
42
43use mgppbuilder;
44use strict; no strict 'refs';
45
46
47sub BEGIN {
48 @lucenebuilder::ISA = ('mgppbuilder');
49}
50
51# /**
52# * @author John Thompson, DL Consulting Ltd.
53# */
54sub new {
55 my $class = shift(@_);
56 my $self = new mgppbuilder (@_);
57 $self = bless $self, $class;
58
59 $self->{'buildtype'} = "lucene";
60
61 # Do we need to put exe on the end?
62 my $exe = &util::get_os_exe ();
63 my $scriptdir = "$ENV{'GSDLHOME'}/bin/script";
64
65 # So where is lucene_passes.pl anyway?
66 my $lucene_passes_script = &util::filename_cat($scriptdir, "lucene_passes.pl");
67
68 # So tack perl on the beginning to ensure execution
69 $self->{'full_lucene_passes'} = "$lucene_passes_script";
70 if ($exe eq ".exe")
71 {
72 $self->{'full_lucene_passes_exe'} = "perl$exe \"$lucene_passes_script\"";
73 }
74 else
75 {
76 $self->{'full_lucene_passes_exe'} = "perl -S \"$lucene_passes_script\"";
77 }
78
79 return $self;
80}
81# /** new() **/
82
83# lucene has none of these options
84sub generate_index_options {
85 my $self = shift (@_);
86
87 $self->SUPER::generate_index_options();
88
89 $self->{'casefold'} = 0;
90 $self->{'stem'} = 0;
91 $self->{'accentfold'} = 0;
92 $self->{'stemindexes'} = 0;
93}
94
95sub default_buildproc {
96 my $self = shift (@_);
97
98 return "lucenebuildproc";
99}
100
101# this writes a nice version of the text docs
102sub compress_text
103{
104 my $self = shift (@_);
105 # we don't do anything if we don't want compressed text
106 return if $self->{'no_text'};
107
108 my ($textindex) = @_;
109 my $outhandle = $self->{'outhandle'};
110 print STDERR "Saving the document text\n";
111 # the text directory
112 my $text_dir = &util::filename_cat($self->{'build_dir'}, "text");
113 my $build_dir = &util::filename_cat($self->{'build_dir'},"");
114 &util::mk_all_dir ($text_dir);
115
116 my $osextra = "";
117 if ($ENV{'GSDLOS'} =~ /^windows$/i)
118 {
119 $text_dir =~ s@/@\\@g;
120 }
121 else
122 {
123 if ($outhandle ne "STDERR")
124 {
125 # so lucene_passes doesn't print to stderr if we redirect output
126 $osextra .= " 2>/dev/null";
127 }
128 }
129
130 # get any os specific stuff
131 my $scriptdir = "$ENV{'GSDLHOME'}/bin/script";
132
133 # Find the perl script to call to run lucene
134 my $full_lucene_passes = $self->{'full_lucene_passes'};
135 my $full_lucene_passes_exe = $self->{'full_lucene_passes_exe'};
136
137 my $lucene_passes_sections = "Doc";
138
139 my ($handle);
140
141 if ($self->{'debug'})
142 {
143 $handle = *STDOUT;
144 }
145 else
146 {
147 print STDERR "Full Path: $full_lucene_passes\n";
148 print STDERR "Executable: $full_lucene_passes_exe\n";
149 print STDERR "Sections: $lucene_passes_sections\n";
150 print STDERR "Build Dir: $build_dir\n";
151 print STDERR "Cmd: $full_lucene_passes_exe text $lucene_passes_sections \"$build_dir\" \"dummy\" $osextra\n";
152 if (!-e "$full_lucene_passes" ||
153 !open($handle, "| $full_lucene_passes_exe text $lucene_passes_sections \"$build_dir\" \"dummy\" $osextra"))
154 {
155 print STDERR "<FatalError name='NoRunLucenePasses'/>\n</Stage>\n" if $self->{'gli'};
156 die "lucenebuilder::build_index - couldn't run $full_lucene_passes_exe\n";
157 }
158 }
159
160 # stored text is always Doc and Sec levels
161 my $levels = { 'document' => 1, 'section' => 1 };
162 # always do database at section level
163 my $db_level = "section";
164
165 # set up the document processr
166 $self->{'buildproc'}->set_output_handle ($handle);
167 $self->{'buildproc'}->set_mode ('text');
168 $self->{'buildproc'}->set_index ($textindex);
169 $self->{'buildproc'}->set_indexing_text (0);
170 $self->{'buildproc'}->set_indexfieldmap ($self->{'indexfieldmap'});
171 $self->{'buildproc'}->set_levels ($levels);
172 $self->{'buildproc'}->set_db_level ($db_level);
173 $self->{'buildproc'}->reset();
174 &plugin::begin($self->{'pluginfo'}, $self->{'source_dir'},
175 $self->{'buildproc'}, $self->{'maxdocs'});
176 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
177 "", {}, {}, $self->{'buildproc'}, $self->{'maxdocs'}, 0, $self->{'gli'});
178 &plugin::end($self->{'pluginfo'});
179 close ($handle) unless $self->{'debug'};
180 $self->print_stats();
181
182 print STDERR "</Stage>\n" if $self->{'gli'};
183}
184
185sub build_indexes {
186 my $self = shift (@_);
187 my ($indexname) = @_;
188 my $outhandle = $self->{'outhandle'};
189
190 my $indexes = [];
191 if (defined $indexname && $indexname =~ /\w/) {
192 push @$indexes, $indexname;
193 } else {
194 $indexes = $self->{'collect_cfg'}->{'indexes'};
195 }
196
197 # create the mapping between the index descriptions
198 # and their directory names (includes subcolls and langs)
199 $self->{'index_mapping'} = $self->create_index_mapping ($indexes);
200
201 # build each of the indexes
202 foreach my $index (@$indexes) {
203 if ($self->want_built($index)) {
204
205 my $idx = $self->{'index_mapping'}->{$index};
206 foreach my $level (keys %{$self->{'levels'}}) {
207 next if $level =~ /paragraph/; # we don't do para indexing
208 my ($pindex) = $level =~ /^(.)/;
209 # should probably check that new name with level
210 # is unique ... but currently (with doc sec and para)
211 # each has unique first letter.
212 $self->{'index_mapping'}->{$index} = $pindex.$idx;
213
214 my $llevel = $mgppbuilder::level_map{$level};
215 print $outhandle "\n*** building index $index at level $llevel in subdirectory " .
216 "$self->{'index_mapping'}->{$index}\n" if ($self->{'verbosity'} >= 1);
217 print STDERR "<Stage name='Index' source='$index' level=$llevel>\n" if $self->{'gli'};
218
219 $self->build_index($index,$llevel);
220 }
221 $self->{'index_mapping'}->{$index} = $idx;
222
223 } else {
224 print $outhandle "\n*** ignoring index $index\n" if ($self->{'verbosity'} >= 1);
225 }
226 }
227
228 #define the final field lists
229 $self->make_final_field_list();
230}
231
232
233sub build_index {
234 my $self = shift (@_);
235 my ($index,$llevel) = @_;
236 my $outhandle = $self->{'outhandle'};
237 my $build_dir = $self->{'build_dir'};
238
239 # get the full index directory path and make sure it exists
240 my $indexdir = $self->{'index_mapping'}->{$index};
241 &util::mk_all_dir (&util::filename_cat($build_dir, $indexdir));
242
243 # get any os specific stuff
244 my $exedir = "$ENV{'GSDLHOME'}/bin/$ENV{'GSDLOS'}";
245 my $scriptdir = "$ENV{'GSDLHOME'}/bin/script";
246
247 # Find the perl script to call to run lucene
248 my $full_lucene_passes = $self->{'full_lucene_passes'};
249 my $full_lucene_passes_exe = $self->{'full_lucene_passes_exe'};
250
251 # define the section names for lucenepasses
252 # define the section names and possibly the doc name for lucenepasses
253 my $lucene_passes_sections = $llevel;
254
255 my $opt_create_index = ($self->{'keepold'}) ? "" : "-removeold";
256
257 my $osextra = "";
258 if ($ENV{'GSDLOS'} =~ /^windows$/i) {
259 $build_dir =~ s@/@\\@g;
260 } else {
261 if ($outhandle ne "STDERR") {
262 # so lucene_passes doesn't print to stderr if we redirect output
263 $osextra .= " 2>/dev/null";
264 }
265 }
266
267 # get the index expression if this index belongs
268 # to a subcollection
269 my $indexexparr = [];
270 my $langarr = [];
271
272 # there may be subcollection info, and language info.
273 my ($fields, $subcollection, $language) = split (":", $index);
274 my @subcollections = ();
275 @subcollections = split /,/, $subcollection if (defined $subcollection);
276
277 foreach $subcollection (@subcollections) {
278 if (defined ($self->{'collect_cfg'}->{'subcollection'}->{$subcollection})) {
279 push (@$indexexparr, $self->{'collect_cfg'}->{'subcollection'}->{$subcollection});
280 }
281 }
282
283 # add expressions for languages if this index belongs to
284 # a language subcollection - only put languages expressions for the
285 # ones we want in the index
286 my @languages = ();
287 my $language_metadata = "Language";
288 if (defined ($self->{'collect_cfg'}->{'language_metadata'})) {
289 $language_metadata = $self->{'collect_cfg'}->{'language_metadata'};
290 }
291 @languages = split /,/, $language if (defined $language);
292 foreach my $language (@languages) {
293 my $not=0;
294 if ($language =~ s/^\!//) {
295 $not = 1;
296 }
297 if($not) {
298 push (@$langarr, "!$language");
299 } else {
300 push (@$langarr, "$language");
301 }
302 }
303
304 # Build index dictionary. Uses verbatim stem method
305 print $outhandle "\n creating index dictionary (lucene_passes -I1)\n" if ($self->{'verbosity'} >= 1);
306 print STDERR "<Phase name='CreatingIndexDic'/>\n" if $self->{'gli'};
307 my ($handle);
308
309 if ($self->{'debug'}) {
310 $handle = *STDOUT;
311 } else {
312 print STDERR "Cmd: $full_lucene_passes_exe $opt_create_index index $lucene_passes_sections \"$build_dir\" \"$indexdir\" $osextra\n";
313 if (!-e "$full_lucene_passes" ||
314 !open($handle, "| $full_lucene_passes_exe $opt_create_index index $lucene_passes_sections \"$build_dir\" \"$indexdir\" $osextra")) {
315 print STDERR "<FatalError name='NoRunLucenePasses'/>\n</Stage>\n" if $self->{'gli'};
316 die "lucenebuilder::build_index - couldn't run $full_lucene_passes_exe\n";
317 }
318 }
319
320 my $store_levels = $self->{'levels'};
321 my $db_level = "section"; #always
322 my $dom_level = "";
323 foreach my $key (keys %$store_levels) {
324 if ($mgppbuilder::level_map{$key} eq $llevel) {
325 $dom_level = $key;
326 }
327 }
328 if ($dom_level eq "") {
329 print STDERR "Warning: unrecognized tag level $llevel\n";
330 $dom_level = "document";
331 }
332
333 my $local_levels = { $dom_level => 1 }; # work on one level at a time
334
335 # set up the document processr
336 $self->{'buildproc'}->set_output_handle ($handle);
337 $self->{'buildproc'}->set_mode ('text');
338 $self->{'buildproc'}->set_index ($index, $indexexparr);
339 $self->{'buildproc'}->set_index_languages ($language_metadata, $langarr) if (defined $language);
340 $self->{'buildproc'}->set_indexing_text (1);
341 $self->{'buildproc'}->set_indexfieldmap ($self->{'indexfieldmap'});
342 $self->{'buildproc'}->set_levels ($local_levels);
343 $self->{'buildproc'}->set_db_level($db_level);
344 $self->{'buildproc'}->reset();
345 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
346 "", {}, {}, $self->{'buildproc'}, $self->{'maxdocs'}, 0, $self->{'gli'});
347 close ($handle) unless $self->{'debug'};
348
349 $self->print_stats();
350
351 $self->{'buildproc'}->set_levels ($store_levels);
352 print STDERR "</Stage>\n" if $self->{'gli'};
353}
354
355# /** A modified version of the basebuilder.pm's function that generates the
356# * information database from the GA documents. We need to change this
357# * so that if we've been asked to do an incremental build we only add
358# * metadata to autohierarchy classifiers via the IncrementalBuildUtils
359# * module. All other classifiers and metadata will be ignored.
360# */
361sub make_infodatabase
362{
363 my $self = shift (@_);
364 my $outhandle = $self->{'outhandle'};
365
366 # Get info database file path
367 my $text_directory_path = &util::filename_cat($self->{'build_dir'}, "text");
368 my $infodb_file_path = &dbutil::get_infodb_file_path($self->{'infodbtype'}, $self->{'collection'}, $text_directory_path);
369
370 # If we aren't doing an incremental addition, then we just call the super-
371 # classes version
372 # Note: Incremental addition can only occur if an information database
373 # already exists. If it doesn't, let the super classes function be
374 # called once to generate it.
375 if (!$self->{'incremental'} || !-e $infodb_file_path)
376 {
377 # basebuilder::make_infodatabase(@_);
378 # Note: this doesn't work as the direct reference means all the $self
379 # data is lost.
380 $self->basebuilder::make_infodatabase(@_);
381 return;
382 }
383
384 # Carry on with an incremental addition
385 print $outhandle "\n*** performing an incremental addition to the info database\n" if ($self->{'verbosity'} >= 1);
386 print STDERR "<Stage name='CreateInfoData'>\n" if $self->{'gli'};
387
388 # 1. Init all the classifiers
389 &classify::init_classifiers ($self->{'classifiers'});
390 # 2. Init the buildproc settings.
391 # Note: we still need this to process any associated files - but we
392 # don't expect to pipe anything to the database so we can do away with the
393 # complex output handle.
394 my $assocdir = &util::filename_cat($self->{'build_dir'}, "assoc");
395 &util::mk_all_dir ($assocdir);
396 $self->{'buildproc'}->set_mode ('incinfodb'); # Very Important
397 $self->{'buildproc'}->set_assocdir ($assocdir);
398 # 3. Read in all the metadata from the files in the archives directory using
399 # the GAPlug and using ourselves as the document processor!
400 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'}, "", {}, {}, $self->{'buildproc'}, $self->{'maxdocs'},0, $self->{'gli'});
401
402 print STDERR "</Stage>\n" if $self->{'gli'};
403}
404
405# /** Lucene specific document removal function. This works by calling lucene_passes.pl with
406# * -remove and the document id on the command line.
407# *
408# * @param oid is the document identifier to be removed.
409# *
410# * @author John Rowe, DL Consulting Ltd.
411# */
412sub remove_document_from_database
413{
414 my ($self, $oid) = @_;
415 # Find the perl script to call to run lucene
416 my $full_lucene_passes_exe = $self->{'full_lucene_passes_exe'};
417 # Call lucene_passes.pl with -remove and the document ID on the command line
418 `$full_lucene_passes_exe -remove "$oid"`;
419}
420# /** remove_document_from_database **/
421
422
4231;
424
425
Note: See TracBrowser for help on using the repository browser.