source: trunk/gsdl/perllib/lucenebuilder.pm@ 10468

Last change on this file since 10468 was 10468, checked in by kjdon, 19 years ago

made a base builder class, adn moved lots of the code to it. hoe I haven't stuffed anything up :-)

  • Property svn:keywords set to Author Date Id Revision
File size: 10.3 KB
Line 
1###########################################################################
2#
3# lucenebuilder.pm -- perl wrapper for building index with Lucene
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26package lucenebuilder;
27
28# Use same basic XML structure setup by mgppbuilder/mgppbuildproc
29
30use mgppbuilder;
31
32sub BEGIN {
33 @lucenebuilder::ISA = ('mgppbuilder');
34}
35
36
37sub new {
38 my $class = shift(@_);
39 my ($collection, $source_dir, $build_dir, $verbosity,
40 $maxdocs, $debug, $keepold, $allclassifications,
41 $outhandle, $no_text, $gli) = @_;
42
43 my $self = new mgppbuilder (@_);
44 $self = bless $self, $class;
45
46 $self->{'buildtype'} = "lucene";
47
48 return $self;
49}
50
51sub default_buildproc {
52 my $self = shift (@_);
53
54 return "lucenebuildproc";
55}
56
57# this writes a nice version of the text docs
58sub compress_text {
59
60 my $self = shift (@_);
61 my ($textindex) = @_;
62 my $outhandle = $self->{'outhandle'};
63 print STDERR "Saving the document text\n";
64 # the text directory
65 my $text_dir = &util::filename_cat($self->{'build_dir'}, "text");
66 my $build_dir = &util::filename_cat($self->{'build_dir'},"");
67 &util::mk_all_dir ($text_dir);
68
69 my $osextra = "";
70 if ($ENV{'GSDLOS'} =~ /^windows$/i) {
71 $text_dir =~ s@/@\\@g;
72 } else {
73 if ($outhandle ne "STDERR") {
74 # so lucene_passes doesn't print to stderr if we redirect output
75 $osextra .= " 2>/dev/null";
76 }
77 }
78
79
80 # get any os specific stuff
81 my $scriptdir = "$ENV{'GSDLHOME'}/bin/script";
82
83 my $lucene_passes_exe = &util::filename_cat($scriptdir, "lucene_passes.pl");
84 my $full_lucene_passes_exe = $lucene_passes_exe;
85 if ($ENV{'GSDLOS'} =~ /^windows$/i) {
86 $full_lucene_passes_exe = "perl.exe -S \"$lucene_passes_exe\"";
87 }
88 my $lucene_passes_sections = "Doc";
89
90 my ($handle);
91
92 if ($self->{'debug'}) {
93 $handle = STDOUT;
94 } else {
95 if (!-e "$lucene_passes_exe" ||
96 !open (PIPEOUT, "| $full_lucene_passes_exe text $lucene_passes_sections \"$build_dir\" \"dummy\" $osextra")) {
97 print STDERR "<FatalError name='NoRunLucenePasses'/>\n</Stage>\n" if $self->{'gli'};
98 die "lucenebuilder::build_index - couldn't run $lucene_passes_exe\n";
99 }
100 $handle = lucenebuilder::PIPEOUT;
101 }
102 my $levels = $self->{'levels'};
103 my $gdbm_level = "document";
104 if ($levels->{'section'}) {
105 $gdbm_level = "section";
106 }
107
108 undef $levels->{'paragraph'}; # get rid of para if we had it.
109 # set up the document processr
110 $self->{'buildproc'}->set_output_handle ($handle);
111 $self->{'buildproc'}->set_mode ('text');
112 $self->{'buildproc'}->set_index ($textindex);
113 $self->{'buildproc'}->set_indexing_text (0);
114 $self->{'buildproc'}->set_store_text(1);
115 $self->{'buildproc'}->set_indexfieldmap ($self->{'indexfieldmap'});
116 $self->{'buildproc'}->set_levels ($levels);
117 $self->{'buildproc'}->set_gdbm_level ($gdbm_level);
118 $self->{'buildproc'}->reset();
119 &plugin::begin($self->{'pluginfo'}, $self->{'source_dir'},
120 $self->{'buildproc'}, $self->{'maxdocs'});
121 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
122 "", {}, $self->{'buildproc'}, $self->{'maxdocs'}, 0, $self->{'gli'});
123 &plugin::end($self->{'pluginfo'});
124 close ($handle) unless $self->{'debug'};
125 close PIPEOUT;
126 $self->print_stats();
127
128 print STDERR "</Stage>\n" if $self->{'gli'};
129
130}
131
132sub build_indexes {
133 my $self = shift (@_);
134 my ($indexname) = @_;
135 my $outhandle = $self->{'outhandle'};
136
137 my $indexes = [];
138 if (defined $indexname && $indexname =~ /\w/) {
139 push @$indexes, $indexname;
140 } else {
141 $indexes = $self->{'collect_cfg'}->{'indexes'};
142 }
143
144 # create the mapping between the index descriptions
145 # and their directory names (includes subcolls and langs)
146 $self->{'index_mapping'} = $self->create_index_mapping ($indexes);
147
148 # build each of the indexes
149 foreach $index (@$indexes) {
150 if ($self->want_built($index)) {
151
152 my $idx = $self->{'index_mapping'}->{$index};
153 foreach my $level (keys %{$self->{'levels'}}) {
154 next if $level =~ /paragraph/; # we don't do para indexing
155 my ($pindex) = $level =~ /^(.)/;
156 # should probably check that new name with level
157 # is unique ... but currently (with doc sec and para)
158 # each has unique first letter.
159 $self->{'index_mapping'}->{$index} = $pindex.$idx;
160
161 my $llevel = $mgppbuilder::level_map{$level};
162 print $outhandle "\n*** building index $index at level $llevel in subdirectory " .
163 "$self->{'index_mapping'}->{$index}\n" if ($self->{'verbosity'} >= 1);
164 print STDERR "<Stage name='Index' source='$index' level=$llevel>\n" if $self->{'gli'};
165
166 $self->build_index($index,$llevel);
167 }
168 $self->{'index_mapping'}->{$index} = $idx;
169
170 } else {
171 print $outhandle "\n*** ignoring index $index\n" if ($self->{'verbosity'} >= 1);
172 }
173 }
174
175 #define the final field lists
176 $self->make_final_field_list();
177}
178
179sub build_index {
180 my $self = shift (@_);
181 my ($index,$llevel) = @_;
182 my $outhandle = $self->{'outhandle'};
183 my $build_dir = $self->{'build_dir'};
184
185 # get the full index directory path and make sure it exists
186 my $indexdir = $self->{'index_mapping'}->{$index};
187 &util::mk_all_dir (&util::filename_cat($build_dir, $indexdir));
188
189 # get any os specific stuff
190 my $exedir = "$ENV{'GSDLHOME'}/bin/$ENV{'GSDLOS'}";
191 my $scriptdir = "$ENV{'GSDLHOME'}/bin/script";
192
193 my $exe = &util::get_os_exe ();
194 my $lucene_passes_exe = &util::filename_cat($scriptdir, "lucene_passes.pl");
195 my $full_lucene_passes_exe = $lucene_passes_exe;
196 if ($ENV{'GSDLOS'} =~ /^windows$/i) {
197 $full_lucene_passes_exe = "perl.exe -S \"$lucene_passes_exe\"";
198 }
199
200 # define the section names for lucenepasses
201 # define the section names and possibly the doc name for lucenepasses
202 my $lucene_passes_sections = $llevel;
203
204 my $opt_create_index = ($self->{'keepold'}) ? "" : "-create";
205
206 my $osextra = "";
207 if ($ENV{'GSDLOS'} =~ /^windows$/i) {
208 $build_dir =~ s@/@\\@g;
209 } else {
210 if ($outhandle ne "STDERR") {
211 # so lucene_passes doesn't print to stderr if we redirect output
212 $osextra .= " 2>/dev/null";
213 }
214 }
215
216 # get the index expression if this index belongs
217 # to a subcollection
218 my $indexexparr = [];
219 my $langarr = [];
220
221 # there may be subcollection info, and language info.
222 my ($fields, $subcollection, $language) = split (":", $index);
223 my @subcollections = ();
224 @subcollections = split /,/, $subcollection if (defined $subcollection);
225
226 foreach $subcollection (@subcollections) {
227 if (defined ($self->{'collect_cfg'}->{'subcollection'}->{$subcollection})) {
228 push (@$indexexparr, $self->{'collect_cfg'}->{'subcollection'}->{$subcollection});
229 }
230 }
231
232 # add expressions for languages if this index belongs to
233 # a language subcollection - only put languages expressions for the
234 # ones we want in the index
235
236 my @languages = ();
237 my $language_metadata = "Language";
238 if (defined ($self->{'collect_cfg'}->{'language_metadata'})) {
239 $language_metadata = $self->{'collect_cfg'}->{'language_metadata'};
240 }
241 @languages = split /,/, $language if (defined $language);
242 foreach my $language (@languages) {
243 my $not=0;
244 if ($language =~ s/^\!//) {
245 $not = 1;
246 }
247 if($not) {
248 push (@$langarr, "!$language");
249 } else {
250 push (@$langarr, "$language");
251 }
252 }
253
254 # Build index dictionary. Uses verbatim stem method
255 print $outhandle "\n creating index dictionary (lucene_passes -I1)\n" if ($self->{'verbosity'} >= 1);
256 print STDERR "<Phase name='CreatingIndexDic'/>\n" if $self->{'gli'};
257 my ($handle);
258
259 if ($self->{'debug'}) {
260 $handle = STDOUT;
261 } else {
262 if (!-e "$lucene_passes_exe" ||
263 !open (PIPEOUT, "| $full_lucene_passes_exe $opt_create_index index $lucene_passes_sections \"$build_dir\" \"$indexdir\" $osextra")) {
264 print STDERR "<FatalError name='NoRunLucenePasses'/>\n</Stage>\n" if $self->{'gli'};
265 die "lucenebuilder::build_index - couldn't run $lucene_passes_exe\n";
266 }
267 $handle = lucenebuilder::PIPEOUT;
268 }
269
270 my $store_levels = $self->{'levels'};
271 my $gdbm_level = "document";
272 if ($store_levels->{'section'}) {
273 $gdbm_level = "section";
274 }
275
276 my $dom_level = "";
277 foreach my $key (keys %$store_levels) {
278 if ($mgppbuilder::level_map{$key} eq $llevel) {
279 $dom_level = $key;
280 }
281 }
282 if ($dom_level eq "") {
283 print STDERR "Warning: unrecognized tag level $llevel\n";
284 $dom_level = "document";
285 }
286
287 my $local_levels = { $dom_level => 1 }; # work on one level at a time
288
289 # set up the document processr
290 $self->{'buildproc'}->set_output_handle ($handle);
291 $self->{'buildproc'}->set_mode ('text');
292 $self->{'buildproc'}->set_index ($index, $indexexparr);
293 $self->{'buildproc'}->set_index_languages ($language_metadata, $langarr) if (defined $language);
294 $self->{'buildproc'}->set_indexing_text (1);
295 $self->{'buildproc'}->set_store_text(1);
296 $self->{'buildproc'}->set_indexfieldmap ($self->{'indexfieldmap'});
297 $self->{'buildproc'}->set_levels ($local_levels);
298 $self->{'buildproc'}->set_gdbm_level($gdbm_level);
299 $self->{'buildproc'}->reset();
300 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
301 "", {}, $self->{'buildproc'}, $self->{'maxdocs'}, 0, $self->{'gli'});
302 close ($handle) unless $self->{'debug'};
303
304 $self->print_stats();
305
306 $self->{'buildproc'}->set_levels ($store_levels);
307 print STDERR "</Stage>\n" if $self->{'gli'};
308}
309
3101;
311
312
Note: See TracBrowser for help on using the repository browser.