source: main/trunk/greenstone2/perllib/mgbuilder.pm@ 28121

Last change on this file since 28121 was 28121, checked in by ak19, 11 years ago

Better error checking after system commands in mgpbuilder too. Could not use bitwise flags for this, since mg needs to generate all the indexes, stemming, casefolding and casefolding and stemming, even when one of them (such as casefolding) is not turned on. And if ever one of them doesn't work, the original comment seemed to indicate that then none of it will work.

  • Property svn:keywords set to Author Date Id Revision
File size: 22.2 KB
Line 
1###########################################################################
2#
3# mgbuilder.pm -- MGBuilder object
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26package mgbuilder;
27
28use basebuilder;
29use plugin;
30use strict; no strict 'refs';
31use util;
32use FileUtils;
33
34
35BEGIN {
36 @mgbuilder::ISA = ('basebuilder');
37}
38
39
40my %wanted_index_files = ('td'=>1,
41 't'=>1,
42 'idb'=>1,
43 'ib1'=>1,
44 'ib2'=>1,
45 'ib3'=>1,
46 'i'=>1,
47 'ip'=>1,
48 'tiw'=>1,
49 'wa'=>1);
50
51my $maxdocsize = $basebuilder::maxdocsize;
52
53
54sub new {
55 my $class = shift(@_);
56
57 my $self = new basebuilder (@_);
58 $self = bless $self, $class;
59
60 $self->{'buildtype'} = "mg";
61 return $self;
62}
63
64sub default_buildproc {
65 my $self = shift (@_);
66
67 return "mgbuildproc";
68}
69
70sub generate_index_list {
71 my $self = shift (@_);
72
73 if (!defined($self->{'collect_cfg'}->{'indexes'})) {
74 $self->{'collect_cfg'}->{'indexes'} = [];
75 }
76 if (scalar(@{$self->{'collect_cfg'}->{'indexes'}}) == 0) {
77 # no indexes have been specified so we'll build a "dummy:text" index
78 push (@{$self->{'collect_cfg'}->{'indexes'}}, "dummy:text");
79 }
80 # remove any ex. but only if there are no other metadata prefixes
81 my @orig_indexes = @{$self->{'collect_cfg'}->{'indexes'}};
82 $self->{'collect_cfg'}->{'indexes'} = [];
83 foreach my $index (@orig_indexes) {
84 #$index =~ s/ex\.([^.,:]+)(,|:|$)/$1$2/g; # doesn't preserve flex.Image, which is turned into fl.Image
85 $index =~ s/(,|:)/$1 /g;
86 $index =~ s/(^| )ex\.([^.,:]+)(,|:|$)/$1$2$3/g;
87 $index =~ s/(,|:) /$1/g;
88
89 push (@{$self->{'collect_cfg'}->{'indexes'}}, $index);
90 }
91}
92
93sub generate_index_options {
94 my $self = shift (@_);
95 $self->SUPER::generate_index_options();
96
97 $self->{'casefold'} = 0;
98 $self->{'stem'} = 0;
99 $self->{'accentfold'} = 0; #not yet implemented for mg
100
101 if (!defined($self->{'collect_cfg'}->{'indexoptions'})) {
102 # just use default options
103 $self->{'casefold'} = 1;
104 $self->{'stem'} = 1;
105
106 } else {
107 foreach my $option (@{$self->{'collect_cfg'}->{'indexoptions'}}) {
108 if ($option =~ /stem/) {
109 $self->{'stem'} = 1;
110 } elsif ($option =~ /casefold/) {
111 $self->{'casefold'} = 1;
112 }
113 }
114 }
115
116 # now we record this for the build cfg
117 $self->{'stemindexes'} = 0;
118 if ($self->{'casefold'}) {
119 $self->{'stemindexes'} += 1;
120 }
121 if ($self->{'stem'}) {
122 $self->{'stemindexes'} += 2;
123 }
124
125
126}
127
128sub compress_text {
129 my $self = shift (@_);
130 my ($textindex) = @_;
131 my $exedir = "$ENV{'GSDLHOME'}/bin/$ENV{'GSDLOS'}";
132 my $exe = &util::get_os_exe ();
133 my $mg_passes_exe = &FileUtils::filenameConcatenate($exedir, "mg_passes$exe");
134 my $mg_compression_dict_exe = &FileUtils::filenameConcatenate($exedir, "mg_compression_dict$exe");
135 my $outhandle = $self->{'outhandle'};
136
137 my $maxnumeric = $self->{'maxnumeric'};
138
139 &FileUtils::makeAllDirectories (&FileUtils::filenameConcatenate($self->{'build_dir'}, "text"));
140
141 my $collect_tail = &util::get_dirsep_tail($self->{'collection'});
142 my $basefilename = &FileUtils::filenameConcatenate("text",$collect_tail);
143 my $fulltextprefix = &FileUtils::filenameConcatenate($self->{'build_dir'}, $basefilename);
144
145 my $osextra = "";
146 if ($ENV{'GSDLOS'} =~ /^windows$/i) {
147 $fulltextprefix =~ s@/@\\@g;
148 } else {
149 $osextra = " -d /";
150 }
151
152 print $outhandle "\n*** creating the compressed text\n" if ($self->{'verbosity'} >= 1);
153 print STDERR "<Stage name='CompressText'>\n" if $self->{'gli'};
154
155 # collect the statistics for the text
156 # -b $maxdocsize sets the maximum document size to be 12 meg
157 print $outhandle "\n collecting text statistics\n" if ($self->{'verbosity'} >= 1);
158 print STDERR "<Phase name='CollectTextStats'/>\n" if $self->{'gli'};
159
160 my ($handle);
161 if ($self->{'debug'}) {
162 $handle = *STDOUT;
163 }
164 else {
165 if (!-e "$mg_passes_exe" ||
166 !open($handle, "| mg_passes$exe -f \"$fulltextprefix\" -b $maxdocsize -T1 -M $maxnumeric $osextra")) {
167 print STDERR "<FatalError name='NoRunMGPasses'>\n</Stage>\n" if $self->{'gli'};
168 die "mgbuilder::compress_text - couldn't run $mg_passes_exe\n";
169 }
170 }
171
172 $self->{'buildproc'}->set_output_handle ($handle);
173 $self->{'buildproc'}->set_mode ('text');
174 $self->{'buildproc'}->set_index ($textindex);
175 $self->{'buildproc'}->set_indexing_text (0);
176
177
178 if ($self->{'no_text'}) {
179 $self->{'buildproc'}->set_store_text(0);
180 } else {
181 $self->{'buildproc'}->set_store_text(1);
182 }
183 $self->{'buildproc'}->reset();
184
185 &plugin::begin($self->{'pluginfo'}, $self->{'source_dir'},
186 $self->{'buildproc'}, $self->{'maxdocs'});
187 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
188 "", {}, {}, $self->{'buildproc'}, $self->{'maxdocs'}, 0, $self->{'gli'});
189 &plugin::end($self->{'pluginfo'});
190
191
192 close ($handle) unless $self->{'debug'};
193
194 $self->print_stats();
195
196 # create the compression dictionary
197 # the compression dictionary is built by assuming the stats are from a seed
198 # dictionary (-S), if a novel word is encountered it is spelled out (-H),
199 # and the resulting dictionary must be less than 5 meg with the most frequent
200 # words being put into the dictionary first (-2 -k 5120)
201 if (!$self->{'debug'}) {
202 print $outhandle "\n creating the compression dictionary\n" if ($self->{'verbosity'} >= 1);
203 print STDERR "<Phase name='CreatingCompress'/>\n" if $self->{'gli'};
204 if (!-e "$mg_compression_dict_exe") {
205 die "mgbuilder::compress_text - couldn't run $mg_compression_dict_exe\n";
206 }
207 my $comp_dict_status = system ("mg_compression_dict$exe -f \"$fulltextprefix\" -S -H -2 -k 5120 $osextra");
208 if($comp_dict_status != 0) {
209 print $outhandle "\nmgbuilder::compress_text - Warning: there's no compressed text\n";
210 $self->{'notbuilt'}->{'compressedtext'} = 1;
211 print STDERR "<Warning name='NoCompressedText'/>\n</Stage>\n" if $self->{'gli'};
212 return;
213 }
214
215 # -b $maxdocsize sets the maximum document size to be 12 meg
216 if (!-e "$mg_passes_exe" ||
217 !open ($handle, "| mg_passes$exe -f \"$fulltextprefix\" -b $maxdocsize -T2 -M $maxnumeric $osextra")) {
218 print STDERR "<FatalError name='NoRunMGPasses'/>\n</Stage>\n" if $self->{'gli'};
219 die "mgbuilder::compress_text - couldn't run $mg_passes_exe\n";
220 }
221 }
222 else {
223 print STDERR "<Phase name='SkipCreatingComp'/>\n" if $self->{'gli'};
224 }
225
226 $self->{'buildproc'}->set_output_handle ($handle);
227 $self->{'buildproc'}->reset();
228
229 # compress the text
230 print $outhandle "\n compressing the text\n" if ($self->{'verbosity'} >= 1);
231 print STDERR "<Phase name='CompressingText'/>\n" if $self->{'gli'};
232
233 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
234 "", {}, {}, $self->{'buildproc'}, $self->{'maxdocs'}, 0, $self->{'gli'});
235
236 close ($handle) unless $self->{'debug'};
237
238 $self->print_stats();
239 print STDERR "</Stage>\n" if $self->{'gli'};
240}
241
242
243# creates directory names for each of the index descriptions
244sub create_index_mapping {
245 my $self = shift (@_);
246 my ($indexes) = @_;
247
248 my %mapping = ();
249 $mapping{'indexmaporder'} = [];
250 $mapping{'subcollectionmaporder'} = [];
251 $mapping{'languagemaporder'} = [];
252
253 # dirnames is used to check for collisions. Start this off
254 # with the manditory directory names
255 my %dirnames = ('text'=>'text',
256 'extra'=>'extra');
257 my %pnames = ('index' => {}, 'subcollection' => {}, 'languages' => {});
258 foreach my $index (@$indexes) {
259 my ($level, $gran, $subcollection, $languages) = split (":", $index);
260
261 # the directory name starts with the first character of the index level
262 my ($pindex) = $level =~ /^(.)/;
263
264 # next comes a processed version of the index
265 $pindex .= $self->process_field ($gran);
266 $pindex = lc ($pindex);
267
268 # next comes a processed version of the subcollection if there is one.
269 my $psub = $self->process_field ($subcollection);
270 $psub = lc ($psub);
271
272 # next comes a processed version of the language if there is one.
273 my $plang = $self->process_field ($languages);
274 $plang = lc ($plang);
275
276 my $dirname = $pindex . $psub . $plang;
277
278 # check to be sure all index names are unique
279 while (defined ($dirnames{$dirname})) {
280 $dirname = $self->make_unique (\%pnames, $index, \$pindex, \$psub, \$plang);
281 }
282 $mapping{$index} = $dirname;
283
284 # store the mapping orders as well as the maps
285 # also put index, subcollection and language fields into the mapping thing -
286 # (the full index name (eg document:text:subcol:lang) is not used on
287 # the query page) -these are used for collectionmeta later on
288 if (!defined $mapping{'indexmap'}{"$level:$gran"}) {
289 $mapping{'indexmap'}{"$level:$gran"} = $pindex;
290 push (@{$mapping{'indexmaporder'}}, "$level:$gran");
291 if (!defined $mapping{"$level:$gran"}) {
292 $mapping{"$level:$gran"} = $pindex;
293 }
294 }
295 if ($psub =~ /\w/ && !defined ($mapping{'subcollectionmap'}{$subcollection})) {
296 $mapping{'subcollectionmap'}{$subcollection} = $psub;
297 push (@{$mapping{'subcollectionmaporder'}}, $subcollection);
298 $mapping{$subcollection} = $psub;
299 }
300 if ($plang =~ /\w/ && !defined ($mapping{'languagemap'}{$languages})) {
301 $mapping{'languagemap'}{$languages} = $plang;
302 push (@{$mapping{'languagemaporder'}}, $languages);
303 $mapping{$languages} = $plang;
304 }
305 $dirnames{$dirname} = $index;
306 $pnames{'index'}->{$pindex} = "$level:$gran";
307 $pnames{'subcollection'}->{$psub} = $subcollection;
308 $pnames{'languages'}->{$plang} = $languages;
309 }
310
311 return \%mapping;
312}
313
314
315sub make_unique {
316 my $self = shift (@_);
317 my ($namehash, $index, $indexref, $subref, $langref) = @_;
318 my ($level, $gran, $subcollection, $languages) = split (":", $index);
319
320 if ($namehash->{'index'}->{$$indexref} ne "$level:$gran") {
321 $self->get_next_version ($indexref);
322 } elsif ($namehash->{'subcollection'}->{$$subref} ne $subcollection) {
323 $self->get_next_version ($subref);
324 } elsif ($namehash->{'languages'}->{$$langref} ne $languages) {
325 $self->get_next_version ($langref);
326 }
327 return "$$indexref$$subref$$langref";
328}
329
330sub build_index {
331 my $self = shift (@_);
332 my ($index) = @_;
333 my $outhandle = $self->{'outhandle'};
334
335 # get the full index directory path and make sure it exists
336 my $indexdir = $self->{'index_mapping'}->{$index};
337 &FileUtils::makeAllDirectories (&FileUtils::filenameConcatenate($self->{'build_dir'}, $indexdir));
338
339 my $collect_tail = &util::get_dirsep_tail($self->{'collection'});
340 my $fullindexprefix = &FileUtils::filenameConcatenate($self->{'build_dir'}, $indexdir,
341 $collect_tail);
342 my $fulltextprefix = &FileUtils::filenameConcatenate($self->{'build_dir'}, "text",
343 $collect_tail);
344
345 # get any os specific stuff
346 my $exedir = "$ENV{'GSDLHOME'}/bin/$ENV{'GSDLOS'}";
347 my $exe = &util::get_os_exe ();
348 my $mg_passes_exe = &FileUtils::filenameConcatenate($exedir, "mg_passes$exe");
349 my $mg_perf_hash_build_exe =
350 &FileUtils::filenameConcatenate($exedir, "mg_perf_hash_build$exe");
351 my $mg_weights_build_exe =
352 &FileUtils::filenameConcatenate($exedir, "mg_weights_build$exe");
353 my $mg_invf_dict_exe =
354 &FileUtils::filenameConcatenate($exedir, "mg_invf_dict$exe");
355 my $mg_stem_idx_exe =
356 &FileUtils::filenameConcatenate($exedir, "mg_stem_idx$exe");
357
358 my $maxnumeric = $self->{'maxnumeric'};
359
360 my $osextra = "";
361 if ($ENV{'GSDLOS'} =~ /^windows$/i) {
362 $fullindexprefix =~ s@/@\\@g;
363 } else {
364 $osextra = " -d /";
365 if ($outhandle ne "STDERR") {
366 # so mg_passes doesn't print to stderr if we redirect output
367 $osextra .= " 2>/dev/null";
368 }
369 }
370
371 # get the index level from the index description
372 # the index will be level 2 unless we are building a
373 # paragraph level index
374 my $index_level = 2;
375 $index_level = 3 if $index =~ /^paragraph/i;
376
377 # get the index expression if this index belongs
378 # to a subcollection
379 my $indexexparr = [];
380 my $langarr = [];
381 # there may be subcollection info, and language info.
382 my ($level, $fields, $subcollection, $language) = split (":", $index);
383 my @subcollections = ();
384 @subcollections = split /,/, $subcollection if (defined $subcollection);
385
386 foreach my $subcollection (@subcollections) {
387 if (defined ($self->{'collect_cfg'}->{'subcollection'}->{$subcollection})) {
388 push (@$indexexparr, $self->{'collect_cfg'}->{'subcollection'}->{$subcollection});
389 }
390 }
391
392 # add expressions for languages if this index belongs to
393 # a language subcollection - only put languages expressions for the
394 # ones we want in the index
395
396 my @languages = ();
397 my $languagemetadata = "Language";
398 if (defined ($self->{'collect_cfg'}->{'languagemetadata'})) {
399 $languagemetadata = $self->{'collect_cfg'}->{'languagemetadata'};
400 }
401 @languages = split /,/, $language if (defined $language);
402 foreach my $language (@languages) {
403 my $not=0;
404 if ($language =~ s/^\!//) {
405 $not = 1;
406 }
407 if($not) {
408 push (@$langarr, "!$language");
409 } else {
410 push (@$langarr, "$language");
411 }
412 }
413
414 # Build index dictionary. Uses verbatim stem method
415 print $outhandle "\n creating index dictionary\n" if ($self->{'verbosity'} >= 1);
416 print STDERR "<Phase name='CreatingIndexDic'/>\n" if $self->{'gli'};
417 my ($handle);
418 if ($self->{'debug'}) {
419 $handle = *STDOUT;
420 }
421 else {
422 if (!-e "$mg_passes_exe" ||
423 !open($handle, "| mg_passes$exe -f \"$fullindexprefix\" -b $maxdocsize " .
424 "-$index_level -m 32 -s 0 -G -t 10 -N1 -M $maxnumeric $osextra")) {
425 print STDERR "<FatalError name='NoRunMGPasses'/>\n</Stage>\n" if $self->{'gli'};
426 die "mgbuilder::build_index - couldn't run $mg_passes_exe\n";
427 }
428 }
429
430 # set up the document processor
431 $self->{'buildproc'}->set_output_handle ($handle);
432 $self->{'buildproc'}->set_mode ('text');
433 $self->{'buildproc'}->set_index ($index, $indexexparr);
434 $self->{'buildproc'}->set_index_languages ($languagemetadata, $langarr) if (defined $language);
435 $self->{'buildproc'}->set_indexing_text (1);
436 $self->{'buildproc'}->set_store_text(1);
437
438 $self->{'buildproc'}->reset();
439 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
440 "", {}, {}, $self->{'buildproc'}, $self->{'maxdocs'},0, $self->{'gli'});
441 close ($handle) unless $self->{'debug'};
442
443 $self->print_stats();
444
445 # now we check to see if the required files have been produced - if not we quit building this index so the whole process doesn't crap out.
446 # we check on the .id file - index dictionary
447 my $dict_file = "$fullindexprefix.id";
448 if (!-e $dict_file) {
449 print $outhandle "mgbuilder::build_index - Couldn't create index $index\n";
450 $self->{'notbuilt'}->{$index}=1;
451 return;
452 }
453 if (!$self->{'debug'}) {
454 # create the perfect hash function
455 if (!-e "$mg_perf_hash_build_exe") {
456 print STDERR "<FatalError name='NoRunMGHash'/>\n</Stage>\n" if $self->{'gli'};
457 die "mgbuilder::build_index - couldn't run $mg_perf_hash_build_exe\n";
458 }
459
460 my $hash_cmd = "mg_perf_hash_build$exe -f \"$fullindexprefix\" $osextra";
461 print $outhandle "\ncmd: $hash_cmd\n" if ($self->{'verbosity'} >= 4);
462 my $hash_status = system ($hash_cmd);
463 print $outhandle "\nstatus from running hash_cmd: $hash_status\n" if ($self->{'verbosity'} >= 4);
464 # check that perf hash was generated - if not, don't carry on
465 if ($hash_status !=0) {
466 print $outhandle "mgbuilder::build_index - Couldn't create index $index as there are too few words in the index.\n";
467 print STDERR "<Warning name='NoIndex'/>\n</Stage>\n" if $self->{'gli'};
468 $self->{'notbuilt'}->{$index}=1;
469 return;
470
471 }
472
473 if (!-e "$mg_passes_exe" ||
474 !open ($handle, "| mg_passes$exe -f \"$fullindexprefix\" -b $maxdocsize " .
475 "-$index_level -c 3 -G -t 10 -N2 -M $maxnumeric $osextra")) {
476 print STDERR "<FatalError name='NoRunMGPasses'/>\n</Stage>\n" if $self->{'gli'};
477 die "mgbuilder::build_index - couldn't run $mg_passes_exe\n";
478 }
479 }
480
481 # invert the text
482 print $outhandle "\n inverting the text\n" if ($self->{'verbosity'} >= 1);
483 print STDERR "<Phase name='InvertingText'/>\n" if $self->{'gli'};
484
485 $self->{'buildproc'}->set_output_handle ($handle);
486 $self->{'buildproc'}->reset();
487
488 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
489 "", {}, {}, $self->{'buildproc'}, $self->{'maxdocs'},0, $self->{'gli'});
490
491
492 $self->print_stats ();
493
494 if (!$self->{'debug'}) {
495
496 close ($handle);
497 my $passes_exit_status = $?;
498 print $outhandle "\nMG passes exit status $passes_exit_status\n" if ($self->{'verbosity'} >= 4);
499
500 # create the weights file
501 print $outhandle "\n create the weights file\n" if ($self->{'verbosity'} >= 1);
502 print STDERR "<Phase name='CreateTheWeights'/>\n" if $self->{'gli'};
503 if (!-e "$mg_weights_build_exe") {
504 print STDERR "<FatalError name='NoRunMGWeights'/>\n</Stage>\n" if $self->{'gli'};
505 die "mgbuilder::build_index - couldn't run $mg_weights_build_exe\n";
506 }
507 my $weights_cmd = "mg_weights_build$exe -f \"$fullindexprefix\" -t \"$fulltextprefix\" $osextra";
508 print $outhandle "\ncmd: $weights_cmd\n" if ($self->{'verbosity'} >= 4);
509 my $weights_status = system ($weights_cmd);
510 # check that it worked - if not, don't carry on
511 if ($weights_status !=0) {
512 print $outhandle "mgbuilder::build_index - No Index: couldn't create weights file, error calling mg_weights_build.\n";
513 print STDERR "<Warning name='NoIndex'/>\n</Stage>\n" if $self->{'gli'};
514 $self->{'notbuilt'}->{$index}=1;
515 return;
516
517 }
518
519 # create 'on-disk' stemmed dictionary
520 print $outhandle "\n creating 'on-disk' stemmed dictionary\n" if ($self->{'verbosity'} >= 1);
521 print STDERR "<Phase name='CreateStemmedDic'/>\n" if $self->{'gli'};
522 if (!-e "$mg_invf_dict_exe") {
523 print STDERR "<FatalError name='NoRunMGInvf'/>\n</Stage>\n" if $self->{'gli'};
524 die "mgbuilder::build_index - couldn't run $mg_invf_dict_exe\n";
525 }
526 my $invdict_status = system ("mg_invf_dict$exe -f \"$fullindexprefix\" $osextra");
527 # check that it worked - if not, don't carry on
528 if ($invdict_status !=0) {
529 print $outhandle "mgbuilder::build_index - No Index: couldn't create on-disk stemmed dictionary, error calling mg_invf_dict.\n";
530 print STDERR "<Warning name='NoIndex'/>\n</Stage>\n" if $self->{'gli'};
531 $self->{'notbuilt'}->{$index}=1;
532 return;
533
534 }
535
536 # creates stem index files for the various stemming methods
537 print $outhandle "\n creating stem indexes\n" if ($self->{'verbosity'} >= 1);
538 print STDERR "<Phase name='CreatingStemIndx'/>\n" if $self->{'gli'};
539 if (!-e "$mg_stem_idx_exe") {
540 print STDERR "<FatalError name='NoRunMGStem'/>\n</Stage>\n" if $self->{'gli'};
541 die "mgbuilder::build_index - couldn't run $mg_stem_idx_exe\n";
542 }
543 # currently mg wont work if we don't generate all the stem idexes
544 # so we generate them whatever, but don't advertise the fact
545 #if ($self->{'casefold'}) {
546# system ("mg_stem_idx$exe -b 4096 -s1 -f \"$fullindexprefix\" $osextra");
547 #}
548 #if ($self->{'stem'}) {
549# system ("mg_stem_idx$exe -b 4096 -s2 -f \"$fullindexprefix\" $osextra");
550 #}
551 #if ($self->{'casefold'} && $self->{'stem'}) {
552# system ("mg_stem_idx$exe -b 4096 -s3 -f \"$fullindexprefix\" $osextra");
553 #}
554
555 # same as above: generate all the stem idexes. But don't bother stemming if
556 # casefolding failed, and don't try generating indexes for both if stemming failed
557 my $stem_index_status = system ("mg_stem_idx$exe -b 4096 -s1 -f \"$fullindexprefix\" $osextra");
558 if($stem_index_status != 0) {
559 print $outhandle "\nCase folding failed: mg_stem_idx exit status $stem_index_status\n" if ($self->{'verbosity'} >= 4);
560 } else {
561 $stem_index_status = system ("mg_stem_idx$exe -b 4096 -s2 -f \"$fullindexprefix\" $osextra");
562
563 if($stem_index_status != 0) {
564 print $outhandle "\nStemming failed: mg_stem_idx exit status $stem_index_status\n" if ($self->{'verbosity'} >= 4);
565 } else {
566 $stem_index_status = system ("mg_stem_idx$exe -b 4096 -s3 -f \"$fullindexprefix\" $osextra");
567
568 if($stem_index_status != 0) {
569 print $outhandle "\nCasefolding and stemming failed: mg_stem_idx exit status $stem_index_status\n" if ($self->{'verbosity'} >= 4);
570 }
571 }
572 }
573
574 # remove unwanted files
575 my $tmpdir = &FileUtils::filenameConcatenate($self->{'build_dir'}, $indexdir);
576 opendir (DIR, $tmpdir) || die
577 "mgbuilder::build_index - couldn't read directory $tmpdir\n";
578 foreach my $file (readdir(DIR)) {
579 next if $file =~ /^\./;
580 my ($suffix) = $file =~ /\.([^\.]+)$/;
581 if (defined $suffix && !defined $wanted_index_files{$suffix}) {
582 # delete it!
583 print $outhandle "deleting $file\n" if $self->{'verbosity'} > 2;
584 &FileUtils::removeFiles (&FileUtils::filenameConcatenate($tmpdir, $file));
585 }
586 }
587 closedir (DIR);
588 }
589 print STDERR "</Stage>\n" if $self->{'gli'};
590}
591
592sub build_cfg_extra {
593 my $self = shift(@_);
594 my ($build_cfg) = @_;
595
596 # get additional stats from mg
597 my $exedir = "$ENV{'GSDLHOME'}/bin/$ENV{'GSDLOS'}";
598 my $exe = &util::get_os_exe ();
599 my $mgstat_exe = &FileUtils::filenameConcatenate($exedir, "mgstat$exe");
600
601 my $collect_tail = &util::get_dirsep_tail($self->{'collection'});
602 my $input_file = &FileUtils::filenameConcatenate("text", $collect_tail);
603 if (!-e "$mgstat_exe" || !open (PIPEIN, "mgstat$exe -d \"$self->{'build_dir'}\" -f \"$input_file\" |")) {
604 my $outhandle = $self->{'outhandle'};
605 print $outhandle "Warning: Couldn't open pipe to $mgstat_exe to get additional stats\n";
606 } else {
607 my $line = "";
608 while (defined ($line = <PIPEIN>)) {
609 if ($line =~ /^Words in collection \[dict\]\s+:\s+(\d+)/) {
610 ($build_cfg->{'numwords'}) = $1;
611 } elsif ($line =~ /^Documents\s+:\s+(\d+)/) {
612 ($build_cfg->{'numsections'}) = $1;
613 }
614 }
615 close PIPEIN;
616 }
617}
618
6191;
620
621
622
Note: See TracBrowser for help on using the repository browser.