source: main/trunk/greenstone2/perllib/mgbuilder.pm@ 29329

Last change on this file since 29329 was 29329, checked in by ak19, 10 years ago

Dr Bainbridge added SIG PIPE handling to mgbuilder to help him discover when pipes failed in executing cmds that were run with open(). Have yet to test that the SIG PIPE handling code still allows the mgppbuilder.pm and mgbuilder.pm to run on Windows.

  • Property svn:keywords set to Author Date Id Revision
File size: 23.5 KB
Line 
1###########################################################################
2#
3# mgbuilder.pm -- MGBuilder object
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26package mgbuilder;
27
28use basebuilder;
29use plugin;
30use strict; no strict 'refs';
31use util;
32use FileUtils;
33
34
35BEGIN {
36 @mgbuilder::ISA = ('basebuilder');
37}
38
39
40$SIG{PIPE} = sub {
41 print "got SIGPIPE\n";
42 die "$0: Error: $!";
43};
44
45
46my %wanted_index_files = ('td'=>1,
47 't'=>1,
48 'idb'=>1,
49 'ib1'=>1,
50 'ib2'=>1,
51 'ib3'=>1,
52 'i'=>1,
53 'ip'=>1,
54 'tiw'=>1,
55 'wa'=>1);
56
57my $maxdocsize = $basebuilder::maxdocsize;
58
59
60sub new {
61 my $class = shift(@_);
62
63 my $self = new basebuilder (@_);
64 $self = bless $self, $class;
65
66 $self->{'buildtype'} = "mg";
67 return $self;
68}
69
70sub default_buildproc {
71 my $self = shift (@_);
72
73 return "mgbuildproc";
74}
75
76sub generate_index_list {
77 my $self = shift (@_);
78
79 if (!defined($self->{'collect_cfg'}->{'indexes'})) {
80 $self->{'collect_cfg'}->{'indexes'} = [];
81 }
82 if (scalar(@{$self->{'collect_cfg'}->{'indexes'}}) == 0) {
83 # no indexes have been specified so we'll build a "dummy:text" index
84 push (@{$self->{'collect_cfg'}->{'indexes'}}, "dummy:text");
85 }
86 # remove any ex. but only if there are no other metadata prefixes
87 my @orig_indexes = @{$self->{'collect_cfg'}->{'indexes'}};
88 $self->{'collect_cfg'}->{'indexes'} = [];
89 foreach my $index (@orig_indexes) {
90 #$index =~ s/ex\.([^.,:]+)(,|:|$)/$1$2/g; # doesn't preserve flex.Image, which is turned into fl.Image
91 $index =~ s/(,|:)/$1 /g;
92 $index =~ s/(^| )ex\.([^.,:]+)(,|:|$)/$1$2$3/g;
93 $index =~ s/(,|:) /$1/g;
94
95 push (@{$self->{'collect_cfg'}->{'indexes'}}, $index);
96 }
97}
98
99sub generate_index_options {
100 my $self = shift (@_);
101 $self->SUPER::generate_index_options();
102
103 $self->{'casefold'} = 0;
104 $self->{'stem'} = 0;
105 $self->{'accentfold'} = 0; #not yet implemented for mg
106
107 if (defined($self->{'collect_cfg'}->{'indexoptions'})) {
108 foreach my $option (@{$self->{'collect_cfg'}->{'indexoptions'}}) {
109 if ($option =~ /stem/) {
110 $self->{'stem'} = 1;
111 } elsif ($option =~ /casefold/) {
112 $self->{'casefold'} = 1;
113 }
114 }
115 }
116
117 # now we record this for the build cfg
118 $self->{'stemindexes'} = 0;
119 if ($self->{'casefold'}) {
120 $self->{'stemindexes'} += 1;
121 }
122 if ($self->{'stem'}) {
123 $self->{'stemindexes'} += 2;
124 }
125
126
127}
128
129sub compress_text {
130 my $self = shift (@_);
131 my ($textindex) = @_;
132 my $exedir = "$ENV{'GSDLHOME'}/bin/$ENV{'GSDLOS'}";
133 my $exe = &util::get_os_exe ();
134 my $mg_passes_exe = &FileUtils::filenameConcatenate($exedir, "mg_passes$exe");
135 my $mg_compression_dict_exe = &FileUtils::filenameConcatenate($exedir, "mg_compression_dict$exe");
136 my $outhandle = $self->{'outhandle'};
137
138 my $maxnumeric = $self->{'maxnumeric'};
139
140 &FileUtils::makeAllDirectories (&FileUtils::filenameConcatenate($self->{'build_dir'}, "text"));
141
142 my $collect_tail = &util::get_dirsep_tail($self->{'collection'});
143 my $basefilename = &FileUtils::filenameConcatenate("text",$collect_tail);
144 my $fulltextprefix = &FileUtils::filenameConcatenate($self->{'build_dir'}, $basefilename);
145
146 my $osextra = "";
147 if (($ENV{'GSDLOS'} =~ /^windows$/i) && ($^O ne "cygwin")) {
148 $fulltextprefix =~ s@/@\\@g;
149 } else {
150 $osextra = " -d /";
151 }
152
153 print $outhandle "\n*** creating the compressed text\n" if ($self->{'verbosity'} >= 1);
154 print STDERR "<Stage name='CompressText'>\n" if $self->{'gli'};
155
156 # collect the statistics for the text
157 # -b $maxdocsize sets the maximum document size to be 12 meg
158 print $outhandle "\n collecting text statistics\n" if ($self->{'verbosity'} >= 1);
159 print STDERR "<Phase name='CollectTextStats'/>\n" if $self->{'gli'};
160
161 my ($handle);
162 if ($self->{'debug'}) {
163 $handle = *STDOUT;
164 }
165 else {
166 my $mgpasses_cmd = "mg_passes$exe -f \"$fulltextprefix\" -b $maxdocsize -T1 -M $maxnumeric $osextra";
167 #print STDERR "**** mg_passes$exe -f \"$fulltextprefix\" -b $maxdocsize -T1 -M $maxnumeric $osextra\n\n";
168 print $outhandle "\ncmd: $mgpasses_cmd\n" if ($self->{'verbosity'} >= 4);
169
170 if (!-e "$mg_passes_exe" || !open($handle, "| $mgpasses_cmd")) {
171 print STDERR "<FatalError name='NoRunMGPasses'>\n</Stage>\n" if $self->{'gli'};
172 die "mgbuilder::compress_text - couldn't run $mg_passes_exe\n";
173 }
174 }
175
176 $self->{'buildproc'}->set_output_handle ($handle);
177 $self->{'buildproc'}->set_mode ('text');
178 $self->{'buildproc'}->set_index ($textindex);
179 $self->{'buildproc'}->set_indexing_text (0);
180
181
182 if ($self->{'no_text'}) {
183 $self->{'buildproc'}->set_store_text(0);
184 } else {
185 $self->{'buildproc'}->set_store_text(1);
186 }
187 $self->{'buildproc'}->reset();
188
189 &plugin::begin($self->{'pluginfo'}, $self->{'source_dir'},
190 $self->{'buildproc'}, $self->{'maxdocs'});
191 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
192 "", {}, {}, $self->{'buildproc'}, $self->{'maxdocs'}, 0, $self->{'gli'});
193 &plugin::end($self->{'pluginfo'});
194
195
196 close ($handle) unless $self->{'debug'};
197
198 $self->print_stats();
199
200 # create the compression dictionary
201 # the compression dictionary is built by assuming the stats are from a seed
202 # dictionary (-S), if a novel word is encountered it is spelled out (-H),
203 # and the resulting dictionary must be less than 5 meg with the most frequent
204 # words being put into the dictionary first (-2 -k 5120)
205 if (!$self->{'debug'}) {
206 print $outhandle "\n creating the compression dictionary\n" if ($self->{'verbosity'} >= 1);
207 my $compdict_cmd = "mg_compression_dict$exe -f \"$fulltextprefix\" -S\ -H -2 -k 5120 $osextra";
208 print $outhandle "\ncmd: $compdict_cmd\n" if ($self->{'verbosity'} >= 4);
209 print STDERR "<Phase name='CreatingCompress'/>\n" if $self->{'gli'};
210 if (!-e "$mg_compression_dict_exe") {
211 die "mgbuilder::compress_text - couldn't run $mg_compression_dict_exe\n";
212 }
213 my $comp_dict_status = system ($compdict_cmd);
214 if($comp_dict_status != 0) {
215 print $outhandle "\nmgbuilder::compress_text - Warning: there's no compressed text\n";
216 $self->{'notbuilt'}->{'compressedtext'} = 1;
217 print STDERR "<Warning name='NoCompressedText'/>\n</Stage>\n" if $self->{'gli'};
218 return;
219 }
220
221 # -b $maxdocsize sets the maximum document size to be 12 meg
222 my $mgpasses_cmd = "mg_passes$exe -f \"$fulltextprefix\" -b $maxdocsize -T2 -M $maxnumeric $osextra" ;
223 print $outhandle "\ncmd: $mgpasses_cmd\n" if ($self->{'verbosity'} >= 4);
224
225 if (!-e "$mg_passes_exe" || !open ($handle, "| $mgpasses_cmd")) {
226 print STDERR "<FatalError name='NoRunMGPasses'/>\n</Stage>\n" if $self->{'gli'};
227 die "mgbuilder::compress_text - couldn't run $mg_passes_exe\n";
228 }
229 }
230 else {
231 print STDERR "<Phase name='SkipCreatingComp'/>\n" if $self->{'gli'};
232 }
233
234 $self->{'buildproc'}->set_output_handle ($handle);
235 $self->{'buildproc'}->reset();
236
237 # compress the text
238 print $outhandle "\n compressing the text\n" if ($self->{'verbosity'} >= 1);
239 print STDERR "<Phase name='CompressingText'/>\n" if $self->{'gli'};
240
241 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
242 "", {}, {}, $self->{'buildproc'}, $self->{'maxdocs'}, 0, $self->{'gli'});
243
244 close ($handle) unless $self->{'debug'};
245
246 $self->print_stats();
247 print STDERR "</Stage>\n" if $self->{'gli'};
248}
249
250
251# creates directory names for each of the index descriptions
252sub create_index_mapping {
253 my $self = shift (@_);
254 my ($indexes) = @_;
255
256 my %mapping = ();
257 $mapping{'indexmaporder'} = [];
258 $mapping{'subcollectionmaporder'} = [];
259 $mapping{'languagemaporder'} = [];
260
261 # dirnames is used to check for collisions. Start this off
262 # with the manditory directory names
263 my %dirnames = ('text'=>'text',
264 'extra'=>'extra');
265 my %pnames = ('index' => {}, 'subcollection' => {}, 'languages' => {});
266 foreach my $index (@$indexes) {
267 my ($level, $gran, $subcollection, $languages) = split (":", $index);
268
269 # the directory name starts with the first character of the index level
270 my ($pindex) = $level =~ /^(.)/;
271
272 # next comes a processed version of the index
273 $pindex .= $self->process_field ($gran);
274 $pindex = lc ($pindex);
275
276 # next comes a processed version of the subcollection if there is one.
277 my $psub = $self->process_field ($subcollection);
278 $psub = lc ($psub);
279
280 # next comes a processed version of the language if there is one.
281 my $plang = $self->process_field ($languages);
282 $plang = lc ($plang);
283
284 my $dirname = $pindex . $psub . $plang;
285
286 # check to be sure all index names are unique
287 while (defined ($dirnames{$dirname})) {
288 $dirname = $self->make_unique (\%pnames, $index, \$pindex, \$psub, \$plang);
289 }
290 $mapping{$index} = $dirname;
291
292 # store the mapping orders as well as the maps
293 # also put index, subcollection and language fields into the mapping thing -
294 # (the full index name (eg document:text:subcol:lang) is not used on
295 # the query page) -these are used for collectionmeta later on
296 if (!defined $mapping{'indexmap'}{"$level:$gran"}) {
297 $mapping{'indexmap'}{"$level:$gran"} = $pindex;
298 push (@{$mapping{'indexmaporder'}}, "$level:$gran");
299 if (!defined $mapping{"$level:$gran"}) {
300 $mapping{"$level:$gran"} = $pindex;
301 }
302 }
303 if ($psub =~ /\w/ && !defined ($mapping{'subcollectionmap'}{$subcollection})) {
304 $mapping{'subcollectionmap'}{$subcollection} = $psub;
305 push (@{$mapping{'subcollectionmaporder'}}, $subcollection);
306 $mapping{$subcollection} = $psub;
307 }
308 if ($plang =~ /\w/ && !defined ($mapping{'languagemap'}{$languages})) {
309 $mapping{'languagemap'}{$languages} = $plang;
310 push (@{$mapping{'languagemaporder'}}, $languages);
311 $mapping{$languages} = $plang;
312 }
313 $dirnames{$dirname} = $index;
314 $pnames{'index'}->{$pindex} = "$level:$gran";
315 $pnames{'subcollection'}->{$psub} = $subcollection;
316 $pnames{'languages'}->{$plang} = $languages;
317 }
318
319 return \%mapping;
320}
321
322
323sub make_unique {
324 my $self = shift (@_);
325 my ($namehash, $index, $indexref, $subref, $langref) = @_;
326 my ($level, $gran, $subcollection, $languages) = split (":", $index);
327
328 if ($namehash->{'index'}->{$$indexref} ne "$level:$gran") {
329 $self->get_next_version ($indexref);
330 } elsif ($namehash->{'subcollection'}->{$$subref} ne $subcollection) {
331 $self->get_next_version ($subref);
332 } elsif ($namehash->{'languages'}->{$$langref} ne $languages) {
333 $self->get_next_version ($langref);
334 }
335 return "$$indexref$$subref$$langref";
336}
337
338sub build_index {
339 my $self = shift (@_);
340 my ($index) = @_;
341 my $outhandle = $self->{'outhandle'};
342
343 # get the full index directory path and make sure it exists
344 my $indexdir = $self->{'index_mapping'}->{$index};
345 &FileUtils::makeAllDirectories (&FileUtils::filenameConcatenate($self->{'build_dir'}, $indexdir));
346
347 my $collect_tail = &util::get_dirsep_tail($self->{'collection'});
348 my $fullindexprefix = &FileUtils::filenameConcatenate($self->{'build_dir'}, $indexdir,
349 $collect_tail);
350 my $fulltextprefix = &FileUtils::filenameConcatenate($self->{'build_dir'}, "text",
351 $collect_tail);
352
353 # get any os specific stuff
354 my $exedir = "$ENV{'GSDLHOME'}/bin/$ENV{'GSDLOS'}";
355 my $exe = &util::get_os_exe ();
356 my $mg_passes_exe = &FileUtils::filenameConcatenate($exedir, "mg_passes$exe");
357 my $mg_perf_hash_build_exe =
358 &FileUtils::filenameConcatenate($exedir, "mg_perf_hash_build$exe");
359 my $mg_weights_build_exe =
360 &FileUtils::filenameConcatenate($exedir, "mg_weights_build$exe");
361 my $mg_invf_dict_exe =
362 &FileUtils::filenameConcatenate($exedir, "mg_invf_dict$exe");
363 my $mg_stem_idx_exe =
364 &FileUtils::filenameConcatenate($exedir, "mg_stem_idx$exe");
365
366 my $maxnumeric = $self->{'maxnumeric'};
367
368 my $osextra = "";
369 if (($ENV{'GSDLOS'} =~ /^windows$/i) && ($^O ne "cygwin")) {
370 $fullindexprefix =~ s@/@\\@g;
371 } else {
372 $osextra = " -d /";
373 if ($outhandle ne "STDERR") {
374 # so mg_passes doesn't print to stderr if we redirect output
375 $osextra .= " 2>/dev/null";
376 }
377 }
378
379 # get the index level from the index description
380 # the index will be level 2 unless we are building a
381 # paragraph level index
382 my $index_level = 2;
383 $index_level = 3 if $index =~ /^paragraph/i;
384
385 # get the index expression if this index belongs
386 # to a subcollection
387 my $indexexparr = [];
388 my $langarr = [];
389 # there may be subcollection info, and language info.
390 my ($level, $fields, $subcollection, $language) = split (":", $index);
391 my @subcollections = ();
392 @subcollections = split /,/, $subcollection if (defined $subcollection);
393
394 foreach my $subcollection (@subcollections) {
395 if (defined ($self->{'collect_cfg'}->{'subcollection'}->{$subcollection})) {
396 push (@$indexexparr, $self->{'collect_cfg'}->{'subcollection'}->{$subcollection});
397 }
398 }
399
400 # add expressions for languages if this index belongs to
401 # a language subcollection - only put languages expressions for the
402 # ones we want in the index
403
404 my @languages = ();
405 my $languagemetadata = "Language";
406 if (defined ($self->{'collect_cfg'}->{'languagemetadata'})) {
407 $languagemetadata = $self->{'collect_cfg'}->{'languagemetadata'};
408 }
409 @languages = split /,/, $language if (defined $language);
410 foreach my $language (@languages) {
411 my $not=0;
412 if ($language =~ s/^\!//) {
413 $not = 1;
414 }
415 if($not) {
416 push (@$langarr, "!$language");
417 } else {
418 push (@$langarr, "$language");
419 }
420 }
421
422 # Build index dictionary. Uses verbatim stem method
423 print $outhandle "\n creating index dictionary\n" if ($self->{'verbosity'} >= 1);
424 print STDERR "<Phase name='CreatingIndexDic'/>\n" if $self->{'gli'};
425 my ($handle);
426 if ($self->{'debug'}) {
427 $handle = *STDOUT;
428 }
429 else {
430 my $mgpasses_cmd = "mg_passes$exe -f \"$fullindexprefix\" -b $maxdocsize " .
431 "-$index_level -m 32 -s 0 -G -t 10 -N1 -M $maxnumeric $osextra";
432 print $outhandle "\ncmd: $mgpasses_cmd\n" if ($self->{'verbosity'} >= 4);
433
434 if (!-e "$mg_passes_exe" || !open($handle, "| $mgpasses_cmd")) {
435 print STDERR "<FatalError name='NoRunMGPasses'/>\n</Stage>\n" if $self->{'gli'};
436 die "mgbuilder::build_index - couldn't run $mg_passes_exe\n";
437 }
438 }
439
440 # set up the document processor
441 $self->{'buildproc'}->set_output_handle ($handle);
442 $self->{'buildproc'}->set_mode ('text');
443 $self->{'buildproc'}->set_index ($index, $indexexparr);
444 $self->{'buildproc'}->set_index_languages ($languagemetadata, $langarr) if (defined $language);
445 $self->{'buildproc'}->set_indexing_text (1);
446 $self->{'buildproc'}->set_store_text(1);
447
448 $self->{'buildproc'}->reset();
449 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
450 "", {}, {}, $self->{'buildproc'}, $self->{'maxdocs'},0, $self->{'gli'});
451 close ($handle) unless $self->{'debug'};
452
453 $self->print_stats();
454
455 # now we check to see if the required files have been produced - if not we quit building this index so the whole process doesn't crap out.
456 # we check on the .id file - index dictionary
457 my $dict_file = "$fullindexprefix.id";
458 if (!-e $dict_file) {
459 print $outhandle "mgbuilder::build_index - Couldn't create index $index\n";
460 $self->{'notbuilt'}->{$index}=1;
461 return;
462 }
463 if (!$self->{'debug'}) {
464 # create the perfect hash function
465 if (!-e "$mg_perf_hash_build_exe") {
466 print STDERR "<FatalError name='NoRunMGHash'/>\n</Stage>\n" if $self->{'gli'};
467 die "mgbuilder::build_index - couldn't run $mg_perf_hash_build_exe\n";
468 }
469
470 my $hash_cmd = "mg_perf_hash_build$exe -f \"$fullindexprefix\" $osextra";
471 print $outhandle "\ncmd: $hash_cmd\n" if ($self->{'verbosity'} >= 4);
472 my $hash_status = system ($hash_cmd);
473 print $outhandle "\nstatus from running hash_cmd: $hash_status\n" if ($self->{'verbosity'} >= 4);
474 # check that perf hash was generated - if not, don't carry on
475 if ($hash_status !=0) {
476 print $outhandle "mgbuilder::build_index - Couldn't create index $index as there are too few words in the index.\n";
477 print STDERR "<Warning name='NoIndex'/>\n</Stage>\n" if $self->{'gli'};
478 $self->{'notbuilt'}->{$index}=1;
479 return;
480
481 }
482
483 my $mgpasses_cmd = "mg_passes$exe -f \"$fullindexprefix\" -b $maxdocsize " .
484 "-$index_level -c 3 -G -t 10 -N2 -M $maxnumeric $osextra";
485 print $outhandle "\ncmd: $mgpasses_cmd\n" if ($self->{'verbosity'} >= 4);
486 if (!-e "$mg_passes_exe" || !open ($handle, "| $mgpasses_cmd")) {
487 print STDERR "<FatalError name='NoRunMGPasses'/>\n</Stage>\n" if $self->{'gli'};
488 die "mgbuilder::build_index - couldn't run $mg_passes_exe\n";
489 }
490 }
491
492 # invert the text
493 print $outhandle "\n inverting the text\n" if ($self->{'verbosity'} >= 1);
494 print STDERR "<Phase name='InvertingText'/>\n" if $self->{'gli'};
495
496 $self->{'buildproc'}->set_output_handle ($handle);
497 $self->{'buildproc'}->reset();
498
499 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
500 "", {}, {}, $self->{'buildproc'}, $self->{'maxdocs'},0, $self->{'gli'});
501
502
503 $self->print_stats ();
504
505 if (!$self->{'debug'}) {
506
507 close ($handle);
508 my $passes_exit_status = $?;
509 print $outhandle "\nMG passes exit status $passes_exit_status\n" if ($self->{'verbosity'} >= 4);
510
511 # create the weights file
512 print $outhandle "\n create the weights file\n" if ($self->{'verbosity'} >= 1);
513 print STDERR "<Phase name='CreateTheWeights'/>\n" if $self->{'gli'};
514 if (!-e "$mg_weights_build_exe") {
515 print STDERR "<FatalError name='NoRunMGWeights'/>\n</Stage>\n" if $self->{'gli'};
516 die "mgbuilder::build_index - couldn't run $mg_weights_build_exe\n";
517 }
518 my $weights_cmd = "mg_weights_build$exe -f \"$fullindexprefix\" -t \"$fulltextprefix\" $osextra";
519 print $outhandle "\ncmd: $weights_cmd\n" if ($self->{'verbosity'} >= 4);
520 my $weights_status = system ($weights_cmd);
521 # check that it worked - if not, don't carry on
522 if ($weights_status !=0) {
523 print $outhandle "mgbuilder::build_index - No Index: couldn't create weights file, error calling mg_weights_build.\n";
524 print STDERR "<Warning name='NoIndex'/>\n</Stage>\n" if $self->{'gli'};
525 $self->{'notbuilt'}->{$index}=1;
526 return;
527
528 }
529
530 # create 'on-disk' stemmed dictionary
531 print $outhandle "\n creating 'on-disk' stemmed dictionary\n" if ($self->{'verbosity'} >= 1);
532 my $invdict_cmd = "mg_invf_dict$exe -f \"$fullindexprefix\" $osextra";
533 print $outhandle "\ncmd: $invdict_cmd\n" if ($self->{'verbosity'} >= 4);
534
535 print STDERR "<Phase name='CreateStemmedDic'/>\n" if $self->{'gli'};
536 if (!-e "$mg_invf_dict_exe") {
537 print STDERR "<FatalError name='NoRunMGInvf'/>\n</Stage>\n" if $self->{'gli'};
538 die "mgbuilder::build_index - couldn't run $mg_invf_dict_exe\n";
539 }
540 my $invdict_status = system ($invdict_cmd);
541 # check that it worked - if not, don't carry on
542 if ($invdict_status !=0) {
543 print $outhandle "mgbuilder::build_index - No Index: couldn't create on-disk stemmed dictionary, error calling mg_invf_dict.\n";
544 print STDERR "<Warning name='NoIndex'/>\n</Stage>\n" if $self->{'gli'};
545 $self->{'notbuilt'}->{$index}=1;
546 return;
547
548 }
549
550 # creates stem index files for the various stemming methods
551 print $outhandle "\n creating stem indexes\n" if ($self->{'verbosity'} >= 1);
552 print STDERR "<Phase name='CreatingStemIndx'/>\n" if $self->{'gli'};
553 if (!-e "$mg_stem_idx_exe") {
554 print STDERR "<FatalError name='NoRunMGStem'/>\n</Stage>\n" if $self->{'gli'};
555 die "mgbuilder::build_index - couldn't run $mg_stem_idx_exe\n";
556 }
557 # currently mg wont work if we don't generate all the stem idexes
558 # so we generate them whatever, but don't advertise the fact
559 #if ($self->{'casefold'}) {
560# system ("mg_stem_idx$exe -b 4096 -s1 -f \"$fullindexprefix\" $osextra");
561 #}
562 #if ($self->{'stem'}) {
563# system ("mg_stem_idx$exe -b 4096 -s2 -f \"$fullindexprefix\" $osextra");
564 #}
565 #if ($self->{'casefold'} && $self->{'stem'}) {
566# system ("mg_stem_idx$exe -b 4096 -s3 -f \"$fullindexprefix\" $osextra");
567 #}
568
569 # same as above: generate all the stem idexes. But don't bother stemming if
570 # casefolding failed, and don't try generating indexes for both if stemming failed
571 my $stemindex_cmd = "mg_stem_idx$exe -b 4096 -s1 -f \"$fullindexprefix\" $osextra";
572 print $outhandle "\ncmd: $stemindex_cmd\n" if ($self->{'verbosity'} >= 4);
573
574 my $stem_index_status = system ($stemindex_cmd);
575 if($stem_index_status != 0) {
576 print $outhandle "\nCase folding failed: mg_stem_idx exit status $stem_index_status\n" if ($self->{'verbosity'} >= 4);
577 } else {
578 $stemindex_cmd = "mg_stem_idx$exe -b 4096 -s2 -f \"$fullindexprefix\" $osextra";
579 print $outhandle "\ncmd: $stemindex_cmd\n" if ($self->{'verbosity'} >= 4);
580 $stem_index_status = system ($stemindex_cmd);
581
582 if($stem_index_status != 0) {
583 print $outhandle "\nStemming failed: mg_stem_idx exit status $stem_index_status\n" if ($self->{'verbosity'} >= 4);
584 } else {
585 $stemindex_cmd = "mg_stem_idx$exe -b 4096 -s3 -f \"$fullindexprefix\" $osextra";
586 print $outhandle "\ncmd: $stemindex_cmd\n" if ($self->{'verbosity'} >= 4);
587 $stem_index_status = system ($stemindex_cmd);
588
589 if($stem_index_status != 0) {
590 print $outhandle "\nCasefolding and stemming failed: mg_stem_idx exit status $stem_index_status\n" if ($self->{'verbosity'} >= 4);
591 }
592 }
593 }
594
595 # remove unwanted files
596 my $tmpdir = &FileUtils::filenameConcatenate($self->{'build_dir'}, $indexdir);
597 opendir (DIR, $tmpdir) || die
598 "mgbuilder::build_index - couldn't read directory $tmpdir\n";
599 foreach my $file (readdir(DIR)) {
600 next if $file =~ /^\./;
601 my ($suffix) = $file =~ /\.([^\.]+)$/;
602 if (defined $suffix && !defined $wanted_index_files{$suffix}) {
603 # delete it!
604 print $outhandle "deleting $file\n" if $self->{'verbosity'} > 2;
605 &FileUtils::removeFiles (&FileUtils::filenameConcatenate($tmpdir, $file));
606 }
607 }
608 closedir (DIR);
609 }
610 print STDERR "</Stage>\n" if $self->{'gli'};
611}
612
613sub build_cfg_extra {
614 my $self = shift(@_);
615 my ($build_cfg) = @_;
616
617 # get additional stats from mg
618 my $exedir = "$ENV{'GSDLHOME'}/bin/$ENV{'GSDLOS'}";
619 my $exe = &util::get_os_exe ();
620 my $mgstat_exe = &FileUtils::filenameConcatenate($exedir, "mgstat$exe");
621
622 my $collect_tail = &util::get_dirsep_tail($self->{'collection'});
623 my $input_file = &FileUtils::filenameConcatenate("text", $collect_tail);
624
625 my $mgstat_cmd = "mgstat$exe -d \"$self->{'build_dir'}\" -f \"$input_file\"";
626 my $outhandle = $self->{'outhandle'};
627 print $outhandle "\ncmd: $mgstat_cmd\n" if ($self->{'verbosity'} >= 4);
628 if (!-e "$mgstat_exe" || !open (PIPEIN, "$mgstat_cmd |")) {
629
630 print $outhandle "Warning: Couldn't open pipe to $mgstat_exe to get additional stats\n";
631 } else {
632 my $line = "";
633 while (defined ($line = <PIPEIN>)) {
634 if ($line =~ /^Words in collection \[dict\]\s+:\s+(\d+)/) {
635 ($build_cfg->{'numwords'}) = $1;
636 } elsif ($line =~ /^Documents\s+:\s+(\d+)/) {
637 ($build_cfg->{'numsections'}) = $1;
638 }
639 }
640 close PIPEIN;
641 }
642}
643
6441;
645
646
647
Note: See TracBrowser for help on using the repository browser.