source: main/trunk/greenstone2/perllib/mgbuilder.pm@ 28375

Last change on this file since 28375 was 28375, checked in by davidb, 11 years ago

A set of changes to help Greenstone building code (perl) run under Cygwin. The test is designed to be mutually to when run natively on Windows. In effect the refined test is saying: if you're windows but not cygwin then do as you used to do for Windows, otherwise go with Unix (as Cygwin is effectively giving you a Unix like operating system to run in)

  • Property svn:keywords set to Author Date Id Revision
File size: 22.2 KB
Line 
1###########################################################################
2#
3# mgbuilder.pm -- MGBuilder object
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26package mgbuilder;
27
28use basebuilder;
29use plugin;
30use strict; no strict 'refs';
31use util;
32use FileUtils;
33
34
35BEGIN {
36 @mgbuilder::ISA = ('basebuilder');
37}
38
39
40my %wanted_index_files = ('td'=>1,
41 't'=>1,
42 'idb'=>1,
43 'ib1'=>1,
44 'ib2'=>1,
45 'ib3'=>1,
46 'i'=>1,
47 'ip'=>1,
48 'tiw'=>1,
49 'wa'=>1);
50
51my $maxdocsize = $basebuilder::maxdocsize;
52
53
54sub new {
55 my $class = shift(@_);
56
57 my $self = new basebuilder (@_);
58 $self = bless $self, $class;
59
60 $self->{'buildtype'} = "mg";
61 return $self;
62}
63
64sub default_buildproc {
65 my $self = shift (@_);
66
67 return "mgbuildproc";
68}
69
70sub generate_index_list {
71 my $self = shift (@_);
72
73 if (!defined($self->{'collect_cfg'}->{'indexes'})) {
74 $self->{'collect_cfg'}->{'indexes'} = [];
75 }
76 if (scalar(@{$self->{'collect_cfg'}->{'indexes'}}) == 0) {
77 # no indexes have been specified so we'll build a "dummy:text" index
78 push (@{$self->{'collect_cfg'}->{'indexes'}}, "dummy:text");
79 }
80 # remove any ex. but only if there are no other metadata prefixes
81 my @orig_indexes = @{$self->{'collect_cfg'}->{'indexes'}};
82 $self->{'collect_cfg'}->{'indexes'} = [];
83 foreach my $index (@orig_indexes) {
84 #$index =~ s/ex\.([^.,:]+)(,|:|$)/$1$2/g; # doesn't preserve flex.Image, which is turned into fl.Image
85 $index =~ s/(,|:)/$1 /g;
86 $index =~ s/(^| )ex\.([^.,:]+)(,|:|$)/$1$2$3/g;
87 $index =~ s/(,|:) /$1/g;
88
89 push (@{$self->{'collect_cfg'}->{'indexes'}}, $index);
90 }
91}
92
93sub generate_index_options {
94 my $self = shift (@_);
95 $self->SUPER::generate_index_options();
96
97 $self->{'casefold'} = 0;
98 $self->{'stem'} = 0;
99 $self->{'accentfold'} = 0; #not yet implemented for mg
100
101 if (defined($self->{'collect_cfg'}->{'indexoptions'})) {
102 foreach my $option (@{$self->{'collect_cfg'}->{'indexoptions'}}) {
103 if ($option =~ /stem/) {
104 $self->{'stem'} = 1;
105 } elsif ($option =~ /casefold/) {
106 $self->{'casefold'} = 1;
107 }
108 }
109 }
110
111 # now we record this for the build cfg
112 $self->{'stemindexes'} = 0;
113 if ($self->{'casefold'}) {
114 $self->{'stemindexes'} += 1;
115 }
116 if ($self->{'stem'}) {
117 $self->{'stemindexes'} += 2;
118 }
119
120
121}
122
123sub compress_text {
124 my $self = shift (@_);
125 my ($textindex) = @_;
126 my $exedir = "$ENV{'GSDLHOME'}/bin/$ENV{'GSDLOS'}";
127 my $exe = &util::get_os_exe ();
128 my $mg_passes_exe = &FileUtils::filenameConcatenate($exedir, "mg_passes$exe");
129 my $mg_compression_dict_exe = &FileUtils::filenameConcatenate($exedir, "mg_compression_dict$exe");
130 my $outhandle = $self->{'outhandle'};
131
132 my $maxnumeric = $self->{'maxnumeric'};
133
134 &FileUtils::makeAllDirectories (&FileUtils::filenameConcatenate($self->{'build_dir'}, "text"));
135
136 my $collect_tail = &util::get_dirsep_tail($self->{'collection'});
137 my $basefilename = &FileUtils::filenameConcatenate("text",$collect_tail);
138 my $fulltextprefix = &FileUtils::filenameConcatenate($self->{'build_dir'}, $basefilename);
139
140 my $osextra = "";
141 if (($ENV{'GSDLOS'} =~ /^windows$/i) && ($^O ne "cygwin")) {
142 $fulltextprefix =~ s@/@\\@g;
143 } else {
144 $osextra = " -d /";
145 }
146
147 print $outhandle "\n*** creating the compressed text\n" if ($self->{'verbosity'} >= 1);
148 print STDERR "<Stage name='CompressText'>\n" if $self->{'gli'};
149
150 # collect the statistics for the text
151 # -b $maxdocsize sets the maximum document size to be 12 meg
152 print $outhandle "\n collecting text statistics\n" if ($self->{'verbosity'} >= 1);
153 print STDERR "<Phase name='CollectTextStats'/>\n" if $self->{'gli'};
154
155 my ($handle);
156 if ($self->{'debug'}) {
157 $handle = *STDOUT;
158 }
159 else {
160 if (!-e "$mg_passes_exe" ||
161 !open($handle, "| mg_passes$exe -f \"$fulltextprefix\" -b $maxdocsize -T1 -M $maxnumeric $osextra")) {
162 print STDERR "<FatalError name='NoRunMGPasses'>\n</Stage>\n" if $self->{'gli'};
163 die "mgbuilder::compress_text - couldn't run $mg_passes_exe\n";
164 }
165 }
166
167 $self->{'buildproc'}->set_output_handle ($handle);
168 $self->{'buildproc'}->set_mode ('text');
169 $self->{'buildproc'}->set_index ($textindex);
170 $self->{'buildproc'}->set_indexing_text (0);
171
172
173 if ($self->{'no_text'}) {
174 $self->{'buildproc'}->set_store_text(0);
175 } else {
176 $self->{'buildproc'}->set_store_text(1);
177 }
178 $self->{'buildproc'}->reset();
179
180 &plugin::begin($self->{'pluginfo'}, $self->{'source_dir'},
181 $self->{'buildproc'}, $self->{'maxdocs'});
182 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
183 "", {}, {}, $self->{'buildproc'}, $self->{'maxdocs'}, 0, $self->{'gli'});
184 &plugin::end($self->{'pluginfo'});
185
186
187 close ($handle) unless $self->{'debug'};
188
189 $self->print_stats();
190
191 # create the compression dictionary
192 # the compression dictionary is built by assuming the stats are from a seed
193 # dictionary (-S), if a novel word is encountered it is spelled out (-H),
194 # and the resulting dictionary must be less than 5 meg with the most frequent
195 # words being put into the dictionary first (-2 -k 5120)
196 if (!$self->{'debug'}) {
197 print $outhandle "\n creating the compression dictionary\n" if ($self->{'verbosity'} >= 1);
198 print STDERR "<Phase name='CreatingCompress'/>\n" if $self->{'gli'};
199 if (!-e "$mg_compression_dict_exe") {
200 die "mgbuilder::compress_text - couldn't run $mg_compression_dict_exe\n";
201 }
202 my $comp_dict_status = system ("mg_compression_dict$exe -f \"$fulltextprefix\" -S -H -2 -k 5120 $osextra");
203 if($comp_dict_status != 0) {
204 print $outhandle "\nmgbuilder::compress_text - Warning: there's no compressed text\n";
205 $self->{'notbuilt'}->{'compressedtext'} = 1;
206 print STDERR "<Warning name='NoCompressedText'/>\n</Stage>\n" if $self->{'gli'};
207 return;
208 }
209
210 # -b $maxdocsize sets the maximum document size to be 12 meg
211 if (!-e "$mg_passes_exe" ||
212 !open ($handle, "| mg_passes$exe -f \"$fulltextprefix\" -b $maxdocsize -T2 -M $maxnumeric $osextra")) {
213 print STDERR "<FatalError name='NoRunMGPasses'/>\n</Stage>\n" if $self->{'gli'};
214 die "mgbuilder::compress_text - couldn't run $mg_passes_exe\n";
215 }
216 }
217 else {
218 print STDERR "<Phase name='SkipCreatingComp'/>\n" if $self->{'gli'};
219 }
220
221 $self->{'buildproc'}->set_output_handle ($handle);
222 $self->{'buildproc'}->reset();
223
224 # compress the text
225 print $outhandle "\n compressing the text\n" if ($self->{'verbosity'} >= 1);
226 print STDERR "<Phase name='CompressingText'/>\n" if $self->{'gli'};
227
228 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
229 "", {}, {}, $self->{'buildproc'}, $self->{'maxdocs'}, 0, $self->{'gli'});
230
231 close ($handle) unless $self->{'debug'};
232
233 $self->print_stats();
234 print STDERR "</Stage>\n" if $self->{'gli'};
235}
236
237
238# creates directory names for each of the index descriptions
239sub create_index_mapping {
240 my $self = shift (@_);
241 my ($indexes) = @_;
242
243 my %mapping = ();
244 $mapping{'indexmaporder'} = [];
245 $mapping{'subcollectionmaporder'} = [];
246 $mapping{'languagemaporder'} = [];
247
248 # dirnames is used to check for collisions. Start this off
249 # with the manditory directory names
250 my %dirnames = ('text'=>'text',
251 'extra'=>'extra');
252 my %pnames = ('index' => {}, 'subcollection' => {}, 'languages' => {});
253 foreach my $index (@$indexes) {
254 my ($level, $gran, $subcollection, $languages) = split (":", $index);
255
256 # the directory name starts with the first character of the index level
257 my ($pindex) = $level =~ /^(.)/;
258
259 # next comes a processed version of the index
260 $pindex .= $self->process_field ($gran);
261 $pindex = lc ($pindex);
262
263 # next comes a processed version of the subcollection if there is one.
264 my $psub = $self->process_field ($subcollection);
265 $psub = lc ($psub);
266
267 # next comes a processed version of the language if there is one.
268 my $plang = $self->process_field ($languages);
269 $plang = lc ($plang);
270
271 my $dirname = $pindex . $psub . $plang;
272
273 # check to be sure all index names are unique
274 while (defined ($dirnames{$dirname})) {
275 $dirname = $self->make_unique (\%pnames, $index, \$pindex, \$psub, \$plang);
276 }
277 $mapping{$index} = $dirname;
278
279 # store the mapping orders as well as the maps
280 # also put index, subcollection and language fields into the mapping thing -
281 # (the full index name (eg document:text:subcol:lang) is not used on
282 # the query page) -these are used for collectionmeta later on
283 if (!defined $mapping{'indexmap'}{"$level:$gran"}) {
284 $mapping{'indexmap'}{"$level:$gran"} = $pindex;
285 push (@{$mapping{'indexmaporder'}}, "$level:$gran");
286 if (!defined $mapping{"$level:$gran"}) {
287 $mapping{"$level:$gran"} = $pindex;
288 }
289 }
290 if ($psub =~ /\w/ && !defined ($mapping{'subcollectionmap'}{$subcollection})) {
291 $mapping{'subcollectionmap'}{$subcollection} = $psub;
292 push (@{$mapping{'subcollectionmaporder'}}, $subcollection);
293 $mapping{$subcollection} = $psub;
294 }
295 if ($plang =~ /\w/ && !defined ($mapping{'languagemap'}{$languages})) {
296 $mapping{'languagemap'}{$languages} = $plang;
297 push (@{$mapping{'languagemaporder'}}, $languages);
298 $mapping{$languages} = $plang;
299 }
300 $dirnames{$dirname} = $index;
301 $pnames{'index'}->{$pindex} = "$level:$gran";
302 $pnames{'subcollection'}->{$psub} = $subcollection;
303 $pnames{'languages'}->{$plang} = $languages;
304 }
305
306 return \%mapping;
307}
308
309
310sub make_unique {
311 my $self = shift (@_);
312 my ($namehash, $index, $indexref, $subref, $langref) = @_;
313 my ($level, $gran, $subcollection, $languages) = split (":", $index);
314
315 if ($namehash->{'index'}->{$$indexref} ne "$level:$gran") {
316 $self->get_next_version ($indexref);
317 } elsif ($namehash->{'subcollection'}->{$$subref} ne $subcollection) {
318 $self->get_next_version ($subref);
319 } elsif ($namehash->{'languages'}->{$$langref} ne $languages) {
320 $self->get_next_version ($langref);
321 }
322 return "$$indexref$$subref$$langref";
323}
324
325sub build_index {
326 my $self = shift (@_);
327 my ($index) = @_;
328 my $outhandle = $self->{'outhandle'};
329
330 # get the full index directory path and make sure it exists
331 my $indexdir = $self->{'index_mapping'}->{$index};
332 &FileUtils::makeAllDirectories (&FileUtils::filenameConcatenate($self->{'build_dir'}, $indexdir));
333
334 my $collect_tail = &util::get_dirsep_tail($self->{'collection'});
335 my $fullindexprefix = &FileUtils::filenameConcatenate($self->{'build_dir'}, $indexdir,
336 $collect_tail);
337 my $fulltextprefix = &FileUtils::filenameConcatenate($self->{'build_dir'}, "text",
338 $collect_tail);
339
340 # get any os specific stuff
341 my $exedir = "$ENV{'GSDLHOME'}/bin/$ENV{'GSDLOS'}";
342 my $exe = &util::get_os_exe ();
343 my $mg_passes_exe = &FileUtils::filenameConcatenate($exedir, "mg_passes$exe");
344 my $mg_perf_hash_build_exe =
345 &FileUtils::filenameConcatenate($exedir, "mg_perf_hash_build$exe");
346 my $mg_weights_build_exe =
347 &FileUtils::filenameConcatenate($exedir, "mg_weights_build$exe");
348 my $mg_invf_dict_exe =
349 &FileUtils::filenameConcatenate($exedir, "mg_invf_dict$exe");
350 my $mg_stem_idx_exe =
351 &FileUtils::filenameConcatenate($exedir, "mg_stem_idx$exe");
352
353 my $maxnumeric = $self->{'maxnumeric'};
354
355 my $osextra = "";
356 if (($ENV{'GSDLOS'} =~ /^windows$/i) && ($^O ne "cygwin")) {
357 $fullindexprefix =~ s@/@\\@g;
358 } else {
359 $osextra = " -d /";
360 if ($outhandle ne "STDERR") {
361 # so mg_passes doesn't print to stderr if we redirect output
362 $osextra .= " 2>/dev/null";
363 }
364 }
365
366 # get the index level from the index description
367 # the index will be level 2 unless we are building a
368 # paragraph level index
369 my $index_level = 2;
370 $index_level = 3 if $index =~ /^paragraph/i;
371
372 # get the index expression if this index belongs
373 # to a subcollection
374 my $indexexparr = [];
375 my $langarr = [];
376 # there may be subcollection info, and language info.
377 my ($level, $fields, $subcollection, $language) = split (":", $index);
378 my @subcollections = ();
379 @subcollections = split /,/, $subcollection if (defined $subcollection);
380
381 foreach my $subcollection (@subcollections) {
382 if (defined ($self->{'collect_cfg'}->{'subcollection'}->{$subcollection})) {
383 push (@$indexexparr, $self->{'collect_cfg'}->{'subcollection'}->{$subcollection});
384 }
385 }
386
387 # add expressions for languages if this index belongs to
388 # a language subcollection - only put languages expressions for the
389 # ones we want in the index
390
391 my @languages = ();
392 my $languagemetadata = "Language";
393 if (defined ($self->{'collect_cfg'}->{'languagemetadata'})) {
394 $languagemetadata = $self->{'collect_cfg'}->{'languagemetadata'};
395 }
396 @languages = split /,/, $language if (defined $language);
397 foreach my $language (@languages) {
398 my $not=0;
399 if ($language =~ s/^\!//) {
400 $not = 1;
401 }
402 if($not) {
403 push (@$langarr, "!$language");
404 } else {
405 push (@$langarr, "$language");
406 }
407 }
408
409 # Build index dictionary. Uses verbatim stem method
410 print $outhandle "\n creating index dictionary\n" if ($self->{'verbosity'} >= 1);
411 print STDERR "<Phase name='CreatingIndexDic'/>\n" if $self->{'gli'};
412 my ($handle);
413 if ($self->{'debug'}) {
414 $handle = *STDOUT;
415 }
416 else {
417 if (!-e "$mg_passes_exe" ||
418 !open($handle, "| mg_passes$exe -f \"$fullindexprefix\" -b $maxdocsize " .
419 "-$index_level -m 32 -s 0 -G -t 10 -N1 -M $maxnumeric $osextra")) {
420 print STDERR "<FatalError name='NoRunMGPasses'/>\n</Stage>\n" if $self->{'gli'};
421 die "mgbuilder::build_index - couldn't run $mg_passes_exe\n";
422 }
423 }
424
425 # set up the document processor
426 $self->{'buildproc'}->set_output_handle ($handle);
427 $self->{'buildproc'}->set_mode ('text');
428 $self->{'buildproc'}->set_index ($index, $indexexparr);
429 $self->{'buildproc'}->set_index_languages ($languagemetadata, $langarr) if (defined $language);
430 $self->{'buildproc'}->set_indexing_text (1);
431 $self->{'buildproc'}->set_store_text(1);
432
433 $self->{'buildproc'}->reset();
434 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
435 "", {}, {}, $self->{'buildproc'}, $self->{'maxdocs'},0, $self->{'gli'});
436 close ($handle) unless $self->{'debug'};
437
438 $self->print_stats();
439
440 # now we check to see if the required files have been produced - if not we quit building this index so the whole process doesn't crap out.
441 # we check on the .id file - index dictionary
442 my $dict_file = "$fullindexprefix.id";
443 if (!-e $dict_file) {
444 print $outhandle "mgbuilder::build_index - Couldn't create index $index\n";
445 $self->{'notbuilt'}->{$index}=1;
446 return;
447 }
448 if (!$self->{'debug'}) {
449 # create the perfect hash function
450 if (!-e "$mg_perf_hash_build_exe") {
451 print STDERR "<FatalError name='NoRunMGHash'/>\n</Stage>\n" if $self->{'gli'};
452 die "mgbuilder::build_index - couldn't run $mg_perf_hash_build_exe\n";
453 }
454
455 my $hash_cmd = "mg_perf_hash_build$exe -f \"$fullindexprefix\" $osextra";
456 print $outhandle "\ncmd: $hash_cmd\n" if ($self->{'verbosity'} >= 4);
457 my $hash_status = system ($hash_cmd);
458 print $outhandle "\nstatus from running hash_cmd: $hash_status\n" if ($self->{'verbosity'} >= 4);
459 # check that perf hash was generated - if not, don't carry on
460 if ($hash_status !=0) {
461 print $outhandle "mgbuilder::build_index - Couldn't create index $index as there are too few words in the index.\n";
462 print STDERR "<Warning name='NoIndex'/>\n</Stage>\n" if $self->{'gli'};
463 $self->{'notbuilt'}->{$index}=1;
464 return;
465
466 }
467
468 if (!-e "$mg_passes_exe" ||
469 !open ($handle, "| mg_passes$exe -f \"$fullindexprefix\" -b $maxdocsize " .
470 "-$index_level -c 3 -G -t 10 -N2 -M $maxnumeric $osextra")) {
471 print STDERR "<FatalError name='NoRunMGPasses'/>\n</Stage>\n" if $self->{'gli'};
472 die "mgbuilder::build_index - couldn't run $mg_passes_exe\n";
473 }
474 }
475
476 # invert the text
477 print $outhandle "\n inverting the text\n" if ($self->{'verbosity'} >= 1);
478 print STDERR "<Phase name='InvertingText'/>\n" if $self->{'gli'};
479
480 $self->{'buildproc'}->set_output_handle ($handle);
481 $self->{'buildproc'}->reset();
482
483 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
484 "", {}, {}, $self->{'buildproc'}, $self->{'maxdocs'},0, $self->{'gli'});
485
486
487 $self->print_stats ();
488
489 if (!$self->{'debug'}) {
490
491 close ($handle);
492 my $passes_exit_status = $?;
493 print $outhandle "\nMG passes exit status $passes_exit_status\n" if ($self->{'verbosity'} >= 4);
494
495 # create the weights file
496 print $outhandle "\n create the weights file\n" if ($self->{'verbosity'} >= 1);
497 print STDERR "<Phase name='CreateTheWeights'/>\n" if $self->{'gli'};
498 if (!-e "$mg_weights_build_exe") {
499 print STDERR "<FatalError name='NoRunMGWeights'/>\n</Stage>\n" if $self->{'gli'};
500 die "mgbuilder::build_index - couldn't run $mg_weights_build_exe\n";
501 }
502 my $weights_cmd = "mg_weights_build$exe -f \"$fullindexprefix\" -t \"$fulltextprefix\" $osextra";
503 print $outhandle "\ncmd: $weights_cmd\n" if ($self->{'verbosity'} >= 4);
504 my $weights_status = system ($weights_cmd);
505 # check that it worked - if not, don't carry on
506 if ($weights_status !=0) {
507 print $outhandle "mgbuilder::build_index - No Index: couldn't create weights file, error calling mg_weights_build.\n";
508 print STDERR "<Warning name='NoIndex'/>\n</Stage>\n" if $self->{'gli'};
509 $self->{'notbuilt'}->{$index}=1;
510 return;
511
512 }
513
514 # create 'on-disk' stemmed dictionary
515 print $outhandle "\n creating 'on-disk' stemmed dictionary\n" if ($self->{'verbosity'} >= 1);
516 print STDERR "<Phase name='CreateStemmedDic'/>\n" if $self->{'gli'};
517 if (!-e "$mg_invf_dict_exe") {
518 print STDERR "<FatalError name='NoRunMGInvf'/>\n</Stage>\n" if $self->{'gli'};
519 die "mgbuilder::build_index - couldn't run $mg_invf_dict_exe\n";
520 }
521 my $invdict_status = system ("mg_invf_dict$exe -f \"$fullindexprefix\" $osextra");
522 # check that it worked - if not, don't carry on
523 if ($invdict_status !=0) {
524 print $outhandle "mgbuilder::build_index - No Index: couldn't create on-disk stemmed dictionary, error calling mg_invf_dict.\n";
525 print STDERR "<Warning name='NoIndex'/>\n</Stage>\n" if $self->{'gli'};
526 $self->{'notbuilt'}->{$index}=1;
527 return;
528
529 }
530
531 # creates stem index files for the various stemming methods
532 print $outhandle "\n creating stem indexes\n" if ($self->{'verbosity'} >= 1);
533 print STDERR "<Phase name='CreatingStemIndx'/>\n" if $self->{'gli'};
534 if (!-e "$mg_stem_idx_exe") {
535 print STDERR "<FatalError name='NoRunMGStem'/>\n</Stage>\n" if $self->{'gli'};
536 die "mgbuilder::build_index - couldn't run $mg_stem_idx_exe\n";
537 }
538 # currently mg wont work if we don't generate all the stem idexes
539 # so we generate them whatever, but don't advertise the fact
540 #if ($self->{'casefold'}) {
541# system ("mg_stem_idx$exe -b 4096 -s1 -f \"$fullindexprefix\" $osextra");
542 #}
543 #if ($self->{'stem'}) {
544# system ("mg_stem_idx$exe -b 4096 -s2 -f \"$fullindexprefix\" $osextra");
545 #}
546 #if ($self->{'casefold'} && $self->{'stem'}) {
547# system ("mg_stem_idx$exe -b 4096 -s3 -f \"$fullindexprefix\" $osextra");
548 #}
549
550 # same as above: generate all the stem idexes. But don't bother stemming if
551 # casefolding failed, and don't try generating indexes for both if stemming failed
552 my $stem_index_status = system ("mg_stem_idx$exe -b 4096 -s1 -f \"$fullindexprefix\" $osextra");
553 if($stem_index_status != 0) {
554 print $outhandle "\nCase folding failed: mg_stem_idx exit status $stem_index_status\n" if ($self->{'verbosity'} >= 4);
555 } else {
556 $stem_index_status = system ("mg_stem_idx$exe -b 4096 -s2 -f \"$fullindexprefix\" $osextra");
557
558 if($stem_index_status != 0) {
559 print $outhandle "\nStemming failed: mg_stem_idx exit status $stem_index_status\n" if ($self->{'verbosity'} >= 4);
560 } else {
561 $stem_index_status = system ("mg_stem_idx$exe -b 4096 -s3 -f \"$fullindexprefix\" $osextra");
562
563 if($stem_index_status != 0) {
564 print $outhandle "\nCasefolding and stemming failed: mg_stem_idx exit status $stem_index_status\n" if ($self->{'verbosity'} >= 4);
565 }
566 }
567 }
568
569 # remove unwanted files
570 my $tmpdir = &FileUtils::filenameConcatenate($self->{'build_dir'}, $indexdir);
571 opendir (DIR, $tmpdir) || die
572 "mgbuilder::build_index - couldn't read directory $tmpdir\n";
573 foreach my $file (readdir(DIR)) {
574 next if $file =~ /^\./;
575 my ($suffix) = $file =~ /\.([^\.]+)$/;
576 if (defined $suffix && !defined $wanted_index_files{$suffix}) {
577 # delete it!
578 print $outhandle "deleting $file\n" if $self->{'verbosity'} > 2;
579 &FileUtils::removeFiles (&FileUtils::filenameConcatenate($tmpdir, $file));
580 }
581 }
582 closedir (DIR);
583 }
584 print STDERR "</Stage>\n" if $self->{'gli'};
585}
586
587sub build_cfg_extra {
588 my $self = shift(@_);
589 my ($build_cfg) = @_;
590
591 # get additional stats from mg
592 my $exedir = "$ENV{'GSDLHOME'}/bin/$ENV{'GSDLOS'}";
593 my $exe = &util::get_os_exe ();
594 my $mgstat_exe = &FileUtils::filenameConcatenate($exedir, "mgstat$exe");
595
596 my $collect_tail = &util::get_dirsep_tail($self->{'collection'});
597 my $input_file = &FileUtils::filenameConcatenate("text", $collect_tail);
598 if (!-e "$mgstat_exe" || !open (PIPEIN, "mgstat$exe -d \"$self->{'build_dir'}\" -f \"$input_file\" |")) {
599 my $outhandle = $self->{'outhandle'};
600 print $outhandle "Warning: Couldn't open pipe to $mgstat_exe to get additional stats\n";
601 } else {
602 my $line = "";
603 while (defined ($line = <PIPEIN>)) {
604 if ($line =~ /^Words in collection \[dict\]\s+:\s+(\d+)/) {
605 ($build_cfg->{'numwords'}) = $1;
606 } elsif ($line =~ /^Documents\s+:\s+(\d+)/) {
607 ($build_cfg->{'numsections'}) = $1;
608 }
609 }
610 close PIPEIN;
611 }
612}
613
6141;
615
616
617
Note: See TracBrowser for help on using the repository browser.