source: trunk/gsdl/perllib/mgbuilder.pm@ 1424

Last change on this file since 1424 was 1424, checked in by sjboddie, 24 years ago

Added a -out option to most of the perl building scripts to allow output
debug information to be directed to a file.

  • Property svn:keywords set to Author Date Id Revision
File size: 24.9 KB
Line 
1###########################################################################
2#
3# mgbuilder.pm -- MGBuilder object
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26package mgbuilder;
27
28use classify;
29use cfgread;
30use colcfg;
31use plugin;
32use util;
33use FileHandle;
34
35BEGIN {
36 # set autoflush on for STDERR and STDOUT so that mg
37 # doesn't get out of sync with plugins
38 STDOUT->autoflush(1);
39 STDERR->autoflush(1);
40}
41
42END {
43 STDOUT->autoflush(0);
44 STDERR->autoflush(0);
45}
46
47$maxdocsize = 12000;
48
49%wanted_index_files = ('td'=>1,
50 't'=>1,
51 'idb'=>1,
52 'ib1'=>1,
53 'ib2'=>1,
54 'ib3'=>1,
55 'i'=>1,
56 'ip'=>1,
57 'tiw'=>1,
58 'wa'=>1);
59
60
61sub new {
62 my ($class, $collection, $source_dir, $build_dir, $verbosity,
63 $maxdocs, $debug, $keepold, $allclassifications, $outhandle) = @_;
64
65 $outhandle = STDERR unless defined $outhandle;
66
67 # create an mgbuilder object
68 my $self = bless {'collection'=>$collection,
69 'source_dir'=>$source_dir,
70 'build_dir'=>$build_dir,
71 'verbosity'=>$verbosity,
72 'maxdocs'=>$maxdocs,
73 'debug'=>$debug,
74 'keepold'=>$keepold,
75 'allclassifications'=>$allclassifications,
76 'outhandle'=>$outhandle,
77 'notbuilt'=>[] # indexes not built
78 }, $class;
79
80
81 # read in the collection configuration file
82 my $colcfgname = "$ENV{'GSDLCOLLECTDIR'}/etc/collect.cfg";
83 if (!-e $colcfgname) {
84 die "mgbuilder::new - couldn't find collect.cfg for collection $collection\n";
85 }
86 $self->{'collect_cfg'} = &colcfg::read_collect_cfg ($colcfgname);
87
88 # sort out subcollection indexes
89 if (defined $self->{'collect_cfg'}->{'indexsubcollections'}) {
90 my $indexes = $self->{'collect_cfg'}->{'indexes'};
91 $self->{'collect_cfg'}->{'indexes'} = [];
92 foreach $subcollection (@{$self->{'collect_cfg'}->{'indexsubcollections'}}) {
93 foreach $index (@$indexes) {
94 push (@{$self->{'collect_cfg'}->{'indexes'}}, "$index:$subcollection");
95 }
96 }
97 }
98
99 # sort out language subindexes
100 if (defined $self->{'collect_cfg'}->{'languages'}) {
101 my $indexes = $self->{'collect_cfg'}->{'indexes'};
102 $self->{'collect_cfg'}->{'indexes'} = [];
103 foreach $language (@{$self->{'collect_cfg'}->{'languages'}}) {
104 foreach $index (@$indexes) {
105 push (@{$self->{'collect_cfg'}->{'indexes'}}, "$index:$language");
106 }
107 }
108 }
109
110 # get the list of plugins for this collection
111 my $plugins = [];
112 if (defined $self->{'collect_cfg'}->{'plugin'}) {
113 $plugins = $self->{'collect_cfg'}->{'plugin'};
114 }
115
116 # load all the plugins
117 $self->{'pluginfo'} = &plugin::load_plugins ($plugins, $verbosity, $outhandle);
118 if (scalar(@{$self->{'pluginfo'}}) == 0) {
119 print $outhandle "No plugins were loaded.\n";
120 die "\n";
121 }
122
123 # get the list of classifiers for this collection
124 my $classifiers = [];
125 if (defined $self->{'collect_cfg'}->{'classify'}) {
126 $classifiers = $self->{'collect_cfg'}->{'classify'};
127 }
128
129 # load all the classifiers
130 $self->{'classifiers'} = &classify::load_classifiers ($classifiers);
131
132 # load up any dontgdbm fields
133 $self->{'dontgdbm'} = {};
134 if (defined ($self->{'collect_cfg'}->{'dontgdbm'})) {
135 foreach $dg (@{$self->{'collect_cfg'}->{'dontgdbm'}}) {
136 $self->{'dontgdbm'}->{$dg} = 1;
137 }
138 }
139
140 # load up the document processor for building
141 # if a buildproc class has been created for this collection, use it
142 # otherwise, use the mg buildproc
143 my ($buildprocdir, $buildproctype);
144 if (-e "$ENV{'GSDLCOLLECTDIR'}/perllib/${collection}buildproc.pm") {
145 $buildprocdir = "$ENV{'GSDLCOLLECTDIR'}/perllib";
146 $buildproctype = "${collection}buildproc";
147 } else {
148 $buildprocdir = "$ENV{'GSDLHOME'}/perllib";
149 $buildproctype = "mgbuildproc";
150 }
151 require "$buildprocdir/$buildproctype.pm";
152
153 eval("\$self->{'buildproc'} = new $buildproctype(\$collection, " .
154 "\$source_dir, \$build_dir, \$verbosity, \$outhandle)");
155 die "$@" if $@;
156
157 return $self;
158}
159
160sub init {
161 my $self = shift (@_);
162
163 if (!$self->{'debug'} && !$self->{'keepold'}) {
164 # remove any old builds
165 &util::rm_r($self->{'build_dir'});
166 &util::mk_all_dir($self->{'build_dir'});
167
168 # make the text directory
169 my $textdir = "$self->{'build_dir'}/text";
170 &util::mk_all_dir($textdir);
171 }
172}
173
174sub compress_text {
175 my $self = shift (@_);
176 my ($textindex) = @_;
177 my $exedir = "$ENV{'GSDLHOME'}/bin/$ENV{'GSDLOS'}";
178 my $exe = &util::get_os_exe ();
179 my $mg_passes_exe = &util::filename_cat($exedir, "mg_passes$exe");
180 my $mg_compression_dict_exe = &util::filename_cat($exedir, "mg_compression_dict$exe");
181 my $outhandle = $self->{'outhandle'};
182
183 &util::mk_all_dir (&util::filename_cat($self->{'build_dir'}, "text"));
184 my $basefilename = "text/$self->{'collection'}";
185 my $fulltextprefix = &util::filename_cat ($self->{'build_dir'}, $basefilename);
186
187 my $osextra = "";
188 if ($ENV{'GSDLOS'} =~ /^windows$/i) {
189 $fulltextprefix =~ s/\//\\/g;
190 } else {
191 $osextra = " -d /";
192 }
193
194 print $outhandle "\n*** creating the compressed text\n" if ($self->{'verbosity'} >= 1);
195
196 # collect the statistics for the text
197 # -b $maxdocsize sets the maximum document size to be 12 meg
198 print $outhandle "\n collecting text statistics\n" if ($self->{'verbosity'} >= 1);
199
200 my ($handle);
201 if ($self->{'debug'}) {
202 $handle = STDOUT;
203 } else {
204 if (!-e "$mg_passes_exe" ||
205 !open (PIPEOUT, "| $mg_passes_exe -f $fulltextprefix -b $maxdocsize -T1 $osextra")) {
206 die "mgbuilder::compress_text - couldn't run $mg_passes_exe\n";
207 }
208 $handle = mgbuilder::PIPEOUT;
209 }
210
211 $self->{'buildproc'}->set_output_handle ($handle);
212 $self->{'buildproc'}->set_mode ('text');
213 $self->{'buildproc'}->set_index ($textindex);
214 $self->{'buildproc'}->set_indexing_text (0);
215 $self->{'buildproc'}->reset();
216 &plugin::begin($self->{'pluginfo'}, $self->{'source_dir'},
217 $self->{'buildproc'}, $self->{'maxdocs'});
218 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
219 "", {}, $self->{'buildproc'}, $self->{'maxdocs'});
220 &plugin::end($self->{'pluginfo'});
221
222 close ($handle) unless $self->{'debug'};
223
224 $self->print_stats();
225
226 # create the compression dictionary
227 # the compression dictionary is built by assuming the stats are from a seed
228 # dictionary (-S), if a novel word is encountered it is spelled out (-H),
229 # and the resulting dictionary must be less than 5 meg with the most frequent
230 # words being put into the dictionary first (-2 -k 5120)
231 if (!$self->{'debug'}) {
232 print $outhandle "\n creating the compression dictionary\n" if ($self->{'verbosity'} >= 1);
233 if (!-e "$mg_compression_dict_exe") {
234 die "mgbuilder::compress_text - couldn't run $mg_compression_dict_exe\n";
235 }
236 system ("$mg_compression_dict_exe -f $fulltextprefix -S -H -2 -k 5120 $osextra");
237
238 # -b $maxdocsize sets the maximum document size to be 12 meg
239 if (!-e "$mg_passes_exe" ||
240 !open ($handle, "| $mg_passes_exe -f $fulltextprefix -b $maxdocsize -T2 $osextra")) {
241 die "mgbuilder::compress_text - couldn't run $mg_passes_exe\n";
242 }
243 }
244
245 $self->{'buildproc'}->reset();
246 # compress the text
247 print $outhandle "\n compressing the text\n" if ($self->{'verbosity'} >= 1);
248 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
249 "", {}, $self->{'buildproc'}, $self->{'maxdocs'});
250 close ($handle) unless $self->{'debug'};
251
252 $self->print_stats();
253}
254
255sub want_built {
256 my $self = shift (@_);
257 my ($index) = @_;
258
259 if (defined ($self->{'collect_cfg'}->{'dontbuild'})) {
260 foreach $checkstr (@{$self->{'collect_cfg'}->{'dontbuild'}}) {
261 if ($index =~ /^$checkstr$/) {
262 push (@{$self->{'notbuilt'}}, $self->{'index_mapping'}->{$index});
263 return 0;
264 }
265 }
266 }
267
268 return 1;
269}
270
271sub build_indexes {
272 my $self = shift (@_);
273 my ($indexname) = @_;
274 my $outhandle = $self->{'outhandle'};
275
276 my $indexes = [];
277 if (defined $indexname && $indexname =~ /\w/) {
278 push @$indexes, $indexname;
279 } else {
280 $indexes = $self->{'collect_cfg'}->{'indexes'};
281 }
282
283 # create the mapping between the index descriptions
284 # and their directory names
285 $self->{'index_mapping'} = $self->create_index_mapping ($indexes);
286
287 # build each of the indexes
288 foreach $index (@$indexes) {
289 if ($self->want_built($index)) {
290 print $outhandle "\n*** building index $index in subdirectory " .
291 "$self->{'index_mapping'}->{$index}\n" if ($self->{'verbosity'} >= 1);
292 $self->build_index($index);
293 } else {
294 print $outhandle "\n*** ignoring index $index\n" if ($self->{'verbosity'} >= 1);
295 }
296 }
297}
298
299# creates directory names for each of the index descriptions
300sub create_index_mapping {
301 my $self = shift (@_);
302 my ($indexes) = @_;
303
304 my %mapping = ();
305 $mapping{'indexmaporder'} = [];
306 $mapping{'subcollectionmaporder'} = [];
307 $mapping{'languagemaporder'} = [];
308
309 # dirnames is used to check for collisions. Start this off
310 # with the manditory directory names
311 my %dirnames = ('text'=>'text',
312 'extra'=>'extra');
313 my %pnames = ('index' => '', 'subcollection' => '', 'languages' => '');
314
315 foreach $index (@$indexes) {
316 my ($level, $gran, $subcollection, $languages) = split (":", $index);
317
318 # the directory name starts with the first character of the index level
319 my ($pindex) = $level =~ /^(.)/;
320
321 # next comes a processed version of the index
322 $pindex .= $self->process_field ($gran);
323 $pindex = lc ($pindex);
324
325 # next comes a processed version of the subcollection if there is one.
326 my $psub = $self->process_field ($subcollection);
327 $psub = lc ($psub);
328
329 # next comes a processed version of the language if there is one.
330 my $plang = $self->process_field ($languages);
331 $plang = lc ($plang);
332
333 my $dirname = $pindex . $psub . $plang;
334
335 # check to be sure all index names are unique
336 while (defined ($dirnames{$dirname})) {
337 $dirname = $self->make_unique (\%pnames, $index, \$pindex, \$psub, \$plang);
338 }
339
340 # store the mapping orders as well as the maps
341 if (!defined $mapping{'indexmap'}{"$level:$gran"}) {
342 $mapping{'indexmap'}{"$level:$gran"} = $pindex;
343 push (@{$mapping{'indexmaporder'}}, "$level:$gran");
344 }
345 if ($psub =~ /\w/ && !defined ($mapping{'subcollectionmap'}{$subcollection})) {
346 $mapping{'subcollectionmap'}{$subcollection} = $psub;
347 push (@{$mapping{'subcollectionmaporder'}}, $subcollection);
348 }
349 if ($plang =~ /\w/ && !defined ($mapping{'languagemap'}{$languages})) {
350 $mapping{'languagemap'}{$languages} = $plang;
351 push (@{$mapping{'languagemaporder'}}, $language);
352 }
353 $mapping{$index} = $dirname;
354 $dirnames{$dirname} = $index;
355 $pnames{'index'}{$pindex} = "$level:$gran";
356 $pnames{'subcollection'}{$psub} = $subcollection;
357 $pnames{'languages'}{$plang} = $languages;
358 }
359
360 return \%mapping;
361}
362
363# returns a processed version of a field.
364# if the field has only one component the processed
365# version will contain the first character and next consonant
366# of that componant - otherwise it will contain the first
367# character of the first two components
368sub process_field {
369 my $self = shift (@_);
370 my ($field) = @_;
371
372 return "" unless (defined ($field) && $field =~ /\w/);
373
374 my @components = split /,/, $field;
375 if (scalar @components >= 2) {
376 splice (@components, 2);
377 map {s/^(.).*$/$1/;} @components;
378 return join("", @components);
379 } else {
380 my ($a, $b) = $field =~ /^(.).*?([bcdfghjklmnpqrstvwxyz])/i;
381 ($a, $b) = $field =~ /^(.)(.)/ unless defined $a && defined $b;
382 return "$a$b";
383 }
384}
385
386sub make_unique {
387 my $self = shift (@_);
388 my ($namehash, $index, $indexref, $subref, $langref) = @_;
389 my ($level, $gran, $subcollection, $languages) = split (":", $index);
390
391 if ($namehash->{'index'}->{$$indexref} ne "$level:$gran") {
392 $self->get_next_version ($indexref);
393 } elsif ($namehash->{'subcollection'}->{$$subref} ne $subcollection) {
394 $self->get_next_version ($subref);
395 } elsif ($namehash->{'languages'}->{$$langref} ne $languages) {
396 $self->get_next_version ($langref);
397 }
398 return "$$indexref$$subref$$langref";
399}
400
401sub get_next_version {
402 my $self = shift (@_);
403 my ($nameref) = @_;
404
405 if ($$nameref =~ /(\d\d)$/) {
406 my $num = $1; $num ++;
407 $$nameref =~ s/\d\d$/$num/;
408 } elsif ($$nameref =~ /(\d)$/) {
409 my $num = $1;
410 if ($num == 9) {$$nameref =~ s/\d\d$/10/;}
411 else {$num ++; $$nameref =~ s/\d$/$num/;}
412 } else {
413 $$nameref =~ s/.$/0/;
414 }
415}
416
417sub build_index {
418 my $self = shift (@_);
419 my ($index) = @_;
420 my $outhandle = $self->{'outhandle'};
421
422 # get the full index directory path and make sure it exists
423 my $indexdir = $self->{'index_mapping'}->{$index};
424 &util::mk_all_dir (&util::filename_cat($self->{'build_dir'}, $indexdir));
425 my $fullindexprefix = &util::filename_cat ($self->{'build_dir'}, $indexdir,
426 $self->{'collection'});
427 my $fulltextprefix = &util::filename_cat ($self->{'build_dir'}, "text",
428 $self->{'collection'});
429
430 # get any os specific stuff
431 my $exedir = "$ENV{'GSDLHOME'}/bin/$ENV{'GSDLOS'}";
432 my $exe = &util::get_os_exe ();
433 my $mg_passes_exe = &util::filename_cat($exedir, "mg_passes$exe");
434 my $mg_perf_hash_build_exe =
435 &util::filename_cat($exedir, "mg_perf_hash_build$exe");
436 my $mg_weights_build_exe =
437 &util::filename_cat ($exedir, "mg_weights_build$exe");
438 my $mg_invf_dict_exe =
439 &util::filename_cat ($exedir, "mg_invf_dict$exe");
440 my $mg_stem_idx_exe =
441 &util::filename_cat ($exedir, "mg_stem_idx$exe");
442
443 my $osextra = "";
444 if ($ENV{'GSDLOS'} =~ /^windows$/i) {
445 $fullindexprefix =~ s/\//\\/g;
446 } else {
447 $osextra = " -d /";
448 }
449
450 # get the index level from the index description
451 # the index will be level 2 unless we are building a
452 # paragraph level index
453 my $index_level = 2;
454 $index_level = 3 if $index =~ /^paragraph/i;
455
456 # get the index expression if this index belongs
457 # to a subcollection
458 my $indexexparr = [];
459 my ($level, $fields, $subcollection) = split (":", $index);
460 my @subcollections = ();
461 @subcollections = split /,/, $subcollection if (defined $subcollection);
462
463 foreach $subcollection (@subcollections) {
464 if (defined ($self->{'collect_cfg'}->{'subcollection'}->{$subcollection})) {
465 push (@$indexexparr, $self->{'collect_cfg'}->{'subcollection'}->{$subcollection});
466 }
467 }
468
469 # add expressions for languages if this index belongs to
470 # a language subcollection
471 foreach $language (@{$self->{'collect_cfg'}->{'languages'}}) {
472 if ($language =~ s/^\!//) {
473 push (@$indexexparr, "!Language/$language/");
474 } else {
475 push (@$indexexparr, "Language/$language/");
476 }
477 }
478
479 # Build index dictionary. Uses verbatim stem method
480 print $outhandle "\n creating index dictionary\n" if ($self->{'verbosity'} >= 1);
481 my ($handle);
482 if ($self->{'debug'}) {
483 $handle = STDOUT;
484 } else {
485 if (!-e "$mg_passes_exe" ||
486 !open (PIPEOUT, "| $mg_passes_exe -f $fullindexprefix -b $maxdocsize " .
487 "-$index_level -m 32 -s 0 -G -t 10 -N1 $osextra")) {
488 die "mgbuilder::build_index - couldn't run $mg_passes_exe\n";
489 }
490 $handle = mgbuilder::PIPEOUT;
491 }
492
493 # set up the document processor
494 $self->{'buildproc'}->set_output_handle ($handle);
495 $self->{'buildproc'}->set_mode ('text');
496 $self->{'buildproc'}->set_index ($index, $indexexparr);
497 $self->{'buildproc'}->set_indexing_text (1);
498
499 $self->{'buildproc'}->reset();
500 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
501 "", {}, $self->{'buildproc'}, $self->{'maxdocs'});
502 close ($handle) unless $self->{'debug'};
503
504 $self->print_stats();
505
506 if (!$self->{'debug'}) {
507 # create the perfect hash function
508 if (!-e "$mg_perf_hash_build_exe") {
509 die "mgbuilder::build_index - couldn't run $mg_perf_hash_build_exe\n";
510 }
511 system ("$mg_perf_hash_build_exe -f $fullindexprefix $osextra");
512
513 if (!-e "$mg_passes_exe" ||
514 !open ($handle, "| $mg_passes_exe -f $fullindexprefix -b $maxdocsize " .
515 "-$index_level -c 3 -G -t 10 -N2 $osextra")) {
516 die "mgbuilder::build_index - couldn't run $mg_passes_exe\n";
517 }
518 }
519
520 # invert the text
521 print $outhandle "\n inverting the text\n" if ($self->{'verbosity'} >= 1);
522
523 $self->{'buildproc'}->reset();
524 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
525 "", {}, $self->{'buildproc'}, $self->{'maxdocs'});
526
527 $self->print_stats ();
528
529 if (!$self->{'debug'}) {
530
531 close ($handle);
532
533 # create the weights file
534 print $outhandle "\n create the weights file\n" if ($self->{'verbosity'} >= 1);
535 if (!-e "$mg_weights_build_exe") {
536 die "mgbuilder::build_index - couldn't run $mg_weights_build_exe\n";
537 }
538 system ("$mg_weights_build_exe -f $fullindexprefix -t $fulltextprefix $osextra");
539
540 # create 'on-disk' stemmed dictionary
541 print $outhandle "\n creating 'on-disk' stemmed dictionary\n" if ($self->{'verbosity'} >= 1);
542 if (!-e "$mg_invf_dict_exe") {
543 die "mgbuilder::build_index - couldn't run $mg_invf_dict_exe\n";
544 }
545 system ("$mg_invf_dict_exe -f $fullindexprefix $osextra");
546
547
548 # creates stem index files for the various stemming methods
549 print $outhandle "\n creating stem indexes\n" if ($self->{'verbosity'} >= 1);
550 if (!-e "$mg_stem_idx_exe") {
551 die "mgbuilder::build_index - couldn't run $mg_stem_idx_exe\n";
552 }
553 system ("$mg_stem_idx_exe -b 4096 -s1 -f $fullindexprefix $osextra");
554 system ("$mg_stem_idx_exe -b 4096 -s2 -f $fullindexprefix $osextra");
555 system ("$mg_stem_idx_exe -b 4096 -s3 -f $fullindexprefix $osextra");
556
557
558 # remove unwanted files
559 my $tmpdir = &util::filename_cat ($self->{'build_dir'}, $indexdir);
560 opendir (DIR, $tmpdir) || die
561 "mgbuilder::build_index - couldn't read directory $tmpdir\n";
562 foreach $file (readdir(DIR)) {
563 next if $file =~ /^\./;
564 my ($suffix) = $file =~ /\.([^\.]+)$/;
565 if (defined $suffix && !defined $wanted_index_files{$suffix}) {
566 # delete it!
567 print $outhandle "deleting $file\n" if $self->{'verbosity'} > 2;
568 &util::rm (&util::filename_cat ($tmpdir, $file));
569 }
570 }
571 closedir (DIR);
572 }
573}
574
575sub make_infodatabase {
576 my $self = shift (@_);
577 my $outhandle = $self->{'outhandle'};
578
579 my $textdir = &util::filename_cat($self->{'build_dir'}, "text");
580 my $assocdir = &util::filename_cat($self->{'build_dir'}, "assoc");
581 &util::mk_all_dir ($textdir);
582 &util::mk_all_dir ($assocdir);
583
584 # get db name
585 my $dbext = ".bdb";
586 $dbext = ".ldb" if &util::is_little_endian();
587 my $fulldbname = &util::filename_cat ($textdir, "$self->{'collection'}$dbext");
588 $fulldbname =~ s/\//\\/g if ($ENV{'GSDLOS'} =~ /^windows$/i);
589
590 my $exedir = "$ENV{'GSDLHOME'}/bin/$ENV{'GSDLOS'}";
591 my $exe = &util::get_os_exe ();
592 my $txt2db_exe = &util::filename_cat($exedir, "txt2db$exe");
593
594 print $outhandle "\n*** creating the info database and processing associated files\n"
595 if ($self->{'verbosity'} >= 1);
596
597 # init all the classifiers
598 &classify::init_classifiers ($self->{'classifiers'});
599
600 # set up the document processor
601 my ($handle);
602 if ($self->{'debug'}) {
603 $handle = STDOUT;
604 } else {
605 if (!-e "$txt2db_exe" || !open (PIPEOUT, "| $txt2db_exe $fulldbname")) {
606 die "mgbuilder::make_infodatabase - couldn't run $txt2db_exe\n";
607 }
608 $handle = mgbuilder::PIPEOUT;
609 }
610
611 $self->{'buildproc'}->set_output_handle ($handle);
612 $self->{'buildproc'}->set_mode ('infodb');
613 $self->{'buildproc'}->set_assocdir ($assocdir);
614 $self->{'buildproc'}->set_dontgdbm ($self->{'dontgdbm'});
615 $self->{'buildproc'}->set_classifiers ($self->{'classifiers'});
616 $self->{'buildproc'}->set_indexing_text (0);
617 $self->{'buildproc'}->reset();
618
619 if (defined $self->{'collect_cfg'}->{'collectionmeta'}) {
620
621 if (!defined $self->{'index_mapping'}) {
622 $self->{'index_mapping'} =
623 $self->create_index_mapping ($self->{'collect_cfg'}->{'indexes'});
624 }
625
626 print $handle "[collection]\n";
627
628 foreach $cmeta (keys (%{$self->{'collect_cfg'}->{'collectionmeta'}})) {
629 if ($cmeta =~ s/^\.//) {
630 if (defined $self->{'index_mapping'}->{$cmeta}) {
631 print $handle "<$self->{'index_mapping'}->{$cmeta}>" .
632 $self->{'collect_cfg'}->{'collectionmeta'}->{".$cmeta"} . "\n";
633 } else {
634 print $outhandle "mgbuilder: warning bad collectionmeta option '$cmeta' - ignored\n";
635 }
636 } else {
637 print $handle "<$cmeta>$self->{'collect_cfg'}->{'collectionmeta'}->{$cmeta}\n";
638 }
639 }
640 print $handle "\n" . ('-' x 70) . "\n";
641
642 }
643
644 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
645 "", {}, $self->{'buildproc'}, $self->{'maxdocs'});
646
647 # output classification information
648 &classify::output_classify_info ($self->{'classifiers'}, $handle,
649 $self->{'allclassifications'});
650
651 close ($handle) if !$self->{'debug'};
652}
653
654sub collect_specific {
655 my $self = shift (@_);
656}
657
658sub make_auxiliary_files {
659 my $self = shift (@_);
660 my ($index);
661 my %build_cfg = ();
662 my $outhandle = $self->{'outhandle'};
663
664 print $outhandle "\n*** creating auxiliary files \n" if ($self->{'verbosity'} >= 1);
665
666 # get the text directory
667 &util::mk_all_dir ($self->{'build_dir'});
668
669 # store the build date
670 $build_cfg->{'builddate'} = time;
671
672 # store the number of documents and number of bytes
673 $build_cfg->{'numdocs'} = $self->{'buildproc'}->get_num_docs();
674 $build_cfg->{'numbytes'} = $self->{'buildproc'}->get_num_bytes();
675
676 # get additional stats from mg
677 my $exedir = "$ENV{'GSDLHOME'}/bin/$ENV{'GSDLOS'}";
678 my $exe = &util::get_os_exe ();
679 my $mgstat_exe = &util::filename_cat($exedir, "mgstat$exe");
680 my $input_file = &util::filename_cat ("text", $self->{'collection'});
681 if (!-e "$mgstat_exe" || !open (PIPEIN, "$mgstat_exe -d $self->{'build_dir'} -f $input_file |")) {
682 print $outhandle "Warning: Couldn't open pipe to $mgstat_exe to get additional stats\n";
683 } else {
684 my $line = "";
685 while (defined ($line = <PIPEIN>)) {
686 if ($line =~ /^Words in collection \[dict\]\s+:\s+(\d+)/) {
687 ($build_cfg->{'numwords'}) = $1;
688 } elsif ($line =~ /^Documents\s+:\s+(\d+)/) {
689 ($build_cfg->{'numsections'}) = $1;
690 }
691 }
692 close PIPEIN;
693 }
694
695 # store the mapping between the index names and the directory names
696 my @indexmap = ();
697 foreach $index (@{$self->{'index_mapping'}->{'indexmaporder'}}) {
698 push (@indexmap, "$index\-\>$self->{'index_mapping'}->{'indexmap'}->{$index}");
699 }
700 $build_cfg->{'indexmap'} = \@indexmap;
701
702 my @subcollectionmap = ();
703 foreach $subcollection (@{$self->{'index_mapping'}->{'subcollectionmaporder'}}) {
704 push (@subcollectionmap, "$subcollection\-\>" .
705 $self->{'index_mapping'}->{'subcollectionmap'}->{$subcollection});
706 }
707 $build_cfg->{'subcollectionmap'} = \@subcollectionmap if scalar (@subcollectionmap);
708
709 my @languagemap = ();
710 foreach $language (@{$self->{'index_mapping'}->{'languagemaporder'}}) {
711 push (@languagemap, "$language\-\>" .
712 $self->{'index_mapping'}->{'languagemap'}->{$language});
713 }
714 $build_cfg->{'languagemap'} = \@languagemap if scalar (@languagemap);
715
716 $build_cfg->{'notbuilt'} = $self->{'notbuilt'} if scalar @{$self->{'notbuilt'}};
717
718 # write out the build information
719 &cfgread::write_cfg_file("$self->{'build_dir'}/build.cfg", $build_cfg,
720 '^(builddate|numdocs|numbytes|numwords|numsections)$',
721 '^(indexmap|subcollectionmap|languagemap|notbuilt)$');
722
723}
724
725sub deinit {
726 my $self = shift (@_);
727}
728
729sub print_stats {
730 my $self = shift (@_);
731
732 my $outhandle = $self->{'outhandle'};
733 my $indexing_text = $self->{'buildproc'}->get_indexing_text();
734 my $index = $self->{'buildproc'}->get_index();
735 my $num_bytes = $self->{'buildproc'}->get_num_bytes();
736 my $num_processed_bytes = $self->{'buildproc'}->get_num_processed_bytes();
737
738 if ($indexing_text) {
739 print $outhandle "Stats (Creating index $index)\n";
740 } else {
741 print $outhandle "Stats (Compressing text from $index)\n";
742 }
743 print $outhandle "Total bytes in collection: $num_bytes\n";
744 print $outhandle "Total bytes in $index: $num_processed_bytes\n";
745
746 if ($num_processed_bytes < 50) {
747 print $outhandle "***************\n";
748 print $outhandle "WARNING: There is very little or no text to process for $index\n";
749 if ($indexing_text) {
750 print $outhandle "This may cause an error while attempting to build the index\n";
751 } else {
752 print $outhandle "This may cause an error while attempting to compress the text\n";
753 }
754 print $outhandle "***************\n";
755 }
756}
757
7581;
759
760
Note: See TracBrowser for help on using the repository browser.