source: trunk/gsdl/perllib/mgbuilder.pm@ 1317

Last change on this file since 1317 was 1304, checked in by sjboddie, 24 years ago

fixed an intermittent bug (I hope) when building under windows

  • Property svn:keywords set to Author Date Id Revision
File size: 24.4 KB
Line 
1###########################################################################
2#
3# mgbuilder.pm -- MGBuilder object
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26package mgbuilder;
27
28use classify;
29use cfgread;
30use colcfg;
31use plugin;
32use util;
33use FileHandle;
34
35BEGIN {
36 # set autoflush on for STDERR and STDOUT so that mg
37 # doesn't get out of sync with plugins
38 STDOUT->autoflush(1);
39 STDERR->autoflush(1);
40}
41
42END {
43 STDOUT->autoflush(0);
44 STDERR->autoflush(0);
45}
46
47$maxdocsize = 12000;
48
49%wanted_index_files = ('td'=>1,
50 't'=>1,
51 'idb'=>1,
52 'ib1'=>1,
53 'ib2'=>1,
54 'ib3'=>1,
55 'i'=>1,
56 'ip'=>1,
57 'tiw'=>1,
58 'wa'=>1);
59
60
61sub new {
62 my ($class, $collection, $source_dir, $build_dir, $verbosity,
63 $maxdocs, $debug, $keepold, $allclassifications) = @_;
64
65 # create an mgbuilder object
66 my $self = bless {'collection'=>$collection,
67 'source_dir'=>$source_dir,
68 'build_dir'=>$build_dir,
69 'verbosity'=>$verbosity,
70 'maxdocs'=>$maxdocs,
71 'debug'=>$debug,
72 'keepold'=>$keepold,
73 'allclassifications'=>$allclassifications,
74 'notbuilt'=>[] # indexes not built
75 }, $class;
76
77
78 # read in the collection configuration file
79 my $colcfgname = "$ENV{'GSDLCOLLECTDIR'}/etc/collect.cfg";
80 if (!-e $colcfgname) {
81 die "mgbuilder::new - couldn't find collect.cfg for collection $collection\n";
82 }
83 $self->{'collect_cfg'} = &colcfg::read_collect_cfg ($colcfgname);
84
85 # sort out subcollection indexes
86 if (defined $self->{'collect_cfg'}->{'indexsubcollections'}) {
87 my $indexes = $self->{'collect_cfg'}->{'indexes'};
88 $self->{'collect_cfg'}->{'indexes'} = [];
89 foreach $subcollection (@{$self->{'collect_cfg'}->{'indexsubcollections'}}) {
90 foreach $index (@$indexes) {
91 push (@{$self->{'collect_cfg'}->{'indexes'}}, "$index:$subcollection");
92 }
93 }
94 }
95
96 # sort out language subindexes
97 if (defined $self->{'collect_cfg'}->{'languages'}) {
98 my $indexes = $self->{'collect_cfg'}->{'indexes'};
99 $self->{'collect_cfg'}->{'indexes'} = [];
100 foreach $language (@{$self->{'collect_cfg'}->{'languages'}}) {
101 foreach $index (@$indexes) {
102 push (@{$self->{'collect_cfg'}->{'indexes'}}, "$index:$language");
103 }
104 }
105 }
106
107 # get the list of plugins for this collection
108 my $plugins = [];
109 if (defined $self->{'collect_cfg'}->{'plugin'}) {
110 $plugins = $self->{'collect_cfg'}->{'plugin'};
111 }
112
113 # load all the plugins
114 $self->{'pluginfo'} = &plugin::load_plugins ($plugins, $verbosity);
115 if (scalar(@{$self->{'pluginfo'}}) == 0) {
116 print STDERR "No plugins were loaded.\n";
117 die "\n";
118 }
119
120 # get the list of classifiers for this collection
121 my $classifiers = [];
122 if (defined $self->{'collect_cfg'}->{'classify'}) {
123 $classifiers = $self->{'collect_cfg'}->{'classify'};
124 }
125
126 # load all the classifiers
127 $self->{'classifiers'} = &classify::load_classifiers ($classifiers);
128
129 # load up any dontgdbm fields
130 $self->{'dontgdbm'} = {};
131 if (defined ($self->{'collect_cfg'}->{'dontgdbm'})) {
132 foreach $dg (@{$self->{'collect_cfg'}->{'dontgdbm'}}) {
133 $self->{'dontgdbm'}->{$dg} = 1;
134 }
135 }
136
137 # load up the document processor for building
138 # if a buildproc class has been created for this collection, use it
139 # otherwise, use the mg buildproc
140 my ($buildprocdir, $buildproctype);
141 if (-e "$ENV{'GSDLCOLLECTDIR'}/perllib/${collection}buildproc.pm") {
142 $buildprocdir = "$ENV{'GSDLCOLLECTDIR'}/perllib";
143 $buildproctype = "${collection}buildproc";
144 } else {
145 $buildprocdir = "$ENV{'GSDLHOME'}/perllib";
146 $buildproctype = "mgbuildproc";
147 }
148 require "$buildprocdir/$buildproctype.pm";
149
150 eval("\$self->{'buildproc'} = new $buildproctype(\$collection, " .
151 "\$source_dir, \$build_dir, \$verbosity)");
152 die "$@" if $@;
153
154 return $self;
155}
156
157sub init {
158 my $self = shift (@_);
159
160 if (!$self->{'debug'} && !$self->{'keepold'}) {
161 # remove any old builds
162 &util::rm_r($self->{'build_dir'});
163 &util::mk_all_dir($self->{'build_dir'});
164
165 # make the text directory
166 my $textdir = "$self->{'build_dir'}/text";
167 &util::mk_all_dir($textdir);
168 }
169}
170
171sub compress_text {
172 my $self = shift (@_);
173 my ($textindex) = @_;
174 my $exedir = "$ENV{'GSDLHOME'}/bin/$ENV{'GSDLOS'}";
175 my $exe = &util::get_os_exe ();
176 my $mg_passes_exe = &util::filename_cat($exedir, "mg_passes$exe");
177 my $mg_compression_dict_exe = &util::filename_cat($exedir, "mg_compression_dict$exe");
178
179 &util::mk_all_dir (&util::filename_cat($self->{'build_dir'}, "text"));
180 my $basefilename = "text/$self->{'collection'}";
181 my $fulltextprefix = &util::filename_cat ($self->{'build_dir'}, $basefilename);
182
183 my $osextra = "";
184 if ($ENV{'GSDLOS'} =~ /^windows$/i) {
185 $fulltextprefix =~ s/\//\\/g;
186 } else {
187 $osextra = " -d /";
188 }
189
190 print STDERR "\n*** creating the compressed text\n" if ($self->{'verbosity'} >= 1);
191
192 # collect the statistics for the text
193 # -b $maxdocsize sets the maximum document size to be 12 meg
194 print STDERR "\n collecting text statistics\n" if ($self->{'verbosity'} >= 1);
195
196 my ($handle);
197 if ($self->{'debug'}) {
198 $handle = STDOUT;
199 } else {
200 if (!-e "$mg_passes_exe" ||
201 !open (PIPEOUT, "| $mg_passes_exe -f $fulltextprefix -b $maxdocsize -T1 $osextra")) {
202 die "mgbuilder::compress_text - couldn't run $mg_passes_exe\n";
203 }
204 $handle = mgbuilder::PIPEOUT;
205 }
206
207 $self->{'buildproc'}->set_output_handle ($handle);
208 $self->{'buildproc'}->set_mode ('text');
209 $self->{'buildproc'}->set_index ($textindex);
210 $self->{'buildproc'}->set_indexing_text (0);
211 $self->{'buildproc'}->reset();
212 &plugin::begin($self->{'pluginfo'}, $self->{'source_dir'},
213 $self->{'buildproc'}, $self->{'maxdocs'});
214 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
215 "", {}, $self->{'buildproc'}, $self->{'maxdocs'});
216 &plugin::end($self->{'pluginfo'});
217
218 close ($handle) unless $self->{'debug'};
219
220 $self->print_stats();
221
222 # create the compression dictionary
223 # the compression dictionary is built by assuming the stats are from a seed
224 # dictionary (-S), if a novel word is encountered it is spelled out (-H),
225 # and the resulting dictionary must be less than 5 meg with the most frequent
226 # words being put into the dictionary first (-2 -k 5120)
227 if (!$self->{'debug'}) {
228 print STDERR "\n creating the compression dictionary\n" if ($self->{'verbosity'} >= 1);
229 if (!-e "$mg_compression_dict_exe") {
230 die "mgbuilder::compress_text - couldn't run $mg_compression_dict_exe\n";
231 }
232 system ("$mg_compression_dict_exe -f $fulltextprefix -S -H -2 -k 5120 $osextra");
233
234 # -b $maxdocsize sets the maximum document size to be 12 meg
235 if (!-e "$mg_passes_exe" ||
236 !open ($handle, "| $mg_passes_exe -f $fulltextprefix -b $maxdocsize -T2 $osextra")) {
237 die "mgbuilder::compress_text - couldn't run $mg_passes_exe\n";
238 }
239 }
240
241 $self->{'buildproc'}->reset();
242 # compress the text
243 print STDERR "\n compressing the text\n" if ($self->{'verbosity'} >= 1);
244 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
245 "", {}, $self->{'buildproc'}, $self->{'maxdocs'});
246 close ($handle) unless $self->{'debug'};
247
248 $self->print_stats();
249}
250
251sub want_built {
252 my $self = shift (@_);
253 my ($index) = @_;
254
255 if (defined ($self->{'collect_cfg'}->{'dontbuild'})) {
256 foreach $checkstr (@{$self->{'collect_cfg'}->{'dontbuild'}}) {
257 if ($index =~ /^$checkstr$/) {
258 push (@{$self->{'notbuilt'}}, $self->{'index_mapping'}->{$index});
259 return 0;
260 }
261 }
262 }
263
264 return 1;
265}
266
267sub build_indexes {
268 my $self = shift (@_);
269 my ($indexname) = @_;
270
271 my $indexes = [];
272 if (defined $indexname && $indexname =~ /\w/) {
273 push @$indexes, $indexname;
274 } else {
275 $indexes = $self->{'collect_cfg'}->{'indexes'};
276 }
277
278 # create the mapping between the index descriptions
279 # and their directory names
280 $self->{'index_mapping'} = $self->create_index_mapping ($indexes);
281
282 # build each of the indexes
283 foreach $index (@$indexes) {
284 if ($self->want_built($index)) {
285 print STDERR "\n*** building index $index in subdirectory " .
286 "$self->{'index_mapping'}->{$index}\n" if ($self->{'verbosity'} >= 1);
287 $self->build_index($index);
288 } else {
289 print STDERR "\n*** ignoring index $index\n" if ($self->{'verbosity'} >= 1);
290 }
291 }
292}
293
294# creates directory names for each of the index descriptions
295sub create_index_mapping {
296 my $self = shift (@_);
297 my ($indexes) = @_;
298
299 my %mapping = ();
300 $mapping{'indexmaporder'} = [];
301 $mapping{'subcollectionmaporder'} = [];
302 $mapping{'languagemaporder'} = [];
303
304 # dirnames is used to check for collisions. Start this off
305 # with the manditory directory names
306 my %dirnames = ('text'=>'text',
307 'extra'=>'extra');
308 my %pnames = ('index' => '', 'subcollection' => '', 'languages' => '');
309
310 foreach $index (@$indexes) {
311 my ($level, $gran, $subcollection, $languages) = split (":", $index);
312
313 # the directory name starts with the first character of the index level
314 my ($pindex) = $level =~ /^(.)/;
315
316 # next comes a processed version of the index
317 $pindex .= $self->process_field ($gran);
318 $pindex = lc ($pindex);
319
320 # next comes a processed version of the subcollection if there is one.
321 my $psub = $self->process_field ($subcollection);
322 $psub = lc ($psub);
323
324 # next comes a processed version of the language if there is one.
325 my $plang = $self->process_field ($languages);
326 $plang = lc ($plang);
327
328 my $dirname = $pindex . $psub . $plang;
329
330 # check to be sure all index names are unique
331 while (defined ($dirnames{$dirname})) {
332 $dirname = $self->make_unique (\%pnames, $index, \$pindex, \$psub, \$plang);
333 }
334
335 # store the mapping orders as well as the maps
336 if (!defined $mapping{'indexmap'}{"$level:$gran"}) {
337 $mapping{'indexmap'}{"$level:$gran"} = $pindex;
338 push (@{$mapping{'indexmaporder'}}, "$level:$gran");
339 }
340 if ($psub =~ /\w/ && !defined ($mapping{'subcollectionmap'}{$subcollection})) {
341 $mapping{'subcollectionmap'}{$subcollection} = $psub;
342 push (@{$mapping{'subcollectionmaporder'}}, $subcollection);
343 }
344 if ($plang =~ /\w/ && !defined ($mapping{'languagemap'}{$languages})) {
345 $mapping{'languagemap'}{$languages} = $plang;
346 push (@{$mapping{'languagemaporder'}}, $language);
347 }
348 $mapping{$index} = $dirname;
349 $dirnames{$dirname} = $index;
350 $pnames{'index'}{$pindex} = "$level:$gran";
351 $pnames{'subcollection'}{$psub} = $subcollection;
352 $pnames{'languages'}{$plang} = $languages;
353 }
354
355 return \%mapping;
356}
357
358# returns a processed version of a field.
359# if the field has only one component the processed
360# version will contain the first character and next consonant
361# of that componant - otherwise it will contain the first
362# character of the first two components
363sub process_field {
364 my $self = shift (@_);
365 my ($field) = @_;
366
367 return "" unless (defined ($field) && $field =~ /\w/);
368
369 my @components = split /,/, $field;
370 if (scalar @components >= 2) {
371 splice (@components, 2);
372 map {s/^(.).*$/$1/;} @components;
373 return join("", @components);
374 } else {
375 my ($a, $b) = $field =~ /^(.).*?([bcdfghjklmnpqrstvwxyz])/i;
376 ($a, $b) = $field =~ /^(.)(.)/ unless defined $a && defined $b;
377 return "$a$b";
378 }
379}
380
381sub make_unique {
382 my $self = shift (@_);
383 my ($namehash, $index, $indexref, $subref, $langref) = @_;
384 my ($level, $gran, $subcollection, $languages) = split (":", $index);
385
386 if ($namehash->{'index'}->{$$indexref} ne "$level:$gran") {
387 $self->get_next_version ($indexref);
388 } elsif ($namehash->{'subcollection'}->{$$subref} ne $subcollection) {
389 $self->get_next_version ($subref);
390 } elsif ($namehash->{'languages'}->{$$langref} ne $languages) {
391 $self->get_next_version ($langref);
392 }
393 return "$$indexref$$subref$$langref";
394}
395
396sub get_next_version {
397 my $self = shift (@_);
398 my ($nameref) = @_;
399
400 if ($$nameref =~ /(\d\d)$/) {
401 my $num = $1; $num ++;
402 $$nameref =~ s/\d\d$/$num/;
403 } elsif ($$nameref =~ /(\d)$/) {
404 my $num = $1;
405 if ($num == 9) {$$nameref =~ s/\d\d$/10/;}
406 else {$num ++; $$nameref =~ s/\d$/$num/;}
407 } else {
408 $$nameref =~ s/.$/0/;
409 }
410}
411
412sub build_index {
413 my $self = shift (@_);
414 my ($index) = @_;
415
416 # get the full index directory path and make sure it exists
417 my $indexdir = $self->{'index_mapping'}->{$index};
418 &util::mk_all_dir (&util::filename_cat($self->{'build_dir'}, $indexdir));
419 my $fullindexprefix = &util::filename_cat ($self->{'build_dir'}, $indexdir,
420 $self->{'collection'});
421 my $fulltextprefix = &util::filename_cat ($self->{'build_dir'}, "text",
422 $self->{'collection'});
423
424 # get any os specific stuff
425 my $exedir = "$ENV{'GSDLHOME'}/bin/$ENV{'GSDLOS'}";
426 my $exe = &util::get_os_exe ();
427 my $mg_passes_exe = &util::filename_cat($exedir, "mg_passes$exe");
428 my $mg_perf_hash_build_exe =
429 &util::filename_cat($exedir, "mg_perf_hash_build$exe");
430 my $mg_weights_build_exe =
431 &util::filename_cat ($exedir, "mg_weights_build$exe");
432 my $mg_invf_dict_exe =
433 &util::filename_cat ($exedir, "mg_invf_dict$exe");
434 my $mg_stem_idx_exe =
435 &util::filename_cat ($exedir, "mg_stem_idx$exe");
436
437 my $osextra = "";
438 if ($ENV{'GSDLOS'} =~ /^windows$/i) {
439 $fullindexprefix =~ s/\//\\/g;
440 } else {
441 $osextra = " -d /";
442 }
443
444 # get the index level from the index description
445 # the index will be level 2 unless we are building a
446 # paragraph level index
447 my $index_level = 2;
448 $index_level = 3 if $index =~ /^paragraph/i;
449
450 # get the index expression if this index belongs
451 # to a subcollection
452 my $indexexparr = [];
453 my ($level, $fields, $subcollection) = split (":", $index);
454 my @subcollections = ();
455 @subcollections = split /,/, $subcollection if (defined $subcollection);
456
457 foreach $subcollection (@subcollections) {
458 if (defined ($self->{'collect_cfg'}->{'subcollection'}->{$subcollection})) {
459 push (@$indexexparr, $self->{'collect_cfg'}->{'subcollection'}->{$subcollection});
460 }
461 }
462
463 # add expressions for languages if this index belongs to
464 # a language subcollection
465 foreach $language (@{$self->{'collect_cfg'}->{'languages'}}) {
466 if ($language =~ s/^\!//) {
467 push (@$indexexparr, "!Language/$language/");
468 } else {
469 push (@$indexexparr, "Language/$language/");
470 }
471 }
472
473 # Build index dictionary. Uses verbatim stem method
474 print STDERR "\n creating index dictionary\n" if ($self->{'verbosity'} >= 1);
475 my ($handle);
476 if ($self->{'debug'}) {
477 $handle = STDOUT;
478 } else {
479 if (!-e "$mg_passes_exe" ||
480 !open (PIPEOUT, "| $mg_passes_exe -f $fullindexprefix -b $maxdocsize " .
481 "-$index_level -m 32 -s 0 -G -t 10 -N1 $osextra")) {
482 die "mgbuilder::build_index - couldn't run $mg_passes_exe\n";
483 }
484 $handle = mgbuilder::PIPEOUT;
485 }
486
487 # set up the document processor
488 $self->{'buildproc'}->set_output_handle ($handle);
489 $self->{'buildproc'}->set_mode ('text');
490 $self->{'buildproc'}->set_index ($index, $indexexparr);
491 $self->{'buildproc'}->set_indexing_text (1);
492
493 $self->{'buildproc'}->reset();
494 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
495 "", {}, $self->{'buildproc'}, $self->{'maxdocs'});
496 close ($handle) unless $self->{'debug'};
497
498 $self->print_stats();
499
500 if (!$self->{'debug'}) {
501 # create the perfect hash function
502 if (!-e "$mg_perf_hash_build_exe") {
503 die "mgbuilder::build_index - couldn't run $mg_perf_hash_build_exe\n";
504 }
505 system ("$mg_perf_hash_build_exe -f $fullindexprefix $osextra");
506
507 if (!-e "$mg_passes_exe" ||
508 !open ($handle, "| $mg_passes_exe -f $fullindexprefix -b $maxdocsize " .
509 "-$index_level -c 3 -G -t 10 -N2 $osextra")) {
510 die "mgbuilder::build_index - couldn't run $mg_passes_exe\n";
511 }
512 }
513
514 # invert the text
515 print STDERR "\n inverting the text\n" if ($self->{'verbosity'} >= 1);
516
517 $self->{'buildproc'}->reset();
518 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
519 "", {}, $self->{'buildproc'}, $self->{'maxdocs'});
520
521 $self->print_stats ();
522
523 if (!$self->{'debug'}) {
524
525 close ($handle);
526
527 # create the weights file
528 print STDERR "\n create the weights file\n" if ($self->{'verbosity'} >= 1);
529 if (!-e "$mg_weights_build_exe") {
530 die "mgbuilder::build_index - couldn't run $mg_weights_build_exe\n";
531 }
532 system ("$mg_weights_build_exe -f $fullindexprefix -t $fulltextprefix $osextra");
533
534 # create 'on-disk' stemmed dictionary
535 print STDERR "\n creating 'on-disk' stemmed dictionary\n" if ($self->{'verbosity'} >= 1);
536 if (!-e "$mg_invf_dict_exe") {
537 die "mgbuilder::build_index - couldn't run $mg_invf_dict_exe\n";
538 }
539 system ("$mg_invf_dict_exe -f $fullindexprefix $osextra");
540
541
542 # creates stem index files for the various stemming methods
543 print STDERR "\n creating stem indexes\n" if ($self->{'verbosity'} >= 1);
544 if (!-e "$mg_stem_idx_exe") {
545 die "mgbuilder::build_index - couldn't run $mg_stem_idx_exe\n";
546 }
547 system ("$mg_stem_idx_exe -b 4096 -s1 -f $fullindexprefix $osextra");
548 system ("$mg_stem_idx_exe -b 4096 -s2 -f $fullindexprefix $osextra");
549 system ("$mg_stem_idx_exe -b 4096 -s3 -f $fullindexprefix $osextra");
550
551
552 # remove unwanted files
553 my $tmpdir = &util::filename_cat ($self->{'build_dir'}, $indexdir);
554 opendir (DIR, $tmpdir) || die
555 "mgbuilder::build_index - couldn't read directory $tmpdir\n";
556 foreach $file (readdir(DIR)) {
557 next if $file =~ /^\./;
558 my ($suffix) = $file =~ /\.([^\.]+)$/;
559 if (defined $suffix && !defined $wanted_index_files{$suffix}) {
560 # delete it!
561 print STDERR "deleting $file\n" if $self->{'verbosity'} > 2;
562 &util::rm (&util::filename_cat ($tmpdir, $file));
563 }
564 }
565 closedir (DIR);
566 }
567}
568
569sub make_infodatabase {
570 my $self = shift (@_);
571 my $textdir = &util::filename_cat($self->{'build_dir'}, "text");
572 my $assocdir = &util::filename_cat($self->{'build_dir'}, "assoc");
573 &util::mk_all_dir ($textdir);
574 &util::mk_all_dir ($assocdir);
575
576 # get db name
577 my $dbext = ".bdb";
578 $dbext = ".ldb" if &util::is_little_endian();
579 my $fulldbname = &util::filename_cat ($textdir, "$self->{'collection'}$dbext");
580 $fulldbname =~ s/\//\\/g if ($ENV{'GSDLOS'} =~ /^windows$/i);
581
582 my $exedir = "$ENV{'GSDLHOME'}/bin/$ENV{'GSDLOS'}";
583 my $exe = &util::get_os_exe ();
584 my $txt2db_exe = &util::filename_cat($exedir, "txt2db$exe");
585
586 print STDERR "\n*** creating the info database and processing associated files\n"
587 if ($self->{'verbosity'} >= 1);
588
589 # init all the classifiers
590 &classify::init_classifiers ($self->{'classifiers'});
591
592 # set up the document processor
593 my ($handle);
594 if ($self->{'debug'}) {
595 $handle = STDOUT;
596 } else {
597 if (!-e "$txt2db_exe" || !open (PIPEOUT, "| $txt2db_exe $fulldbname")) {
598 die "mgbuilder::make_infodatabase - couldn't run $txt2db_exe\n";
599 }
600 $handle = mgbuilder::PIPEOUT;
601 }
602
603 $self->{'buildproc'}->set_output_handle ($handle);
604 $self->{'buildproc'}->set_mode ('infodb');
605 $self->{'buildproc'}->set_assocdir ($assocdir);
606 $self->{'buildproc'}->set_dontgdbm ($self->{'dontgdbm'});
607 $self->{'buildproc'}->set_classifiers ($self->{'classifiers'});
608 $self->{'buildproc'}->set_indexing_text (0);
609 $self->{'buildproc'}->reset();
610
611 if (defined $self->{'collect_cfg'}->{'collectionmeta'}) {
612
613 if (!defined $self->{'index_mapping'}) {
614 $self->{'index_mapping'} =
615 $self->create_index_mapping ($self->{'collect_cfg'}->{'indexes'});
616 }
617
618 print $handle "[collection]\n";
619
620 foreach $cmeta (keys (%{$self->{'collect_cfg'}->{'collectionmeta'}})) {
621 if ($cmeta =~ s/^\.//) {
622 if (defined $self->{'index_mapping'}->{$cmeta}) {
623 print $handle "<$self->{'index_mapping'}->{$cmeta}>" .
624 $self->{'collect_cfg'}->{'collectionmeta'}->{".$cmeta"} . "\n";
625 } else {
626 print STDERR "mgbuilder: warning bad collectionmeta option '$cmeta' - ignored\n";
627 }
628 } else {
629 print $handle "<$cmeta>$self->{'collect_cfg'}->{'collectionmeta'}->{$cmeta}\n";
630 }
631 }
632 print $handle "\n" . ('-' x 70) . "\n";
633
634 }
635
636 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
637 "", {}, $self->{'buildproc'}, $self->{'maxdocs'});
638
639 # output classification information
640 &classify::output_classify_info ($self->{'classifiers'}, $handle,
641 $self->{'allclassifications'});
642
643 close ($handle) if !$self->{'debug'};
644}
645
646sub collect_specific {
647 my $self = shift (@_);
648}
649
650sub make_auxiliary_files {
651 my $self = shift (@_);
652 my ($index);
653 my %build_cfg = ();
654
655 print STDERR "\n*** creating auxiliary files \n" if ($self->{'verbosity'} >= 1);
656
657 # get the text directory
658 &util::mk_all_dir ($self->{'build_dir'});
659
660 # store the build date
661 $build_cfg->{'builddate'} = time;
662
663 # store the number of documents and number of bytes
664 $build_cfg->{'numdocs'} = $self->{'buildproc'}->get_num_docs();
665 $build_cfg->{'numbytes'} = $self->{'buildproc'}->get_num_bytes();
666
667 # get additional stats from mg
668 my $exedir = "$ENV{'GSDLHOME'}/bin/$ENV{'GSDLOS'}";
669 my $exe = &util::get_os_exe ();
670 my $mgstat_exe = &util::filename_cat($exedir, "mgstat$exe");
671 my $input_file = &util::filename_cat ("text", $self->{'collection'});
672 if (!-e "$mgstat_exe" || !open (PIPEIN, "$mgstat_exe -d $self->{'build_dir'} -f $input_file |")) {
673 print STDERR "Warning: Couldn't open pipe to $mgstat_exe to get additional stats\n";
674 } else {
675 my $line = "";
676 while (defined ($line = <PIPEIN>)) {
677 if ($line =~ /^Words in collection \[dict\]\s+:\s+(\d+)/) {
678 ($build_cfg->{'numwords'}) = $1;
679 } elsif ($line =~ /^Documents\s+:\s+(\d+)/) {
680 ($build_cfg->{'numsections'}) = $1;
681 }
682 }
683 close PIPEIN;
684 }
685
686 # store the mapping between the index names and the directory names
687 my @indexmap = ();
688 foreach $index (@{$self->{'index_mapping'}->{'indexmaporder'}}) {
689 push (@indexmap, "$index\-\>$self->{'index_mapping'}->{'indexmap'}->{$index}");
690 }
691 $build_cfg->{'indexmap'} = \@indexmap;
692
693 my @subcollectionmap = ();
694 foreach $subcollection (@{$self->{'index_mapping'}->{'subcollectionmaporder'}}) {
695 push (@subcollectionmap, "$subcollection\-\>" .
696 $self->{'index_mapping'}->{'subcollectionmap'}->{$subcollection});
697 }
698 $build_cfg->{'subcollectionmap'} = \@subcollectionmap if scalar (@subcollectionmap);
699
700 my @languagemap = ();
701 foreach $language (@{$self->{'index_mapping'}->{'languagemaporder'}}) {
702 push (@languagemap, "$language\-\>" .
703 $self->{'index_mapping'}->{'languagemap'}->{$language});
704 }
705 $build_cfg->{'languagemap'} = \@languagemap if scalar (@languagemap);
706
707 $build_cfg->{'notbuilt'} = $self->{'notbuilt'} if scalar @{$self->{'notbuilt'}};
708
709 # write out the build information
710 &cfgread::write_cfg_file("$self->{'build_dir'}/build.cfg", $build_cfg,
711 '^(builddate|numdocs|numbytes|numwords|numsections)$',
712 '^(indexmap|subcollectionmap|languagemap|notbuilt)$');
713
714}
715
716sub deinit {
717 my $self = shift (@_);
718}
719
720sub print_stats {
721 my $self = shift (@_);
722
723 my $indexing_text = $self->{'buildproc'}->get_indexing_text();
724 my $index = $self->{'buildproc'}->get_index();
725 my $num_bytes = $self->{'buildproc'}->get_num_bytes();
726 my $num_processed_bytes = $self->{'buildproc'}->get_num_processed_bytes();
727
728 if ($indexing_text) {
729 print STDERR "Stats (Creating index $index)\n";
730 } else {
731 print STDERR "Stats (Compressing text from $index)\n";
732 }
733 print STDERR "Total bytes in collection: $num_bytes\n";
734 print STDERR "Total bytes in $index: $num_processed_bytes\n";
735
736 if ($num_processed_bytes < 50) {
737 print STDERR "***************\n";
738 print STDERR "WARNING: There is very little or no text to process for $index\n";
739 if ($indexing_text) {
740 print STDERR "This may cause an error while attempting to build the index\n";
741 } else {
742 print STDERR "This may cause an error while attempting to compress the text\n";
743 }
744 print STDERR "***************\n";
745 }
746}
747
7481;
749
750
Note: See TracBrowser for help on using the repository browser.