source: branches/New_Config_Format-branch/gsdl/perllib/mgbuilder.pm@ 1279

Last change on this file since 1279 was 1279, checked in by sjboddie, 24 years ago

merged changes to trunk into New_Config_Format branch

  • Property svn:keywords set to Author Date Id Revision
File size: 24.2 KB
Line 
1###########################################################################
2#
3# mgbuilder.pm -- MGBuilder object
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26package mgbuilder;
27
28use classify;
29use cfgread;
30use colcfg;
31use plugin;
32use util;
33
34$maxdocsize = 12000;
35
36%wanted_index_files = ('td'=>1,
37 't'=>1,
38 'idb'=>1,
39 'ib1'=>1,
40 'ib2'=>1,
41 'ib3'=>1,
42 'i'=>1,
43 'ip'=>1,
44 'tiw'=>1,
45 'wa'=>1);
46
47
48sub new {
49 my ($class, $collection, $source_dir, $build_dir, $verbosity,
50 $maxdocs, $debug, $keepold, $allclassifications) = @_;
51
52 # create an mgbuilder object
53 my $self = bless {'collection'=>$collection,
54 'source_dir'=>$source_dir,
55 'build_dir'=>$build_dir,
56 'verbosity'=>$verbosity,
57 'maxdocs'=>$maxdocs,
58 'debug'=>$debug,
59 'keepold'=>$keepold,
60 'allclassifications'=>$allclassifications,
61 'notbuilt'=>[] # indexes not built
62 }, $class;
63
64
65 # read in the collection configuration file
66 my $colcfgname = "$ENV{'GSDLCOLLECTDIR'}/etc/collect.cfg";
67 if (!-e $colcfgname) {
68 die "mgbuilder::new - couldn't find collect.cfg for collection $collection\n";
69 }
70 $self->{'collect_cfg'} = &colcfg::read_collect_cfg ($colcfgname);
71
72 # sort out subcollection indexes
73 if (defined $self->{'collect_cfg'}->{'indexsubcollections'}) {
74 my $indexes = $self->{'collect_cfg'}->{'indexes'};
75 $self->{'collect_cfg'}->{'indexes'} = [];
76 foreach $subcollection (@{$self->{'collect_cfg'}->{'indexsubcollections'}}) {
77 foreach $index (@$indexes) {
78 push (@{$self->{'collect_cfg'}->{'indexes'}}, "$index:$subcollection");
79 }
80 }
81 }
82
83 # sort out language subindexes
84 if (defined $self->{'collect_cfg'}->{'languages'}) {
85 my $indexes = $self->{'collect_cfg'}->{'indexes'};
86 $self->{'collect_cfg'}->{'indexes'} = [];
87 foreach $language (@{$self->{'collect_cfg'}->{'languages'}}) {
88 foreach $index (@$indexes) {
89 push (@{$self->{'collect_cfg'}->{'indexes'}}, "$index:$language");
90 }
91 }
92 }
93
94 # get the list of plugins for this collection
95 my $plugins = [];
96 if (defined $self->{'collect_cfg'}->{'plugin'}) {
97 $plugins = $self->{'collect_cfg'}->{'plugin'};
98 }
99
100 # load all the plugins
101 $self->{'pluginfo'} = &plugin::load_plugins ($plugins, $verbosity);
102 if (scalar(@{$self->{'pluginfo'}}) == 0) {
103 print STDERR "No plugins were loaded.\n";
104 die "\n";
105 }
106
107 # get the list of classifiers for this collection
108 my $classifiers = [];
109 if (defined $self->{'collect_cfg'}->{'classify'}) {
110 $classifiers = $self->{'collect_cfg'}->{'classify'};
111 }
112
113 # load all the classifiers
114 $self->{'classifiers'} = &classify::load_classifiers ($classifiers);
115
116 # load up any dontgdbm fields
117 $self->{'dontgdbm'} = {};
118 if (defined ($self->{'collect_cfg'}->{'dontgdbm'})) {
119 foreach $dg (@{$self->{'collect_cfg'}->{'dontgdbm'}}) {
120 $self->{'dontgdbm'}->{$dg} = 1;
121 }
122 }
123
124 # load up the document processor for building
125 # if a buildproc class has been created for this collection, use it
126 # otherwise, use the mg buildproc
127 my ($buildprocdir, $buildproctype);
128 if (-e "$ENV{'GSDLCOLLECTDIR'}/perllib/${collection}buildproc.pm") {
129 $buildprocdir = "$ENV{'GSDLCOLLECTDIR'}/perllib";
130 $buildproctype = "${collection}buildproc";
131 } else {
132 $buildprocdir = "$ENV{'GSDLHOME'}/perllib";
133 $buildproctype = "mgbuildproc";
134 }
135 require "$buildprocdir/$buildproctype.pm";
136
137 eval("\$self->{'buildproc'} = new $buildproctype(\$collection, " .
138 "\$source_dir, \$build_dir, \$verbosity)");
139 die "$@" if $@;
140
141
142 return $self;
143}
144
145sub init {
146 my $self = shift (@_);
147
148 if (!$self->{'debug'} && !$self->{'keepold'}) {
149 # remove any old builds
150 &util::rm_r($self->{'build_dir'});
151 &util::mk_all_dir($self->{'build_dir'});
152
153 # make the text directory
154 my $textdir = "$self->{'build_dir'}/text";
155 &util::mk_all_dir($textdir);
156 }
157}
158
159sub compress_text {
160 my $self = shift (@_);
161 my ($textindex) = @_;
162 my $exedir = "$ENV{'GSDLHOME'}/bin/$ENV{'GSDLOS'}";
163 my $exe = &util::get_os_exe ();
164 my $mg_passes_exe = &util::filename_cat($exedir, "mg_passes$exe");
165 my $mg_compression_dict_exe = &util::filename_cat($exedir, "mg_compression_dict$exe");
166
167 &util::mk_all_dir (&util::filename_cat($self->{'build_dir'}, "text"));
168 my $basefilename = "text/$self->{'collection'}";
169 my $fulltextprefix = &util::filename_cat ($self->{'build_dir'}, $basefilename);
170
171 my $osextra = "";
172 if ($ENV{'GSDLOS'} =~ /^windows$/i) {
173 $fulltextprefix =~ s/\//\\/g;
174 } else {
175 $osextra = " -d /";
176 }
177
178 print STDERR "\n*** creating the compressed text\n" if ($self->{'verbosity'} >= 1);
179
180 # collect the statistics for the text
181 # -b $maxdocsize sets the maximum document size to be 12 meg
182 print STDERR "\n collecting text statistics\n" if ($self->{'verbosity'} >= 1);
183
184 my ($handle);
185 if ($self->{'debug'}) {
186 $handle = STDOUT;
187 } else {
188 if (!-e "$mg_passes_exe" ||
189 !open (PIPEOUT, "| $mg_passes_exe -f $fulltextprefix -b $maxdocsize -T1 $osextra")) {
190 die "mgbuilder::compress_text - couldn't run $mg_passes_exe\n";
191 }
192 $handle = mgbuilder::PIPEOUT;
193 }
194
195 $self->{'buildproc'}->set_output_handle ($handle);
196 $self->{'buildproc'}->set_mode ('text');
197 $self->{'buildproc'}->set_index ($textindex);
198 $self->{'buildproc'}->set_indexing_text (0);
199 $self->{'buildproc'}->reset();
200 &plugin::begin($self->{'pluginfo'}, $self->{'source_dir'},
201 $self->{'buildproc'}, $self->{'maxdocs'});
202 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
203 "", {}, $self->{'buildproc'}, $self->{'maxdocs'});
204 &plugin::end($self->{'pluginfo'});
205 close (PIPEOUT);
206
207 close ($handle) unless $self->{'debug'};
208
209 $self->print_stats();
210
211 # create the compression dictionary
212 # the compression dictionary is built by assuming the stats are from a seed
213 # dictionary (-S), if a novel word is encountered it is spelled out (-H),
214 # and the resulting dictionary must be less than 5 meg with the most frequent
215 # words being put into the dictionary first (-2 -k 5120)
216 if (!$self->{'debug'}) {
217 print STDERR "\n creating the compression dictionary\n" if ($self->{'verbosity'} >= 1);
218 if (!-e "$mg_compression_dict_exe") {
219 die "mgbuilder::compress_text - couldn't run $mg_compression_dict_exe\n";
220 }
221 system ("$mg_compression_dict_exe -f $fulltextprefix -S -H -2 -k 5120 $osextra");
222
223 # -b $maxdocsize sets the maximum document size to be 12 meg
224 if (!-e "$mg_passes_exe" ||
225 !open ($handle, "| $mg_passes_exe -f $fulltextprefix -b $maxdocsize -T2 $osextra")) {
226 die "mgbuilder::compress_text - couldn't run $mg_passes_exe\n";
227 }
228 }
229
230 $self->{'buildproc'}->reset();
231 # compress the text
232 print STDERR "\n compressing the text\n" if ($self->{'verbosity'} >= 1);
233 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
234 "", {}, $self->{'buildproc'}, $self->{'maxdocs'});
235 close ($handle) unless $self->{'debug'};
236
237 $self->print_stats();
238}
239
240sub want_built {
241 my $self = shift (@_);
242 my ($index) = @_;
243
244 if (defined ($self->{'collect_cfg'}->{'dontbuild'})) {
245 foreach $checkstr (@{$self->{'collect_cfg'}->{'dontbuild'}}) {
246 if ($index =~ /^$checkstr$/) {
247 push (@{$self->{'notbuilt'}}, $self->{'index_mapping'}->{$index});
248 return 0;
249 }
250 }
251 }
252
253 return 1;
254}
255
256sub build_indexes {
257 my $self = shift (@_);
258 my ($indexname) = @_;
259
260 my $indexes = [];
261 if (defined $indexname && $indexname =~ /\w/) {
262 push @$indexes, $indexname;
263 } else {
264 $indexes = $self->{'collect_cfg'}->{'indexes'};
265 }
266
267 # create the mapping between the index descriptions
268 # and their directory names
269 $self->{'index_mapping'} = $self->create_index_mapping ($indexes);
270
271 # build each of the indexes
272 foreach $index (@$indexes) {
273 if ($self->want_built($index)) {
274 print STDERR "\n*** building index $index in subdirectory " .
275 "$self->{'index_mapping'}->{$index}\n" if ($self->{'verbosity'} >= 1);
276 $self->build_index($index);
277 } else {
278 print STDERR "\n*** ignoring index $index\n" if ($self->{'verbosity'} >= 1);
279 }
280 }
281}
282
283# creates directory names for each of the index descriptions
284sub create_index_mapping {
285 my $self = shift (@_);
286 my ($indexes) = @_;
287
288 my %mapping = ();
289 $mapping{'indexmaporder'} = [];
290 $mapping{'subcollectionmaporder'} = [];
291 $mapping{'languagemaporder'} = [];
292
293 # dirnames is used to check for collisions. Start this off
294 # with the manditory directory names
295 my %dirnames = ('text'=>'text',
296 'extra'=>'extra');
297 my %pnames = ('index' => '', 'subcollection' => '', 'languages' => '');
298
299 foreach $index (@$indexes) {
300 my ($level, $gran, $subcollection, $languages) = split (":", $index);
301
302 # the directory name starts with the first character of the index level
303 my ($pindex) = $level =~ /^(.)/;
304
305 # next comes a processed version of the index
306 $pindex .= $self->process_field ($gran);
307 $pindex = lc ($pindex);
308
309 # next comes a processed version of the subcollection if there is one.
310 my $psub = $self->process_field ($subcollection);
311 $psub = lc ($psub);
312
313 # next comes a processed version of the language if there is one.
314 my $plang = $self->process_field ($languages);
315 $plang = lc ($plang);
316
317 my $dirname = $pindex . $psub . $plang;
318
319 # check to be sure all index names are unique
320 while (defined ($dirnames{$dirname})) {
321 $dirname = $self->make_unique (\%pnames, $index, \$pindex, \$psub, \$plang);
322 }
323
324 # store the mapping orders as well as the maps
325 if (!defined $mapping{'indexmap'}{"$level:$gran"}) {
326 $mapping{'indexmap'}{"$level:$gran"} = $pindex;
327 push (@{$mapping{'indexmaporder'}}, "$level:$gran");
328 }
329 if ($psub =~ /\w/ && !defined ($mapping{'subcollectionmap'}{$subcollection})) {
330 $mapping{'subcollectionmap'}{$subcollection} = $psub;
331 push (@{$mapping{'subcollectionmaporder'}}, $subcollection);
332 }
333 if ($plang =~ /\w/ && !defined ($mapping{'languagemap'}{$languages})) {
334 $mapping{'languagemap'}{$languages} = $plang;
335 push (@{$mapping{'languagemaporder'}}, $language);
336 }
337 $mapping{$index} = $dirname;
338 $dirnames{$dirname} = $index;
339 $pnames{'index'}{$pindex} = "$level:$gran";
340 $pnames{'subcollection'}{$psub} = $subcollection;
341 $pnames{'languages'}{$plang} = $languages;
342 }
343
344 return \%mapping;
345}
346
347# returns a processed version of a field.
348# if the field has only one component the processed
349# version will contain the first character and next consonant
350# of that componant - otherwise it will contain the first
351# character of the first two components
352sub process_field {
353 my $self = shift (@_);
354 my ($field) = @_;
355
356 return "" unless (defined ($field) && $field =~ /\w/);
357
358 my @components = split /,/, $field;
359 if (scalar @components >= 2) {
360 splice (@components, 2);
361 map {s/^(.).*$/$1/;} @components;
362 return join("", @components);
363 } else {
364 my ($a, $b) = $field =~ /^(.).*?([bcdfghjklmnpqrstvwxyz])/i;
365 ($a, $b) = $field =~ /^(.)(.)/ unless defined $a && defined $b;
366 return "$a$b";
367 }
368}
369
370sub make_unique {
371 my $self = shift (@_);
372 my ($namehash, $index, $indexref, $subref, $langref) = @_;
373 my ($level, $gran, $subcollection, $languages) = split (":", $index);
374
375 if ($namehash->{'index'}->{$$indexref} ne "$level:$gran") {
376 $self->get_next_version ($indexref);
377 } elsif ($namehash->{'subcollection'}->{$$subref} ne $subcollection) {
378 $self->get_next_version ($subref);
379 } elsif ($namehash->{'languages'}->{$$langref} ne $languages) {
380 $self->get_next_version ($langref);
381 }
382 return "$$indexref$$subref$$langref";
383}
384
385sub get_next_version {
386 my $self = shift (@_);
387 my ($nameref) = @_;
388
389 if ($$nameref =~ /(\d\d)$/) {
390 my $num = $1; $num ++;
391 $$nameref =~ s/\d\d$/$num/;
392 } elsif ($$nameref =~ /(\d)$/) {
393 my $num = $1;
394 if ($num == 9) {$$nameref =~ s/\d\d$/10/;}
395 else {$num ++; $$nameref =~ s/\d$/$num/;}
396 } else {
397 $$nameref =~ s/.$/0/;
398 }
399}
400
401sub build_index {
402 my $self = shift (@_);
403 my ($index) = @_;
404
405 # get the full index directory path and make sure it exists
406 my $indexdir = $self->{'index_mapping'}->{$index};
407 &util::mk_all_dir (&util::filename_cat($self->{'build_dir'}, $indexdir));
408 my $fullindexprefix = &util::filename_cat ($self->{'build_dir'}, $indexdir,
409 $self->{'collection'});
410 my $fulltextprefix = &util::filename_cat ($self->{'build_dir'}, "text",
411 $self->{'collection'});
412
413 # get any os specific stuff
414 my $exedir = "$ENV{'GSDLHOME'}/bin/$ENV{'GSDLOS'}";
415 my $exe = &util::get_os_exe ();
416 my $mg_passes_exe = &util::filename_cat($exedir, "mg_passes$exe");
417 my $mg_perf_hash_build_exe =
418 &util::filename_cat($exedir, "mg_perf_hash_build$exe");
419 my $mg_weights_build_exe =
420 &util::filename_cat ($exedir, "mg_weights_build$exe");
421 my $mg_invf_dict_exe =
422 &util::filename_cat ($exedir, "mg_invf_dict$exe");
423 my $mg_stem_idx_exe =
424 &util::filename_cat ($exedir, "mg_stem_idx$exe");
425
426 my $osextra = "";
427 if ($ENV{'GSDLOS'} =~ /^windows$/i) {
428 $fullindexprefix =~ s/\//\\/g;
429 } else {
430 $osextra = " -d /";
431 }
432
433 # get the index level from the index description
434 # the index will be level 2 unless we are building a
435 # paragraph level index
436 my $index_level = 2;
437 $index_level = 3 if $index =~ /^paragraph/i;
438
439 # get the index expression if this index belongs
440 # to a subcollection
441 my $indexexparr = [];
442 my ($level, $fields, $subcollection) = split (":", $index);
443 my @subcollections = ();
444 @subcollections = split /,/, $subcollection if (defined $subcollection);
445
446 foreach $subcollection (@subcollections) {
447 if (defined ($self->{'collect_cfg'}->{'subcollection'}->{$subcollection})) {
448 push (@$indexexparr, $self->{'collect_cfg'}->{'subcollection'}->{$subcollection});
449 }
450 }
451
452 # add expressions for languages if this index belongs to
453 # a language subcollection
454 foreach $language (@{$self->{'collect_cfg'}->{'languages'}}) {
455 if ($language =~ s/^\!//) {
456 push (@$indexexparr, "!Language/$language/");
457 } else {
458 push (@$indexexparr, "Language/$language/");
459 }
460 }
461
462 # Build index dictionary. Uses verbatim stem method
463 print STDERR "\n creating index dictionary\n" if ($self->{'verbosity'} >= 1);
464 my ($handle);
465 if ($self->{'debug'}) {
466 $handle = STDOUT;
467 } else {
468 if (!-e "$mg_passes_exe" ||
469 !open (PIPEOUT, "| $mg_passes_exe -f $fullindexprefix -b $maxdocsize " .
470 "-$index_level -m 32 -s 0 -G -t 10 -N1 $osextra")) {
471 die "mgbuilder::build_index - couldn't run $mg_passes_exe\n";
472 }
473 $handle = mgbuilder::PIPEOUT;
474 }
475
476 # set up the document processor
477 $self->{'buildproc'}->set_output_handle ($handle);
478 $self->{'buildproc'}->set_mode ('text');
479 $self->{'buildproc'}->set_index ($index, $indexexparr);
480 $self->{'buildproc'}->set_indexing_text (1);
481
482 $self->{'buildproc'}->reset();
483 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
484 "", {}, $self->{'buildproc'}, $self->{'maxdocs'});
485 close ($handle) unless $self->{'debug'};
486
487 $self->print_stats();
488
489 if (!$self->{'debug'}) {
490 # create the perfect hash function
491 if (!-e "$mg_perf_hash_build_exe") {
492 die "mgbuilder::build_index - couldn't run $mg_perf_hash_build_exe\n";
493 }
494 system ("$mg_perf_hash_build_exe -f $fullindexprefix $osextra");
495
496 if (!-e "$mg_passes_exe" ||
497 !open ($handle, "| $mg_passes_exe -f $fullindexprefix -b $maxdocsize " .
498 "-$index_level -c 3 -G -t 10 -N2 $osextra")) {
499 die "mgbuilder::build_index - couldn't run $mg_passes_exe\n";
500 }
501 }
502
503 # invert the text
504 print STDERR "\n inverting the text\n" if ($self->{'verbosity'} >= 1);
505
506 $self->{'buildproc'}->reset();
507 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
508 "", {}, $self->{'buildproc'}, $self->{'maxdocs'});
509
510 $self->print_stats ();
511
512 if (!$self->{'debug'}) {
513
514 close ($handle);
515
516 # create the weights file
517 print STDERR "\n create the weights file\n" if ($self->{'verbosity'} >= 1);
518 if (!-e "$mg_weights_build_exe") {
519 die "mgbuilder::build_index - couldn't run $mg_weights_build_exe\n";
520 }
521 system ("$mg_weights_build_exe -f $fullindexprefix -t $fulltextprefix $osextra");
522
523 # create 'on-disk' stemmed dictionary
524 print STDERR "\n creating 'on-disk' stemmed dictionary\n" if ($self->{'verbosity'} >= 1);
525 if (!-e "$mg_invf_dict_exe") {
526 die "mgbuilder::build_index - couldn't run $mg_invf_dict_exe\n";
527 }
528 system ("$mg_invf_dict_exe -f $fullindexprefix $osextra");
529
530
531 # creates stem index files for the various stemming methods
532 print STDERR "\n creating stem indexes\n" if ($self->{'verbosity'} >= 1);
533 if (!-e "$mg_stem_idx_exe") {
534 die "mgbuilder::build_index - couldn't run $mg_stem_idx_exe\n";
535 }
536 system ("$mg_stem_idx_exe -b 4096 -s1 -f $fullindexprefix $osextra");
537 system ("$mg_stem_idx_exe -b 4096 -s2 -f $fullindexprefix $osextra");
538 system ("$mg_stem_idx_exe -b 4096 -s3 -f $fullindexprefix $osextra");
539
540
541 # remove unwanted files
542 my $tmpdir = &util::filename_cat ($self->{'build_dir'}, $indexdir);
543 opendir (DIR, $tmpdir) || die
544 "mgbuilder::build_index - couldn't read directory $tmpdir\n";
545 foreach $file (readdir(DIR)) {
546 next if $file =~ /^\./;
547 my ($suffix) = $file =~ /\.([^\.]+)$/;
548 if (defined $suffix && !defined $wanted_index_files{$suffix}) {
549 # delete it!
550 print STDERR "deleting $file\n" if $self->{'verbosity'} > 2;
551 &util::rm (&util::filename_cat ($tmpdir, $file));
552 }
553 }
554 closedir (DIR);
555 }
556}
557
558sub make_infodatabase {
559 my $self = shift (@_);
560 my $textdir = &util::filename_cat($self->{'build_dir'}, "text");
561 my $assocdir = &util::filename_cat($self->{'build_dir'}, "assoc");
562 &util::mk_all_dir ($textdir);
563 &util::mk_all_dir ($assocdir);
564
565 # get db name
566 my $dbext = ".bdb";
567 $dbext = ".ldb" if &util::is_little_endian();
568 my $fulldbname = &util::filename_cat ($textdir, "$self->{'collection'}$dbext");
569 $fulldbname =~ s/\//\\/g if ($ENV{'GSDLOS'} =~ /^windows$/i);
570
571 my $exedir = "$ENV{'GSDLHOME'}/bin/$ENV{'GSDLOS'}";
572 my $exe = &util::get_os_exe ();
573 my $txt2db_exe = &util::filename_cat($exedir, "txt2db$exe");
574
575 print STDERR "\n*** creating the info database and processing associated files\n"
576 if ($self->{'verbosity'} >= 1);
577
578 # init all the classifiers
579 &classify::init_classifiers ($self->{'classifiers'});
580
581 # set up the document processor
582 my ($handle);
583 if ($self->{'debug'}) {
584 $handle = STDOUT;
585 } else {
586 if (!-e "$txt2db_exe" || !open (PIPEOUT, "| $txt2db_exe $fulldbname")) {
587 die "mgbuilder::make_infodatabase - couldn't run $txt2db_exe\n";
588 }
589 $handle = mgbuilder::PIPEOUT;
590 }
591
592 $self->{'buildproc'}->set_output_handle ($handle);
593 $self->{'buildproc'}->set_mode ('infodb');
594 $self->{'buildproc'}->set_assocdir ($assocdir);
595 $self->{'buildproc'}->set_dontgdbm ($self->{'dontgdbm'});
596 $self->{'buildproc'}->set_classifiers ($self->{'classifiers'});
597 $self->{'buildproc'}->set_indexing_text (0);
598 $self->{'buildproc'}->reset();
599
600 if (defined $self->{'collect_cfg'}->{'collectionmeta'}) {
601
602 if (!defined $self->{'index_mapping'}) {
603 $self->{'index_mapping'} =
604 $self->create_index_mapping ($self->{'collect_cfg'}->{'indexes'});
605 }
606
607 print $handle "[collection]\n";
608
609 foreach $cmeta (keys (%{$self->{'collect_cfg'}->{'collectionmeta'}})) {
610 if ($cmeta =~ s/^\.//) {
611 if (defined $self->{'index_mapping'}->{$cmeta}) {
612 print $handle "<$self->{'index_mapping'}->{$cmeta}>" .
613 $self->{'collect_cfg'}->{'collectionmeta'}->{".$cmeta"} . "\n";
614 } else {
615 print STDERR "mgbuilder: warning bad collectionmeta option '$cmeta' - ignored\n";
616 }
617 } else {
618 print $handle "<$cmeta>$self->{'collect_cfg'}->{'collectionmeta'}->{$cmeta}\n";
619 }
620 }
621 print $handle "\n" . ('-' x 70) . "\n";
622
623 }
624
625 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
626 "", {}, $self->{'buildproc'}, $self->{'maxdocs'});
627
628 # output classification information
629 &classify::output_classify_info ($self->{'classifiers'}, $handle,
630 $self->{'allclassifications'});
631
632 close ($handle) if !$self->{'debug'};
633}
634
635sub collect_specific {
636 my $self = shift (@_);
637}
638
639sub make_auxiliary_files {
640 my $self = shift (@_);
641 my ($index);
642 my %build_cfg = ();
643
644 print STDERR "\n*** creating auxiliary files \n" if ($self->{'verbosity'} >= 1);
645
646 # get the text directory
647 &util::mk_all_dir ($self->{'build_dir'});
648
649 # store the build date
650 $build_cfg->{'builddate'} = time;
651
652 # store the number of documents and number of bytes
653 $build_cfg->{'numdocs'} = $self->{'buildproc'}->get_num_docs();
654 $build_cfg->{'numbytes'} = $self->{'buildproc'}->get_num_bytes();
655
656 # get additional stats from mg
657 my $exedir = "$ENV{'GSDLHOME'}/bin/$ENV{'GSDLOS'}";
658 my $exe = &util::get_os_exe ();
659 my $mgstat_exe = &util::filename_cat($exedir, "mgstat$exe");
660 my $input_file = &util::filename_cat ("text", $self->{'collection'});
661 if (!-e "$mgstat_exe" || !open (PIPEIN, "$mgstat_exe -d $self->{'build_dir'} -f $input_file |")) {
662 print STDERR "Warning: Couldn't open pipe to $mgstat_exe to get additional stats\n";
663 } else {
664 my $line = "";
665 while (defined ($line = <PIPEIN>)) {
666 if ($line =~ /^Words in collection \[dict\]\s+:\s+(\d+)/) {
667 ($build_cfg->{'numwords'}) = $1;
668 } elsif ($line =~ /^Documents\s+:\s+(\d+)/) {
669 ($build_cfg->{'numsections'}) = $1;
670 }
671 }
672 close PIPEIN;
673 }
674
675 # store the mapping between the index names and the directory names
676 my @indexmap = ();
677 foreach $index (@{$self->{'index_mapping'}->{'indexmaporder'}}) {
678 push (@indexmap, "$index\-\>$self->{'index_mapping'}->{'indexmap'}->{$index}");
679 }
680 $build_cfg->{'indexmap'} = \@indexmap;
681
682 my @subcollectionmap = ();
683 foreach $subcollection (@{$self->{'index_mapping'}->{'subcollectionmaporder'}}) {
684 push (@subcollectionmap, "$subcollection\-\>" .
685 $self->{'index_mapping'}->{'subcollectionmap'}->{$subcollection});
686 }
687 $build_cfg->{'subcollectionmap'} = \@subcollectionmap if scalar (@subcollectionmap);
688
689 my @languagemap = ();
690 foreach $language (@{$self->{'index_mapping'}->{'languagemaporder'}}) {
691 push (@languagemap, "$language\-\>" .
692 $self->{'index_mapping'}->{'languagemap'}->{$language});
693 }
694 $build_cfg->{'languagemap'} = \@languagemap if scalar (@languagemap);
695
696 $build_cfg->{'notbuilt'} = $self->{'notbuilt'} if scalar @{$self->{'notbuilt'}};
697
698 # write out the build information
699 &cfgread::write_cfg_file("$self->{'build_dir'}/build.cfg", $build_cfg,
700 '^(builddate|numdocs|numbytes|numwords|numsections)$',
701 '^(indexmap|subcollectionmap|languagemap|notbuilt)$');
702
703}
704
705sub deinit {
706 my $self = shift (@_);
707}
708
709sub print_stats {
710 my $self = shift (@_);
711
712 my $indexing_text = $self->{'buildproc'}->get_indexing_text();
713 my $index = $self->{'buildproc'}->get_index();
714 my $num_bytes = $self->{'buildproc'}->get_num_bytes();
715 my $num_processed_bytes = $self->{'buildproc'}->get_num_processed_bytes();
716
717 if ($indexing_text) {
718 print STDERR "Stats (Creating index $index)\n";
719 } else {
720 print STDERR "Stats (Compressing text from $index)\n";
721 }
722 print STDERR "Total bytes in collection: $num_bytes\n";
723 print STDERR "Total bytes in $index: $num_processed_bytes\n";
724
725 if ($num_processed_bytes < 50) {
726 print STDERR "***************\n";
727 print STDERR "WARNING: There is very little or no text to process for $index\n";
728 if ($indexing_text) {
729 print STDERR "This may cause an error while attempting to build the index\n";
730 } else {
731 print STDERR "This may cause an error while attempting to compress the text\n";
732 }
733 print STDERR "***************\n";
734 }
735}
736
7371;
738
739
Note: See TracBrowser for help on using the repository browser.