source: trunk/gsdl/perllib/mgbuilder.pm@ 1251

Last change on this file since 1251 was 1251, checked in by sjboddie, 24 years ago

Added some stat reporting and a warning message to the build code.
Now warns when very little or no text is to be processed for a given
index (as mg craps out in these situations). Will hopefully be useful
in realizing when an attempt is made to create an index of metadata that
is never set etc.

  • Property svn:keywords set to Author Date Id Revision
File size: 23.5 KB
Line 
1###########################################################################
2#
3# mgbuilder.pm -- MGBuilder object
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26package mgbuilder;
27
28use classify;
29use cfgread;
30use colcfg;
31use plugin;
32use util;
33
34$maxdocsize = 12000;
35
36%wanted_index_files = ('td'=>1,
37 't'=>1,
38 'idb'=>1,
39 'ib1'=>1,
40 'ib2'=>1,
41 'ib3'=>1,
42 'i'=>1,
43 'ip'=>1,
44 'tiw'=>1,
45 'wa'=>1);
46
47
48sub new {
49 my ($class, $collection, $source_dir, $build_dir, $verbosity,
50 $maxdocs, $debug, $keepold, $allclassifications) = @_;
51
52 # create an mgbuilder object
53 my $self = bless {'collection'=>$collection,
54 'source_dir'=>$source_dir,
55 'build_dir'=>$build_dir,
56 'verbosity'=>$verbosity,
57 'maxdocs'=>$maxdocs,
58 'debug'=>$debug,
59 'keepold'=>$keepold,
60 'allclassifications'=>$allclassifications,
61 'notbuilt'=>[] # indexes not built
62 }, $class;
63
64
65 # read in the collection configuration file
66 my $colcfgname = "$ENV{'GSDLCOLLECTDIR'}/etc/collect.cfg";
67 if (!-e $colcfgname) {
68 die "mgbuilder::new - couldn't find collect.cfg for collection $collection\n";
69 }
70 $self->{'collect_cfg'} = &colcfg::read_collect_cfg ($colcfgname);
71
72 # sort out subcollection indexes
73 if (defined $self->{'collect_cfg'}->{'indexsubcollections'}) {
74 my $indexes = $self->{'collect_cfg'}->{'indexes'};
75 $self->{'collect_cfg'}->{'indexes'} = [];
76 foreach $subcollection (@{$self->{'collect_cfg'}->{'indexsubcollections'}}) {
77 foreach $index (@$indexes) {
78 push (@{$self->{'collect_cfg'}->{'indexes'}}, "$index:$subcollection");
79 }
80 }
81 }
82
83 # sort out language subindexes
84 if (defined $self->{'collect_cfg'}->{'languages'}) {
85 my $indexes = $self->{'collect_cfg'}->{'indexes'};
86 $self->{'collect_cfg'}->{'indexes'} = [];
87 foreach $language (@{$self->{'collect_cfg'}->{'languages'}}) {
88 foreach $index (@$indexes) {
89 push (@{$self->{'collect_cfg'}->{'indexes'}}, "$index:$language");
90 }
91 }
92 }
93
94 # get the list of plugins for this collection
95 my $plugins = [];
96 if (defined $self->{'collect_cfg'}->{'plugin'}) {
97 $plugins = $self->{'collect_cfg'}->{'plugin'};
98 }
99
100 # load all the plugins
101 $self->{'pluginfo'} = &plugin::load_plugins ($plugins, $verbosity);
102 if (scalar(@{$self->{'pluginfo'}}) == 0) {
103 print STDERR "No plugins were loaded.\n";
104 die "\n";
105 }
106
107 # get the list of classifiers for this collection
108 my $classifiers = [];
109 if (defined $self->{'collect_cfg'}->{'classify'}) {
110 $classifiers = $self->{'collect_cfg'}->{'classify'};
111 }
112
113 # load all the classifiers
114 $self->{'classifiers'} = &classify::load_classifiers ($classifiers);
115
116 # load up any dontgdbm fields
117 $self->{'dontgdbm'} = {};
118 if (defined ($self->{'collect_cfg'}->{'dontgdbm'})) {
119 foreach $dg (@{$self->{'collect_cfg'}->{'dontgdbm'}}) {
120 $self->{'dontgdbm'}->{$dg} = 1;
121 }
122 }
123
124 # load up the document processor for building
125 # if a buildproc class has been created for this collection, use it
126 # otherwise, use the mg buildproc
127 my ($buildprocdir, $buildproctype);
128 if (-e "$ENV{'GSDLCOLLECTDIR'}/perllib/${collection}buildproc.pm") {
129 $buildprocdir = "$ENV{'GSDLCOLLECTDIR'}/perllib";
130 $buildproctype = "${collection}buildproc";
131 } else {
132 $buildprocdir = "$ENV{'GSDLHOME'}/perllib";
133 $buildproctype = "mgbuildproc";
134 }
135 require "$buildprocdir/$buildproctype.pm";
136
137 eval("\$self->{'buildproc'} = new $buildproctype(\$collection, " .
138 "\$source_dir, \$build_dir, \$verbosity)");
139 die "$@" if $@;
140
141
142 return $self;
143}
144
145sub init {
146 my $self = shift (@_);
147
148 if (!$self->{'debug'} && !$self->{'keepold'}) {
149 # remove any old builds
150 &util::rm_r($self->{'build_dir'});
151 &util::mk_all_dir($self->{'build_dir'});
152
153 # make the text directory
154 my $textdir = "$self->{'build_dir'}/text";
155 &util::mk_all_dir($textdir);
156 }
157}
158
159sub compress_text {
160 my $self = shift (@_);
161 my ($textindex) = @_;
162 my $exedir = "$ENV{'GSDLHOME'}/bin/$ENV{'GSDLOS'}";
163 my $exe = &util::get_os_exe ();
164 my $mg_passes_exe = &util::filename_cat($exedir, "mg_passes$exe");
165 my $mg_compression_dict_exe = &util::filename_cat($exedir, "mg_compression_dict$exe");
166
167 &util::mk_all_dir (&util::filename_cat($self->{'build_dir'}, "text"));
168 my $basefilename = "text/$self->{'collection'}";
169 my $fulltextprefix = &util::filename_cat ($self->{'build_dir'}, $basefilename);
170
171 my $osextra = "";
172 if ($ENV{'GSDLOS'} =~ /^windows$/i) {
173 $fulltextprefix =~ s/\//\\/g;
174 } else {
175 $osextra = " -d /";
176 }
177
178 print STDERR "\n*** creating the compressed text\n" if ($self->{'verbosity'} >= 1);
179
180 # collect the statistics for the text
181 # -b $maxdocsize sets the maximum document size to be 12 meg
182 print STDERR "\n collecting text statistics\n" if ($self->{'verbosity'} >= 1);
183
184 my ($handle);
185 if ($self->{'debug'}) {
186 $handle = STDOUT;
187 } else {
188 if (!-e "$mg_passes_exe" ||
189 !open (PIPEOUT, "| $mg_passes_exe -f $fulltextprefix -b $maxdocsize -T1 $osextra")) {
190 die "mgbuilder::compress_text - couldn't run $mg_passes_exe\n";
191 }
192 $handle = mgbuilder::PIPEOUT;
193 }
194
195 $self->{'buildproc'}->set_output_handle ($handle);
196 $self->{'buildproc'}->set_mode ('text');
197 $self->{'buildproc'}->set_index ($textindex);
198 $self->{'buildproc'}->set_indexing_text (0);
199 $self->{'buildproc'}->reset();
200 &plugin::begin($self->{'pluginfo'}, $self->{'source_dir'},
201 $self->{'buildproc'}, $self->{'maxdocs'});
202 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
203 "", {}, $self->{'buildproc'}, $self->{'maxdocs'});
204 &plugin::end($self->{'pluginfo'});
205 close (PIPEOUT);
206
207 close ($handle) unless $self->{'debug'};
208
209 $self->print_stats();
210
211 # create the compression dictionary
212 # the compression dictionary is built by assuming the stats are from a seed
213 # dictionary (-S), if a novel word is encountered it is spelled out (-H),
214 # and the resulting dictionary must be less than 5 meg with the most frequent
215 # words being put into the dictionary first (-2 -k 5120)
216 if (!$self->{'debug'}) {
217 print STDERR "\n creating the compression dictionary\n" if ($self->{'verbosity'} >= 1);
218 if (!-e "$mg_compression_dict_exe") {
219 die "mgbuilder::compress_text - couldn't run $mg_compression_dict_exe\n";
220 }
221 system ("$mg_compression_dict_exe -f $fulltextprefix -S -H -2 -k 5120 $osextra");
222
223 # -b $maxdocsize sets the maximum document size to be 12 meg
224 if (!-e "$mg_passes_exe" ||
225 !open ($handle, "| $mg_passes_exe -f $fulltextprefix -b $maxdocsize -T2 $osextra")) {
226 die "mgbuilder::compress_text - couldn't run $mg_passes_exe\n";
227 }
228 }
229
230 $self->{'buildproc'}->reset();
231 # compress the text
232 print STDERR "\n compressing the text\n" if ($self->{'verbosity'} >= 1);
233 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
234 "", {}, $self->{'buildproc'}, $self->{'maxdocs'});
235 close ($handle) unless $self->{'debug'};
236
237 $self->print_stats();
238}
239
240sub want_built {
241 my $self = shift (@_);
242 my ($index) = @_;
243
244 if (defined ($self->{'collect_cfg'}->{'dontbuild'})) {
245 foreach $checkstr (@{$self->{'collect_cfg'}->{'dontbuild'}}) {
246 if ($index =~ /^$checkstr$/) {
247 push (@{$self->{'notbuilt'}}, $self->{'index_mapping'}->{$index});
248 return 0;
249 }
250 }
251 }
252
253 return 1;
254}
255
256sub build_indexes {
257 my $self = shift (@_);
258 my ($indexname) = @_;
259
260 my $indexes = [];
261 if (defined $indexname && $indexname =~ /\w/) {
262 push @$indexes, $indexname;
263 } else {
264 $indexes = $self->{'collect_cfg'}->{'indexes'};
265 }
266
267 # create the mapping between the index descriptions
268 # and their directory names
269 $self->{'index_mapping'} = $self->create_index_mapping ($indexes);
270
271 # build each of the indexes
272 foreach $index (@$indexes) {
273 if ($self->want_built($index)) {
274 print STDERR "\n*** building index $index in subdirectory " .
275 "$self->{'index_mapping'}->{$index}\n" if ($self->{'verbosity'} >= 1);
276 $self->build_index($index);
277 } else {
278 print STDERR "\n*** ignoring index $index\n" if ($self->{'verbosity'} >= 1);
279 }
280 }
281}
282
283# creates directory names for each of the index descriptions
284sub create_index_mapping {
285 my $self = shift (@_);
286 my ($indexes) = @_;
287
288 my %mapping = ();
289 $mapping{'indexmaporder'} = [];
290 $mapping{'subcollectionmaporder'} = [];
291 $mapping{'languagemaporder'} = [];
292
293 # dirnames is used to check for collisions. Start this off
294 # with the manditory directory names
295 my %dirnames = ('text'=>'text',
296 'extra'=>'extra');
297 my %pnames = ('index' => '', 'subcollection' => '', 'languages' => '');
298
299 foreach $index (@$indexes) {
300 my ($level, $gran, $subcollection, $languages) = split (":", $index);
301
302 # the directory name starts with the first character of the index level
303 my ($pindex) = $level =~ /^(.)/;
304
305 # next comes a processed version of the index
306 $pindex .= $self->process_field ($gran);
307 $pindex = lc ($pindex);
308
309 # next comes a processed version of the subcollection if there is one.
310 my $psub = $self->process_field ($subcollection);
311 $psub = lc ($psub);
312
313 # next comes a processed version of the language if there is one.
314 my $plang = $self->process_field ($languages);
315 $plang = lc ($plang);
316
317 my $dirname = $pindex . $psub . $plang;
318
319 # check to be sure all index names are unique
320 while (defined ($dirnames{$dirname})) {
321 $dirname = $self->make_unique (\%pnames, $index, \$pindex, \$psub, \$plang);
322 }
323
324 # store the mapping orders as well as the maps
325 if (!defined $mapping{'indexmap'}{"$level:$gran"}) {
326 $mapping{'indexmap'}{"$level:$gran"} = $pindex;
327 push (@{$mapping{'indexmaporder'}}, "$level:$gran");
328 }
329 if ($psub =~ /\w/ && !defined ($mapping{'subcollectionmap'}{$subcollection})) {
330 $mapping{'subcollectionmap'}{$subcollection} = $psub;
331 push (@{$mapping{'subcollectionmaporder'}}, $subcollection);
332 }
333 if ($plang =~ /\w/ && !defined ($mapping{'languagemap'}{$languages})) {
334 $mapping{'languagemap'}{$languages} = $plang;
335 push (@{$mapping{'languagemaporder'}}, $language);
336 }
337 $mapping{$index} = $dirname;
338 $dirnames{$dirname} = $index;
339 $pnames{'index'}{$pindex} = "$level:$gran";
340 $pnames{'subcollection'}{$psub} = $subcollection;
341 $pnames{'languages'}{$plang} = $languages;
342 }
343
344 return \%mapping;
345}
346
347# returns a processed version of a field.
348# if the field has only one component the processed
349# version will contain the first character and next consonant
350# of that componant - otherwise it will contain the first
351# character of the first two components
352sub process_field {
353 my $self = shift (@_);
354 my ($field) = @_;
355
356 return "" unless (defined ($field) && $field =~ /\w/);
357
358 my @components = split /,/, $field;
359 if (scalar @components >= 2) {
360 splice (@components, 2);
361 map {s/^(.).*$/$1/;} @components;
362 return join("", @components);
363 } else {
364 my ($a, $b) = $field =~ /^(.).*?([bcdfghjklmnpqrstvwxyz])/i;
365 ($a, $b) = $field =~ /^(.)(.)/ unless defined $a && defined $b;
366 return "$a$b";
367 }
368}
369
370sub make_unique {
371 my $self = shift (@_);
372 my ($namehash, $index, $indexref, $subref, $langref) = @_;
373 my ($level, $gran, $subcollection, $languages) = split (":", $index);
374
375 if ($namehash->{'index'}->{$$indexref} ne "$level:$gran") {
376 $self->get_next_version ($indexref);
377 } elsif ($namehash->{'subcollection'}->{$$subref} ne $subcollection) {
378 $self->get_next_version ($subref);
379 } elsif ($namehash->{'languages'}->{$$langref} ne $languages) {
380 $self->get_next_version ($langref);
381 }
382 return "$$indexref$$subref$$langref";
383}
384
385sub get_next_version {
386 my $self = shift (@_);
387 my ($nameref) = @_;
388
389 if ($$nameref =~ /(\d\d)$/) {
390 my $num = $1; $num ++;
391 $$nameref =~ s/\d\d$/$num/;
392 } elsif ($$nameref =~ /(\d)$/) {
393 my $num = $1;
394 if ($num == 9) {$$nameref =~ s/\d\d$/10/;}
395 else {$num ++; $$nameref =~ s/\d$/$num/;}
396 } else {
397 $$nameref =~ s/.$/0/;
398 }
399}
400
401sub build_index {
402 my $self = shift (@_);
403 my ($index) = @_;
404
405 # get the full index directory path and make sure it exists
406 my $indexdir = $self->{'index_mapping'}->{$index};
407 &util::mk_all_dir (&util::filename_cat($self->{'build_dir'}, $indexdir));
408 my $fullindexprefix = &util::filename_cat ($self->{'build_dir'}, $indexdir,
409 $self->{'collection'});
410 my $fulltextprefix = &util::filename_cat ($self->{'build_dir'}, "text",
411 $self->{'collection'});
412
413 # get any os specific stuff
414 my $exedir = "$ENV{'GSDLHOME'}/bin/$ENV{'GSDLOS'}";
415 my $exe = &util::get_os_exe ();
416 my $mg_passes_exe = &util::filename_cat($exedir, "mg_passes$exe");
417 my $mg_perf_hash_build_exe =
418 &util::filename_cat($exedir, "mg_perf_hash_build$exe");
419 my $mg_weights_build_exe =
420 &util::filename_cat ($exedir, "mg_weights_build$exe");
421 my $mg_invf_dict_exe =
422 &util::filename_cat ($exedir, "mg_invf_dict$exe");
423 my $mg_stem_idx_exe =
424 &util::filename_cat ($exedir, "mg_stem_idx$exe");
425
426 my $osextra = "";
427 if ($ENV{'GSDLOS'} =~ /^windows$/i) {
428 $fullindexprefix =~ s/\//\\/g;
429 } else {
430 $osextra = " -d /";
431 }
432
433 # get the index level from the index description
434 # the index will be level 2 unless we are building a
435 # paragraph level index
436 my $index_level = 2;
437 $index_level = 3 if $index =~ /^paragraph/i;
438
439 # get the index expression if this index belongs
440 # to a subcollection
441 my $indexexparr = [];
442 my ($level, $fields, $subcollection) = split (":", $index);
443 my @subcollections = ();
444 @subcollections = split /,/, $subcollection if (defined $subcollection);
445
446 foreach $subcollection (@subcollections) {
447 if (defined ($self->{'collect_cfg'}->{'subcollection'}->{$subcollection})) {
448 push (@$indexexparr, $self->{'collect_cfg'}->{'subcollection'}->{$subcollection});
449 }
450 }
451
452 # add expressions for languages if this index belongs to
453 # a language subcollection
454 foreach $language (@{$self->{'collect_cfg'}->{'languages'}}) {
455 if ($language =~ s/^\!//) {
456 push (@$indexexparr, "!Language/$language/");
457 } else {
458 push (@$indexexparr, "Language/$language/");
459 }
460 }
461
462 # Build index dictionary. Uses verbatim stem method
463 print STDERR "\n creating index dictionary\n" if ($self->{'verbosity'} >= 1);
464 my ($handle);
465 if ($self->{'debug'}) {
466 $handle = STDOUT;
467 } else {
468 if (!-e "$mg_passes_exe" ||
469 !open (PIPEOUT, "| $mg_passes_exe -f $fullindexprefix -b $maxdocsize " .
470 "-$index_level -m 32 -s 0 -G -t 10 -N1 $osextra")) {
471 die "mgbuilder::build_index - couldn't run $mg_passes_exe\n";
472 }
473 $handle = mgbuilder::PIPEOUT;
474 }
475
476 # set up the document processor
477 $self->{'buildproc'}->set_output_handle ($handle);
478 $self->{'buildproc'}->set_mode ('text');
479 $self->{'buildproc'}->set_index ($index, $indexexparr);
480 $self->{'buildproc'}->set_indexing_text (1);
481
482 $self->{'buildproc'}->reset();
483 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
484 "", {}, $self->{'buildproc'}, $self->{'maxdocs'});
485 close ($handle) unless $self->{'debug'};
486
487 $self->print_stats();
488
489 if (!$self->{'debug'}) {
490 # create the perfect hash function
491 if (!-e "$mg_perf_hash_build_exe") {
492 die "mgbuilder::build_index - couldn't run $mg_perf_hash_build_exe\n";
493 }
494 system ("$mg_perf_hash_build_exe -f $fullindexprefix $osextra");
495
496 if (!-e "$mg_passes_exe" ||
497 !open ($handle, "| $mg_passes_exe -f $fullindexprefix -b $maxdocsize " .
498 "-$index_level -c 3 -G -t 10 -N2 $osextra")) {
499 die "mgbuilder::build_index - couldn't run $mg_passes_exe\n";
500 }
501 }
502
503 # invert the text
504 print STDERR "\n inverting the text\n" if ($self->{'verbosity'} >= 1);
505
506 $self->{'buildproc'}->reset();
507 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
508 "", {}, $self->{'buildproc'}, $self->{'maxdocs'});
509
510 $self->print_stats ();
511
512 if (!$self->{'debug'}) {
513
514 close ($handle);
515
516 # create the weights file
517 print STDERR "\n create the weights file\n" if ($self->{'verbosity'} >= 1);
518 if (!-e "$mg_weights_build_exe") {
519 die "mgbuilder::build_index - couldn't run $mg_weights_build_exe\n";
520 }
521 system ("$mg_weights_build_exe -f $fullindexprefix -t $fulltextprefix $osextra");
522
523 # create 'on-disk' stemmed dictionary
524 print STDERR "\n creating 'on-disk' stemmed dictionary\n" if ($self->{'verbosity'} >= 1);
525 if (!-e "$mg_invf_dict_exe") {
526 die "mgbuilder::build_index - couldn't run $mg_invf_dict_exe\n";
527 }
528 system ("$mg_invf_dict_exe -f $fullindexprefix $osextra");
529
530
531 # creates stem index files for the various stemming methods
532 print STDERR "\n creating stem indexes\n" if ($self->{'verbosity'} >= 1);
533 if (!-e "$mg_stem_idx_exe") {
534 die "mgbuilder::build_index - couldn't run $mg_stem_idx_exe\n";
535 }
536 system ("$mg_stem_idx_exe -b 4096 -s1 -f $fullindexprefix $osextra");
537 system ("$mg_stem_idx_exe -b 4096 -s2 -f $fullindexprefix $osextra");
538 system ("$mg_stem_idx_exe -b 4096 -s3 -f $fullindexprefix $osextra");
539
540
541 # remove unwanted files
542 my $tmpdir = &util::filename_cat ($self->{'build_dir'}, $indexdir);
543 opendir (DIR, $tmpdir) || die
544 "mgbuilder::build_index - couldn't read directory $tmpdir\n";
545 foreach $file (readdir(DIR)) {
546 next if $file =~ /^\./;
547 my ($suffix) = $file =~ /\.([^\.]+)$/;
548 if (defined $suffix && !defined $wanted_index_files{$suffix}) {
549 # delete it!
550 print STDERR "deleting $file\n" if $self->{'verbosity'} > 2;
551 &util::rm (&util::filename_cat ($tmpdir, $file));
552 }
553 }
554 closedir (DIR);
555 }
556}
557
558sub make_infodatabase {
559 my $self = shift (@_);
560 my $textdir = &util::filename_cat($self->{'build_dir'}, "text");
561 my $assocdir = &util::filename_cat($self->{'build_dir'}, "assoc");
562 &util::mk_all_dir ($textdir);
563 &util::mk_all_dir ($assocdir);
564
565 # get db name
566 my $dbext = ".bdb";
567 $dbext = ".ldb" if &util::is_little_endian();
568 my $fulldbname = &util::filename_cat ($textdir, "$self->{'collection'}$dbext");
569 $fulldbname =~ s/\//\\/g if ($ENV{'GSDLOS'} =~ /^windows$/i);
570
571 my $exedir = "$ENV{'GSDLHOME'}/bin/$ENV{'GSDLOS'}";
572 my $exe = &util::get_os_exe ();
573 my $txt2db_exe = &util::filename_cat($exedir, "txt2db$exe");
574
575 print STDERR "\n*** creating the info database and processing associated files\n"
576 if ($self->{'verbosity'} >= 1);
577
578 # init all the classifiers
579 &classify::init_classifiers ($self->{'classifiers'});
580
581 # set up the document processor
582 my ($handle);
583 if ($self->{'debug'}) {
584 $handle = STDOUT;
585 } else {
586 if (!-e "$txt2db_exe" || !open (PIPEOUT, "| $txt2db_exe $fulldbname")) {
587 die "mgbuilder::make_infodatabase - couldn't run $txt2db_exe\n";
588 }
589 $handle = mgbuilder::PIPEOUT;
590 }
591
592 $self->{'buildproc'}->set_output_handle ($handle);
593 $self->{'buildproc'}->set_mode ('infodb');
594 $self->{'buildproc'}->set_assocdir ($assocdir);
595 $self->{'buildproc'}->set_dontgdbm ($self->{'dontgdbm'});
596 $self->{'buildproc'}->set_classifiers ($self->{'classifiers'});
597 $self->{'buildproc'}->set_indexing_text (0);
598 $self->{'buildproc'}->reset();
599
600 if (defined $self->{'collect_cfg'}->{'collectionmeta'}) {
601
602 if (!defined $self->{'index_mapping'}) {
603 $self->{'index_mapping'} =
604 $self->create_index_mapping ($self->{'collect_cfg'}->{'indexes'});
605 }
606
607 print $handle "[collection]\n";
608
609 foreach $cmeta (keys (%{$self->{'collect_cfg'}->{'collectionmeta'}})) {
610 if ($cmeta =~ s/^\.//) {
611 if (defined $self->{'index_mapping'}->{$cmeta}) {
612 print $handle "<$self->{'index_mapping'}->{$cmeta}>" .
613 $self->{'collect_cfg'}->{'collectionmeta'}->{".$cmeta"} . "\n";
614 } else {
615 print STDERR "mgbuilder: warning bad collectionmeta option '$cmeta' - ignored\n";
616 }
617 } else {
618 print $handle "<$cmeta>$self->{'collect_cfg'}->{'collectionmeta'}->{$cmeta}\n";
619 }
620 }
621 print $handle "\n" . ('-' x 70) . "\n";
622
623 }
624
625 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
626 "", {}, $self->{'buildproc'}, $self->{'maxdocs'});
627
628 # output classification information
629 &classify::output_classify_info ($self->{'classifiers'}, $handle,
630 $self->{'allclassifications'});
631
632 close ($handle) if !$self->{'debug'};
633}
634
635sub collect_specific {
636 my $self = shift (@_);
637}
638
639sub make_auxiliary_files {
640 my $self = shift (@_);
641 my ($index);
642 my %build_cfg = ();
643
644 print STDERR "\n*** creating auxiliary files \n" if ($self->{'verbosity'} >= 1);
645
646 # get the text directory
647 &util::mk_all_dir ($self->{'build_dir'});
648
649 # store the build date
650 $build_cfg->{'builddate'} = time;
651
652 # store the number of documents and number of bytes
653 $build_cfg->{'numdocs'} = $self->{'buildproc'}->get_num_docs();
654 $build_cfg->{'numbytes'} = $self->{'buildproc'}->get_num_bytes();
655
656 # store the mapping between the index names and the directory names
657 my @indexmap = ();
658 foreach $index (@{$self->{'index_mapping'}->{'indexmaporder'}}) {
659 push (@indexmap, "$index\-\>$self->{'index_mapping'}->{'indexmap'}->{$index}");
660 }
661 $build_cfg->{'indexmap'} = \@indexmap;
662
663 my @subcollectionmap = ();
664 foreach $subcollection (@{$self->{'index_mapping'}->{'subcollectionmaporder'}}) {
665 push (@subcollectionmap, "$subcollection\-\>" .
666 $self->{'index_mapping'}->{'subcollectionmap'}->{$subcollection});
667 }
668 $build_cfg->{'subcollectionmap'} = \@subcollectionmap if scalar (@subcollectionmap);
669
670 my @languagemap = ();
671 foreach $language (@{$self->{'index_mapping'}->{'languagemaporder'}}) {
672 push (@languagemap, "$language\-\>" .
673 $self->{'index_mapping'}->{'languagemap'}->{$language});
674 }
675 $build_cfg->{'languagemap'} = \@languagemap if scalar (@languagemap);
676
677 $build_cfg->{'notbuilt'} = $self->{'notbuilt'} if scalar @{$self->{'notbuilt'}};
678
679 # write out the build information
680 &cfgread::write_cfg_file("$self->{'build_dir'}/build.cfg", $build_cfg,
681 '^(builddate|numdocs|numbytes)$',
682 '^(indexmap|subcollectionmap|languagemap|notbuilt)$');
683
684}
685
686sub deinit {
687 my $self = shift (@_);
688}
689
690sub print_stats {
691 my $self = shift (@_);
692
693 my $indexing_text = $self->{'buildproc'}->get_indexing_text();
694 my $index = $self->{'buildproc'}->get_index();
695 my $num_bytes = $self->{'buildproc'}->get_num_bytes();
696 my $num_processed_bytes = $self->{'buildproc'}->get_num_processed_bytes();
697
698 if ($indexing_text) {
699 print STDERR "Stats (Creating index $index)\n";
700 } else {
701 print STDERR "Stats (Compressing text from $index)\n";
702 }
703 print STDERR "Total bytes in collection: $num_bytes\n";
704 print STDERR "Total bytes in $index: $num_processed_bytes\n";
705
706 if ($num_processed_bytes < 50) {
707 print STDERR "***************\n";
708 print STDERR "WARNING: There is very little or no text to process for $index\n";
709 if ($indexing_text) {
710 print STDERR "This may cause an error while attempting to build the index\n";
711 } else {
712 print STDERR "This may cause an error while attempting to compress the text\n";
713 }
714 print STDERR "***************\n";
715 }
716}
717
7181;
719
720
Note: See TracBrowser for help on using the repository browser.