source: trunk/gsdl/perllib/mgbuilder.pm@ 817

Last change on this file since 817 was 810, checked in by sjboddie, 25 years ago

plugins now take options, files are associated at build time as
well as import time

  • Property svn:keywords set to Author Date Id Revision
File size: 22.3 KB
Line 
1###########################################################################
2#
3# mgbuilder.pm -- MGBuilder object
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26package mgbuilder;
27
28use classify;
29use cfgread;
30use colcfg;
31use plugin;
32use util;
33
34$maxdocsize = 12000;
35
36%wanted_index_files = ('td'=>1,
37 't'=>1,
38 'idb'=>1,
39 'ib1'=>1,
40 'ib2'=>1,
41 'ib3'=>1,
42 'i'=>1,
43 'ip'=>1,
44 'tiw'=>1,
45 'wa'=>1);
46
47
48sub new {
49 my ($class, $collection, $source_dir, $build_dir, $verbosity,
50 $maxdocs, $debug, $keepold, $allclassifications) = @_;
51
52 # create an mgbuilder object
53 my $self = bless {'collection'=>$collection,
54 'source_dir'=>$source_dir,
55 'build_dir'=>$build_dir,
56 'verbosity'=>$verbosity,
57 'maxdocs'=>$maxdocs,
58 'debug'=>$debug,
59 'keepold'=>$keepold,
60 'allclassifications'=>$allclassifications,
61 'notbuilt'=>[] # indexes not built
62 }, $class;
63
64
65 # read in the collection configuration file
66 my $colcfgname = "$ENV{'GSDLCOLLECTDIR'}/etc/collect.cfg";
67 if (!-e $colcfgname) {
68 die "mgbuilder::new - couldn't find collect.cfg for collection $collection\n";
69 }
70 $self->{'collect_cfg'} = &colcfg::read_collect_cfg ($colcfgname);
71
72 # sort out subcollection indexes
73 if (defined $self->{'collect_cfg'}->{'indexsubcollections'}) {
74 my $indexes = $self->{'collect_cfg'}->{'indexes'};
75 $self->{'collect_cfg'}->{'indexes'} = [];
76 foreach $subcollection (@{$self->{'collect_cfg'}->{'indexsubcollections'}}) {
77 foreach $index (@$indexes) {
78 push (@{$self->{'collect_cfg'}->{'indexes'}}, "$index:$subcollection");
79 }
80 }
81 }
82
83 # sort out language subindexes
84 if (defined $self->{'collect_cfg'}->{'languages'}) {
85 my $indexes = $self->{'collect_cfg'}->{'indexes'};
86 $self->{'collect_cfg'}->{'indexes'} = [];
87 foreach $language (@{$self->{'collect_cfg'}->{'languages'}}) {
88 foreach $index (@$indexes) {
89 push (@{$self->{'collect_cfg'}->{'indexes'}}, "$index:$language");
90 }
91 }
92 }
93
94 # get the list of plugins for this collection
95 my $plugins = [];
96 if (defined $self->{'collect_cfg'}->{'plugin'}) {
97 $plugins = $self->{'collect_cfg'}->{'plugin'};
98 }
99
100 # load all the plugins
101 $self->{'pluginfo'} = &plugin::load_plugins ($plugins);
102 if (scalar(@{$self->{'pluginfo'}}) == 0) {
103 print STDERR "No plugins were loaded.\n";
104 die "\n";
105 }
106
107 # get the list of classifiers for this collection
108 my $classifiers = [];
109 if (defined $self->{'collect_cfg'}->{'classify'}) {
110 $classifiers = $self->{'collect_cfg'}->{'classify'};
111 }
112
113 # load all the classifiers
114 $self->{'classifiers'} = &classify::load_classifiers ($classifiers);
115
116 # load up any dontgdbm fields
117 $self->{'dontgdbm'} = {};
118 if (defined ($self->{'collect_cfg'}->{'dontgdbm'})) {
119 foreach $dg (@{$self->{'collect_cfg'}->{'dontgdbm'}}) {
120 $self->{'dontgdbm'}->{$dg} = 1;
121 }
122 }
123
124 # load up the document processor for building
125 # if a buildproc class has been created for this collection, use it
126 # otherwise, use the mg buildproc
127 my ($buildprocdir, $buildproctype);
128 if (-e "$ENV{'GSDLCOLLECTDIR'}/perllib/${collection}buildproc.pm") {
129 $buildprocdir = "$ENV{'GSDLCOLLECTDIR'}/perllib";
130 $buildproctype = "${collection}buildproc";
131 } else {
132 $buildprocdir = "$ENV{'GSDLHOME'}/perllib";
133 $buildproctype = "mgbuildproc";
134 }
135 require "$buildprocdir/$buildproctype.pm";
136
137 eval("\$self->{'buildproc'} = new $buildproctype(\$collection, " .
138 "\$source_dir, \$build_dir, \$verbosity)");
139 die "$@" if $@;
140
141
142 return $self;
143}
144
145sub init {
146 my $self = shift (@_);
147
148 if (!$self->{'debug'} && !$self->{'keepold'}) {
149 # remove any old builds
150 &util::rm_r($self->{'build_dir'});
151 &util::mk_all_dir($self->{'build_dir'});
152
153 # make the text directory
154 my $textdir = "$self->{'build_dir'}/text";
155 &util::mk_all_dir($textdir);
156 }
157}
158
159sub compress_text {
160 my $self = shift (@_);
161 my ($textindex) = @_;
162 my $exedir = "$ENV{'GSDLHOME'}/bin/$ENV{'GSDLOS'}";
163 my $exe = &util::get_os_exe ();
164 my $mg_passes_exe = &util::filename_cat($exedir, "mg_passes$exe");
165 my $mg_compression_dict_exe = &util::filename_cat($exedir, "mg_compression_dict$exe");
166
167 &util::mk_all_dir (&util::filename_cat($self->{'build_dir'}, "text"));
168 my $basefilename = "text/$self->{'collection'}";
169 my $fulltextprefix = &util::filename_cat ($self->{'build_dir'}, $basefilename);
170
171 my $osextra = "";
172 if ($ENV{'GSDLOS'} =~ /^windows$/i) {
173 $fulltextprefix =~ s/\//\\/g;
174 } else {
175 $osextra = " -d /";
176 }
177
178 print STDERR "\n*** creating the compressed text\n" if ($self->{'verbosity'} >= 1);
179
180 # collect the statistics for the text
181 # -b $maxdocsize sets the maximum document size to be 12 meg
182 print STDERR "\n collecting text statistics\n" if ($self->{'verbosity'} >= 1);
183
184 my ($handle);
185 if ($self->{'debug'}) {
186 $handle = STDOUT;
187 } else {
188 if (!-e "$mg_passes_exe" ||
189 !open (PIPEOUT, "| $mg_passes_exe -f $fulltextprefix -b $maxdocsize -T1 $osextra")) {
190 die "mgbuilder::compress_text - couldn't run $mg_passes_exe\n";
191 }
192 $handle = mgbuilder::PIPEOUT;
193 }
194
195 $self->{'buildproc'}->set_output_handle ($handle);
196 $self->{'buildproc'}->set_mode ('text');
197 $self->{'buildproc'}->set_index ($textindex);
198 $self->{'buildproc'}->set_indexing_text (0);
199 $self->{'buildproc'}->reset();
200 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
201 "", {}, $self->{'buildproc'}, $self->{'maxdocs'});
202
203 close ($handle) unless $self->{'debug'};
204
205 # create the compression dictionary
206 # the compression dictionary is built by assuming the stats are from a seed
207 # dictionary (-S), if a novel word is encountered it is spelled out (-H),
208 # and the resulting dictionary must be less than 5 meg with the most frequent
209 # words being put into the dictionary first (-2 -k 5120)
210 if (!$self->{'debug'}) {
211 print STDERR "\n creating the compression dictionary\n" if ($self->{'verbosity'} >= 1);
212 if (!-e "$mg_compression_dict_exe") {
213 die "mgbuilder::compress_text - couldn't run $mg_compression_dict_exe\n";
214 }
215 system ("$mg_compression_dict_exe -f $fulltextprefix -S -H -2 -k 5120 $osextra");
216
217 # -b $maxdocsize sets the maximum document size to be 12 meg
218 if (!$self->{'debug'}) {
219 if (!-e "$mg_passes_exe" ||
220 !open ($handle, "| $mg_passes_exe -f $fulltextprefix -b $maxdocsize -T2 $osextra")) {
221 die "mgbuilder::compress_text - couldn't run $mg_passes_exe\n";
222 }
223 }
224 }
225
226 $self->{'buildproc'}->reset();
227 # compress the text
228 print STDERR "\n compressing the text\n" if ($self->{'verbosity'} >= 1);
229 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
230 "", {}, $self->{'buildproc'}, $self->{'maxdocs'});
231 close ($handle) unless $self->{'debug'};
232}
233
234sub want_built {
235 my $self = shift (@_);
236 my ($index) = @_;
237
238 if (defined ($self->{'collect_cfg'}->{'dontbuild'})) {
239 foreach $checkstr (@{$self->{'collect_cfg'}->{'dontbuild'}}) {
240 if ($index =~ /^$checkstr$/) {
241 push (@{$self->{'notbuilt'}}, $self->{'index_mapping'}->{$index});
242 return 0;
243 }
244 }
245 }
246
247 return 1;
248}
249
250sub build_indexes {
251 my $self = shift (@_);
252 my ($indexname) = @_;
253
254 my $indexes = [];
255 if (defined $indexname && $indexname =~ /\w/) {
256 push @$indexes, $indexname;
257 } else {
258 $indexes = $self->{'collect_cfg'}->{'indexes'};
259 }
260
261 # create the mapping between the index descriptions
262 # and their directory names
263 $self->{'index_mapping'} = $self->create_index_mapping ($indexes);
264
265 # build each of the indexes
266 foreach $index (@$indexes) {
267 if ($self->want_built($index)) {
268 print STDERR "\n*** building index $index in subdirectory " .
269 "$self->{'index_mapping'}->{$index}\n" if ($self->{'verbosity'} >= 1);
270 $self->build_index($index);
271 } else {
272 print STDERR "\n*** ignoring index $index\n" if ($self->{'verbosity'} >= 1);
273 }
274 }
275}
276
277# creates directory names for each of the index descriptions
278sub create_index_mapping {
279 my $self = shift (@_);
280 my ($indexes) = @_;
281
282 my %mapping = ();
283 $mapping{'indexmaporder'} = [];
284 $mapping{'subcollectionmaporder'} = [];
285 $mapping{'languagemaporder'} = [];
286
287 # dirnames is used to check for collisions. Start this off
288 # with the manditory directory names
289 my %dirnames = ('text'=>'text',
290 'extra'=>'extra');
291 my %pnames = ('index' => '', 'subcollection' => '', 'languages' => '');
292
293 foreach $index (@$indexes) {
294 my ($level, $gran, $subcollection, $languages) = split (":", $index);
295
296 # the directory name starts with the first character of the index level
297 my ($pindex) = $level =~ /^(.)/;
298
299 # next comes a processed version of the index
300 $pindex .= $self->process_field ($gran);
301 $pindex = lc ($pindex);
302
303 # next comes a processed version of the subcollection if there is one.
304 my $psub = $self->process_field ($subcollection);
305 $psub = lc ($psub);
306
307 # next comes a processed version of the language if there is one.
308 my $plang = $self->process_field ($languages);
309 $plang = lc ($plang);
310
311 my $dirname = $pindex . $psub . $plang;
312
313 # check to be sure all index names are unique
314 while (defined ($dirnames{$dirname})) {
315 $dirname = $self->make_unique (\%pnames, $index, \$pindex, \$psub, \$plang);
316 }
317
318 # store the mapping orders as well as the maps
319 if (!defined $mapping{'indexmap'}{"$level:$gran"}) {
320 $mapping{'indexmap'}{"$level:$gran"} = $pindex;
321 push (@{$mapping{'indexmaporder'}}, "$level:$gran");
322 }
323 if ($psub =~ /\w/ && !defined ($mapping{'subcollectionmap'}{$subcollection})) {
324 $mapping{'subcollectionmap'}{$subcollection} = $psub;
325 push (@{$mapping{'subcollectionmaporder'}}, $subcollection);
326 }
327 if ($plang =~ /\w/ && !defined ($mapping{'languagemap'}{$languages})) {
328 $mapping{'languagemap'}{$languages} = $plang;
329 push (@{$mapping{'languagemaporder'}}, $language);
330 }
331 $mapping{$index} = $dirname;
332 $dirnames{$dirname} = $index;
333 $pnames{'index'}{$pindex} = "$level:$gran";
334 $pnames{'subcollection'}{$psub} = $subcollection;
335 $pnames{'languages'}{$plang} = $languages;
336 }
337
338 return \%mapping;
339}
340
341# returns a processed version of a field.
342# if the field has only one component the processed
343# version will contain the first character and next consonant
344# of that componant - otherwise it will contain the first
345# character of the first two components
346sub process_field {
347 my $self = shift (@_);
348 my ($field) = @_;
349
350 return "" unless (defined ($field) && $field =~ /\w/);
351
352 my @components = split /,/, $field;
353 if (scalar @components >= 2) {
354 splice (@components, 2);
355 map {s/^(.).*$/$1/;} @components;
356 return join("", @components);
357 } else {
358 my ($a, $b) = $field =~ /^(.).*?([bcdfghjklmnpqrstvwxyz])/i;
359 ($a, $b) = $field =~ /^(.)(.)/ unless defined $a && defined $b;
360 return "$a$b";
361 }
362}
363
364sub make_unique {
365 my $self = shift (@_);
366 my ($namehash, $index, $indexref, $subref, $langref) = @_;
367 my ($level, $gran, $subcollection, $languages) = split (":", $index);
368
369 if ($namehash->{'index'}->{$$indexref} ne "$level:$gran") {
370 $self->get_next_version ($indexref);
371 } elsif ($namehash->{'subcollection'}->{$$subref} ne $subcollection) {
372 $self->get_next_version ($subref);
373 } elsif ($namehash->{'languages'}->{$$langref} ne $languages) {
374 $self->get_next_version ($langref);
375 }
376 return "$$indexref$$subref$$langref";
377}
378
379sub get_next_version {
380 my $self = shift (@_);
381 my ($nameref) = @_;
382
383 if ($$nameref =~ /(\d\d)$/) {
384 my $num = $1; $num ++;
385 $$nameref =~ s/\d\d$/$num/;
386 } elsif ($$nameref =~ /(\d)$/) {
387 my $num = $1;
388 if ($num == 9) {$$nameref =~ s/\d\d$/10/;}
389 else {$num ++; $$nameref =~ s/\d$/$num/;}
390 } else {
391 $$nameref =~ s/.$/0/;
392 }
393}
394
395sub build_index {
396 my $self = shift (@_);
397 my ($index) = @_;
398
399 # get the full index directory path and make sure it exists
400 my $indexdir = $self->{'index_mapping'}->{$index};
401 &util::mk_all_dir (&util::filename_cat($self->{'build_dir'}, $indexdir));
402 my $fullindexprefix = &util::filename_cat ($self->{'build_dir'}, $indexdir,
403 $self->{'collection'});
404 my $fulltextprefix = &util::filename_cat ($self->{'build_dir'}, "text",
405 $self->{'collection'});
406
407 # get any os specific stuff
408 my $exedir = "$ENV{'GSDLHOME'}/bin/$ENV{'GSDLOS'}";
409 my $exe = &util::get_os_exe ();
410 my $mg_passes_exe = &util::filename_cat($exedir, "mg_passes$exe");
411 my $mg_perf_hash_build_exe =
412 &util::filename_cat($exedir, "mg_perf_hash_build$exe");
413 my $mg_weights_build_exe =
414 &util::filename_cat ($exedir, "mg_weights_build$exe");
415 my $mg_invf_dict_exe =
416 &util::filename_cat ($exedir, "mg_invf_dict$exe");
417 my $mg_stem_idx_exe =
418 &util::filename_cat ($exedir, "mg_stem_idx$exe");
419
420 my $osextra = "";
421 if ($ENV{'GSDLOS'} =~ /^windows$/i) {
422 $fullindexprefix =~ s/\//\\/g;
423 } else {
424 $osextra = " -d /";
425 }
426
427 # get the index level from the index description
428 # the index will be level 2 unless we are building a
429 # paragraph level index
430 my $index_level = 2;
431 $index_level = 3 if $index =~ /^paragraph/i;
432
433 # get the index expression if this index belongs
434 # to a subcollection
435 my $indexexparr = [];
436 my ($level, $fields, $subcollection) = split (":", $index);
437 my @subcollections = ();
438 @subcollections = split /,/, $subcollection if (defined $subcollection);
439
440 foreach $subcollection (@subcollections) {
441 if (defined ($self->{'collect_cfg'}->{'subcollection'}->{$subcollection})) {
442 push (@$indexexparr, $self->{'collect_cfg'}->{'subcollection'}->{$subcollection});
443 }
444 }
445
446 # add expressions for languages if this index belongs to
447 # a language subcollection
448 foreach $language (@{$self->{'collect_cfg'}->{'languages'}}) {
449 if ($language =~ s/^\!//) {
450 push (@$indexexparr, "!Language/$language/");
451 } else {
452 push (@$indexexparr, "Language/$language/");
453 }
454 }
455
456 # Build index dictionary. Uses verbatim stem method
457 print STDERR "\n creating index dictionary\n" if ($self->{'verbosity'} >= 1);
458 my ($handle);
459 if ($self->{'debug'}) {
460 $handle = STDOUT;
461 } else {
462 if (!-e "$mg_passes_exe" ||
463 !open (PIPEOUT, "| $mg_passes_exe -f $fullindexprefix -b $maxdocsize " .
464 "-$index_level -m 32 -s 0 -G -t 10 -N1 $osextra")) {
465 die "mgbuilder::build_index - couldn't run $mg_passes_exe\n";
466 }
467 $handle = mgbuilder::PIPEOUT;
468 }
469
470 # set up the document processor
471 $self->{'buildproc'}->set_output_handle ($handle);
472 $self->{'buildproc'}->set_mode ('text');
473 $self->{'buildproc'}->set_index ($index, $indexexparr);
474 $self->{'buildproc'}->set_indexing_text (1);
475
476 $self->{'buildproc'}->reset();
477 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
478 "", {}, $self->{'buildproc'}, $self->{'maxdocs'});
479 close ($handle) unless $self->{'debug'};
480
481 if (!$self->{'debug'}) {
482 # create the perfect hash function
483 if (!-e "$mg_perf_hash_build_exe") {
484 die "mgbuilder::build_index - couldn't run $mg_perf_hash_build_exe\n";
485 }
486 system ("$mg_perf_hash_build_exe -f $fullindexprefix $osextra");
487
488 if (!-e "$mg_passes_exe" ||
489 !open ($handle, "| $mg_passes_exe -f $fullindexprefix -b $maxdocsize " .
490 "-$index_level -c 3 -G -t 10 -N2 $osextra")) {
491 die "mgbuilder::build_index - couldn't run $mg_passes_exe\n";
492 }
493 }
494
495 # invert the text
496 print STDERR "\n inverting the text\n" if ($self->{'verbosity'} >= 1);
497
498 $self->{'buildproc'}->reset();
499 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
500 "", {}, $self->{'buildproc'}, $self->{'maxdocs'});
501
502 if (!$self->{'debug'}) {
503
504 close ($handle);
505
506 # create the weights file
507 print STDERR "\n create the weights file\n" if ($self->{'verbosity'} >= 1);
508 if (!-e "$mg_weights_build_exe") {
509 die "mgbuilder::build_index - couldn't run $mg_weights_build_exe\n";
510 }
511 system ("$mg_weights_build_exe -f $fullindexprefix -t $fulltextprefix $osextra");
512
513 # create 'on-disk' stemmed dictionary
514 print STDERR "\n creating 'on-disk' stemmed dictionary\n" if ($self->{'verbosity'} >= 1);
515 if (!-e "$mg_invf_dict_exe") {
516 die "mgbuilder::build_index - couldn't run $mg_invf_dict_exe\n";
517 }
518 system ("$mg_invf_dict_exe -f $fullindexprefix $osextra");
519
520
521 # creates stem index files for the various stemming methods
522 print STDERR "\n creating stem indexes\n" if ($self->{'verbosity'} >= 1);
523 if (!-e "$mg_stem_idx_exe") {
524 die "mgbuilder::build_index - couldn't run $mg_stem_idx_exe\n";
525 }
526 system ("$mg_stem_idx_exe -b 4096 -s1 -f $fullindexprefix $osextra");
527 system ("$mg_stem_idx_exe -b 4096 -s2 -f $fullindexprefix $osextra");
528 system ("$mg_stem_idx_exe -b 4096 -s3 -f $fullindexprefix $osextra");
529
530
531 # remove unwanted files
532 my $tmpdir = &util::filename_cat ($self->{'build_dir'}, $indexdir);
533 opendir (DIR, $tmpdir) || die
534 "mgbuilder::build_index - couldn't read directory $tmpdir\n";
535 foreach $file (readdir(DIR)) {
536 next if $file =~ /^\./;
537 my ($suffix) = $file =~ /\.([^\.]+)$/;
538 if (defined $suffix && !defined $wanted_index_files{$suffix}) {
539 # delete it!
540 print STDERR "deleting $file\n" if $self->{'verbosity'} > 2;
541 &util::rm (&util::filename_cat ($tmpdir, $file));
542 }
543 }
544 closedir (DIR);
545 }
546}
547
548sub make_infodatabase {
549 my $self = shift (@_);
550 my $textdir = &util::filename_cat($self->{'build_dir'}, "text");
551 my $assocdir = &util::filename_cat($self->{'build_dir'}, "assoc");
552 &util::mk_all_dir ($textdir);
553 &util::mk_all_dir ($assocdir);
554
555 # get db name
556 my $dbext = ".bdb";
557 $dbext = ".ldb" if &util::is_little_endian();
558 my $fulldbname = &util::filename_cat ($textdir, "$self->{'collection'}$dbext");
559 $fulldbname =~ s/\//\\/g if ($ENV{'GSDLOS'} =~ /^windows$/i);
560
561 my $exedir = "$ENV{'GSDLHOME'}/bin/$ENV{'GSDLOS'}";
562 my $exe = &util::get_os_exe ();
563 my $txt2db_exe = &util::filename_cat($exedir, "txt2db$exe");
564
565 print STDERR "\n*** creating the info database and processing associated files\n"
566 if ($self->{'verbosity'} >= 1);
567
568 # init all the classifiers
569 &classify::init_classifiers ($self->{'classifiers'});
570
571 # set up the document processor
572 my ($handle);
573 if ($self->{'debug'}) {
574 $handle = STDOUT;
575 } else {
576 if (!-e "$txt2db_exe" || !open (PIPEOUT, "| $txt2db_exe $fulldbname")) {
577 die "mgbuilder::make_infodatabase - couldn't run $txt2db_exe\n";
578 }
579 $handle = mgbuilder::PIPEOUT;
580 }
581
582 $self->{'buildproc'}->set_output_handle ($handle);
583 $self->{'buildproc'}->set_mode ('infodb');
584 $self->{'buildproc'}->set_assocdir ($assocdir);
585 $self->{'buildproc'}->set_dontgdbm ($self->{'dontgdbm'});
586 $self->{'buildproc'}->set_classifiers ($self->{'classifiers'});
587 $self->{'buildproc'}->set_indexing_text (0);
588 $self->{'buildproc'}->reset();
589
590 if (defined $self->{'collect_cfg'}->{'collectionmeta'}) {
591
592 if (!defined $self->{'index_mapping'}) {
593 $self->{'index_mapping'} =
594 $self->create_index_mapping ($self->{'collect_cfg'}->{'indexes'});
595 }
596
597 print $handle "[collection]\n";
598
599 foreach $cmeta (keys (%{$self->{'collect_cfg'}->{'collectionmeta'}})) {
600 if ($cmeta =~ s/^\.//) {
601 if (defined $self->{'index_mapping'}->{$cmeta}) {
602 print $handle "<$self->{'index_mapping'}->{$cmeta}>" .
603 $self->{'collect_cfg'}->{'collectionmeta'}->{".$cmeta"} . "\n";
604 } else {
605 print STDERR "mgbuilder: warning bad collectionmeta option '$cmeta' - ignored\n";
606 }
607 } else {
608 print $handle "<$cmeta>$self->{'collect_cfg'}->{'collectionmeta'}->{$cmeta}\n";
609 }
610 }
611 print $handle "\n" . ('-' x 70) . "\n";
612
613 }
614
615 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
616 "", {}, $self->{'buildproc'}, $self->{'maxdocs'});
617
618 # output classification information
619 &classify::output_classify_info ($self->{'classifiers'}, $handle,
620 $self->{'allclassifications'});
621
622 close ($handle) if !$self->{'debug'};
623}
624
625sub collect_specific {
626 my $self = shift (@_);
627}
628
629sub make_auxiliary_files {
630 my $self = shift (@_);
631 my ($index);
632 my %build_cfg = ();
633
634 print STDERR "\n*** creating auxiliary files \n" if ($self->{'verbosity'} >= 1);
635
636 # get the text directory
637 &util::mk_all_dir ($self->{'build_dir'});
638
639 # store the build date
640 $build_cfg->{'builddate'} = time;
641
642 # store the number of documents and number of bytes
643 $build_cfg->{'numdocs'} = $self->{'buildproc'}->get_num_docs();
644 $build_cfg->{'numbytes'} = $self->{'buildproc'}->get_num_bytes();
645
646 # store the mapping between the index names and the directory names
647 my @indexmap = ();
648 foreach $index (@{$self->{'index_mapping'}->{'indexmaporder'}}) {
649 push (@indexmap, "$index\-\>$self->{'index_mapping'}->{'indexmap'}->{$index}");
650 }
651 $build_cfg->{'indexmap'} = \@indexmap;
652
653 my @subcollectionmap = ();
654 foreach $subcollection (@{$self->{'index_mapping'}->{'subcollectionmaporder'}}) {
655 push (@subcollectionmap, "$subcollection\-\>" .
656 $self->{'index_mapping'}->{'subcollectionmap'}->{$subcollection});
657 }
658 $build_cfg->{'subcollectionmap'} = \@subcollectionmap if scalar (@subcollectionmap);
659
660 my @languagemap = ();
661 foreach $language (@{$self->{'index_mapping'}->{'languagemaporder'}}) {
662 push (@languagemap, "$language\-\>" .
663 $self->{'index_mapping'}->{'languagemap'}->{$language});
664 }
665 $build_cfg->{'languagemap'} = \@languagemap if scalar (@languagemap);
666
667 $build_cfg->{'notbuilt'} = $self->{'notbuilt'};
668
669 # write out the build information
670 &cfgread::write_cfg_file("$self->{'build_dir'}/build.cfg", $build_cfg,
671 '^(builddate|numdocs|numbytes)$',
672 '^(indexmap|subcollectionmap|languagemap|notbuilt)$');
673
674}
675
676sub deinit {
677 my $self = shift (@_);
678}
679
680
6811;
682
683
Note: See TracBrowser for help on using the repository browser.