source: trunk/gsdl/perllib/mgbuilder.pm@ 1072

Last change on this file since 1072 was 1072, checked in by sjboddie, 24 years ago

Fixed bug - Control B's and C's were only being removed from body of text
and not from metadata values. This caused problems for mg when indexing
metadata values containing Control B's or C's. They're now removed from
both text and metadata.

  • Property svn:keywords set to Author Date Id Revision
File size: 22.4 KB
Line 
1###########################################################################
2#
3# mgbuilder.pm -- MGBuilder object
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26package mgbuilder;
27
28use classify;
29use cfgread;
30use colcfg;
31use plugin;
32use util;
33
34$maxdocsize = 12000;
35
36%wanted_index_files = ('td'=>1,
37 't'=>1,
38 'idb'=>1,
39 'ib1'=>1,
40 'ib2'=>1,
41 'ib3'=>1,
42 'i'=>1,
43 'ip'=>1,
44 'tiw'=>1,
45 'wa'=>1);
46
47
48sub new {
49 my ($class, $collection, $source_dir, $build_dir, $verbosity,
50 $maxdocs, $debug, $keepold, $allclassifications) = @_;
51
52 # create an mgbuilder object
53 my $self = bless {'collection'=>$collection,
54 'source_dir'=>$source_dir,
55 'build_dir'=>$build_dir,
56 'verbosity'=>$verbosity,
57 'maxdocs'=>$maxdocs,
58 'debug'=>$debug,
59 'keepold'=>$keepold,
60 'allclassifications'=>$allclassifications,
61 'notbuilt'=>[] # indexes not built
62 }, $class;
63
64
65 # read in the collection configuration file
66 my $colcfgname = "$ENV{'GSDLCOLLECTDIR'}/etc/collect.cfg";
67 if (!-e $colcfgname) {
68 die "mgbuilder::new - couldn't find collect.cfg for collection $collection\n";
69 }
70 $self->{'collect_cfg'} = &colcfg::read_collect_cfg ($colcfgname);
71
72 # sort out subcollection indexes
73 if (defined $self->{'collect_cfg'}->{'indexsubcollections'}) {
74 my $indexes = $self->{'collect_cfg'}->{'indexes'};
75 $self->{'collect_cfg'}->{'indexes'} = [];
76 foreach $subcollection (@{$self->{'collect_cfg'}->{'indexsubcollections'}}) {
77 foreach $index (@$indexes) {
78 push (@{$self->{'collect_cfg'}->{'indexes'}}, "$index:$subcollection");
79 }
80 }
81 }
82
83 # sort out language subindexes
84 if (defined $self->{'collect_cfg'}->{'languages'}) {
85 my $indexes = $self->{'collect_cfg'}->{'indexes'};
86 $self->{'collect_cfg'}->{'indexes'} = [];
87 foreach $language (@{$self->{'collect_cfg'}->{'languages'}}) {
88 foreach $index (@$indexes) {
89 push (@{$self->{'collect_cfg'}->{'indexes'}}, "$index:$language");
90 }
91 }
92 }
93
94 # get the list of plugins for this collection
95 my $plugins = [];
96 if (defined $self->{'collect_cfg'}->{'plugin'}) {
97 $plugins = $self->{'collect_cfg'}->{'plugin'};
98 }
99
100 # load all the plugins
101 $self->{'pluginfo'} = &plugin::load_plugins ($plugins);
102 if (scalar(@{$self->{'pluginfo'}}) == 0) {
103 print STDERR "No plugins were loaded.\n";
104 die "\n";
105 }
106
107 # get the list of classifiers for this collection
108 my $classifiers = [];
109 if (defined $self->{'collect_cfg'}->{'classify'}) {
110 $classifiers = $self->{'collect_cfg'}->{'classify'};
111 }
112
113 # load all the classifiers
114 $self->{'classifiers'} = &classify::load_classifiers ($classifiers);
115
116 # load up any dontgdbm fields
117 $self->{'dontgdbm'} = {};
118 if (defined ($self->{'collect_cfg'}->{'dontgdbm'})) {
119 foreach $dg (@{$self->{'collect_cfg'}->{'dontgdbm'}}) {
120 $self->{'dontgdbm'}->{$dg} = 1;
121 }
122 }
123
124 # load up the document processor for building
125 # if a buildproc class has been created for this collection, use it
126 # otherwise, use the mg buildproc
127 my ($buildprocdir, $buildproctype);
128 if (-e "$ENV{'GSDLCOLLECTDIR'}/perllib/${collection}buildproc.pm") {
129 $buildprocdir = "$ENV{'GSDLCOLLECTDIR'}/perllib";
130 $buildproctype = "${collection}buildproc";
131 } else {
132 $buildprocdir = "$ENV{'GSDLHOME'}/perllib";
133 $buildproctype = "mgbuildproc";
134 }
135 require "$buildprocdir/$buildproctype.pm";
136
137 eval("\$self->{'buildproc'} = new $buildproctype(\$collection, " .
138 "\$source_dir, \$build_dir, \$verbosity)");
139 die "$@" if $@;
140
141
142 return $self;
143}
144
145sub init {
146 my $self = shift (@_);
147
148 if (!$self->{'debug'} && !$self->{'keepold'}) {
149 # remove any old builds
150 &util::rm_r($self->{'build_dir'});
151 &util::mk_all_dir($self->{'build_dir'});
152
153 # make the text directory
154 my $textdir = "$self->{'build_dir'}/text";
155 &util::mk_all_dir($textdir);
156 }
157}
158
159sub compress_text {
160 my $self = shift (@_);
161 my ($textindex) = @_;
162 my $exedir = "$ENV{'GSDLHOME'}/bin/$ENV{'GSDLOS'}";
163 my $exe = &util::get_os_exe ();
164 my $mg_passes_exe = &util::filename_cat($exedir, "mg_passes$exe");
165 my $mg_compression_dict_exe = &util::filename_cat($exedir, "mg_compression_dict$exe");
166
167 &util::mk_all_dir (&util::filename_cat($self->{'build_dir'}, "text"));
168 my $basefilename = "text/$self->{'collection'}";
169 my $fulltextprefix = &util::filename_cat ($self->{'build_dir'}, $basefilename);
170
171 my $osextra = "";
172 if ($ENV{'GSDLOS'} =~ /^windows$/i) {
173 $fulltextprefix =~ s/\//\\/g;
174 } else {
175 $osextra = " -d /";
176 }
177
178 print STDERR "\n*** creating the compressed text\n" if ($self->{'verbosity'} >= 1);
179
180 # collect the statistics for the text
181 # -b $maxdocsize sets the maximum document size to be 12 meg
182 print STDERR "\n collecting text statistics\n" if ($self->{'verbosity'} >= 1);
183
184 my ($handle);
185 if ($self->{'debug'}) {
186 $handle = STDOUT;
187 } else {
188 if (!-e "$mg_passes_exe" ||
189 !open (PIPEOUT, "| $mg_passes_exe -f $fulltextprefix -b $maxdocsize -T1 $osextra")) {
190 die "mgbuilder::compress_text - couldn't run $mg_passes_exe\n";
191 }
192 $handle = mgbuilder::PIPEOUT;
193 }
194
195 $self->{'buildproc'}->set_output_handle ($handle);
196 $self->{'buildproc'}->set_mode ('text');
197 $self->{'buildproc'}->set_index ($textindex);
198 $self->{'buildproc'}->set_indexing_text (0);
199 $self->{'buildproc'}->reset();
200 &plugin::begin($self->{'pluginfo'}, $self->{'source_dir'},
201 $self->{'buildproc'}, $self->{'maxdocs'});
202 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
203 "", {}, $self->{'buildproc'}, $self->{'maxdocs'});
204 &plugin::end($self->{'pluginfo'});
205 close (PIPEOUT);
206
207 close ($handle) unless $self->{'debug'};
208
209 # create the compression dictionary
210 # the compression dictionary is built by assuming the stats are from a seed
211 # dictionary (-S), if a novel word is encountered it is spelled out (-H),
212 # and the resulting dictionary must be less than 5 meg with the most frequent
213 # words being put into the dictionary first (-2 -k 5120)
214 if (!$self->{'debug'}) {
215 print STDERR "\n creating the compression dictionary\n" if ($self->{'verbosity'} >= 1);
216 if (!-e "$mg_compression_dict_exe") {
217 die "mgbuilder::compress_text - couldn't run $mg_compression_dict_exe\n";
218 }
219 system ("$mg_compression_dict_exe -f $fulltextprefix -S -H -2 -k 5120 $osextra");
220
221 # -b $maxdocsize sets the maximum document size to be 12 meg
222 if (!-e "$mg_passes_exe" ||
223 !open ($handle, "| $mg_passes_exe -f $fulltextprefix -b $maxdocsize -T2 $osextra")) {
224 die "mgbuilder::compress_text - couldn't run $mg_passes_exe\n";
225 }
226 }
227
228 $self->{'buildproc'}->reset();
229 # compress the text
230 print STDERR "\n compressing the text\n" if ($self->{'verbosity'} >= 1);
231 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
232 "", {}, $self->{'buildproc'}, $self->{'maxdocs'});
233 close ($handle) unless $self->{'debug'};
234}
235
236sub want_built {
237 my $self = shift (@_);
238 my ($index) = @_;
239
240 if (defined ($self->{'collect_cfg'}->{'dontbuild'})) {
241 foreach $checkstr (@{$self->{'collect_cfg'}->{'dontbuild'}}) {
242 if ($index =~ /^$checkstr$/) {
243 push (@{$self->{'notbuilt'}}, $self->{'index_mapping'}->{$index});
244 return 0;
245 }
246 }
247 }
248
249 return 1;
250}
251
252sub build_indexes {
253 my $self = shift (@_);
254 my ($indexname) = @_;
255
256 my $indexes = [];
257 if (defined $indexname && $indexname =~ /\w/) {
258 push @$indexes, $indexname;
259 } else {
260 $indexes = $self->{'collect_cfg'}->{'indexes'};
261 }
262
263 # create the mapping between the index descriptions
264 # and their directory names
265 $self->{'index_mapping'} = $self->create_index_mapping ($indexes);
266
267 # build each of the indexes
268 foreach $index (@$indexes) {
269 if ($self->want_built($index)) {
270 print STDERR "\n*** building index $index in subdirectory " .
271 "$self->{'index_mapping'}->{$index}\n" if ($self->{'verbosity'} >= 1);
272 $self->build_index($index);
273 } else {
274 print STDERR "\n*** ignoring index $index\n" if ($self->{'verbosity'} >= 1);
275 }
276 }
277}
278
279# creates directory names for each of the index descriptions
280sub create_index_mapping {
281 my $self = shift (@_);
282 my ($indexes) = @_;
283
284 my %mapping = ();
285 $mapping{'indexmaporder'} = [];
286 $mapping{'subcollectionmaporder'} = [];
287 $mapping{'languagemaporder'} = [];
288
289 # dirnames is used to check for collisions. Start this off
290 # with the manditory directory names
291 my %dirnames = ('text'=>'text',
292 'extra'=>'extra');
293 my %pnames = ('index' => '', 'subcollection' => '', 'languages' => '');
294
295 foreach $index (@$indexes) {
296 my ($level, $gran, $subcollection, $languages) = split (":", $index);
297
298 # the directory name starts with the first character of the index level
299 my ($pindex) = $level =~ /^(.)/;
300
301 # next comes a processed version of the index
302 $pindex .= $self->process_field ($gran);
303 $pindex = lc ($pindex);
304
305 # next comes a processed version of the subcollection if there is one.
306 my $psub = $self->process_field ($subcollection);
307 $psub = lc ($psub);
308
309 # next comes a processed version of the language if there is one.
310 my $plang = $self->process_field ($languages);
311 $plang = lc ($plang);
312
313 my $dirname = $pindex . $psub . $plang;
314
315 # check to be sure all index names are unique
316 while (defined ($dirnames{$dirname})) {
317 $dirname = $self->make_unique (\%pnames, $index, \$pindex, \$psub, \$plang);
318 }
319
320 # store the mapping orders as well as the maps
321 if (!defined $mapping{'indexmap'}{"$level:$gran"}) {
322 $mapping{'indexmap'}{"$level:$gran"} = $pindex;
323 push (@{$mapping{'indexmaporder'}}, "$level:$gran");
324 }
325 if ($psub =~ /\w/ && !defined ($mapping{'subcollectionmap'}{$subcollection})) {
326 $mapping{'subcollectionmap'}{$subcollection} = $psub;
327 push (@{$mapping{'subcollectionmaporder'}}, $subcollection);
328 }
329 if ($plang =~ /\w/ && !defined ($mapping{'languagemap'}{$languages})) {
330 $mapping{'languagemap'}{$languages} = $plang;
331 push (@{$mapping{'languagemaporder'}}, $language);
332 }
333 $mapping{$index} = $dirname;
334 $dirnames{$dirname} = $index;
335 $pnames{'index'}{$pindex} = "$level:$gran";
336 $pnames{'subcollection'}{$psub} = $subcollection;
337 $pnames{'languages'}{$plang} = $languages;
338 }
339
340 return \%mapping;
341}
342
343# returns a processed version of a field.
344# if the field has only one component the processed
345# version will contain the first character and next consonant
346# of that componant - otherwise it will contain the first
347# character of the first two components
348sub process_field {
349 my $self = shift (@_);
350 my ($field) = @_;
351
352 return "" unless (defined ($field) && $field =~ /\w/);
353
354 my @components = split /,/, $field;
355 if (scalar @components >= 2) {
356 splice (@components, 2);
357 map {s/^(.).*$/$1/;} @components;
358 return join("", @components);
359 } else {
360 my ($a, $b) = $field =~ /^(.).*?([bcdfghjklmnpqrstvwxyz])/i;
361 ($a, $b) = $field =~ /^(.)(.)/ unless defined $a && defined $b;
362 return "$a$b";
363 }
364}
365
366sub make_unique {
367 my $self = shift (@_);
368 my ($namehash, $index, $indexref, $subref, $langref) = @_;
369 my ($level, $gran, $subcollection, $languages) = split (":", $index);
370
371 if ($namehash->{'index'}->{$$indexref} ne "$level:$gran") {
372 $self->get_next_version ($indexref);
373 } elsif ($namehash->{'subcollection'}->{$$subref} ne $subcollection) {
374 $self->get_next_version ($subref);
375 } elsif ($namehash->{'languages'}->{$$langref} ne $languages) {
376 $self->get_next_version ($langref);
377 }
378 return "$$indexref$$subref$$langref";
379}
380
381sub get_next_version {
382 my $self = shift (@_);
383 my ($nameref) = @_;
384
385 if ($$nameref =~ /(\d\d)$/) {
386 my $num = $1; $num ++;
387 $$nameref =~ s/\d\d$/$num/;
388 } elsif ($$nameref =~ /(\d)$/) {
389 my $num = $1;
390 if ($num == 9) {$$nameref =~ s/\d\d$/10/;}
391 else {$num ++; $$nameref =~ s/\d$/$num/;}
392 } else {
393 $$nameref =~ s/.$/0/;
394 }
395}
396
397sub build_index {
398 my $self = shift (@_);
399 my ($index) = @_;
400
401 # get the full index directory path and make sure it exists
402 my $indexdir = $self->{'index_mapping'}->{$index};
403 &util::mk_all_dir (&util::filename_cat($self->{'build_dir'}, $indexdir));
404 my $fullindexprefix = &util::filename_cat ($self->{'build_dir'}, $indexdir,
405 $self->{'collection'});
406 my $fulltextprefix = &util::filename_cat ($self->{'build_dir'}, "text",
407 $self->{'collection'});
408
409 # get any os specific stuff
410 my $exedir = "$ENV{'GSDLHOME'}/bin/$ENV{'GSDLOS'}";
411 my $exe = &util::get_os_exe ();
412 my $mg_passes_exe = &util::filename_cat($exedir, "mg_passes$exe");
413 my $mg_perf_hash_build_exe =
414 &util::filename_cat($exedir, "mg_perf_hash_build$exe");
415 my $mg_weights_build_exe =
416 &util::filename_cat ($exedir, "mg_weights_build$exe");
417 my $mg_invf_dict_exe =
418 &util::filename_cat ($exedir, "mg_invf_dict$exe");
419 my $mg_stem_idx_exe =
420 &util::filename_cat ($exedir, "mg_stem_idx$exe");
421
422 my $osextra = "";
423 if ($ENV{'GSDLOS'} =~ /^windows$/i) {
424 $fullindexprefix =~ s/\//\\/g;
425 } else {
426 $osextra = " -d /";
427 }
428
429 # get the index level from the index description
430 # the index will be level 2 unless we are building a
431 # paragraph level index
432 my $index_level = 2;
433 $index_level = 3 if $index =~ /^paragraph/i;
434
435 # get the index expression if this index belongs
436 # to a subcollection
437 my $indexexparr = [];
438 my ($level, $fields, $subcollection) = split (":", $index);
439 my @subcollections = ();
440 @subcollections = split /,/, $subcollection if (defined $subcollection);
441
442 foreach $subcollection (@subcollections) {
443 if (defined ($self->{'collect_cfg'}->{'subcollection'}->{$subcollection})) {
444 push (@$indexexparr, $self->{'collect_cfg'}->{'subcollection'}->{$subcollection});
445 }
446 }
447
448 # add expressions for languages if this index belongs to
449 # a language subcollection
450 foreach $language (@{$self->{'collect_cfg'}->{'languages'}}) {
451 if ($language =~ s/^\!//) {
452 push (@$indexexparr, "!Language/$language/");
453 } else {
454 push (@$indexexparr, "Language/$language/");
455 }
456 }
457
458 # Build index dictionary. Uses verbatim stem method
459 print STDERR "\n creating index dictionary\n" if ($self->{'verbosity'} >= 1);
460 my ($handle);
461 if ($self->{'debug'}) {
462 $handle = STDOUT;
463 } else {
464 if (!-e "$mg_passes_exe" ||
465 !open (PIPEOUT, "| $mg_passes_exe -f $fullindexprefix -b $maxdocsize " .
466 "-$index_level -m 32 -s 0 -G -t 10 -N1 $osextra")) {
467 die "mgbuilder::build_index - couldn't run $mg_passes_exe\n";
468 }
469 $handle = mgbuilder::PIPEOUT;
470 }
471
472 # set up the document processor
473 $self->{'buildproc'}->set_output_handle ($handle);
474 $self->{'buildproc'}->set_mode ('text');
475 $self->{'buildproc'}->set_index ($index, $indexexparr);
476 $self->{'buildproc'}->set_indexing_text (1);
477
478 $self->{'buildproc'}->reset();
479 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
480 "", {}, $self->{'buildproc'}, $self->{'maxdocs'});
481 close ($handle) unless $self->{'debug'};
482
483 if (!$self->{'debug'}) {
484 # create the perfect hash function
485 if (!-e "$mg_perf_hash_build_exe") {
486 die "mgbuilder::build_index - couldn't run $mg_perf_hash_build_exe\n";
487 }
488 system ("$mg_perf_hash_build_exe -f $fullindexprefix $osextra");
489
490 if (!-e "$mg_passes_exe" ||
491 !open ($handle, "| $mg_passes_exe -f $fullindexprefix -b $maxdocsize " .
492 "-$index_level -c 3 -G -t 10 -N2 $osextra")) {
493 die "mgbuilder::build_index - couldn't run $mg_passes_exe\n";
494 }
495 }
496
497 # invert the text
498 print STDERR "\n inverting the text\n" if ($self->{'verbosity'} >= 1);
499
500 $self->{'buildproc'}->reset();
501 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
502 "", {}, $self->{'buildproc'}, $self->{'maxdocs'});
503
504 if (!$self->{'debug'}) {
505
506 close ($handle);
507
508 # create the weights file
509 print STDERR "\n create the weights file\n" if ($self->{'verbosity'} >= 1);
510 if (!-e "$mg_weights_build_exe") {
511 die "mgbuilder::build_index - couldn't run $mg_weights_build_exe\n";
512 }
513 system ("$mg_weights_build_exe -f $fullindexprefix -t $fulltextprefix $osextra");
514
515 # create 'on-disk' stemmed dictionary
516 print STDERR "\n creating 'on-disk' stemmed dictionary\n" if ($self->{'verbosity'} >= 1);
517 if (!-e "$mg_invf_dict_exe") {
518 die "mgbuilder::build_index - couldn't run $mg_invf_dict_exe\n";
519 }
520 system ("$mg_invf_dict_exe -f $fullindexprefix $osextra");
521
522
523 # creates stem index files for the various stemming methods
524 print STDERR "\n creating stem indexes\n" if ($self->{'verbosity'} >= 1);
525 if (!-e "$mg_stem_idx_exe") {
526 die "mgbuilder::build_index - couldn't run $mg_stem_idx_exe\n";
527 }
528 system ("$mg_stem_idx_exe -b 4096 -s1 -f $fullindexprefix $osextra");
529 system ("$mg_stem_idx_exe -b 4096 -s2 -f $fullindexprefix $osextra");
530 system ("$mg_stem_idx_exe -b 4096 -s3 -f $fullindexprefix $osextra");
531
532
533 # remove unwanted files
534 my $tmpdir = &util::filename_cat ($self->{'build_dir'}, $indexdir);
535 opendir (DIR, $tmpdir) || die
536 "mgbuilder::build_index - couldn't read directory $tmpdir\n";
537 foreach $file (readdir(DIR)) {
538 next if $file =~ /^\./;
539 my ($suffix) = $file =~ /\.([^\.]+)$/;
540 if (defined $suffix && !defined $wanted_index_files{$suffix}) {
541 # delete it!
542 print STDERR "deleting $file\n" if $self->{'verbosity'} > 2;
543 &util::rm (&util::filename_cat ($tmpdir, $file));
544 }
545 }
546 closedir (DIR);
547 }
548}
549
550sub make_infodatabase {
551 my $self = shift (@_);
552 my $textdir = &util::filename_cat($self->{'build_dir'}, "text");
553 my $assocdir = &util::filename_cat($self->{'build_dir'}, "assoc");
554 &util::mk_all_dir ($textdir);
555 &util::mk_all_dir ($assocdir);
556
557 # get db name
558 my $dbext = ".bdb";
559 $dbext = ".ldb" if &util::is_little_endian();
560 my $fulldbname = &util::filename_cat ($textdir, "$self->{'collection'}$dbext");
561 $fulldbname =~ s/\//\\/g if ($ENV{'GSDLOS'} =~ /^windows$/i);
562
563 my $exedir = "$ENV{'GSDLHOME'}/bin/$ENV{'GSDLOS'}";
564 my $exe = &util::get_os_exe ();
565 my $txt2db_exe = &util::filename_cat($exedir, "txt2db$exe");
566
567 print STDERR "\n*** creating the info database and processing associated files\n"
568 if ($self->{'verbosity'} >= 1);
569
570 # init all the classifiers
571 &classify::init_classifiers ($self->{'classifiers'});
572
573 # set up the document processor
574 my ($handle);
575 if ($self->{'debug'}) {
576 $handle = STDOUT;
577 } else {
578 if (!-e "$txt2db_exe" || !open (PIPEOUT, "| $txt2db_exe $fulldbname")) {
579 die "mgbuilder::make_infodatabase - couldn't run $txt2db_exe\n";
580 }
581 $handle = mgbuilder::PIPEOUT;
582 }
583
584 $self->{'buildproc'}->set_output_handle ($handle);
585 $self->{'buildproc'}->set_mode ('infodb');
586 $self->{'buildproc'}->set_assocdir ($assocdir);
587 $self->{'buildproc'}->set_dontgdbm ($self->{'dontgdbm'});
588 $self->{'buildproc'}->set_classifiers ($self->{'classifiers'});
589 $self->{'buildproc'}->set_indexing_text (0);
590 $self->{'buildproc'}->reset();
591
592 if (defined $self->{'collect_cfg'}->{'collectionmeta'}) {
593
594 if (!defined $self->{'index_mapping'}) {
595 $self->{'index_mapping'} =
596 $self->create_index_mapping ($self->{'collect_cfg'}->{'indexes'});
597 }
598
599 print $handle "[collection]\n";
600
601 foreach $cmeta (keys (%{$self->{'collect_cfg'}->{'collectionmeta'}})) {
602 if ($cmeta =~ s/^\.//) {
603 if (defined $self->{'index_mapping'}->{$cmeta}) {
604 print $handle "<$self->{'index_mapping'}->{$cmeta}>" .
605 $self->{'collect_cfg'}->{'collectionmeta'}->{".$cmeta"} . "\n";
606 } else {
607 print STDERR "mgbuilder: warning bad collectionmeta option '$cmeta' - ignored\n";
608 }
609 } else {
610 print $handle "<$cmeta>$self->{'collect_cfg'}->{'collectionmeta'}->{$cmeta}\n";
611 }
612 }
613 print $handle "\n" . ('-' x 70) . "\n";
614
615 }
616
617 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
618 "", {}, $self->{'buildproc'}, $self->{'maxdocs'});
619
620 # output classification information
621 &classify::output_classify_info ($self->{'classifiers'}, $handle,
622 $self->{'allclassifications'});
623
624 close ($handle) if !$self->{'debug'};
625}
626
627sub collect_specific {
628 my $self = shift (@_);
629}
630
631sub make_auxiliary_files {
632 my $self = shift (@_);
633 my ($index);
634 my %build_cfg = ();
635
636 print STDERR "\n*** creating auxiliary files \n" if ($self->{'verbosity'} >= 1);
637
638 # get the text directory
639 &util::mk_all_dir ($self->{'build_dir'});
640
641 # store the build date
642 $build_cfg->{'builddate'} = time;
643
644 # store the number of documents and number of bytes
645 $build_cfg->{'numdocs'} = $self->{'buildproc'}->get_num_docs();
646 $build_cfg->{'numbytes'} = $self->{'buildproc'}->get_num_bytes();
647
648 # store the mapping between the index names and the directory names
649 my @indexmap = ();
650 foreach $index (@{$self->{'index_mapping'}->{'indexmaporder'}}) {
651 push (@indexmap, "$index\-\>$self->{'index_mapping'}->{'indexmap'}->{$index}");
652 }
653 $build_cfg->{'indexmap'} = \@indexmap;
654
655 my @subcollectionmap = ();
656 foreach $subcollection (@{$self->{'index_mapping'}->{'subcollectionmaporder'}}) {
657 push (@subcollectionmap, "$subcollection\-\>" .
658 $self->{'index_mapping'}->{'subcollectionmap'}->{$subcollection});
659 }
660 $build_cfg->{'subcollectionmap'} = \@subcollectionmap if scalar (@subcollectionmap);
661
662 my @languagemap = ();
663 foreach $language (@{$self->{'index_mapping'}->{'languagemaporder'}}) {
664 push (@languagemap, "$language\-\>" .
665 $self->{'index_mapping'}->{'languagemap'}->{$language});
666 }
667 $build_cfg->{'languagemap'} = \@languagemap if scalar (@languagemap);
668
669 $build_cfg->{'notbuilt'} = $self->{'notbuilt'};
670
671 # write out the build information
672 &cfgread::write_cfg_file("$self->{'build_dir'}/build.cfg", $build_cfg,
673 '^(builddate|numdocs|numbytes)$',
674 '^(indexmap|subcollectionmap|languagemap|notbuilt)$');
675
676}
677
678sub deinit {
679 my $self = shift (@_);
680}
681
682
6831;
684
685
Note: See TracBrowser for help on using the repository browser.