source: trunk/gsdl/perllib/mgbuilder.pm@ 1209

Last change on this file since 1209 was 1072, checked in by sjboddie, 24 years ago

Fixed bug - Control B's and C's were only being removed from body of text
and not from metadata values. This caused problems for mg when indexing
metadata values containing Control B's or C's. They're now removed from
both text and metadata.

  • Property svn:keywords set to Author Date Id Revision
File size: 22.4 KB
RevLine 
[537]1###########################################################################
[4]2#
[537]3# mgbuilder.pm -- MGBuilder object
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
[4]25
26package mgbuilder;
27
[215]28use classify;
[4]29use cfgread;
30use colcfg;
31use plugin;
32use util;
33
34$maxdocsize = 12000;
35
36%wanted_index_files = ('td'=>1,
37 't'=>1,
38 'idb'=>1,
39 'ib1'=>1,
40 'ib2'=>1,
41 'ib3'=>1,
42 'i'=>1,
43 'ip'=>1,
44 'tiw'=>1,
45 'wa'=>1);
46
47
48sub new {
[784]49 my ($class, $collection, $source_dir, $build_dir, $verbosity,
50 $maxdocs, $debug, $keepold, $allclassifications) = @_;
[4]51
52 # create an mgbuilder object
53 my $self = bless {'collection'=>$collection,
54 'source_dir'=>$source_dir,
55 'build_dir'=>$build_dir,
[215]56 'verbosity'=>$verbosity,
[315]57 'maxdocs'=>$maxdocs,
[782]58 'debug'=>$debug,
[784]59 'keepold'=>$keepold,
[486]60 'allclassifications'=>$allclassifications,
61 'notbuilt'=>[] # indexes not built
[315]62 }, $class;
[4]63
64
65 # read in the collection configuration file
[215]66 my $colcfgname = "$ENV{'GSDLCOLLECTDIR'}/etc/collect.cfg";
67 if (!-e $colcfgname) {
[4]68 die "mgbuilder::new - couldn't find collect.cfg for collection $collection\n";
69 }
[215]70 $self->{'collect_cfg'} = &colcfg::read_collect_cfg ($colcfgname);
[4]71
[69]72 # sort out subcollection indexes
73 if (defined $self->{'collect_cfg'}->{'indexsubcollections'}) {
74 my $indexes = $self->{'collect_cfg'}->{'indexes'};
75 $self->{'collect_cfg'}->{'indexes'} = [];
76 foreach $subcollection (@{$self->{'collect_cfg'}->{'indexsubcollections'}}) {
77 foreach $index (@$indexes) {
78 push (@{$self->{'collect_cfg'}->{'indexes'}}, "$index:$subcollection");
79 }
80 }
81 }
82
[139]83 # sort out language subindexes
84 if (defined $self->{'collect_cfg'}->{'languages'}) {
85 my $indexes = $self->{'collect_cfg'}->{'indexes'};
86 $self->{'collect_cfg'}->{'indexes'} = [];
87 foreach $language (@{$self->{'collect_cfg'}->{'languages'}}) {
88 foreach $index (@$indexes) {
89 push (@{$self->{'collect_cfg'}->{'indexes'}}, "$index:$language");
90 }
91 }
92 }
93
[4]94 # get the list of plugins for this collection
[810]95 my $plugins = [];
96 if (defined $self->{'collect_cfg'}->{'plugin'}) {
97 $plugins = $self->{'collect_cfg'}->{'plugin'};
[4]98 }
99
100 # load all the plugins
[810]101 $self->{'pluginfo'} = &plugin::load_plugins ($plugins);
[4]102 if (scalar(@{$self->{'pluginfo'}}) == 0) {
103 print STDERR "No plugins were loaded.\n";
104 die "\n";
105 }
106
[810]107 # get the list of classifiers for this collection
108 my $classifiers = [];
109 if (defined $self->{'collect_cfg'}->{'classify'}) {
110 $classifiers = $self->{'collect_cfg'}->{'classify'};
111 }
112
[215]113 # load all the classifiers
[810]114 $self->{'classifiers'} = &classify::load_classifiers ($classifiers);
[215]115
[780]116 # load up any dontgdbm fields
117 $self->{'dontgdbm'} = {};
118 if (defined ($self->{'collect_cfg'}->{'dontgdbm'})) {
119 foreach $dg (@{$self->{'collect_cfg'}->{'dontgdbm'}}) {
120 $self->{'dontgdbm'}->{$dg} = 1;
121 }
122 }
123
[4]124 # load up the document processor for building
125 # if a buildproc class has been created for this collection, use it
126 # otherwise, use the mg buildproc
127 my ($buildprocdir, $buildproctype);
[134]128 if (-e "$ENV{'GSDLCOLLECTDIR'}/perllib/${collection}buildproc.pm") {
129 $buildprocdir = "$ENV{'GSDLCOLLECTDIR'}/perllib";
[4]130 $buildproctype = "${collection}buildproc";
131 } else {
[16]132 $buildprocdir = "$ENV{'GSDLHOME'}/perllib";
[4]133 $buildproctype = "mgbuildproc";
134 }
135 require "$buildprocdir/$buildproctype.pm";
136
137 eval("\$self->{'buildproc'} = new $buildproctype(\$collection, " .
[315]138 "\$source_dir, \$build_dir, \$verbosity)");
[4]139 die "$@" if $@;
140
141
142 return $self;
143}
144
145sub init {
146 my $self = shift (@_);
147
[784]148 if (!$self->{'debug'} && !$self->{'keepold'}) {
[782]149 # remove any old builds
150 &util::rm_r($self->{'build_dir'});
151 &util::mk_all_dir($self->{'build_dir'});
[4]152
[782]153 # make the text directory
154 my $textdir = "$self->{'build_dir'}/text";
155 &util::mk_all_dir($textdir);
156 }
[4]157}
158
159sub compress_text {
160 my $self = shift (@_);
[134]161 my ($textindex) = @_;
[4]162 my $exedir = "$ENV{'GSDLHOME'}/bin/$ENV{'GSDLOS'}";
163 my $exe = &util::get_os_exe ();
[486]164 my $mg_passes_exe = &util::filename_cat($exedir, "mg_passes$exe");
165 my $mg_compression_dict_exe = &util::filename_cat($exedir, "mg_compression_dict$exe");
[4]166
167 &util::mk_all_dir (&util::filename_cat($self->{'build_dir'}, "text"));
168 my $basefilename = "text/$self->{'collection'}";
169 my $fulltextprefix = &util::filename_cat ($self->{'build_dir'}, $basefilename);
170
171 my $osextra = "";
172 if ($ENV{'GSDLOS'} =~ /^windows$/i) {
173 $fulltextprefix =~ s/\//\\/g;
174 } else {
175 $osextra = " -d /";
176 }
177
178 print STDERR "\n*** creating the compressed text\n" if ($self->{'verbosity'} >= 1);
179
180 # collect the statistics for the text
181 # -b $maxdocsize sets the maximum document size to be 12 meg
182 print STDERR "\n collecting text statistics\n" if ($self->{'verbosity'} >= 1);
[782]183
184 my ($handle);
185 if ($self->{'debug'}) {
186 $handle = STDOUT;
187 } else {
188 if (!-e "$mg_passes_exe" ||
189 !open (PIPEOUT, "| $mg_passes_exe -f $fulltextprefix -b $maxdocsize -T1 $osextra")) {
190 die "mgbuilder::compress_text - couldn't run $mg_passes_exe\n";
191 }
192 $handle = mgbuilder::PIPEOUT;
[4]193 }
[782]194
195 $self->{'buildproc'}->set_output_handle ($handle);
196 $self->{'buildproc'}->set_mode ('text');
197 $self->{'buildproc'}->set_index ($textindex);
198 $self->{'buildproc'}->set_indexing_text (0);
[4]199 $self->{'buildproc'}->reset();
[835]200 &plugin::begin($self->{'pluginfo'}, $self->{'source_dir'},
201 $self->{'buildproc'}, $self->{'maxdocs'});
202 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
203 "", {}, $self->{'buildproc'}, $self->{'maxdocs'});
204 &plugin::end($self->{'pluginfo'});
205 close (PIPEOUT);
[4]206
[782]207 close ($handle) unless $self->{'debug'};
208
[4]209 # create the compression dictionary
210 # the compression dictionary is built by assuming the stats are from a seed
211 # dictionary (-S), if a novel word is encountered it is spelled out (-H),
212 # and the resulting dictionary must be less than 5 meg with the most frequent
213 # words being put into the dictionary first (-2 -k 5120)
[782]214 if (!$self->{'debug'}) {
215 print STDERR "\n creating the compression dictionary\n" if ($self->{'verbosity'} >= 1);
216 if (!-e "$mg_compression_dict_exe") {
217 die "mgbuilder::compress_text - couldn't run $mg_compression_dict_exe\n";
218 }
219 system ("$mg_compression_dict_exe -f $fulltextprefix -S -H -2 -k 5120 $osextra");
220
221 # -b $maxdocsize sets the maximum document size to be 12 meg
[1072]222 if (!-e "$mg_passes_exe" ||
223 !open ($handle, "| $mg_passes_exe -f $fulltextprefix -b $maxdocsize -T2 $osextra")) {
224 die "mgbuilder::compress_text - couldn't run $mg_passes_exe\n";
[782]225 }
[4]226 }
227
[782]228 $self->{'buildproc'}->reset();
[4]229 # compress the text
230 print STDERR "\n compressing the text\n" if ($self->{'verbosity'} >= 1);
231 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
[315]232 "", {}, $self->{'buildproc'}, $self->{'maxdocs'});
[782]233 close ($handle) unless $self->{'debug'};
[4]234}
235
[486]236sub want_built {
237 my $self = shift (@_);
238 my ($index) = @_;
239
240 if (defined ($self->{'collect_cfg'}->{'dontbuild'})) {
241 foreach $checkstr (@{$self->{'collect_cfg'}->{'dontbuild'}}) {
242 if ($index =~ /^$checkstr$/) {
243 push (@{$self->{'notbuilt'}}, $self->{'index_mapping'}->{$index});
244 return 0;
245 }
246 }
247 }
248
249 return 1;
250}
251
[4]252sub build_indexes {
253 my $self = shift (@_);
[782]254 my ($indexname) = @_;
[4]255
[782]256 my $indexes = [];
257 if (defined $indexname && $indexname =~ /\w/) {
258 push @$indexes, $indexname;
259 } else {
260 $indexes = $self->{'collect_cfg'}->{'indexes'};
261 }
262
[4]263 # create the mapping between the index descriptions
264 # and their directory names
265 $self->{'index_mapping'} = $self->create_index_mapping ($indexes);
266
267 # build each of the indexes
268 foreach $index (@$indexes) {
[486]269 if ($self->want_built($index)) {
270 print STDERR "\n*** building index $index in subdirectory " .
271 "$self->{'index_mapping'}->{$index}\n" if ($self->{'verbosity'} >= 1);
272 $self->build_index($index);
273 } else {
274 print STDERR "\n*** ignoring index $index\n" if ($self->{'verbosity'} >= 1);
275 }
[4]276 }
277}
278
279# creates directory names for each of the index descriptions
280sub create_index_mapping {
281 my $self = shift (@_);
282 my ($indexes) = @_;
283
284 my %mapping = ();
[290]285 $mapping{'indexmaporder'} = [];
286 $mapping{'subcollectionmaporder'} = [];
287 $mapping{'languagemaporder'} = [];
[4]288
289 # dirnames is used to check for collisions. Start this off
290 # with the manditory directory names
291 my %dirnames = ('text'=>'text',
292 'extra'=>'extra');
[139]293 my %pnames = ('index' => '', 'subcollection' => '', 'languages' => '');
294
[4]295 foreach $index (@$indexes) {
[139]296 my ($level, $gran, $subcollection, $languages) = split (":", $index);
[4]297
[139]298 # the directory name starts with the first character of the index level
299 my ($pindex) = $level =~ /^(.)/;
[4]300
[139]301 # next comes a processed version of the index
302 $pindex .= $self->process_field ($gran);
303 $pindex = lc ($pindex);
304
[69]305 # next comes a processed version of the subcollection if there is one.
[139]306 my $psub = $self->process_field ($subcollection);
307 $psub = lc ($psub);
[69]308
[139]309 # next comes a processed version of the language if there is one.
310 my $plang = $self->process_field ($languages);
311 $plang = lc ($plang);
[4]312
[139]313 my $dirname = $pindex . $psub . $plang;
314
315 # check to be sure all index names are unique
316 while (defined ($dirnames{$dirname})) {
317 $dirname = $self->make_unique (\%pnames, $index, \$pindex, \$psub, \$plang);
[4]318 }
[139]319
[290]320 # store the mapping orders as well as the maps
321 if (!defined $mapping{'indexmap'}{"$level:$gran"}) {
322 $mapping{'indexmap'}{"$level:$gran"} = $pindex;
323 push (@{$mapping{'indexmaporder'}}, "$level:$gran");
324 }
325 if ($psub =~ /\w/ && !defined ($mapping{'subcollectionmap'}{$subcollection})) {
326 $mapping{'subcollectionmap'}{$subcollection} = $psub;
327 push (@{$mapping{'subcollectionmaporder'}}, $subcollection);
328 }
329 if ($plang =~ /\w/ && !defined ($mapping{'languagemap'}{$languages})) {
330 $mapping{'languagemap'}{$languages} = $plang;
331 push (@{$mapping{'languagemaporder'}}, $language);
332 }
[4]333 $mapping{$index} = $dirname;
334 $dirnames{$dirname} = $index;
[139]335 $pnames{'index'}{$pindex} = "$level:$gran";
336 $pnames{'subcollection'}{$psub} = $subcollection;
337 $pnames{'languages'}{$plang} = $languages;
[4]338 }
339
340 return \%mapping;
341}
342
[139]343# returns a processed version of a field.
344# if the field has only one component the processed
345# version will contain the first character and next consonant
346# of that componant - otherwise it will contain the first
347# character of the first two components
348sub process_field {
349 my $self = shift (@_);
350 my ($field) = @_;
351
352 return "" unless (defined ($field) && $field =~ /\w/);
[4]353
[139]354 my @components = split /,/, $field;
355 if (scalar @components >= 2) {
356 splice (@components, 2);
357 map {s/^(.).*$/$1/;} @components;
358 return join("", @components);
359 } else {
360 my ($a, $b) = $field =~ /^(.).*?([bcdfghjklmnpqrstvwxyz])/i;
361 ($a, $b) = $field =~ /^(.)(.)/ unless defined $a && defined $b;
362 return "$a$b";
363 }
364}
365
366sub make_unique {
367 my $self = shift (@_);
368 my ($namehash, $index, $indexref, $subref, $langref) = @_;
369 my ($level, $gran, $subcollection, $languages) = split (":", $index);
370
371 if ($namehash->{'index'}->{$$indexref} ne "$level:$gran") {
372 $self->get_next_version ($indexref);
373 } elsif ($namehash->{'subcollection'}->{$$subref} ne $subcollection) {
374 $self->get_next_version ($subref);
375 } elsif ($namehash->{'languages'}->{$$langref} ne $languages) {
376 $self->get_next_version ($langref);
377 }
378 return "$$indexref$$subref$$langref";
379}
380
381sub get_next_version {
382 my $self = shift (@_);
383 my ($nameref) = @_;
384
385 if ($$nameref =~ /(\d\d)$/) {
386 my $num = $1; $num ++;
387 $$nameref =~ s/\d\d$/$num/;
388 } elsif ($$nameref =~ /(\d)$/) {
389 my $num = $1;
390 if ($num == 9) {$$nameref =~ s/\d\d$/10/;}
391 else {$num ++; $$nameref =~ s/\d$/$num/;}
392 } else {
393 $$nameref =~ s/.$/0/;
394 }
395}
396
[4]397sub build_index {
398 my $self = shift (@_);
399 my ($index) = @_;
400
401 # get the full index directory path and make sure it exists
402 my $indexdir = $self->{'index_mapping'}->{$index};
403 &util::mk_all_dir (&util::filename_cat($self->{'build_dir'}, $indexdir));
404 my $fullindexprefix = &util::filename_cat ($self->{'build_dir'}, $indexdir,
405 $self->{'collection'});
406 my $fulltextprefix = &util::filename_cat ($self->{'build_dir'}, "text",
407 $self->{'collection'});
408
409 # get any os specific stuff
410 my $exedir = "$ENV{'GSDLHOME'}/bin/$ENV{'GSDLOS'}";
411 my $exe = &util::get_os_exe ();
[486]412 my $mg_passes_exe = &util::filename_cat($exedir, "mg_passes$exe");
413 my $mg_perf_hash_build_exe =
414 &util::filename_cat($exedir, "mg_perf_hash_build$exe");
415 my $mg_weights_build_exe =
416 &util::filename_cat ($exedir, "mg_weights_build$exe");
417 my $mg_invf_dict_exe =
418 &util::filename_cat ($exedir, "mg_invf_dict$exe");
419 my $mg_stem_idx_exe =
420 &util::filename_cat ($exedir, "mg_stem_idx$exe");
421
[4]422 my $osextra = "";
423 if ($ENV{'GSDLOS'} =~ /^windows$/i) {
424 $fullindexprefix =~ s/\//\\/g;
425 } else {
426 $osextra = " -d /";
427 }
428
429 # get the index level from the index description
430 # the index will be level 2 unless we are building a
431 # paragraph level index
432 my $index_level = 2;
433 $index_level = 3 if $index =~ /^paragraph/i;
434
[69]435 # get the index expression if this index belongs
436 # to a subcollection
437 my $indexexparr = [];
438 my ($level, $fields, $subcollection) = split (":", $index);
[85]439 my @subcollections = ();
440 @subcollections = split /,/, $subcollection if (defined $subcollection);
[69]441
442 foreach $subcollection (@subcollections) {
443 if (defined ($self->{'collect_cfg'}->{'subcollection'}->{$subcollection})) {
444 push (@$indexexparr, $self->{'collect_cfg'}->{'subcollection'}->{$subcollection});
445 }
446 }
447
[139]448 # add expressions for languages if this index belongs to
449 # a language subcollection
450 foreach $language (@{$self->{'collect_cfg'}->{'languages'}}) {
451 if ($language =~ s/^\!//) {
452 push (@$indexexparr, "!Language/$language/");
453 } else {
454 push (@$indexexparr, "Language/$language/");
455 }
456 }
[782]457
458 # Build index dictionary. Uses verbatim stem method
459 print STDERR "\n creating index dictionary\n" if ($self->{'verbosity'} >= 1);
460 my ($handle);
461 if ($self->{'debug'}) {
462 $handle = STDOUT;
463 } else {
464 if (!-e "$mg_passes_exe" ||
465 !open (PIPEOUT, "| $mg_passes_exe -f $fullindexprefix -b $maxdocsize " .
466 "-$index_level -m 32 -s 0 -G -t 10 -N1 $osextra")) {
467 die "mgbuilder::build_index - couldn't run $mg_passes_exe\n";
468 }
469 $handle = mgbuilder::PIPEOUT;
470 }
471
[4]472 # set up the document processor
[782]473 $self->{'buildproc'}->set_output_handle ($handle);
[4]474 $self->{'buildproc'}->set_mode ('text');
[69]475 $self->{'buildproc'}->set_index ($index, $indexexparr);
[292]476 $self->{'buildproc'}->set_indexing_text (1);
[4]477
478 $self->{'buildproc'}->reset();
479 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
[315]480 "", {}, $self->{'buildproc'}, $self->{'maxdocs'});
[782]481 close ($handle) unless $self->{'debug'};
[4]482
[782]483 if (!$self->{'debug'}) {
484 # create the perfect hash function
485 if (!-e "$mg_perf_hash_build_exe") {
486 die "mgbuilder::build_index - couldn't run $mg_perf_hash_build_exe\n";
487 }
488 system ("$mg_perf_hash_build_exe -f $fullindexprefix $osextra");
489
490 if (!-e "$mg_passes_exe" ||
491 !open ($handle, "| $mg_passes_exe -f $fullindexprefix -b $maxdocsize " .
492 "-$index_level -c 3 -G -t 10 -N2 $osextra")) {
493 die "mgbuilder::build_index - couldn't run $mg_passes_exe\n";
494 }
[4]495 }
[782]496
[4]497 # invert the text
498 print STDERR "\n inverting the text\n" if ($self->{'verbosity'} >= 1);
[782]499
[4]500 $self->{'buildproc'}->reset();
501 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
[315]502 "", {}, $self->{'buildproc'}, $self->{'maxdocs'});
[782]503
504 if (!$self->{'debug'}) {
[4]505
[782]506 close ($handle);
507
508 # create the weights file
509 print STDERR "\n create the weights file\n" if ($self->{'verbosity'} >= 1);
510 if (!-e "$mg_weights_build_exe") {
511 die "mgbuilder::build_index - couldn't run $mg_weights_build_exe\n";
512 }
513 system ("$mg_weights_build_exe -f $fullindexprefix -t $fulltextprefix $osextra");
[4]514
[782]515 # create 'on-disk' stemmed dictionary
516 print STDERR "\n creating 'on-disk' stemmed dictionary\n" if ($self->{'verbosity'} >= 1);
517 if (!-e "$mg_invf_dict_exe") {
518 die "mgbuilder::build_index - couldn't run $mg_invf_dict_exe\n";
519 }
520 system ("$mg_invf_dict_exe -f $fullindexprefix $osextra");
[4]521
522
[782]523 # creates stem index files for the various stemming methods
524 print STDERR "\n creating stem indexes\n" if ($self->{'verbosity'} >= 1);
525 if (!-e "$mg_stem_idx_exe") {
526 die "mgbuilder::build_index - couldn't run $mg_stem_idx_exe\n";
527 }
528 system ("$mg_stem_idx_exe -b 4096 -s1 -f $fullindexprefix $osextra");
529 system ("$mg_stem_idx_exe -b 4096 -s2 -f $fullindexprefix $osextra");
530 system ("$mg_stem_idx_exe -b 4096 -s3 -f $fullindexprefix $osextra");
[4]531
532
[782]533 # remove unwanted files
534 my $tmpdir = &util::filename_cat ($self->{'build_dir'}, $indexdir);
535 opendir (DIR, $tmpdir) || die
536 "mgbuilder::build_index - couldn't read directory $tmpdir\n";
537 foreach $file (readdir(DIR)) {
538 next if $file =~ /^\./;
539 my ($suffix) = $file =~ /\.([^\.]+)$/;
540 if (defined $suffix && !defined $wanted_index_files{$suffix}) {
541 # delete it!
542 print STDERR "deleting $file\n" if $self->{'verbosity'} > 2;
543 &util::rm (&util::filename_cat ($tmpdir, $file));
544 }
[4]545 }
[782]546 closedir (DIR);
[4]547 }
548}
549
550sub make_infodatabase {
551 my $self = shift (@_);
552 my $textdir = &util::filename_cat($self->{'build_dir'}, "text");
[810]553 my $assocdir = &util::filename_cat($self->{'build_dir'}, "assoc");
[4]554 &util::mk_all_dir ($textdir);
[810]555 &util::mk_all_dir ($assocdir);
[4]556
[85]557 # get db name
558 my $dbext = ".bdb";
559 $dbext = ".ldb" if &util::is_little_endian();
560 my $fulldbname = &util::filename_cat ($textdir, "$self->{'collection'}$dbext");
[4]561 $fulldbname =~ s/\//\\/g if ($ENV{'GSDLOS'} =~ /^windows$/i);
[85]562
[4]563 my $exedir = "$ENV{'GSDLHOME'}/bin/$ENV{'GSDLOS'}";
564 my $exe = &util::get_os_exe ();
[486]565 my $txt2db_exe = &util::filename_cat($exedir, "txt2db$exe");
[4]566
[810]567 print STDERR "\n*** creating the info database and processing associated files\n"
568 if ($self->{'verbosity'} >= 1);
[4]569
[215]570 # init all the classifiers
[315]571 &classify::init_classifiers ($self->{'classifiers'});
[215]572
[4]573 # set up the document processor
[782]574 my ($handle);
575 if ($self->{'debug'}) {
576 $handle = STDOUT;
577 } else {
578 if (!-e "$txt2db_exe" || !open (PIPEOUT, "| $txt2db_exe $fulldbname")) {
579 die "mgbuilder::make_infodatabase - couldn't run $txt2db_exe\n";
580 }
581 $handle = mgbuilder::PIPEOUT;
582 }
583
584 $self->{'buildproc'}->set_output_handle ($handle);
[315]585 $self->{'buildproc'}->set_mode ('infodb');
[810]586 $self->{'buildproc'}->set_assocdir ($assocdir);
[780]587 $self->{'buildproc'}->set_dontgdbm ($self->{'dontgdbm'});
[315]588 $self->{'buildproc'}->set_classifiers ($self->{'classifiers'});
[292]589 $self->{'buildproc'}->set_indexing_text (0);
[4]590 $self->{'buildproc'}->reset();
[246]591
[461]592 if (defined $self->{'collect_cfg'}->{'collectionmeta'}) {
[782]593
[461]594 if (!defined $self->{'index_mapping'}) {
595 $self->{'index_mapping'} =
596 $self->create_index_mapping ($self->{'collect_cfg'}->{'indexes'});
597 }
598
[782]599 print $handle "[collection]\n";
600
[461]601 foreach $cmeta (keys (%{$self->{'collect_cfg'}->{'collectionmeta'}})) {
602 if ($cmeta =~ s/^\.//) {
603 if (defined $self->{'index_mapping'}->{$cmeta}) {
[782]604 print $handle "<$self->{'index_mapping'}->{$cmeta}>" .
[461]605 $self->{'collect_cfg'}->{'collectionmeta'}->{".$cmeta"} . "\n";
606 } else {
607 print STDERR "mgbuilder: warning bad collectionmeta option '$cmeta' - ignored\n";
608 }
609 } else {
[782]610 print $handle "<$cmeta>$self->{'collect_cfg'}->{'collectionmeta'}->{$cmeta}\n";
[461]611 }
612 }
[782]613 print $handle "\n" . ('-' x 70) . "\n";
[461]614
615 }
616
[4]617 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
[315]618 "", {}, $self->{'buildproc'}, $self->{'maxdocs'});
[215]619
620 # output classification information
[782]621 &classify::output_classify_info ($self->{'classifiers'}, $handle,
[315]622 $self->{'allclassifications'});
[215]623
[782]624 close ($handle) if !$self->{'debug'};
[4]625}
626
[626]627sub collect_specific {
628 my $self = shift (@_);
629}
630
[4]631sub make_auxiliary_files {
632 my $self = shift (@_);
633 my ($index);
634 my %build_cfg = ();
635
636 print STDERR "\n*** creating auxiliary files \n" if ($self->{'verbosity'} >= 1);
637
638 # get the text directory
639 &util::mk_all_dir ($self->{'build_dir'});
640
641 # store the build date
642 $build_cfg->{'builddate'} = time;
643
644 # store the number of documents and number of bytes
645 $build_cfg->{'numdocs'} = $self->{'buildproc'}->get_num_docs();
646 $build_cfg->{'numbytes'} = $self->{'buildproc'}->get_num_bytes();
647
648 # store the mapping between the index names and the directory names
649 my @indexmap = ();
[290]650 foreach $index (@{$self->{'index_mapping'}->{'indexmaporder'}}) {
[139]651 push (@indexmap, "$index\-\>$self->{'index_mapping'}->{'indexmap'}->{$index}");
[4]652 }
653 $build_cfg->{'indexmap'} = \@indexmap;
654
[139]655 my @subcollectionmap = ();
[290]656 foreach $subcollection (@{$self->{'index_mapping'}->{'subcollectionmaporder'}}) {
[378]657 push (@subcollectionmap, "$subcollection\-\>" .
658 $self->{'index_mapping'}->{'subcollectionmap'}->{$subcollection});
[139]659 }
660 $build_cfg->{'subcollectionmap'} = \@subcollectionmap if scalar (@subcollectionmap);
661
662 my @languagemap = ();
[290]663 foreach $language (@{$self->{'index_mapping'}->{'languagemaporder'}}) {
[378]664 push (@languagemap, "$language\-\>" .
665 $self->{'index_mapping'}->{'languagemap'}->{$language});
[139]666 }
667 $build_cfg->{'languagemap'} = \@languagemap if scalar (@languagemap);
668
[486]669 $build_cfg->{'notbuilt'} = $self->{'notbuilt'};
670
[4]671 # write out the build information
672 &cfgread::write_cfg_file("$self->{'build_dir'}/build.cfg", $build_cfg,
[626]673 '^(builddate|numdocs|numbytes)$',
674 '^(indexmap|subcollectionmap|languagemap|notbuilt)$');
[4]675
676}
677
678sub deinit {
679 my $self = shift (@_);
680}
681
682
6831;
684
685
Note: See TracBrowser for help on using the repository browser.