source: trunk/gsdl/perllib/mgbuilder.pm@ 1679

Last change on this file since 1679 was 1679, checked in by sjboddie, 23 years ago

Re-Added recent changes that were lost when the CVS repository was moved.
Mostly stuff to do with the collector and building code.

  • Property svn:keywords set to Author Date Id Revision
File size: 26.1 KB
Line 
1###########################################################################
2#
3# mgbuilder.pm -- MGBuilder object
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26package mgbuilder;
27
28use classify;
29use cfgread;
30use colcfg;
31use plugin;
32use util;
33use FileHandle;
34
35BEGIN {
36 # set autoflush on for STDERR and STDOUT so that mg
37 # doesn't get out of sync with plugins
38 STDOUT->autoflush(1);
39 STDERR->autoflush(1);
40}
41
42END {
43 STDOUT->autoflush(0);
44 STDERR->autoflush(0);
45}
46
47$maxdocsize = 12000;
48
49%wanted_index_files = ('td'=>1,
50 't'=>1,
51 'idb'=>1,
52 'ib1'=>1,
53 'ib2'=>1,
54 'ib3'=>1,
55 'i'=>1,
56 'ip'=>1,
57 'tiw'=>1,
58 'wa'=>1);
59
60
61sub new {
62 my ($class, $collection, $source_dir, $build_dir, $verbosity,
63 $maxdocs, $debug, $keepold, $allclassifications, $outhandle) = @_;
64
65 $outhandle = STDERR unless defined $outhandle;
66
67 # create an mgbuilder object
68 my $self = bless {'collection'=>$collection,
69 'source_dir'=>$source_dir,
70 'build_dir'=>$build_dir,
71 'verbosity'=>$verbosity,
72 'maxdocs'=>$maxdocs,
73 'debug'=>$debug,
74 'keepold'=>$keepold,
75 'allclassifications'=>$allclassifications,
76 'outhandle'=>$outhandle,
77 'notbuilt'=>[] # indexes not built
78 }, $class;
79
80
81 # read in the collection configuration file
82 my $colcfgname = "$ENV{'GSDLCOLLECTDIR'}/etc/collect.cfg";
83 if (!-e $colcfgname) {
84 die "mgbuilder::new - couldn't find collect.cfg for collection $collection\n";
85 }
86 $self->{'collect_cfg'} = &colcfg::read_collect_cfg ($colcfgname);
87
88 # sort out subcollection indexes
89 if (defined $self->{'collect_cfg'}->{'indexsubcollections'}) {
90 my $indexes = $self->{'collect_cfg'}->{'indexes'};
91 $self->{'collect_cfg'}->{'indexes'} = [];
92 foreach $subcollection (@{$self->{'collect_cfg'}->{'indexsubcollections'}}) {
93 foreach $index (@$indexes) {
94 push (@{$self->{'collect_cfg'}->{'indexes'}}, "$index:$subcollection");
95 }
96 }
97 }
98
99 # sort out language subindexes
100 if (defined $self->{'collect_cfg'}->{'languages'}) {
101 my $indexes = $self->{'collect_cfg'}->{'indexes'};
102 $self->{'collect_cfg'}->{'indexes'} = [];
103 foreach $language (@{$self->{'collect_cfg'}->{'languages'}}) {
104 foreach $index (@$indexes) {
105 push (@{$self->{'collect_cfg'}->{'indexes'}}, "$index:$language");
106 }
107 }
108 }
109
110 # get the list of plugins for this collection
111 my $plugins = [];
112 if (defined $self->{'collect_cfg'}->{'plugin'}) {
113 $plugins = $self->{'collect_cfg'}->{'plugin'};
114 }
115
116 # load all the plugins
117 $self->{'pluginfo'} = &plugin::load_plugins ($plugins, $verbosity, $outhandle);
118 if (scalar(@{$self->{'pluginfo'}}) == 0) {
119 print $outhandle "No plugins were loaded.\n";
120 die "\n";
121 }
122
123 # get the list of classifiers for this collection
124 my $classifiers = [];
125 if (defined $self->{'collect_cfg'}->{'classify'}) {
126 $classifiers = $self->{'collect_cfg'}->{'classify'};
127 }
128
129 # load all the classifiers
130 $self->{'classifiers'} = &classify::load_classifiers ($classifiers, $outhandle);
131
132 # load up any dontgdbm fields
133 $self->{'dontgdbm'} = {};
134 if (defined ($self->{'collect_cfg'}->{'dontgdbm'})) {
135 foreach $dg (@{$self->{'collect_cfg'}->{'dontgdbm'}}) {
136 $self->{'dontgdbm'}->{$dg} = 1;
137 }
138 }
139
140 # load up the document processor for building
141 # if a buildproc class has been created for this collection, use it
142 # otherwise, use the mg buildproc
143 my ($buildprocdir, $buildproctype);
144 if (-e "$ENV{'GSDLCOLLECTDIR'}/perllib/${collection}buildproc.pm") {
145 $buildprocdir = "$ENV{'GSDLCOLLECTDIR'}/perllib";
146 $buildproctype = "${collection}buildproc";
147 } else {
148 $buildprocdir = "$ENV{'GSDLHOME'}/perllib";
149 $buildproctype = "mgbuildproc";
150 }
151 require "$buildprocdir/$buildproctype.pm";
152
153 eval("\$self->{'buildproc'} = new $buildproctype(\$collection, " .
154 "\$source_dir, \$build_dir, \$verbosity, \$outhandle)");
155 die "$@" if $@;
156
157 return $self;
158}
159
160sub init {
161 my $self = shift (@_);
162
163 if (!$self->{'debug'} && !$self->{'keepold'}) {
164 # remove any old builds
165 &util::rm_r($self->{'build_dir'});
166 &util::mk_all_dir($self->{'build_dir'});
167
168 # make the text directory
169 my $textdir = "$self->{'build_dir'}/text";
170 &util::mk_all_dir($textdir);
171 }
172}
173
174sub compress_text {
175 my $self = shift (@_);
176 my ($textindex) = @_;
177 my $exedir = "$ENV{'GSDLHOME'}/bin/$ENV{'GSDLOS'}";
178 my $exe = &util::get_os_exe ();
179 my $mg_passes_exe = &util::filename_cat($exedir, "mg_passes$exe");
180 my $mg_compression_dict_exe = &util::filename_cat($exedir, "mg_compression_dict$exe");
181 my $outhandle = $self->{'outhandle'};
182
183 &util::mk_all_dir (&util::filename_cat($self->{'build_dir'}, "text"));
184 my $basefilename = "text/$self->{'collection'}";
185 my $fulltextprefix = &util::filename_cat ($self->{'build_dir'}, $basefilename);
186
187 my $osextra = "";
188 if ($ENV{'GSDLOS'} =~ /^windows$/i) {
189 $fulltextprefix =~ s/\//\\/g;
190 } else {
191 $osextra = " -d /";
192 }
193
194 print $outhandle "\n*** creating the compressed text\n" if ($self->{'verbosity'} >= 1);
195
196 # collect the statistics for the text
197 # -b $maxdocsize sets the maximum document size to be 12 meg
198 print $outhandle "\n collecting text statistics\n" if ($self->{'verbosity'} >= 1);
199
200 my ($handle);
201 if ($self->{'debug'}) {
202 $handle = STDOUT;
203 } else {
204 if (!-e "$mg_passes_exe" ||
205# !open (PIPEOUT, "| \"$mg_passes_exe\" -f \"$fulltextprefix\" -b $maxdocsize -T1 $osextra")) {
206 !open (PIPEOUT, "| mg_passes$exe -f \"$fulltextprefix\" -b $maxdocsize -T1 $osextra")) {
207 die "mgbuilder::compress_text - couldn't run $mg_passes_exe\n";
208 }
209 $handle = mgbuilder::PIPEOUT;
210 }
211
212 $self->{'buildproc'}->set_output_handle ($handle);
213 $self->{'buildproc'}->set_mode ('text');
214 $self->{'buildproc'}->set_index ($textindex);
215 $self->{'buildproc'}->set_indexing_text (0);
216 $self->{'buildproc'}->reset();
217 &plugin::begin($self->{'pluginfo'}, $self->{'source_dir'},
218 $self->{'buildproc'}, $self->{'maxdocs'});
219 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
220 "", {}, $self->{'buildproc'}, $self->{'maxdocs'});
221 &plugin::end($self->{'pluginfo'});
222
223 close ($handle) unless $self->{'debug'};
224
225 $self->print_stats();
226
227 # create the compression dictionary
228 # the compression dictionary is built by assuming the stats are from a seed
229 # dictionary (-S), if a novel word is encountered it is spelled out (-H),
230 # and the resulting dictionary must be less than 5 meg with the most frequent
231 # words being put into the dictionary first (-2 -k 5120)
232 if (!$self->{'debug'}) {
233 print $outhandle "\n creating the compression dictionary\n" if ($self->{'verbosity'} >= 1);
234 if (!-e "$mg_compression_dict_exe") {
235 die "mgbuilder::compress_text - couldn't run $mg_compression_dict_exe\n";
236 }
237# system ("\"$mg_compression_dict_exe\" -f \"$fulltextprefix\" -S -H -2 -k 5120 $osextra");
238 system ("mg_compression_dict$exe -f \"$fulltextprefix\" -S -H -2 -k 5120 $osextra");
239
240 # -b $maxdocsize sets the maximum document size to be 12 meg
241 if (!-e "$mg_passes_exe" ||
242# !open ($handle, "| \"$mg_passes_exe\" -f \"$fulltextprefix\" -b $maxdocsize -T2 $osextra")) {
243 !open ($handle, "| mg_passes$exe -f \"$fulltextprefix\" -b $maxdocsize -T2 $osextra")) {
244 die "mgbuilder::compress_text - couldn't run $mg_passes_exe\n";
245 }
246 }
247
248 $self->{'buildproc'}->reset();
249 # compress the text
250 print $outhandle "\n compressing the text\n" if ($self->{'verbosity'} >= 1);
251 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
252 "", {}, $self->{'buildproc'}, $self->{'maxdocs'});
253 close ($handle) unless $self->{'debug'};
254
255 $self->print_stats();
256}
257
258sub want_built {
259 my $self = shift (@_);
260 my ($index) = @_;
261
262 if (defined ($self->{'collect_cfg'}->{'dontbuild'})) {
263 foreach $checkstr (@{$self->{'collect_cfg'}->{'dontbuild'}}) {
264 if ($index =~ /^$checkstr$/) {
265 push (@{$self->{'notbuilt'}}, $self->{'index_mapping'}->{$index});
266 return 0;
267 }
268 }
269 }
270
271 return 1;
272}
273
274sub build_indexes {
275 my $self = shift (@_);
276 my ($indexname) = @_;
277 my $outhandle = $self->{'outhandle'};
278
279 my $indexes = [];
280 if (defined $indexname && $indexname =~ /\w/) {
281 push @$indexes, $indexname;
282 } else {
283 $indexes = $self->{'collect_cfg'}->{'indexes'};
284 }
285
286 # create the mapping between the index descriptions
287 # and their directory names
288 $self->{'index_mapping'} = $self->create_index_mapping ($indexes);
289
290 # build each of the indexes
291 foreach $index (@$indexes) {
292 if ($self->want_built($index)) {
293 print $outhandle "\n*** building index $index in subdirectory " .
294 "$self->{'index_mapping'}->{$index}\n" if ($self->{'verbosity'} >= 1);
295 $self->build_index($index);
296 } else {
297 print $outhandle "\n*** ignoring index $index\n" if ($self->{'verbosity'} >= 1);
298 }
299 }
300}
301
302# creates directory names for each of the index descriptions
303sub create_index_mapping {
304 my $self = shift (@_);
305 my ($indexes) = @_;
306
307 my %mapping = ();
308 $mapping{'indexmaporder'} = [];
309 $mapping{'subcollectionmaporder'} = [];
310 $mapping{'languagemaporder'} = [];
311
312 # dirnames is used to check for collisions. Start this off
313 # with the manditory directory names
314 my %dirnames = ('text'=>'text',
315 'extra'=>'extra');
316 my %pnames = ('index' => '', 'subcollection' => '', 'languages' => '');
317
318 foreach $index (@$indexes) {
319 my ($level, $gran, $subcollection, $languages) = split (":", $index);
320
321 # the directory name starts with the first character of the index level
322 my ($pindex) = $level =~ /^(.)/;
323
324 # next comes a processed version of the index
325 $pindex .= $self->process_field ($gran);
326 $pindex = lc ($pindex);
327
328 # next comes a processed version of the subcollection if there is one.
329 my $psub = $self->process_field ($subcollection);
330 $psub = lc ($psub);
331
332 # next comes a processed version of the language if there is one.
333 my $plang = $self->process_field ($languages);
334 $plang = lc ($plang);
335
336 my $dirname = $pindex . $psub . $plang;
337
338 # check to be sure all index names are unique
339 while (defined ($dirnames{$dirname})) {
340 $dirname = $self->make_unique (\%pnames, $index, \$pindex, \$psub, \$plang);
341 }
342
343 # store the mapping orders as well as the maps
344 if (!defined $mapping{'indexmap'}{"$level:$gran"}) {
345 $mapping{'indexmap'}{"$level:$gran"} = $pindex;
346 push (@{$mapping{'indexmaporder'}}, "$level:$gran");
347 }
348 if ($psub =~ /\w/ && !defined ($mapping{'subcollectionmap'}{$subcollection})) {
349 $mapping{'subcollectionmap'}{$subcollection} = $psub;
350 push (@{$mapping{'subcollectionmaporder'}}, $subcollection);
351 }
352 if ($plang =~ /\w/ && !defined ($mapping{'languagemap'}{$languages})) {
353 $mapping{'languagemap'}{$languages} = $plang;
354 push (@{$mapping{'languagemaporder'}}, $language);
355 }
356 $mapping{$index} = $dirname;
357 $dirnames{$dirname} = $index;
358 $pnames{'index'}{$pindex} = "$level:$gran";
359 $pnames{'subcollection'}{$psub} = $subcollection;
360 $pnames{'languages'}{$plang} = $languages;
361 }
362
363 return \%mapping;
364}
365
366# returns a processed version of a field.
367# if the field has only one component the processed
368# version will contain the first character and next consonant
369# of that componant - otherwise it will contain the first
370# character of the first two components
371sub process_field {
372 my $self = shift (@_);
373 my ($field) = @_;
374
375 return "" unless (defined ($field) && $field =~ /\w/);
376
377 my @components = split /,/, $field;
378 if (scalar @components >= 2) {
379 splice (@components, 2);
380 map {s/^(.).*$/$1/;} @components;
381 return join("", @components);
382 } else {
383 my ($a, $b) = $field =~ /^(.).*?([bcdfghjklmnpqrstvwxyz])/i;
384 ($a, $b) = $field =~ /^(.)(.)/ unless defined $a && defined $b;
385 return "$a$b";
386 }
387}
388
389sub make_unique {
390 my $self = shift (@_);
391 my ($namehash, $index, $indexref, $subref, $langref) = @_;
392 my ($level, $gran, $subcollection, $languages) = split (":", $index);
393
394 if ($namehash->{'index'}->{$$indexref} ne "$level:$gran") {
395 $self->get_next_version ($indexref);
396 } elsif ($namehash->{'subcollection'}->{$$subref} ne $subcollection) {
397 $self->get_next_version ($subref);
398 } elsif ($namehash->{'languages'}->{$$langref} ne $languages) {
399 $self->get_next_version ($langref);
400 }
401 return "$$indexref$$subref$$langref";
402}
403
404sub get_next_version {
405 my $self = shift (@_);
406 my ($nameref) = @_;
407
408 if ($$nameref =~ /(\d\d)$/) {
409 my $num = $1; $num ++;
410 $$nameref =~ s/\d\d$/$num/;
411 } elsif ($$nameref =~ /(\d)$/) {
412 my $num = $1;
413 if ($num == 9) {$$nameref =~ s/\d\d$/10/;}
414 else {$num ++; $$nameref =~ s/\d$/$num/;}
415 } else {
416 $$nameref =~ s/.$/0/;
417 }
418}
419
420sub build_index {
421 my $self = shift (@_);
422 my ($index) = @_;
423 my $outhandle = $self->{'outhandle'};
424
425 # get the full index directory path and make sure it exists
426 my $indexdir = $self->{'index_mapping'}->{$index};
427 &util::mk_all_dir (&util::filename_cat($self->{'build_dir'}, $indexdir));
428 my $fullindexprefix = &util::filename_cat ($self->{'build_dir'}, $indexdir,
429 $self->{'collection'});
430 my $fulltextprefix = &util::filename_cat ($self->{'build_dir'}, "text",
431 $self->{'collection'});
432
433 # get any os specific stuff
434 my $exedir = "$ENV{'GSDLHOME'}/bin/$ENV{'GSDLOS'}";
435 my $exe = &util::get_os_exe ();
436 my $mg_passes_exe = &util::filename_cat($exedir, "mg_passes$exe");
437 my $mg_perf_hash_build_exe =
438 &util::filename_cat($exedir, "mg_perf_hash_build$exe");
439 my $mg_weights_build_exe =
440 &util::filename_cat ($exedir, "mg_weights_build$exe");
441 my $mg_invf_dict_exe =
442 &util::filename_cat ($exedir, "mg_invf_dict$exe");
443 my $mg_stem_idx_exe =
444 &util::filename_cat ($exedir, "mg_stem_idx$exe");
445
446 my $osextra = "";
447 if ($ENV{'GSDLOS'} =~ /^windows$/i) {
448 $fullindexprefix =~ s/\//\\/g;
449 } else {
450 $osextra = " -d /";
451 }
452
453 # get the index level from the index description
454 # the index will be level 2 unless we are building a
455 # paragraph level index
456 my $index_level = 2;
457 $index_level = 3 if $index =~ /^paragraph/i;
458
459 # get the index expression if this index belongs
460 # to a subcollection
461 my $indexexparr = [];
462 my ($level, $fields, $subcollection) = split (":", $index);
463 my @subcollections = ();
464 @subcollections = split /,/, $subcollection if (defined $subcollection);
465
466 foreach $subcollection (@subcollections) {
467 if (defined ($self->{'collect_cfg'}->{'subcollection'}->{$subcollection})) {
468 push (@$indexexparr, $self->{'collect_cfg'}->{'subcollection'}->{$subcollection});
469 }
470 }
471
472 # add expressions for languages if this index belongs to
473 # a language subcollection
474 foreach $language (@{$self->{'collect_cfg'}->{'languages'}}) {
475 if ($language =~ s/^\!//) {
476 push (@$indexexparr, "!Language/$language/");
477 } else {
478 push (@$indexexparr, "Language/$language/");
479 }
480 }
481
482 # Build index dictionary. Uses verbatim stem method
483 print $outhandle "\n creating index dictionary\n" if ($self->{'verbosity'} >= 1);
484 my ($handle);
485 if ($self->{'debug'}) {
486 $handle = STDOUT;
487 } else {
488 if (!-e "$mg_passes_exe" ||
489# !open (PIPEOUT, "| \"$mg_passes_exe\" -f \"$fullindexprefix\" -b $maxdocsize " .
490 !open (PIPEOUT, "| mg_passes$exe -f \"$fullindexprefix\" -b $maxdocsize " .
491 "-$index_level -m 32 -s 0 -G -t 10 -N1 $osextra")) {
492 die "mgbuilder::build_index - couldn't run $mg_passes_exe\n";
493 }
494 $handle = mgbuilder::PIPEOUT;
495 }
496
497 # set up the document processor
498 $self->{'buildproc'}->set_output_handle ($handle);
499 $self->{'buildproc'}->set_mode ('text');
500 $self->{'buildproc'}->set_index ($index, $indexexparr);
501 $self->{'buildproc'}->set_indexing_text (1);
502
503 $self->{'buildproc'}->reset();
504 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
505 "", {}, $self->{'buildproc'}, $self->{'maxdocs'});
506 close ($handle) unless $self->{'debug'};
507
508 $self->print_stats();
509
510 if (!$self->{'debug'}) {
511 # create the perfect hash function
512 if (!-e "$mg_perf_hash_build_exe") {
513 die "mgbuilder::build_index - couldn't run $mg_perf_hash_build_exe\n";
514 }
515# system ("\"$mg_perf_hash_build_exe\" -f \"$fullindexprefix\" $osextra");
516 system ("mg_perf_hash_build$exe -f \"$fullindexprefix\" $osextra");
517
518 if (!-e "$mg_passes_exe" ||
519# !open ($handle, "| \"$mg_passes_exe\" -f \"$fullindexprefix\" -b $maxdocsize " .
520 !open ($handle, "| mg_passes$exe -f \"$fullindexprefix\" -b $maxdocsize " .
521 "-$index_level -c 3 -G -t 10 -N2 $osextra")) {
522 die "mgbuilder::build_index - couldn't run $mg_passes_exe\n";
523 }
524 }
525
526 # invert the text
527 print $outhandle "\n inverting the text\n" if ($self->{'verbosity'} >= 1);
528
529 $self->{'buildproc'}->reset();
530 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
531 "", {}, $self->{'buildproc'}, $self->{'maxdocs'});
532
533 $self->print_stats ();
534
535 if (!$self->{'debug'}) {
536
537 close ($handle);
538
539 # create the weights file
540 print $outhandle "\n create the weights file\n" if ($self->{'verbosity'} >= 1);
541 if (!-e "$mg_weights_build_exe") {
542 die "mgbuilder::build_index - couldn't run $mg_weights_build_exe\n";
543 }
544# system ("\"$mg_weights_build_exe\" -f \"$fullindexprefix\" -t \"$fulltextprefix\" $osextra");
545 system ("mg_weights_build$exe -f \"$fullindexprefix\" -t \"$fulltextprefix\" $osextra");
546
547 # create 'on-disk' stemmed dictionary
548 print $outhandle "\n creating 'on-disk' stemmed dictionary\n" if ($self->{'verbosity'} >= 1);
549 if (!-e "$mg_invf_dict_exe") {
550 die "mgbuilder::build_index - couldn't run $mg_invf_dict_exe\n";
551 }
552# system ("\"$mg_invf_dict_exe\" -f \"$fullindexprefix\" $osextra");
553 system ("mg_invf_dict$exe -f \"$fullindexprefix\" $osextra");
554
555
556 # creates stem index files for the various stemming methods
557 print $outhandle "\n creating stem indexes\n" if ($self->{'verbosity'} >= 1);
558 if (!-e "$mg_stem_idx_exe") {
559 die "mgbuilder::build_index - couldn't run $mg_stem_idx_exe\n";
560 }
561# system ("\"$mg_stem_idx_exe\" -b 4096 -s1 -f \"$fullindexprefix\" $osextra");
562 system ("mg_stem_idx$exe -b 4096 -s1 -f \"$fullindexprefix\" $osextra");
563# system ("\"$mg_stem_idx_exe\" -b 4096 -s2 -f \"$fullindexprefix\" $osextra");
564 system ("mg_stem_idx$exe -b 4096 -s2 -f \"$fullindexprefix\" $osextra");
565# system ("\"$mg_stem_idx_exe\" -b 4096 -s3 -f \"$fullindexprefix\" $osextra");
566 system ("mg_stem_idx$exe -b 4096 -s3 -f \"$fullindexprefix\" $osextra");
567
568
569 # remove unwanted files
570 my $tmpdir = &util::filename_cat ($self->{'build_dir'}, $indexdir);
571 opendir (DIR, $tmpdir) || die
572 "mgbuilder::build_index - couldn't read directory $tmpdir\n";
573 foreach $file (readdir(DIR)) {
574 next if $file =~ /^\./;
575 my ($suffix) = $file =~ /\.([^\.]+)$/;
576 if (defined $suffix && !defined $wanted_index_files{$suffix}) {
577 # delete it!
578 print $outhandle "deleting $file\n" if $self->{'verbosity'} > 2;
579 &util::rm (&util::filename_cat ($tmpdir, $file));
580 }
581 }
582 closedir (DIR);
583 }
584}
585
586sub make_infodatabase {
587 my $self = shift (@_);
588 my $outhandle = $self->{'outhandle'};
589
590 my $textdir = &util::filename_cat($self->{'build_dir'}, "text");
591 my $assocdir = &util::filename_cat($self->{'build_dir'}, "assoc");
592 &util::mk_all_dir ($textdir);
593 &util::mk_all_dir ($assocdir);
594
595 # get db name
596 my $dbext = ".bdb";
597 $dbext = ".ldb" if &util::is_little_endian();
598 my $fulldbname = &util::filename_cat ($textdir, "$self->{'collection'}$dbext");
599 $fulldbname =~ s/\//\\/g if ($ENV{'GSDLOS'} =~ /^windows$/i);
600
601 my $exedir = "$ENV{'GSDLHOME'}/bin/$ENV{'GSDLOS'}";
602 my $exe = &util::get_os_exe ();
603 my $txt2db_exe = &util::filename_cat($exedir, "txt2db$exe");
604
605 print $outhandle "\n*** creating the info database and processing associated files\n"
606 if ($self->{'verbosity'} >= 1);
607
608 # init all the classifiers
609 &classify::init_classifiers ($self->{'classifiers'});
610
611 # set up the document processor
612 my ($handle);
613 if ($self->{'debug'}) {
614 $handle = STDOUT;
615 } else {
616# if (!-e "$txt2db_exe" || !open (PIPEOUT, "| \"$txt2db_exe\" \"$fulldbname\"")) {
617 if (!-e "$txt2db_exe" || !open (PIPEOUT, "| txt2db$exe \"$fulldbname\"")) {
618 die "mgbuilder::make_infodatabase - couldn't run $txt2db_exe\n";
619 }
620 $handle = mgbuilder::PIPEOUT;
621 }
622
623 $self->{'buildproc'}->set_output_handle ($handle);
624 $self->{'buildproc'}->set_mode ('infodb');
625 $self->{'buildproc'}->set_assocdir ($assocdir);
626 $self->{'buildproc'}->set_dontgdbm ($self->{'dontgdbm'});
627 $self->{'buildproc'}->set_classifiers ($self->{'classifiers'});
628 $self->{'buildproc'}->set_indexing_text (0);
629 $self->{'buildproc'}->reset();
630
631 if (defined $self->{'collect_cfg'}->{'collectionmeta'}) {
632
633 if (!defined $self->{'index_mapping'}) {
634 $self->{'index_mapping'} =
635 $self->create_index_mapping ($self->{'collect_cfg'}->{'indexes'});
636 }
637
638 print $handle "[collection]\n";
639
640 foreach $cmeta (keys (%{$self->{'collect_cfg'}->{'collectionmeta'}})) {
641 if ($cmeta =~ s/^\.//) {
642 if (defined $self->{'index_mapping'}->{$cmeta}) {
643 print $handle "<$self->{'index_mapping'}->{$cmeta}>" .
644 $self->{'collect_cfg'}->{'collectionmeta'}->{".$cmeta"} . "\n";
645 } else {
646 print $outhandle "mgbuilder: warning bad collectionmeta option '$cmeta' - ignored\n";
647 }
648 } else {
649 print $handle "<$cmeta>$self->{'collect_cfg'}->{'collectionmeta'}->{$cmeta}\n";
650 }
651 }
652 print $handle "\n" . ('-' x 70) . "\n";
653
654 }
655
656 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
657 "", {}, $self->{'buildproc'}, $self->{'maxdocs'});
658
659 # output classification information
660 &classify::output_classify_info ($self->{'classifiers'}, $handle,
661 $self->{'allclassifications'});
662
663 close ($handle) if !$self->{'debug'};
664}
665
666sub collect_specific {
667 my $self = shift (@_);
668}
669
670sub make_auxiliary_files {
671 my $self = shift (@_);
672 my ($index);
673 my %build_cfg = ();
674 my $outhandle = $self->{'outhandle'};
675
676 print $outhandle "\n*** creating auxiliary files \n" if ($self->{'verbosity'} >= 1);
677
678 # get the text directory
679 &util::mk_all_dir ($self->{'build_dir'});
680
681 # store the build date
682 $build_cfg->{'builddate'} = time;
683
684 # store the number of documents and number of bytes
685 $build_cfg->{'numdocs'} = $self->{'buildproc'}->get_num_docs();
686 $build_cfg->{'numbytes'} = $self->{'buildproc'}->get_num_bytes();
687
688 # get additional stats from mg
689 my $exedir = "$ENV{'GSDLHOME'}/bin/$ENV{'GSDLOS'}";
690 my $exe = &util::get_os_exe ();
691 my $mgstat_exe = &util::filename_cat($exedir, "mgstat$exe");
692 my $input_file = &util::filename_cat ("text", $self->{'collection'});
693# if (!-e "$mgstat_exe" || !open (PIPEIN, "\"$mgstat_exe\" -d \"$self->{'build_dir'}\" -f \"$input_file\" |")) {
694 if (!-e "$mgstat_exe" || !open (PIPEIN, "mgstat$exe -d \"$self->{'build_dir'}\" -f \"$input_file\" |")) {
695 print $outhandle "Warning: Couldn't open pipe to $mgstat_exe to get additional stats\n";
696 } else {
697 my $line = "";
698 while (defined ($line = <PIPEIN>)) {
699 if ($line =~ /^Words in collection \[dict\]\s+:\s+(\d+)/) {
700 ($build_cfg->{'numwords'}) = $1;
701 } elsif ($line =~ /^Documents\s+:\s+(\d+)/) {
702 ($build_cfg->{'numsections'}) = $1;
703 }
704 }
705 close PIPEIN;
706 }
707
708 # store the mapping between the index names and the directory names
709 my @indexmap = ();
710 foreach $index (@{$self->{'index_mapping'}->{'indexmaporder'}}) {
711 push (@indexmap, "$index\-\>$self->{'index_mapping'}->{'indexmap'}->{$index}");
712 }
713 $build_cfg->{'indexmap'} = \@indexmap;
714
715 my @subcollectionmap = ();
716 foreach $subcollection (@{$self->{'index_mapping'}->{'subcollectionmaporder'}}) {
717 push (@subcollectionmap, "$subcollection\-\>" .
718 $self->{'index_mapping'}->{'subcollectionmap'}->{$subcollection});
719 }
720 $build_cfg->{'subcollectionmap'} = \@subcollectionmap if scalar (@subcollectionmap);
721
722 my @languagemap = ();
723 foreach $language (@{$self->{'index_mapping'}->{'languagemaporder'}}) {
724 push (@languagemap, "$language\-\>" .
725 $self->{'index_mapping'}->{'languagemap'}->{$language});
726 }
727 $build_cfg->{'languagemap'} = \@languagemap if scalar (@languagemap);
728
729 $build_cfg->{'notbuilt'} = $self->{'notbuilt'} if scalar @{$self->{'notbuilt'}};
730
731 # write out the build information
732 &cfgread::write_cfg_file("$self->{'build_dir'}/build.cfg", $build_cfg,
733 '^(builddate|numdocs|numbytes|numwords|numsections)$',
734 '^(indexmap|subcollectionmap|languagemap|notbuilt)$');
735
736}
737
738sub deinit {
739 my $self = shift (@_);
740}
741
742sub print_stats {
743 my $self = shift (@_);
744
745 my $outhandle = $self->{'outhandle'};
746 my $indexing_text = $self->{'buildproc'}->get_indexing_text();
747 my $index = $self->{'buildproc'}->get_index();
748 my $num_bytes = $self->{'buildproc'}->get_num_bytes();
749 my $num_processed_bytes = $self->{'buildproc'}->get_num_processed_bytes();
750
751 if ($indexing_text) {
752 print $outhandle "Stats (Creating index $index)\n";
753 } else {
754 print $outhandle "Stats (Compressing text from $index)\n";
755 }
756 print $outhandle "Total bytes in collection: $num_bytes\n";
757 print $outhandle "Total bytes in $index: $num_processed_bytes\n";
758
759 if ($num_processed_bytes < 50) {
760 print $outhandle "***************\n";
761 print $outhandle "WARNING: There is very little or no text to process for $index\n";
762 if ($indexing_text) {
763 print $outhandle "This may cause an error while attempting to build the index\n";
764 } else {
765 print $outhandle "This may cause an error while attempting to compress the text\n";
766 }
767 print $outhandle "***************\n";
768 }
769}
770
7711;
772
773
Note: See TracBrowser for help on using the repository browser.