source: trunk/gsdl/perllib/mgbuilder.pm@ 1954

Last change on this file since 1954 was 1803, checked in by paynter, 23 years ago

Moved the phind classifier's data directory into the index directory. This
means we no longer overwrite existing phind classifier data during a build.
I had to tweak the classifier code to pass the locatin of the building
directory to each classifer as an argument.

  • Property svn:keywords set to Author Date Id Revision
File size: 26.4 KB
Line 
1###########################################################################
2#
3# mgbuilder.pm -- MGBuilder object
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26package mgbuilder;
27
28use classify;
29use cfgread;
30use colcfg;
31use plugin;
32use util;
33use FileHandle;
34
35BEGIN {
36 # set autoflush on for STDERR and STDOUT so that mg
37 # doesn't get out of sync with plugins
38 STDOUT->autoflush(1);
39 STDERR->autoflush(1);
40}
41
42END {
43 STDOUT->autoflush(0);
44 STDERR->autoflush(0);
45}
46
47$maxdocsize = 12000;
48
49%wanted_index_files = ('td'=>1,
50 't'=>1,
51 'idb'=>1,
52 'ib1'=>1,
53 'ib2'=>1,
54 'ib3'=>1,
55 'i'=>1,
56 'ip'=>1,
57 'tiw'=>1,
58 'wa'=>1);
59
60
61sub new {
62 my ($class, $collection, $source_dir, $build_dir, $verbosity,
63 $maxdocs, $debug, $keepold, $allclassifications, $outhandle) = @_;
64
65 $outhandle = STDERR unless defined $outhandle;
66
67 # create an mgbuilder object
68 my $self = bless {'collection'=>$collection,
69 'source_dir'=>$source_dir,
70 'build_dir'=>$build_dir,
71 'verbosity'=>$verbosity,
72 'maxdocs'=>$maxdocs,
73 'debug'=>$debug,
74 'keepold'=>$keepold,
75 'allclassifications'=>$allclassifications,
76 'outhandle'=>$outhandle,
77 'notbuilt'=>[] # indexes not built
78 }, $class;
79
80
81 # read in the collection configuration file
82 my $colcfgname = "$ENV{'GSDLCOLLECTDIR'}/etc/collect.cfg";
83 if (!-e $colcfgname) {
84 die "mgbuilder::new - couldn't find collect.cfg for collection $collection\n";
85 }
86 $self->{'collect_cfg'} = &colcfg::read_collect_cfg ($colcfgname);
87
88 # sort out subcollection indexes
89 if (defined $self->{'collect_cfg'}->{'indexsubcollections'}) {
90 my $indexes = $self->{'collect_cfg'}->{'indexes'};
91 $self->{'collect_cfg'}->{'indexes'} = [];
92 foreach $subcollection (@{$self->{'collect_cfg'}->{'indexsubcollections'}}) {
93 foreach $index (@$indexes) {
94 push (@{$self->{'collect_cfg'}->{'indexes'}}, "$index:$subcollection");
95 }
96 }
97 }
98
99 # sort out language subindexes
100 if (defined $self->{'collect_cfg'}->{'languages'}) {
101 my $indexes = $self->{'collect_cfg'}->{'indexes'};
102 $self->{'collect_cfg'}->{'indexes'} = [];
103 foreach $language (@{$self->{'collect_cfg'}->{'languages'}}) {
104 foreach $index (@$indexes) {
105 push (@{$self->{'collect_cfg'}->{'indexes'}}, "$index:$language");
106 }
107 }
108 }
109
110 # make sure that the same index isn't specified more than once
111 my %tmphash = ();
112 my @tmparray = @{$self->{'collect_cfg'}->{'indexes'}};
113 $self->{'collect_cfg'}->{'indexes'} = [];
114 foreach my $i (@tmparray) {
115 if (!defined ($tmphash{$i})) {
116 push (@{$self->{'collect_cfg'}->{'indexes'}}, $i);
117 $tmphash{$i} = 1;
118 }
119 }
120
121 # get the list of plugins for this collection
122 my $plugins = [];
123 if (defined $self->{'collect_cfg'}->{'plugin'}) {
124 $plugins = $self->{'collect_cfg'}->{'plugin'};
125 }
126
127 # load all the plugins
128 $self->{'pluginfo'} = &plugin::load_plugins ($plugins, $verbosity, $outhandle);
129 if (scalar(@{$self->{'pluginfo'}}) == 0) {
130 print $outhandle "No plugins were loaded.\n";
131 die "\n";
132 }
133
134 # get the list of classifiers for this collection
135 my $classifiers = [];
136 if (defined $self->{'collect_cfg'}->{'classify'}) {
137 $classifiers = $self->{'collect_cfg'}->{'classify'};
138 }
139
140 # load all the classifiers
141 $self->{'classifiers'} = &classify::load_classifiers ($classifiers, $build_dir, $outhandle);
142
143 # load up any dontgdbm fields
144 $self->{'dontgdbm'} = {};
145 if (defined ($self->{'collect_cfg'}->{'dontgdbm'})) {
146 foreach $dg (@{$self->{'collect_cfg'}->{'dontgdbm'}}) {
147 $self->{'dontgdbm'}->{$dg} = 1;
148 }
149 }
150
151 # load up the document processor for building
152 # if a buildproc class has been created for this collection, use it
153 # otherwise, use the mg buildproc
154 my ($buildprocdir, $buildproctype);
155 if (-e "$ENV{'GSDLCOLLECTDIR'}/perllib/${collection}buildproc.pm") {
156 $buildprocdir = "$ENV{'GSDLCOLLECTDIR'}/perllib";
157 $buildproctype = "${collection}buildproc";
158 } else {
159 $buildprocdir = "$ENV{'GSDLHOME'}/perllib";
160 $buildproctype = "mgbuildproc";
161 }
162 require "$buildprocdir/$buildproctype.pm";
163
164 eval("\$self->{'buildproc'} = new $buildproctype(\$collection, " .
165 "\$source_dir, \$build_dir, \$verbosity, \$outhandle)");
166 die "$@" if $@;
167
168 return $self;
169}
170
171sub init {
172 my $self = shift (@_);
173
174 if (!$self->{'debug'} && !$self->{'keepold'}) {
175 # remove any old builds
176 &util::rm_r($self->{'build_dir'});
177 &util::mk_all_dir($self->{'build_dir'});
178
179 # make the text directory
180 my $textdir = "$self->{'build_dir'}/text";
181 &util::mk_all_dir($textdir);
182 }
183}
184
185sub compress_text {
186 my $self = shift (@_);
187 my ($textindex) = @_;
188 my $exedir = "$ENV{'GSDLHOME'}/bin/$ENV{'GSDLOS'}";
189 my $exe = &util::get_os_exe ();
190 my $mg_passes_exe = &util::filename_cat($exedir, "mg_passes$exe");
191 my $mg_compression_dict_exe = &util::filename_cat($exedir, "mg_compression_dict$exe");
192 my $outhandle = $self->{'outhandle'};
193
194 &util::mk_all_dir (&util::filename_cat($self->{'build_dir'}, "text"));
195 my $basefilename = "text/$self->{'collection'}";
196 my $fulltextprefix = &util::filename_cat ($self->{'build_dir'}, $basefilename);
197
198 my $osextra = "";
199 if ($ENV{'GSDLOS'} =~ /^windows$/i) {
200 $fulltextprefix =~ s/\//\\/g;
201 } else {
202 $osextra = " -d /";
203 }
204
205 print $outhandle "\n*** creating the compressed text\n" if ($self->{'verbosity'} >= 1);
206
207 # collect the statistics for the text
208 # -b $maxdocsize sets the maximum document size to be 12 meg
209 print $outhandle "\n collecting text statistics\n" if ($self->{'verbosity'} >= 1);
210
211 my ($handle);
212 if ($self->{'debug'}) {
213 $handle = STDOUT;
214 } else {
215 if (!-e "$mg_passes_exe" ||
216# !open (PIPEOUT, "| \"$mg_passes_exe\" -f \"$fulltextprefix\" -b $maxdocsize -T1 $osextra")) {
217 !open (PIPEOUT, "| mg_passes$exe -f \"$fulltextprefix\" -b $maxdocsize -T1 $osextra")) {
218 die "mgbuilder::compress_text - couldn't run $mg_passes_exe\n";
219 }
220 $handle = mgbuilder::PIPEOUT;
221 }
222
223 $self->{'buildproc'}->set_output_handle ($handle);
224 $self->{'buildproc'}->set_mode ('text');
225 $self->{'buildproc'}->set_index ($textindex);
226 $self->{'buildproc'}->set_indexing_text (0);
227 $self->{'buildproc'}->reset();
228 &plugin::begin($self->{'pluginfo'}, $self->{'source_dir'},
229 $self->{'buildproc'}, $self->{'maxdocs'});
230 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
231 "", {}, $self->{'buildproc'}, $self->{'maxdocs'});
232 &plugin::end($self->{'pluginfo'});
233
234 close ($handle) unless $self->{'debug'};
235
236 $self->print_stats();
237
238 # create the compression dictionary
239 # the compression dictionary is built by assuming the stats are from a seed
240 # dictionary (-S), if a novel word is encountered it is spelled out (-H),
241 # and the resulting dictionary must be less than 5 meg with the most frequent
242 # words being put into the dictionary first (-2 -k 5120)
243 if (!$self->{'debug'}) {
244 print $outhandle "\n creating the compression dictionary\n" if ($self->{'verbosity'} >= 1);
245 if (!-e "$mg_compression_dict_exe") {
246 die "mgbuilder::compress_text - couldn't run $mg_compression_dict_exe\n";
247 }
248# system ("\"$mg_compression_dict_exe\" -f \"$fulltextprefix\" -S -H -2 -k 5120 $osextra");
249 system ("mg_compression_dict$exe -f \"$fulltextprefix\" -S -H -2 -k 5120 $osextra");
250
251 # -b $maxdocsize sets the maximum document size to be 12 meg
252 if (!-e "$mg_passes_exe" ||
253# !open ($handle, "| \"$mg_passes_exe\" -f \"$fulltextprefix\" -b $maxdocsize -T2 $osextra")) {
254 !open ($handle, "| mg_passes$exe -f \"$fulltextprefix\" -b $maxdocsize -T2 $osextra")) {
255 die "mgbuilder::compress_text - couldn't run $mg_passes_exe\n";
256 }
257 }
258
259 $self->{'buildproc'}->reset();
260 # compress the text
261 print $outhandle "\n compressing the text\n" if ($self->{'verbosity'} >= 1);
262 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
263 "", {}, $self->{'buildproc'}, $self->{'maxdocs'});
264 close ($handle) unless $self->{'debug'};
265
266 $self->print_stats();
267}
268
269sub want_built {
270 my $self = shift (@_);
271 my ($index) = @_;
272
273 if (defined ($self->{'collect_cfg'}->{'dontbuild'})) {
274 foreach $checkstr (@{$self->{'collect_cfg'}->{'dontbuild'}}) {
275 if ($index =~ /^$checkstr$/) {
276 push (@{$self->{'notbuilt'}}, $self->{'index_mapping'}->{$index});
277 return 0;
278 }
279 }
280 }
281
282 return 1;
283}
284
285sub build_indexes {
286 my $self = shift (@_);
287 my ($indexname) = @_;
288 my $outhandle = $self->{'outhandle'};
289
290 my $indexes = [];
291 if (defined $indexname && $indexname =~ /\w/) {
292 push @$indexes, $indexname;
293 } else {
294 $indexes = $self->{'collect_cfg'}->{'indexes'};
295 }
296
297 # create the mapping between the index descriptions
298 # and their directory names
299 $self->{'index_mapping'} = $self->create_index_mapping ($indexes);
300
301 # build each of the indexes
302 foreach $index (@$indexes) {
303 if ($self->want_built($index)) {
304 print $outhandle "\n*** building index $index in subdirectory " .
305 "$self->{'index_mapping'}->{$index}\n" if ($self->{'verbosity'} >= 1);
306 $self->build_index($index);
307 } else {
308 print $outhandle "\n*** ignoring index $index\n" if ($self->{'verbosity'} >= 1);
309 }
310 }
311}
312
313# creates directory names for each of the index descriptions
314sub create_index_mapping {
315 my $self = shift (@_);
316 my ($indexes) = @_;
317
318 my %mapping = ();
319 $mapping{'indexmaporder'} = [];
320 $mapping{'subcollectionmaporder'} = [];
321 $mapping{'languagemaporder'} = [];
322
323 # dirnames is used to check for collisions. Start this off
324 # with the manditory directory names
325 my %dirnames = ('text'=>'text',
326 'extra'=>'extra');
327 my %pnames = ('index' => '', 'subcollection' => '', 'languages' => '');
328
329 foreach $index (@$indexes) {
330 my ($level, $gran, $subcollection, $languages) = split (":", $index);
331
332 # the directory name starts with the first character of the index level
333 my ($pindex) = $level =~ /^(.)/;
334
335 # next comes a processed version of the index
336 $pindex .= $self->process_field ($gran);
337 $pindex = lc ($pindex);
338
339 # next comes a processed version of the subcollection if there is one.
340 my $psub = $self->process_field ($subcollection);
341 $psub = lc ($psub);
342
343 # next comes a processed version of the language if there is one.
344 my $plang = $self->process_field ($languages);
345 $plang = lc ($plang);
346
347 my $dirname = $pindex . $psub . $plang;
348
349 # check to be sure all index names are unique
350 while (defined ($dirnames{$dirname})) {
351 $dirname = $self->make_unique (\%pnames, $index, \$pindex, \$psub, \$plang);
352 }
353
354 # store the mapping orders as well as the maps
355 if (!defined $mapping{'indexmap'}{"$level:$gran"}) {
356 $mapping{'indexmap'}{"$level:$gran"} = $pindex;
357 push (@{$mapping{'indexmaporder'}}, "$level:$gran");
358 }
359 if ($psub =~ /\w/ && !defined ($mapping{'subcollectionmap'}{$subcollection})) {
360 $mapping{'subcollectionmap'}{$subcollection} = $psub;
361 push (@{$mapping{'subcollectionmaporder'}}, $subcollection);
362 }
363 if ($plang =~ /\w/ && !defined ($mapping{'languagemap'}{$languages})) {
364 $mapping{'languagemap'}{$languages} = $plang;
365 push (@{$mapping{'languagemaporder'}}, $language);
366 }
367 $mapping{$index} = $dirname;
368 $dirnames{$dirname} = $index;
369 $pnames{'index'}{$pindex} = "$level:$gran";
370 $pnames{'subcollection'}{$psub} = $subcollection;
371 $pnames{'languages'}{$plang} = $languages;
372 }
373
374 return \%mapping;
375}
376
377# returns a processed version of a field.
378# if the field has only one component the processed
379# version will contain the first character and next consonant
380# of that componant - otherwise it will contain the first
381# character of the first two components
382sub process_field {
383 my $self = shift (@_);
384 my ($field) = @_;
385
386 return "" unless (defined ($field) && $field =~ /\w/);
387
388 my @components = split /,/, $field;
389 if (scalar @components >= 2) {
390 splice (@components, 2);
391 map {s/^(.).*$/$1/;} @components;
392 return join("", @components);
393 } else {
394 my ($a, $b) = $field =~ /^(.).*?([bcdfghjklmnpqrstvwxyz])/i;
395 ($a, $b) = $field =~ /^(.)(.)/ unless defined $a && defined $b;
396 return "$a$b";
397 }
398}
399
400sub make_unique {
401 my $self = shift (@_);
402 my ($namehash, $index, $indexref, $subref, $langref) = @_;
403 my ($level, $gran, $subcollection, $languages) = split (":", $index);
404
405 if ($namehash->{'index'}->{$$indexref} ne "$level:$gran") {
406 $self->get_next_version ($indexref);
407 } elsif ($namehash->{'subcollection'}->{$$subref} ne $subcollection) {
408 $self->get_next_version ($subref);
409 } elsif ($namehash->{'languages'}->{$$langref} ne $languages) {
410 $self->get_next_version ($langref);
411 }
412 return "$$indexref$$subref$$langref";
413}
414
415sub get_next_version {
416 my $self = shift (@_);
417 my ($nameref) = @_;
418
419 if ($$nameref =~ /(\d\d)$/) {
420 my $num = $1; $num ++;
421 $$nameref =~ s/\d\d$/$num/;
422 } elsif ($$nameref =~ /(\d)$/) {
423 my $num = $1;
424 if ($num == 9) {$$nameref =~ s/\d\d$/10/;}
425 else {$num ++; $$nameref =~ s/\d$/$num/;}
426 } else {
427 $$nameref =~ s/.$/0/;
428 }
429}
430
431sub build_index {
432 my $self = shift (@_);
433 my ($index) = @_;
434 my $outhandle = $self->{'outhandle'};
435
436 # get the full index directory path and make sure it exists
437 my $indexdir = $self->{'index_mapping'}->{$index};
438 &util::mk_all_dir (&util::filename_cat($self->{'build_dir'}, $indexdir));
439 my $fullindexprefix = &util::filename_cat ($self->{'build_dir'}, $indexdir,
440 $self->{'collection'});
441 my $fulltextprefix = &util::filename_cat ($self->{'build_dir'}, "text",
442 $self->{'collection'});
443
444 # get any os specific stuff
445 my $exedir = "$ENV{'GSDLHOME'}/bin/$ENV{'GSDLOS'}";
446 my $exe = &util::get_os_exe ();
447 my $mg_passes_exe = &util::filename_cat($exedir, "mg_passes$exe");
448 my $mg_perf_hash_build_exe =
449 &util::filename_cat($exedir, "mg_perf_hash_build$exe");
450 my $mg_weights_build_exe =
451 &util::filename_cat ($exedir, "mg_weights_build$exe");
452 my $mg_invf_dict_exe =
453 &util::filename_cat ($exedir, "mg_invf_dict$exe");
454 my $mg_stem_idx_exe =
455 &util::filename_cat ($exedir, "mg_stem_idx$exe");
456
457 my $osextra = "";
458 if ($ENV{'GSDLOS'} =~ /^windows$/i) {
459 $fullindexprefix =~ s/\//\\/g;
460 } else {
461 $osextra = " -d /";
462 }
463
464 # get the index level from the index description
465 # the index will be level 2 unless we are building a
466 # paragraph level index
467 my $index_level = 2;
468 $index_level = 3 if $index =~ /^paragraph/i;
469
470 # get the index expression if this index belongs
471 # to a subcollection
472 my $indexexparr = [];
473 my ($level, $fields, $subcollection) = split (":", $index);
474 my @subcollections = ();
475 @subcollections = split /,/, $subcollection if (defined $subcollection);
476
477 foreach $subcollection (@subcollections) {
478 if (defined ($self->{'collect_cfg'}->{'subcollection'}->{$subcollection})) {
479 push (@$indexexparr, $self->{'collect_cfg'}->{'subcollection'}->{$subcollection});
480 }
481 }
482
483 # add expressions for languages if this index belongs to
484 # a language subcollection
485 foreach $language (@{$self->{'collect_cfg'}->{'languages'}}) {
486 if ($language =~ s/^\!//) {
487 push (@$indexexparr, "!Language/$language/");
488 } else {
489 push (@$indexexparr, "Language/$language/");
490 }
491 }
492
493 # Build index dictionary. Uses verbatim stem method
494 print $outhandle "\n creating index dictionary\n" if ($self->{'verbosity'} >= 1);
495 my ($handle);
496 if ($self->{'debug'}) {
497 $handle = STDOUT;
498 } else {
499 if (!-e "$mg_passes_exe" ||
500# !open (PIPEOUT, "| \"$mg_passes_exe\" -f \"$fullindexprefix\" -b $maxdocsize " .
501 !open (PIPEOUT, "| mg_passes$exe -f \"$fullindexprefix\" -b $maxdocsize " .
502 "-$index_level -m 32 -s 0 -G -t 10 -N1 $osextra")) {
503 die "mgbuilder::build_index - couldn't run $mg_passes_exe\n";
504 }
505 $handle = mgbuilder::PIPEOUT;
506 }
507
508 # set up the document processor
509 $self->{'buildproc'}->set_output_handle ($handle);
510 $self->{'buildproc'}->set_mode ('text');
511 $self->{'buildproc'}->set_index ($index, $indexexparr);
512 $self->{'buildproc'}->set_indexing_text (1);
513
514 $self->{'buildproc'}->reset();
515 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
516 "", {}, $self->{'buildproc'}, $self->{'maxdocs'});
517 close ($handle) unless $self->{'debug'};
518
519 $self->print_stats();
520
521 if (!$self->{'debug'}) {
522 # create the perfect hash function
523 if (!-e "$mg_perf_hash_build_exe") {
524 die "mgbuilder::build_index - couldn't run $mg_perf_hash_build_exe\n";
525 }
526# system ("\"$mg_perf_hash_build_exe\" -f \"$fullindexprefix\" $osextra");
527 system ("mg_perf_hash_build$exe -f \"$fullindexprefix\" $osextra");
528
529 if (!-e "$mg_passes_exe" ||
530# !open ($handle, "| \"$mg_passes_exe\" -f \"$fullindexprefix\" -b $maxdocsize " .
531 !open ($handle, "| mg_passes$exe -f \"$fullindexprefix\" -b $maxdocsize " .
532 "-$index_level -c 3 -G -t 10 -N2 $osextra")) {
533 die "mgbuilder::build_index - couldn't run $mg_passes_exe\n";
534 }
535 }
536
537 # invert the text
538 print $outhandle "\n inverting the text\n" if ($self->{'verbosity'} >= 1);
539
540 $self->{'buildproc'}->reset();
541 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
542 "", {}, $self->{'buildproc'}, $self->{'maxdocs'});
543
544 $self->print_stats ();
545
546 if (!$self->{'debug'}) {
547
548 close ($handle);
549
550 # create the weights file
551 print $outhandle "\n create the weights file\n" if ($self->{'verbosity'} >= 1);
552 if (!-e "$mg_weights_build_exe") {
553 die "mgbuilder::build_index - couldn't run $mg_weights_build_exe\n";
554 }
555# system ("\"$mg_weights_build_exe\" -f \"$fullindexprefix\" -t \"$fulltextprefix\" $osextra");
556 system ("mg_weights_build$exe -f \"$fullindexprefix\" -t \"$fulltextprefix\" $osextra");
557
558 # create 'on-disk' stemmed dictionary
559 print $outhandle "\n creating 'on-disk' stemmed dictionary\n" if ($self->{'verbosity'} >= 1);
560 if (!-e "$mg_invf_dict_exe") {
561 die "mgbuilder::build_index - couldn't run $mg_invf_dict_exe\n";
562 }
563# system ("\"$mg_invf_dict_exe\" -f \"$fullindexprefix\" $osextra");
564 system ("mg_invf_dict$exe -f \"$fullindexprefix\" $osextra");
565
566
567 # creates stem index files for the various stemming methods
568 print $outhandle "\n creating stem indexes\n" if ($self->{'verbosity'} >= 1);
569 if (!-e "$mg_stem_idx_exe") {
570 die "mgbuilder::build_index - couldn't run $mg_stem_idx_exe\n";
571 }
572# system ("\"$mg_stem_idx_exe\" -b 4096 -s1 -f \"$fullindexprefix\" $osextra");
573 system ("mg_stem_idx$exe -b 4096 -s1 -f \"$fullindexprefix\" $osextra");
574# system ("\"$mg_stem_idx_exe\" -b 4096 -s2 -f \"$fullindexprefix\" $osextra");
575 system ("mg_stem_idx$exe -b 4096 -s2 -f \"$fullindexprefix\" $osextra");
576# system ("\"$mg_stem_idx_exe\" -b 4096 -s3 -f \"$fullindexprefix\" $osextra");
577 system ("mg_stem_idx$exe -b 4096 -s3 -f \"$fullindexprefix\" $osextra");
578
579
580 # remove unwanted files
581 my $tmpdir = &util::filename_cat ($self->{'build_dir'}, $indexdir);
582 opendir (DIR, $tmpdir) || die
583 "mgbuilder::build_index - couldn't read directory $tmpdir\n";
584 foreach $file (readdir(DIR)) {
585 next if $file =~ /^\./;
586 my ($suffix) = $file =~ /\.([^\.]+)$/;
587 if (defined $suffix && !defined $wanted_index_files{$suffix}) {
588 # delete it!
589 print $outhandle "deleting $file\n" if $self->{'verbosity'} > 2;
590 &util::rm (&util::filename_cat ($tmpdir, $file));
591 }
592 }
593 closedir (DIR);
594 }
595}
596
597sub make_infodatabase {
598 my $self = shift (@_);
599 my $outhandle = $self->{'outhandle'};
600
601 my $textdir = &util::filename_cat($self->{'build_dir'}, "text");
602 my $assocdir = &util::filename_cat($self->{'build_dir'}, "assoc");
603 &util::mk_all_dir ($textdir);
604 &util::mk_all_dir ($assocdir);
605
606 # get db name
607 my $dbext = ".bdb";
608 $dbext = ".ldb" if &util::is_little_endian();
609 my $fulldbname = &util::filename_cat ($textdir, "$self->{'collection'}$dbext");
610 $fulldbname =~ s/\//\\/g if ($ENV{'GSDLOS'} =~ /^windows$/i);
611
612 my $exedir = "$ENV{'GSDLHOME'}/bin/$ENV{'GSDLOS'}";
613 my $exe = &util::get_os_exe ();
614 my $txt2db_exe = &util::filename_cat($exedir, "txt2db$exe");
615
616 print $outhandle "\n*** creating the info database and processing associated files\n"
617 if ($self->{'verbosity'} >= 1);
618
619 # init all the classifiers
620 &classify::init_classifiers ($self->{'classifiers'});
621
622 # set up the document processor
623 my ($handle);
624 if ($self->{'debug'}) {
625 $handle = STDOUT;
626 } else {
627# if (!-e "$txt2db_exe" || !open (PIPEOUT, "| \"$txt2db_exe\" \"$fulldbname\"")) {
628 if (!-e "$txt2db_exe" || !open (PIPEOUT, "| txt2db$exe \"$fulldbname\"")) {
629 die "mgbuilder::make_infodatabase - couldn't run $txt2db_exe\n";
630 }
631 $handle = mgbuilder::PIPEOUT;
632 }
633
634 $self->{'buildproc'}->set_output_handle ($handle);
635 $self->{'buildproc'}->set_mode ('infodb');
636 $self->{'buildproc'}->set_assocdir ($assocdir);
637 $self->{'buildproc'}->set_dontgdbm ($self->{'dontgdbm'});
638 $self->{'buildproc'}->set_classifiers ($self->{'classifiers'});
639 $self->{'buildproc'}->set_indexing_text (0);
640 $self->{'buildproc'}->reset();
641
642 if (defined $self->{'collect_cfg'}->{'collectionmeta'}) {
643
644 if (!defined $self->{'index_mapping'}) {
645 $self->{'index_mapping'} =
646 $self->create_index_mapping ($self->{'collect_cfg'}->{'indexes'});
647 }
648
649 print $handle "[collection]\n";
650
651 foreach $cmeta (keys (%{$self->{'collect_cfg'}->{'collectionmeta'}})) {
652 if ($cmeta =~ s/^\.//) {
653 if (defined $self->{'index_mapping'}->{$cmeta}) {
654 print $handle "<$self->{'index_mapping'}->{$cmeta}>" .
655 $self->{'collect_cfg'}->{'collectionmeta'}->{".$cmeta"} . "\n";
656 } else {
657 print $outhandle "mgbuilder: warning bad collectionmeta option '$cmeta' - ignored\n";
658 }
659 } else {
660 print $handle "<$cmeta>$self->{'collect_cfg'}->{'collectionmeta'}->{$cmeta}\n";
661 }
662 }
663 print $handle "\n" . ('-' x 70) . "\n";
664
665 }
666
667 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
668 "", {}, $self->{'buildproc'}, $self->{'maxdocs'});
669
670 # output classification information
671 &classify::output_classify_info ($self->{'classifiers'}, $handle,
672 $self->{'allclassifications'});
673
674 close ($handle) if !$self->{'debug'};
675}
676
677sub collect_specific {
678 my $self = shift (@_);
679}
680
681sub make_auxiliary_files {
682 my $self = shift (@_);
683 my ($index);
684 my %build_cfg = ();
685 my $outhandle = $self->{'outhandle'};
686
687 print $outhandle "\n*** creating auxiliary files \n" if ($self->{'verbosity'} >= 1);
688
689 # get the text directory
690 &util::mk_all_dir ($self->{'build_dir'});
691
692 # store the build date
693 $build_cfg->{'builddate'} = time;
694
695 # store the number of documents and number of bytes
696 $build_cfg->{'numdocs'} = $self->{'buildproc'}->get_num_docs();
697 $build_cfg->{'numbytes'} = $self->{'buildproc'}->get_num_bytes();
698
699 # get additional stats from mg
700 my $exedir = "$ENV{'GSDLHOME'}/bin/$ENV{'GSDLOS'}";
701 my $exe = &util::get_os_exe ();
702 my $mgstat_exe = &util::filename_cat($exedir, "mgstat$exe");
703 my $input_file = &util::filename_cat ("text", $self->{'collection'});
704# if (!-e "$mgstat_exe" || !open (PIPEIN, "\"$mgstat_exe\" -d \"$self->{'build_dir'}\" -f \"$input_file\" |")) {
705 if (!-e "$mgstat_exe" || !open (PIPEIN, "mgstat$exe -d \"$self->{'build_dir'}\" -f \"$input_file\" |")) {
706 print $outhandle "Warning: Couldn't open pipe to $mgstat_exe to get additional stats\n";
707 } else {
708 my $line = "";
709 while (defined ($line = <PIPEIN>)) {
710 if ($line =~ /^Words in collection \[dict\]\s+:\s+(\d+)/) {
711 ($build_cfg->{'numwords'}) = $1;
712 } elsif ($line =~ /^Documents\s+:\s+(\d+)/) {
713 ($build_cfg->{'numsections'}) = $1;
714 }
715 }
716 close PIPEIN;
717 }
718
719 # store the mapping between the index names and the directory names
720 my @indexmap = ();
721 foreach $index (@{$self->{'index_mapping'}->{'indexmaporder'}}) {
722 push (@indexmap, "$index\-\>$self->{'index_mapping'}->{'indexmap'}->{$index}");
723 }
724 $build_cfg->{'indexmap'} = \@indexmap;
725
726 my @subcollectionmap = ();
727 foreach $subcollection (@{$self->{'index_mapping'}->{'subcollectionmaporder'}}) {
728 push (@subcollectionmap, "$subcollection\-\>" .
729 $self->{'index_mapping'}->{'subcollectionmap'}->{$subcollection});
730 }
731 $build_cfg->{'subcollectionmap'} = \@subcollectionmap if scalar (@subcollectionmap);
732
733 my @languagemap = ();
734 foreach $language (@{$self->{'index_mapping'}->{'languagemaporder'}}) {
735 push (@languagemap, "$language\-\>" .
736 $self->{'index_mapping'}->{'languagemap'}->{$language});
737 }
738 $build_cfg->{'languagemap'} = \@languagemap if scalar (@languagemap);
739
740 $build_cfg->{'notbuilt'} = $self->{'notbuilt'} if scalar @{$self->{'notbuilt'}};
741
742 # write out the build information
743 &cfgread::write_cfg_file("$self->{'build_dir'}/build.cfg", $build_cfg,
744 '^(builddate|numdocs|numbytes|numwords|numsections)$',
745 '^(indexmap|subcollectionmap|languagemap|notbuilt)$');
746
747}
748
749sub deinit {
750 my $self = shift (@_);
751}
752
753sub print_stats {
754 my $self = shift (@_);
755
756 my $outhandle = $self->{'outhandle'};
757 my $indexing_text = $self->{'buildproc'}->get_indexing_text();
758 my $index = $self->{'buildproc'}->get_index();
759 my $num_bytes = $self->{'buildproc'}->get_num_bytes();
760 my $num_processed_bytes = $self->{'buildproc'}->get_num_processed_bytes();
761
762 if ($indexing_text) {
763 print $outhandle "Stats (Creating index $index)\n";
764 } else {
765 print $outhandle "Stats (Compressing text from $index)\n";
766 }
767 print $outhandle "Total bytes in collection: $num_bytes\n";
768 print $outhandle "Total bytes in $index: $num_processed_bytes\n";
769
770 if ($num_processed_bytes < 50) {
771 print $outhandle "***************\n";
772 print $outhandle "WARNING: There is very little or no text to process for $index\n";
773 if ($indexing_text) {
774 print $outhandle "This may cause an error while attempting to build the index\n";
775 } else {
776 print $outhandle "This may cause an error while attempting to compress the text\n";
777 }
778 print $outhandle "***************\n";
779 }
780}
781
7821;
783
784
Note: See TracBrowser for help on using the repository browser.