source: trunk/gsdl/perllib/mgbuilder.pm@ 2811

Last change on this file since 2811 was 2785, checked in by sjboddie, 23 years ago

The build process now creates a summary of how many files were included,
which were rejected, etc. A link to a page containing this summary is
provided from the final page of the collector (once the collection is built
successfully) and from the default "about this collection" text for
collections built by the collector.

Also did a little bit of tidying in a couple of places

  • Property svn:keywords set to Author Date Id Revision
File size: 28.1 KB
RevLine 
[537]1###########################################################################
[4]2#
[537]3# mgbuilder.pm -- MGBuilder object
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
[4]25
26package mgbuilder;
27
[215]28use classify;
[4]29use cfgread;
30use colcfg;
31use plugin;
32use util;
[1304]33use FileHandle;
[4]34
[1304]35BEGIN {
36 # set autoflush on for STDERR and STDOUT so that mg
37 # doesn't get out of sync with plugins
38 STDOUT->autoflush(1);
39 STDERR->autoflush(1);
40}
41
42END {
43 STDOUT->autoflush(0);
44 STDERR->autoflush(0);
45}
46
[4]47$maxdocsize = 12000;
48
49%wanted_index_files = ('td'=>1,
50 't'=>1,
51 'idb'=>1,
52 'ib1'=>1,
53 'ib2'=>1,
54 'ib3'=>1,
55 'i'=>1,
56 'ip'=>1,
57 'tiw'=>1,
58 'wa'=>1);
59
60
61sub new {
[784]62 my ($class, $collection, $source_dir, $build_dir, $verbosity,
[2336]63 $maxdocs, $debug, $keepold, $allclassifications,
[2785]64 $outhandle, $no_text, $failhandle) = @_;
[4]65
[1424]66 $outhandle = STDERR unless defined $outhandle;
[2336]67 $no_text = 0 unless defined $no_text;
[2785]68 $failhandle = STDERR unless defined $failhandle;
[1424]69
[4]70 # create an mgbuilder object
71 my $self = bless {'collection'=>$collection,
72 'source_dir'=>$source_dir,
73 'build_dir'=>$build_dir,
[215]74 'verbosity'=>$verbosity,
[315]75 'maxdocs'=>$maxdocs,
[782]76 'debug'=>$debug,
[784]77 'keepold'=>$keepold,
[486]78 'allclassifications'=>$allclassifications,
[1424]79 'outhandle'=>$outhandle,
[2336]80 'no_text'=>$no_text,
[2785]81 'failhandle'=>$failhandle,
[486]82 'notbuilt'=>[] # indexes not built
[315]83 }, $class;
[4]84
85
86 # read in the collection configuration file
[215]87 my $colcfgname = "$ENV{'GSDLCOLLECTDIR'}/etc/collect.cfg";
88 if (!-e $colcfgname) {
[4]89 die "mgbuilder::new - couldn't find collect.cfg for collection $collection\n";
90 }
[215]91 $self->{'collect_cfg'} = &colcfg::read_collect_cfg ($colcfgname);
[4]92
[69]93 # sort out subcollection indexes
94 if (defined $self->{'collect_cfg'}->{'indexsubcollections'}) {
95 my $indexes = $self->{'collect_cfg'}->{'indexes'};
96 $self->{'collect_cfg'}->{'indexes'} = [];
97 foreach $subcollection (@{$self->{'collect_cfg'}->{'indexsubcollections'}}) {
98 foreach $index (@$indexes) {
99 push (@{$self->{'collect_cfg'}->{'indexes'}}, "$index:$subcollection");
100 }
101 }
102 }
103
[139]104 # sort out language subindexes
105 if (defined $self->{'collect_cfg'}->{'languages'}) {
106 my $indexes = $self->{'collect_cfg'}->{'indexes'};
107 $self->{'collect_cfg'}->{'indexes'} = [];
108 foreach $language (@{$self->{'collect_cfg'}->{'languages'}}) {
109 foreach $index (@$indexes) {
[1973]110 if (defined ($self->{'collect_cfg'}->{'indexsubcollections'})) {
111 push (@{$self->{'collect_cfg'}->{'indexes'}}, "$index:$language");
112 }
113 else { # add in an empty subcollection field
114 push (@{$self->{'collect_cfg'}->{'indexes'}}, "$index\:\:$language");
115 }
[139]116 }
117 }
118 }
119
[1799]120 # make sure that the same index isn't specified more than once
121 my %tmphash = ();
122 my @tmparray = @{$self->{'collect_cfg'}->{'indexes'}};
123 $self->{'collect_cfg'}->{'indexes'} = [];
124 foreach my $i (@tmparray) {
125 if (!defined ($tmphash{$i})) {
126 push (@{$self->{'collect_cfg'}->{'indexes'}}, $i);
127 $tmphash{$i} = 1;
128 }
129 }
130
[4]131 # get the list of plugins for this collection
[810]132 my $plugins = [];
133 if (defined $self->{'collect_cfg'}->{'plugin'}) {
134 $plugins = $self->{'collect_cfg'}->{'plugin'};
[4]135 }
136
137 # load all the plugins
[2785]138 $self->{'pluginfo'} = &plugin::load_plugins ($plugins, $verbosity, $outhandle, $failhandle);
[4]139 if (scalar(@{$self->{'pluginfo'}}) == 0) {
[1424]140 print $outhandle "No plugins were loaded.\n";
[4]141 die "\n";
142 }
143
[810]144 # get the list of classifiers for this collection
145 my $classifiers = [];
146 if (defined $self->{'collect_cfg'}->{'classify'}) {
147 $classifiers = $self->{'collect_cfg'}->{'classify'};
148 }
149
[215]150 # load all the classifiers
[1803]151 $self->{'classifiers'} = &classify::load_classifiers ($classifiers, $build_dir, $outhandle);
[215]152
[780]153 # load up any dontgdbm fields
154 $self->{'dontgdbm'} = {};
155 if (defined ($self->{'collect_cfg'}->{'dontgdbm'})) {
156 foreach $dg (@{$self->{'collect_cfg'}->{'dontgdbm'}}) {
157 $self->{'dontgdbm'}->{$dg} = 1;
158 }
159 }
160
[4]161 # load up the document processor for building
162 # if a buildproc class has been created for this collection, use it
163 # otherwise, use the mg buildproc
164 my ($buildprocdir, $buildproctype);
[134]165 if (-e "$ENV{'GSDLCOLLECTDIR'}/perllib/${collection}buildproc.pm") {
166 $buildprocdir = "$ENV{'GSDLCOLLECTDIR'}/perllib";
[4]167 $buildproctype = "${collection}buildproc";
168 } else {
[16]169 $buildprocdir = "$ENV{'GSDLHOME'}/perllib";
[4]170 $buildproctype = "mgbuildproc";
171 }
172 require "$buildprocdir/$buildproctype.pm";
173
174 eval("\$self->{'buildproc'} = new $buildproctype(\$collection, " .
[1424]175 "\$source_dir, \$build_dir, \$verbosity, \$outhandle)");
[4]176 die "$@" if $@;
177
178 return $self;
179}
180
181sub init {
182 my $self = shift (@_);
183
[784]184 if (!$self->{'debug'} && !$self->{'keepold'}) {
[782]185 # remove any old builds
186 &util::rm_r($self->{'build_dir'});
187 &util::mk_all_dir($self->{'build_dir'});
[4]188
[782]189 # make the text directory
190 my $textdir = "$self->{'build_dir'}/text";
191 &util::mk_all_dir($textdir);
192 }
[4]193}
194
195sub compress_text {
196 my $self = shift (@_);
[134]197 my ($textindex) = @_;
[4]198 my $exedir = "$ENV{'GSDLHOME'}/bin/$ENV{'GSDLOS'}";
199 my $exe = &util::get_os_exe ();
[486]200 my $mg_passes_exe = &util::filename_cat($exedir, "mg_passes$exe");
201 my $mg_compression_dict_exe = &util::filename_cat($exedir, "mg_compression_dict$exe");
[1424]202 my $outhandle = $self->{'outhandle'};
[4]203
204 &util::mk_all_dir (&util::filename_cat($self->{'build_dir'}, "text"));
205 my $basefilename = "text/$self->{'collection'}";
206 my $fulltextprefix = &util::filename_cat ($self->{'build_dir'}, $basefilename);
207
208 my $osextra = "";
209 if ($ENV{'GSDLOS'} =~ /^windows$/i) {
210 $fulltextprefix =~ s/\//\\/g;
211 } else {
212 $osextra = " -d /";
213 }
214
[1424]215 print $outhandle "\n*** creating the compressed text\n" if ($self->{'verbosity'} >= 1);
[4]216
217 # collect the statistics for the text
218 # -b $maxdocsize sets the maximum document size to be 12 meg
[1424]219 print $outhandle "\n collecting text statistics\n" if ($self->{'verbosity'} >= 1);
[782]220
221 my ($handle);
222 if ($self->{'debug'}) {
223 $handle = STDOUT;
224 } else {
225 if (!-e "$mg_passes_exe" ||
[1679]226 !open (PIPEOUT, "| mg_passes$exe -f \"$fulltextprefix\" -b $maxdocsize -T1 $osextra")) {
[782]227 die "mgbuilder::compress_text - couldn't run $mg_passes_exe\n";
228 }
229 $handle = mgbuilder::PIPEOUT;
[4]230 }
[782]231
232 $self->{'buildproc'}->set_output_handle ($handle);
233 $self->{'buildproc'}->set_mode ('text');
234 $self->{'buildproc'}->set_index ($textindex);
235 $self->{'buildproc'}->set_indexing_text (0);
[2336]236 if ($self->{'no_text'}) {
237 $self->{'buildproc'}->set_store_text(0);
238 } else {
239 $self->{'buildproc'}->set_store_text(1);
240 }
[4]241 $self->{'buildproc'}->reset();
[835]242 &plugin::begin($self->{'pluginfo'}, $self->{'source_dir'},
243 $self->{'buildproc'}, $self->{'maxdocs'});
244 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
245 "", {}, $self->{'buildproc'}, $self->{'maxdocs'});
246 &plugin::end($self->{'pluginfo'});
[4]247
[782]248 close ($handle) unless $self->{'debug'};
249
[1251]250 $self->print_stats();
251
[4]252 # create the compression dictionary
253 # the compression dictionary is built by assuming the stats are from a seed
254 # dictionary (-S), if a novel word is encountered it is spelled out (-H),
255 # and the resulting dictionary must be less than 5 meg with the most frequent
256 # words being put into the dictionary first (-2 -k 5120)
[782]257 if (!$self->{'debug'}) {
[1424]258 print $outhandle "\n creating the compression dictionary\n" if ($self->{'verbosity'} >= 1);
[782]259 if (!-e "$mg_compression_dict_exe") {
260 die "mgbuilder::compress_text - couldn't run $mg_compression_dict_exe\n";
261 }
[1679]262 system ("mg_compression_dict$exe -f \"$fulltextprefix\" -S -H -2 -k 5120 $osextra");
[782]263
264 # -b $maxdocsize sets the maximum document size to be 12 meg
[1072]265 if (!-e "$mg_passes_exe" ||
[1679]266 !open ($handle, "| mg_passes$exe -f \"$fulltextprefix\" -b $maxdocsize -T2 $osextra")) {
[1072]267 die "mgbuilder::compress_text - couldn't run $mg_passes_exe\n";
[782]268 }
[4]269 }
270
[782]271 $self->{'buildproc'}->reset();
[4]272 # compress the text
[1424]273 print $outhandle "\n compressing the text\n" if ($self->{'verbosity'} >= 1);
[4]274 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
[315]275 "", {}, $self->{'buildproc'}, $self->{'maxdocs'});
[782]276 close ($handle) unless $self->{'debug'};
[1251]277
278 $self->print_stats();
[4]279}
280
[486]281sub want_built {
282 my $self = shift (@_);
283 my ($index) = @_;
284
285 if (defined ($self->{'collect_cfg'}->{'dontbuild'})) {
286 foreach $checkstr (@{$self->{'collect_cfg'}->{'dontbuild'}}) {
287 if ($index =~ /^$checkstr$/) {
288 push (@{$self->{'notbuilt'}}, $self->{'index_mapping'}->{$index});
289 return 0;
290 }
291 }
292 }
293
294 return 1;
295}
296
[4]297sub build_indexes {
298 my $self = shift (@_);
[782]299 my ($indexname) = @_;
[1424]300 my $outhandle = $self->{'outhandle'};
[4]301
[782]302 my $indexes = [];
303 if (defined $indexname && $indexname =~ /\w/) {
304 push @$indexes, $indexname;
305 } else {
306 $indexes = $self->{'collect_cfg'}->{'indexes'};
307 }
308
[4]309 # create the mapping between the index descriptions
310 # and their directory names
311 $self->{'index_mapping'} = $self->create_index_mapping ($indexes);
312
313 # build each of the indexes
314 foreach $index (@$indexes) {
[486]315 if ($self->want_built($index)) {
[1424]316 print $outhandle "\n*** building index $index in subdirectory " .
[486]317 "$self->{'index_mapping'}->{$index}\n" if ($self->{'verbosity'} >= 1);
318 $self->build_index($index);
319 } else {
[1424]320 print $outhandle "\n*** ignoring index $index\n" if ($self->{'verbosity'} >= 1);
[486]321 }
[4]322 }
323}
324
325# creates directory names for each of the index descriptions
326sub create_index_mapping {
327 my $self = shift (@_);
328 my ($indexes) = @_;
329
330 my %mapping = ();
[290]331 $mapping{'indexmaporder'} = [];
332 $mapping{'subcollectionmaporder'} = [];
333 $mapping{'languagemaporder'} = [];
[4]334
335 # dirnames is used to check for collisions. Start this off
336 # with the manditory directory names
337 my %dirnames = ('text'=>'text',
338 'extra'=>'extra');
[139]339 my %pnames = ('index' => '', 'subcollection' => '', 'languages' => '');
340
[4]341 foreach $index (@$indexes) {
[139]342 my ($level, $gran, $subcollection, $languages) = split (":", $index);
[4]343
[139]344 # the directory name starts with the first character of the index level
345 my ($pindex) = $level =~ /^(.)/;
[4]346
[139]347 # next comes a processed version of the index
348 $pindex .= $self->process_field ($gran);
349 $pindex = lc ($pindex);
350
[69]351 # next comes a processed version of the subcollection if there is one.
[139]352 my $psub = $self->process_field ($subcollection);
353 $psub = lc ($psub);
[69]354
[139]355 # next comes a processed version of the language if there is one.
356 my $plang = $self->process_field ($languages);
357 $plang = lc ($plang);
[4]358
[139]359 my $dirname = $pindex . $psub . $plang;
360
361 # check to be sure all index names are unique
362 while (defined ($dirnames{$dirname})) {
363 $dirname = $self->make_unique (\%pnames, $index, \$pindex, \$psub, \$plang);
[4]364 }
[1973]365 $mapping{$index} = $dirname;
[139]366
[290]367 # store the mapping orders as well as the maps
[1973]368 # also put index, subcollection and language fields into the mapping thing -
369 # (the full index name (eg document:text:subcol:lang) is not used on
370 # the query page) -these are used for collectionmeta later on
[290]371 if (!defined $mapping{'indexmap'}{"$level:$gran"}) {
372 $mapping{'indexmap'}{"$level:$gran"} = $pindex;
373 push (@{$mapping{'indexmaporder'}}, "$level:$gran");
[1973]374 if (!defined $mapping{"$level:$gran"}) {
375 $mapping{"$level:$gran"} = $pindex;
376 }
[290]377 }
378 if ($psub =~ /\w/ && !defined ($mapping{'subcollectionmap'}{$subcollection})) {
379 $mapping{'subcollectionmap'}{$subcollection} = $psub;
380 push (@{$mapping{'subcollectionmaporder'}}, $subcollection);
[1973]381 $mapping{$subcollection} = $psub;
[290]382 }
383 if ($plang =~ /\w/ && !defined ($mapping{'languagemap'}{$languages})) {
384 $mapping{'languagemap'}{$languages} = $plang;
[1973]385 push (@{$mapping{'languagemaporder'}}, $languages);
386 $mapping{$languages} = $plang;
[290]387 }
[4]388 $dirnames{$dirname} = $index;
[139]389 $pnames{'index'}{$pindex} = "$level:$gran";
390 $pnames{'subcollection'}{$psub} = $subcollection;
391 $pnames{'languages'}{$plang} = $languages;
[4]392 }
393
394 return \%mapping;
395}
396
[139]397# returns a processed version of a field.
398# if the field has only one component the processed
399# version will contain the first character and next consonant
400# of that componant - otherwise it will contain the first
401# character of the first two components
402sub process_field {
403 my $self = shift (@_);
404 my ($field) = @_;
405
406 return "" unless (defined ($field) && $field =~ /\w/);
[4]407
[139]408 my @components = split /,/, $field;
409 if (scalar @components >= 2) {
410 splice (@components, 2);
411 map {s/^(.).*$/$1/;} @components;
412 return join("", @components);
413 } else {
414 my ($a, $b) = $field =~ /^(.).*?([bcdfghjklmnpqrstvwxyz])/i;
415 ($a, $b) = $field =~ /^(.)(.)/ unless defined $a && defined $b;
416 return "$a$b";
417 }
418}
419
420sub make_unique {
421 my $self = shift (@_);
422 my ($namehash, $index, $indexref, $subref, $langref) = @_;
423 my ($level, $gran, $subcollection, $languages) = split (":", $index);
424
425 if ($namehash->{'index'}->{$$indexref} ne "$level:$gran") {
426 $self->get_next_version ($indexref);
427 } elsif ($namehash->{'subcollection'}->{$$subref} ne $subcollection) {
428 $self->get_next_version ($subref);
429 } elsif ($namehash->{'languages'}->{$$langref} ne $languages) {
430 $self->get_next_version ($langref);
431 }
432 return "$$indexref$$subref$$langref";
433}
434
435sub get_next_version {
436 my $self = shift (@_);
437 my ($nameref) = @_;
438
439 if ($$nameref =~ /(\d\d)$/) {
440 my $num = $1; $num ++;
441 $$nameref =~ s/\d\d$/$num/;
442 } elsif ($$nameref =~ /(\d)$/) {
443 my $num = $1;
444 if ($num == 9) {$$nameref =~ s/\d\d$/10/;}
445 else {$num ++; $$nameref =~ s/\d$/$num/;}
446 } else {
447 $$nameref =~ s/.$/0/;
448 }
449}
450
[4]451sub build_index {
452 my $self = shift (@_);
453 my ($index) = @_;
[1424]454 my $outhandle = $self->{'outhandle'};
[4]455
456 # get the full index directory path and make sure it exists
457 my $indexdir = $self->{'index_mapping'}->{$index};
458 &util::mk_all_dir (&util::filename_cat($self->{'build_dir'}, $indexdir));
459 my $fullindexprefix = &util::filename_cat ($self->{'build_dir'}, $indexdir,
460 $self->{'collection'});
461 my $fulltextprefix = &util::filename_cat ($self->{'build_dir'}, "text",
462 $self->{'collection'});
463
464 # get any os specific stuff
465 my $exedir = "$ENV{'GSDLHOME'}/bin/$ENV{'GSDLOS'}";
466 my $exe = &util::get_os_exe ();
[486]467 my $mg_passes_exe = &util::filename_cat($exedir, "mg_passes$exe");
468 my $mg_perf_hash_build_exe =
469 &util::filename_cat($exedir, "mg_perf_hash_build$exe");
470 my $mg_weights_build_exe =
471 &util::filename_cat ($exedir, "mg_weights_build$exe");
472 my $mg_invf_dict_exe =
473 &util::filename_cat ($exedir, "mg_invf_dict$exe");
474 my $mg_stem_idx_exe =
475 &util::filename_cat ($exedir, "mg_stem_idx$exe");
476
[4]477 my $osextra = "";
478 if ($ENV{'GSDLOS'} =~ /^windows$/i) {
479 $fullindexprefix =~ s/\//\\/g;
480 } else {
481 $osextra = " -d /";
482 }
483
484 # get the index level from the index description
485 # the index will be level 2 unless we are building a
486 # paragraph level index
487 my $index_level = 2;
488 $index_level = 3 if $index =~ /^paragraph/i;
489
[69]490 # get the index expression if this index belongs
491 # to a subcollection
492 my $indexexparr = [];
[1973]493
494 # there may be subcollection info, and language info.
495 my ($level, $fields, $subcollection, $language) = split (":", $index);
[85]496 my @subcollections = ();
497 @subcollections = split /,/, $subcollection if (defined $subcollection);
[69]498
499 foreach $subcollection (@subcollections) {
500 if (defined ($self->{'collect_cfg'}->{'subcollection'}->{$subcollection})) {
501 push (@$indexexparr, $self->{'collect_cfg'}->{'subcollection'}->{$subcollection});
502 }
503 }
504
[139]505 # add expressions for languages if this index belongs to
[1973]506 # a language subcollection - only put languages expressions for the
507 # ones we want in the index
508
509 my @languages = ();
510 @languages = split /,/, $language if (defined $language);
511 foreach $language (@languages) {
512 my $not=0;
[139]513 if ($language =~ s/^\!//) {
[1973]514 $not = 1;
[139]515 }
[1973]516 foreach $lang (@{$self->{'collect_cfg'}->{'languages'}}) {
517 if ($lang eq $language) {
518 if($not) {
519 push (@$indexexparr, "!Language/$language/");
520 } else {
521 push (@$indexexparr, "Language/$language/");
522 }
523 last;
524 }
525 }
[139]526 }
[782]527
528 # Build index dictionary. Uses verbatim stem method
[1424]529 print $outhandle "\n creating index dictionary\n" if ($self->{'verbosity'} >= 1);
[782]530 my ($handle);
531 if ($self->{'debug'}) {
532 $handle = STDOUT;
533 } else {
534 if (!-e "$mg_passes_exe" ||
[1679]535 !open (PIPEOUT, "| mg_passes$exe -f \"$fullindexprefix\" -b $maxdocsize " .
[782]536 "-$index_level -m 32 -s 0 -G -t 10 -N1 $osextra")) {
537 die "mgbuilder::build_index - couldn't run $mg_passes_exe\n";
538 }
539 $handle = mgbuilder::PIPEOUT;
540 }
541
[4]542 # set up the document processor
[782]543 $self->{'buildproc'}->set_output_handle ($handle);
[4]544 $self->{'buildproc'}->set_mode ('text');
[69]545 $self->{'buildproc'}->set_index ($index, $indexexparr);
[292]546 $self->{'buildproc'}->set_indexing_text (1);
[2336]547 $self->{'buildproc'}->set_store_text(1);
[4]548
549 $self->{'buildproc'}->reset();
550 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
[315]551 "", {}, $self->{'buildproc'}, $self->{'maxdocs'});
[782]552 close ($handle) unless $self->{'debug'};
[4]553
[1251]554 $self->print_stats();
555
[782]556 if (!$self->{'debug'}) {
557 # create the perfect hash function
558 if (!-e "$mg_perf_hash_build_exe") {
559 die "mgbuilder::build_index - couldn't run $mg_perf_hash_build_exe\n";
560 }
[1679]561 system ("mg_perf_hash_build$exe -f \"$fullindexprefix\" $osextra");
[782]562
563 if (!-e "$mg_passes_exe" ||
[1679]564 !open ($handle, "| mg_passes$exe -f \"$fullindexprefix\" -b $maxdocsize " .
[782]565 "-$index_level -c 3 -G -t 10 -N2 $osextra")) {
566 die "mgbuilder::build_index - couldn't run $mg_passes_exe\n";
567 }
[4]568 }
[782]569
[4]570 # invert the text
[1424]571 print $outhandle "\n inverting the text\n" if ($self->{'verbosity'} >= 1);
[782]572
[4]573 $self->{'buildproc'}->reset();
574 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
[315]575 "", {}, $self->{'buildproc'}, $self->{'maxdocs'});
[1304]576
[1251]577 $self->print_stats ();
578
[782]579 if (!$self->{'debug'}) {
[4]580
[782]581 close ($handle);
582
583 # create the weights file
[1424]584 print $outhandle "\n create the weights file\n" if ($self->{'verbosity'} >= 1);
[782]585 if (!-e "$mg_weights_build_exe") {
586 die "mgbuilder::build_index - couldn't run $mg_weights_build_exe\n";
587 }
[1679]588 system ("mg_weights_build$exe -f \"$fullindexprefix\" -t \"$fulltextprefix\" $osextra");
[4]589
[782]590 # create 'on-disk' stemmed dictionary
[1424]591 print $outhandle "\n creating 'on-disk' stemmed dictionary\n" if ($self->{'verbosity'} >= 1);
[782]592 if (!-e "$mg_invf_dict_exe") {
593 die "mgbuilder::build_index - couldn't run $mg_invf_dict_exe\n";
594 }
[1679]595 system ("mg_invf_dict$exe -f \"$fullindexprefix\" $osextra");
[4]596
597
[782]598 # creates stem index files for the various stemming methods
[1424]599 print $outhandle "\n creating stem indexes\n" if ($self->{'verbosity'} >= 1);
[782]600 if (!-e "$mg_stem_idx_exe") {
601 die "mgbuilder::build_index - couldn't run $mg_stem_idx_exe\n";
602 }
[1679]603 system ("mg_stem_idx$exe -b 4096 -s1 -f \"$fullindexprefix\" $osextra");
604 system ("mg_stem_idx$exe -b 4096 -s2 -f \"$fullindexprefix\" $osextra");
605 system ("mg_stem_idx$exe -b 4096 -s3 -f \"$fullindexprefix\" $osextra");
[4]606
[782]607 # remove unwanted files
608 my $tmpdir = &util::filename_cat ($self->{'build_dir'}, $indexdir);
609 opendir (DIR, $tmpdir) || die
610 "mgbuilder::build_index - couldn't read directory $tmpdir\n";
611 foreach $file (readdir(DIR)) {
612 next if $file =~ /^\./;
613 my ($suffix) = $file =~ /\.([^\.]+)$/;
614 if (defined $suffix && !defined $wanted_index_files{$suffix}) {
615 # delete it!
[1424]616 print $outhandle "deleting $file\n" if $self->{'verbosity'} > 2;
[782]617 &util::rm (&util::filename_cat ($tmpdir, $file));
618 }
[4]619 }
[782]620 closedir (DIR);
[4]621 }
622}
623
624sub make_infodatabase {
625 my $self = shift (@_);
[1424]626 my $outhandle = $self->{'outhandle'};
627
[4]628 my $textdir = &util::filename_cat($self->{'build_dir'}, "text");
[810]629 my $assocdir = &util::filename_cat($self->{'build_dir'}, "assoc");
[4]630 &util::mk_all_dir ($textdir);
[810]631 &util::mk_all_dir ($assocdir);
[4]632
[85]633 # get db name
634 my $dbext = ".bdb";
635 $dbext = ".ldb" if &util::is_little_endian();
636 my $fulldbname = &util::filename_cat ($textdir, "$self->{'collection'}$dbext");
[4]637 $fulldbname =~ s/\//\\/g if ($ENV{'GSDLOS'} =~ /^windows$/i);
[85]638
[4]639 my $exedir = "$ENV{'GSDLHOME'}/bin/$ENV{'GSDLOS'}";
640 my $exe = &util::get_os_exe ();
[486]641 my $txt2db_exe = &util::filename_cat($exedir, "txt2db$exe");
[4]642
[1424]643 print $outhandle "\n*** creating the info database and processing associated files\n"
[810]644 if ($self->{'verbosity'} >= 1);
[4]645
[215]646 # init all the classifiers
[315]647 &classify::init_classifiers ($self->{'classifiers'});
[2506]648
[215]649
[4]650 # set up the document processor
[782]651 my ($handle);
652 if ($self->{'debug'}) {
653 $handle = STDOUT;
654 } else {
[1679]655 if (!-e "$txt2db_exe" || !open (PIPEOUT, "| txt2db$exe \"$fulldbname\"")) {
[782]656 die "mgbuilder::make_infodatabase - couldn't run $txt2db_exe\n";
657 }
658 $handle = mgbuilder::PIPEOUT;
659 }
[2506]660
[782]661 $self->{'buildproc'}->set_output_handle ($handle);
[315]662 $self->{'buildproc'}->set_mode ('infodb');
[810]663 $self->{'buildproc'}->set_assocdir ($assocdir);
[780]664 $self->{'buildproc'}->set_dontgdbm ($self->{'dontgdbm'});
[315]665 $self->{'buildproc'}->set_classifiers ($self->{'classifiers'});
[292]666 $self->{'buildproc'}->set_indexing_text (0);
[2336]667 $self->{'buildproc'}->set_store_text(1);
[4]668 $self->{'buildproc'}->reset();
[2506]669
[461]670 if (defined $self->{'collect_cfg'}->{'collectionmeta'}) {
[782]671
[2772]672 if (!defined $self->{'index_mapping'}) {
[461]673 $self->{'index_mapping'} =
674 $self->create_index_mapping ($self->{'collect_cfg'}->{'indexes'});
675 }
[2772]676
[782]677 print $handle "[collection]\n";
[2772]678
[461]679 foreach $cmeta (keys (%{$self->{'collect_cfg'}->{'collectionmeta'}})) {
[2772]680 my $defaultfound=0;
681 my $first=1;
682 my $metadata_entry = "";
683 my $default="";
684 my $cmetamap = "";
[461]685 if ($cmeta =~ s/^\.//) {
[2772]686 if (defined $self->{'index_mapping'}->{$cmeta}) {
687 $cmetamap = $self->{'index_mapping'}->{$cmeta};
688 $cmeta = ".$cmeta";
689 }
[1973]690 else {
[1424]691 print $outhandle "mgbuilder: warning bad collectionmeta option '$cmeta' - ignored\n";
[2772]692 next; #ignore this one
[461]693 }
694 }
[2772]695 else {
696 $cmetamap = $cmeta; # just using the same name
697 }
698 #iterate through the languages
699 foreach $lang (keys (%{$self->{'collect_cfg'}->{'collectionmeta'}->{$cmeta}})) {
700 if ($first) {
701 $first=0;
702 #set the default default to the first entry
703 $default=$self->{'collect_cfg'}->{'collectionmeta'}->{$cmeta}->{$lang};
704 }
705 if ($lang =~ /default/) {
706 $defaultfound=1;
707 #the default entry goes first
708 $metadata_entry = "<$cmetamap>" .
709 $self->{'collect_cfg'}->{'collectionmeta'}->{$cmeta}->{'default'} . "\n" . $metadata_entry;
710 }
711 else {
712 my ($l) = $lang =~ /^\[l=(\w*)\]$/;
713 if ($l) {
714 $metadata_entry .= "<$cmetamap:$l>" .
715 $self->{'collect_cfg'}->{'collectionmeta'}->{$cmeta}->{$lang} . "\n";
716 }
717 }
718 }
719 #if we haven't found a default, put one in
720 if (!$defaultfound) {
721 $metadata_entry = "<$cmetamap>$default\n" . $metadata_entry;
722 }
723 #write the entry to the file
724 print $handle $metadata_entry;
725
[461]726 }
[2772]727
[782]728 print $handle "\n" . ('-' x 70) . "\n";
[2772]729
[461]730 }
731
[2506]732
[4]733 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
[315]734 "", {}, $self->{'buildproc'}, $self->{'maxdocs'});
[2506]735
[215]736 # output classification information
[782]737 &classify::output_classify_info ($self->{'classifiers'}, $handle,
[315]738 $self->{'allclassifications'});
[215]739
[2506]740
741
742 #output doclist
743 my @doclist = $self->{'buildproc'}->get_doc_list();
744 my $docs = join (";",@doclist);
745 print $handle "[browselist]\n";
746 print $handle "<hastxt>0\n";
747 print $handle "<childtype>VList\n";
748 print $handle "<numleafdocs>" . ($#doclist+1) . "\n";
749 print $handle "<thistype>Invisible\n";
750 print $handle "<contains>$docs";
751 print $handle "\n" . ('-' x 70) . "\n";
752
[782]753 close ($handle) if !$self->{'debug'};
[4]754}
755
[626]756sub collect_specific {
757 my $self = shift (@_);
758}
759
[4]760sub make_auxiliary_files {
761 my $self = shift (@_);
762 my ($index);
763 my %build_cfg = ();
[1424]764 my $outhandle = $self->{'outhandle'};
[4]765
[1424]766 print $outhandle "\n*** creating auxiliary files \n" if ($self->{'verbosity'} >= 1);
[4]767
768 # get the text directory
769 &util::mk_all_dir ($self->{'build_dir'});
770
771 # store the build date
772 $build_cfg->{'builddate'} = time;
773
774 # store the number of documents and number of bytes
775 $build_cfg->{'numdocs'} = $self->{'buildproc'}->get_num_docs();
776 $build_cfg->{'numbytes'} = $self->{'buildproc'}->get_num_bytes();
777
[1252]778 # get additional stats from mg
779 my $exedir = "$ENV{'GSDLHOME'}/bin/$ENV{'GSDLOS'}";
780 my $exe = &util::get_os_exe ();
781 my $mgstat_exe = &util::filename_cat($exedir, "mgstat$exe");
782 my $input_file = &util::filename_cat ("text", $self->{'collection'});
[1679]783 if (!-e "$mgstat_exe" || !open (PIPEIN, "mgstat$exe -d \"$self->{'build_dir'}\" -f \"$input_file\" |")) {
[1424]784 print $outhandle "Warning: Couldn't open pipe to $mgstat_exe to get additional stats\n";
[1252]785 } else {
786 my $line = "";
787 while (defined ($line = <PIPEIN>)) {
788 if ($line =~ /^Words in collection \[dict\]\s+:\s+(\d+)/) {
789 ($build_cfg->{'numwords'}) = $1;
790 } elsif ($line =~ /^Documents\s+:\s+(\d+)/) {
791 ($build_cfg->{'numsections'}) = $1;
792 }
793 }
794 close PIPEIN;
795 }
796
[4]797 # store the mapping between the index names and the directory names
798 my @indexmap = ();
[290]799 foreach $index (@{$self->{'index_mapping'}->{'indexmaporder'}}) {
[139]800 push (@indexmap, "$index\-\>$self->{'index_mapping'}->{'indexmap'}->{$index}");
[4]801 }
802 $build_cfg->{'indexmap'} = \@indexmap;
803
[139]804 my @subcollectionmap = ();
[290]805 foreach $subcollection (@{$self->{'index_mapping'}->{'subcollectionmaporder'}}) {
[378]806 push (@subcollectionmap, "$subcollection\-\>" .
807 $self->{'index_mapping'}->{'subcollectionmap'}->{$subcollection});
[139]808 }
809 $build_cfg->{'subcollectionmap'} = \@subcollectionmap if scalar (@subcollectionmap);
810
811 my @languagemap = ();
[290]812 foreach $language (@{$self->{'index_mapping'}->{'languagemaporder'}}) {
[378]813 push (@languagemap, "$language\-\>" .
814 $self->{'index_mapping'}->{'languagemap'}->{$language});
[139]815 }
816 $build_cfg->{'languagemap'} = \@languagemap if scalar (@languagemap);
817
[1246]818 $build_cfg->{'notbuilt'} = $self->{'notbuilt'} if scalar @{$self->{'notbuilt'}};
[486]819
[4]820 # write out the build information
821 &cfgread::write_cfg_file("$self->{'build_dir'}/build.cfg", $build_cfg,
[1252]822 '^(builddate|numdocs|numbytes|numwords|numsections)$',
[626]823 '^(indexmap|subcollectionmap|languagemap|notbuilt)$');
[4]824
825}
826
827sub deinit {
828 my $self = shift (@_);
829}
830
[1251]831sub print_stats {
832 my $self = shift (@_);
[4]833
[1424]834 my $outhandle = $self->{'outhandle'};
[1251]835 my $indexing_text = $self->{'buildproc'}->get_indexing_text();
836 my $index = $self->{'buildproc'}->get_index();
837 my $num_bytes = $self->{'buildproc'}->get_num_bytes();
838 my $num_processed_bytes = $self->{'buildproc'}->get_num_processed_bytes();
839
840 if ($indexing_text) {
[1424]841 print $outhandle "Stats (Creating index $index)\n";
[1251]842 } else {
[1424]843 print $outhandle "Stats (Compressing text from $index)\n";
[1251]844 }
[1424]845 print $outhandle "Total bytes in collection: $num_bytes\n";
846 print $outhandle "Total bytes in $index: $num_processed_bytes\n";
[1251]847
[2336]848 if ($num_processed_bytes < 50 && ($indexing_text || !$self->{'no_text'})) {
[1424]849 print $outhandle "***************\n";
[1251]850 if ($indexing_text) {
[2336]851 print $outhandle "WARNING: There is very little or no text to process for $index\n";
852 } elsif (!$self->{'no_text'}) {
853 print $outhandle "WARNING: There is very little or no text to compress\n";
[1251]854 }
[2336]855 print $outhandle " Was this your intention?\n";
[1424]856 print $outhandle "***************\n";
[1251]857 }
858}
859
[4]8601;
Note: See TracBrowser for help on using the repository browser.