source: trunk/gsdl/perllib/mgbuilder.pm@ 1954

Last change on this file since 1954 was 1803, checked in by paynter, 23 years ago

Moved the phind classifier's data directory into the index directory. This
means we no longer overwrite existing phind classifier data during a build.
I had to tweak the classifier code to pass the locatin of the building
directory to each classifer as an argument.

  • Property svn:keywords set to Author Date Id Revision
File size: 26.4 KB
RevLine 
[537]1###########################################################################
[4]2#
[537]3# mgbuilder.pm -- MGBuilder object
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
[4]25
26package mgbuilder;
27
[215]28use classify;
[4]29use cfgread;
30use colcfg;
31use plugin;
32use util;
[1304]33use FileHandle;
[4]34
[1304]35BEGIN {
36 # set autoflush on for STDERR and STDOUT so that mg
37 # doesn't get out of sync with plugins
38 STDOUT->autoflush(1);
39 STDERR->autoflush(1);
40}
41
42END {
43 STDOUT->autoflush(0);
44 STDERR->autoflush(0);
45}
46
[4]47$maxdocsize = 12000;
48
49%wanted_index_files = ('td'=>1,
50 't'=>1,
51 'idb'=>1,
52 'ib1'=>1,
53 'ib2'=>1,
54 'ib3'=>1,
55 'i'=>1,
56 'ip'=>1,
57 'tiw'=>1,
58 'wa'=>1);
59
60
61sub new {
[784]62 my ($class, $collection, $source_dir, $build_dir, $verbosity,
[1424]63 $maxdocs, $debug, $keepold, $allclassifications, $outhandle) = @_;
[4]64
[1424]65 $outhandle = STDERR unless defined $outhandle;
66
[4]67 # create an mgbuilder object
68 my $self = bless {'collection'=>$collection,
69 'source_dir'=>$source_dir,
70 'build_dir'=>$build_dir,
[215]71 'verbosity'=>$verbosity,
[315]72 'maxdocs'=>$maxdocs,
[782]73 'debug'=>$debug,
[784]74 'keepold'=>$keepold,
[486]75 'allclassifications'=>$allclassifications,
[1424]76 'outhandle'=>$outhandle,
[486]77 'notbuilt'=>[] # indexes not built
[315]78 }, $class;
[4]79
80
81 # read in the collection configuration file
[215]82 my $colcfgname = "$ENV{'GSDLCOLLECTDIR'}/etc/collect.cfg";
83 if (!-e $colcfgname) {
[4]84 die "mgbuilder::new - couldn't find collect.cfg for collection $collection\n";
85 }
[215]86 $self->{'collect_cfg'} = &colcfg::read_collect_cfg ($colcfgname);
[4]87
[69]88 # sort out subcollection indexes
89 if (defined $self->{'collect_cfg'}->{'indexsubcollections'}) {
90 my $indexes = $self->{'collect_cfg'}->{'indexes'};
91 $self->{'collect_cfg'}->{'indexes'} = [];
92 foreach $subcollection (@{$self->{'collect_cfg'}->{'indexsubcollections'}}) {
93 foreach $index (@$indexes) {
94 push (@{$self->{'collect_cfg'}->{'indexes'}}, "$index:$subcollection");
95 }
96 }
97 }
98
[139]99 # sort out language subindexes
100 if (defined $self->{'collect_cfg'}->{'languages'}) {
101 my $indexes = $self->{'collect_cfg'}->{'indexes'};
102 $self->{'collect_cfg'}->{'indexes'} = [];
103 foreach $language (@{$self->{'collect_cfg'}->{'languages'}}) {
104 foreach $index (@$indexes) {
105 push (@{$self->{'collect_cfg'}->{'indexes'}}, "$index:$language");
106 }
107 }
108 }
109
[1799]110 # make sure that the same index isn't specified more than once
111 my %tmphash = ();
112 my @tmparray = @{$self->{'collect_cfg'}->{'indexes'}};
113 $self->{'collect_cfg'}->{'indexes'} = [];
114 foreach my $i (@tmparray) {
115 if (!defined ($tmphash{$i})) {
116 push (@{$self->{'collect_cfg'}->{'indexes'}}, $i);
117 $tmphash{$i} = 1;
118 }
119 }
120
[4]121 # get the list of plugins for this collection
[810]122 my $plugins = [];
123 if (defined $self->{'collect_cfg'}->{'plugin'}) {
124 $plugins = $self->{'collect_cfg'}->{'plugin'};
[4]125 }
126
127 # load all the plugins
[1424]128 $self->{'pluginfo'} = &plugin::load_plugins ($plugins, $verbosity, $outhandle);
[4]129 if (scalar(@{$self->{'pluginfo'}}) == 0) {
[1424]130 print $outhandle "No plugins were loaded.\n";
[4]131 die "\n";
132 }
133
[810]134 # get the list of classifiers for this collection
135 my $classifiers = [];
136 if (defined $self->{'collect_cfg'}->{'classify'}) {
137 $classifiers = $self->{'collect_cfg'}->{'classify'};
138 }
139
[215]140 # load all the classifiers
[1803]141 $self->{'classifiers'} = &classify::load_classifiers ($classifiers, $build_dir, $outhandle);
[215]142
[780]143 # load up any dontgdbm fields
144 $self->{'dontgdbm'} = {};
145 if (defined ($self->{'collect_cfg'}->{'dontgdbm'})) {
146 foreach $dg (@{$self->{'collect_cfg'}->{'dontgdbm'}}) {
147 $self->{'dontgdbm'}->{$dg} = 1;
148 }
149 }
150
[4]151 # load up the document processor for building
152 # if a buildproc class has been created for this collection, use it
153 # otherwise, use the mg buildproc
154 my ($buildprocdir, $buildproctype);
[134]155 if (-e "$ENV{'GSDLCOLLECTDIR'}/perllib/${collection}buildproc.pm") {
156 $buildprocdir = "$ENV{'GSDLCOLLECTDIR'}/perllib";
[4]157 $buildproctype = "${collection}buildproc";
158 } else {
[16]159 $buildprocdir = "$ENV{'GSDLHOME'}/perllib";
[4]160 $buildproctype = "mgbuildproc";
161 }
162 require "$buildprocdir/$buildproctype.pm";
163
164 eval("\$self->{'buildproc'} = new $buildproctype(\$collection, " .
[1424]165 "\$source_dir, \$build_dir, \$verbosity, \$outhandle)");
[4]166 die "$@" if $@;
167
168 return $self;
169}
170
171sub init {
172 my $self = shift (@_);
173
[784]174 if (!$self->{'debug'} && !$self->{'keepold'}) {
[782]175 # remove any old builds
176 &util::rm_r($self->{'build_dir'});
177 &util::mk_all_dir($self->{'build_dir'});
[4]178
[782]179 # make the text directory
180 my $textdir = "$self->{'build_dir'}/text";
181 &util::mk_all_dir($textdir);
182 }
[4]183}
184
185sub compress_text {
186 my $self = shift (@_);
[134]187 my ($textindex) = @_;
[4]188 my $exedir = "$ENV{'GSDLHOME'}/bin/$ENV{'GSDLOS'}";
189 my $exe = &util::get_os_exe ();
[486]190 my $mg_passes_exe = &util::filename_cat($exedir, "mg_passes$exe");
191 my $mg_compression_dict_exe = &util::filename_cat($exedir, "mg_compression_dict$exe");
[1424]192 my $outhandle = $self->{'outhandle'};
[4]193
194 &util::mk_all_dir (&util::filename_cat($self->{'build_dir'}, "text"));
195 my $basefilename = "text/$self->{'collection'}";
196 my $fulltextprefix = &util::filename_cat ($self->{'build_dir'}, $basefilename);
197
198 my $osextra = "";
199 if ($ENV{'GSDLOS'} =~ /^windows$/i) {
200 $fulltextprefix =~ s/\//\\/g;
201 } else {
202 $osextra = " -d /";
203 }
204
[1424]205 print $outhandle "\n*** creating the compressed text\n" if ($self->{'verbosity'} >= 1);
[4]206
207 # collect the statistics for the text
208 # -b $maxdocsize sets the maximum document size to be 12 meg
[1424]209 print $outhandle "\n collecting text statistics\n" if ($self->{'verbosity'} >= 1);
[782]210
211 my ($handle);
212 if ($self->{'debug'}) {
213 $handle = STDOUT;
214 } else {
215 if (!-e "$mg_passes_exe" ||
[1679]216# !open (PIPEOUT, "| \"$mg_passes_exe\" -f \"$fulltextprefix\" -b $maxdocsize -T1 $osextra")) {
217 !open (PIPEOUT, "| mg_passes$exe -f \"$fulltextprefix\" -b $maxdocsize -T1 $osextra")) {
[782]218 die "mgbuilder::compress_text - couldn't run $mg_passes_exe\n";
219 }
220 $handle = mgbuilder::PIPEOUT;
[4]221 }
[782]222
223 $self->{'buildproc'}->set_output_handle ($handle);
224 $self->{'buildproc'}->set_mode ('text');
225 $self->{'buildproc'}->set_index ($textindex);
226 $self->{'buildproc'}->set_indexing_text (0);
[4]227 $self->{'buildproc'}->reset();
[835]228 &plugin::begin($self->{'pluginfo'}, $self->{'source_dir'},
229 $self->{'buildproc'}, $self->{'maxdocs'});
230 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
231 "", {}, $self->{'buildproc'}, $self->{'maxdocs'});
232 &plugin::end($self->{'pluginfo'});
[4]233
[782]234 close ($handle) unless $self->{'debug'};
235
[1251]236 $self->print_stats();
237
[4]238 # create the compression dictionary
239 # the compression dictionary is built by assuming the stats are from a seed
240 # dictionary (-S), if a novel word is encountered it is spelled out (-H),
241 # and the resulting dictionary must be less than 5 meg with the most frequent
242 # words being put into the dictionary first (-2 -k 5120)
[782]243 if (!$self->{'debug'}) {
[1424]244 print $outhandle "\n creating the compression dictionary\n" if ($self->{'verbosity'} >= 1);
[782]245 if (!-e "$mg_compression_dict_exe") {
246 die "mgbuilder::compress_text - couldn't run $mg_compression_dict_exe\n";
247 }
[1679]248# system ("\"$mg_compression_dict_exe\" -f \"$fulltextprefix\" -S -H -2 -k 5120 $osextra");
249 system ("mg_compression_dict$exe -f \"$fulltextprefix\" -S -H -2 -k 5120 $osextra");
[782]250
251 # -b $maxdocsize sets the maximum document size to be 12 meg
[1072]252 if (!-e "$mg_passes_exe" ||
[1679]253# !open ($handle, "| \"$mg_passes_exe\" -f \"$fulltextprefix\" -b $maxdocsize -T2 $osextra")) {
254 !open ($handle, "| mg_passes$exe -f \"$fulltextprefix\" -b $maxdocsize -T2 $osextra")) {
[1072]255 die "mgbuilder::compress_text - couldn't run $mg_passes_exe\n";
[782]256 }
[4]257 }
258
[782]259 $self->{'buildproc'}->reset();
[4]260 # compress the text
[1424]261 print $outhandle "\n compressing the text\n" if ($self->{'verbosity'} >= 1);
[4]262 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
[315]263 "", {}, $self->{'buildproc'}, $self->{'maxdocs'});
[782]264 close ($handle) unless $self->{'debug'};
[1251]265
266 $self->print_stats();
[4]267}
268
[486]269sub want_built {
270 my $self = shift (@_);
271 my ($index) = @_;
272
273 if (defined ($self->{'collect_cfg'}->{'dontbuild'})) {
274 foreach $checkstr (@{$self->{'collect_cfg'}->{'dontbuild'}}) {
275 if ($index =~ /^$checkstr$/) {
276 push (@{$self->{'notbuilt'}}, $self->{'index_mapping'}->{$index});
277 return 0;
278 }
279 }
280 }
281
282 return 1;
283}
284
[4]285sub build_indexes {
286 my $self = shift (@_);
[782]287 my ($indexname) = @_;
[1424]288 my $outhandle = $self->{'outhandle'};
[4]289
[782]290 my $indexes = [];
291 if (defined $indexname && $indexname =~ /\w/) {
292 push @$indexes, $indexname;
293 } else {
294 $indexes = $self->{'collect_cfg'}->{'indexes'};
295 }
296
[4]297 # create the mapping between the index descriptions
298 # and their directory names
299 $self->{'index_mapping'} = $self->create_index_mapping ($indexes);
300
301 # build each of the indexes
302 foreach $index (@$indexes) {
[486]303 if ($self->want_built($index)) {
[1424]304 print $outhandle "\n*** building index $index in subdirectory " .
[486]305 "$self->{'index_mapping'}->{$index}\n" if ($self->{'verbosity'} >= 1);
306 $self->build_index($index);
307 } else {
[1424]308 print $outhandle "\n*** ignoring index $index\n" if ($self->{'verbosity'} >= 1);
[486]309 }
[4]310 }
311}
312
313# creates directory names for each of the index descriptions
314sub create_index_mapping {
315 my $self = shift (@_);
316 my ($indexes) = @_;
317
318 my %mapping = ();
[290]319 $mapping{'indexmaporder'} = [];
320 $mapping{'subcollectionmaporder'} = [];
321 $mapping{'languagemaporder'} = [];
[4]322
323 # dirnames is used to check for collisions. Start this off
324 # with the manditory directory names
325 my %dirnames = ('text'=>'text',
326 'extra'=>'extra');
[139]327 my %pnames = ('index' => '', 'subcollection' => '', 'languages' => '');
328
[4]329 foreach $index (@$indexes) {
[139]330 my ($level, $gran, $subcollection, $languages) = split (":", $index);
[4]331
[139]332 # the directory name starts with the first character of the index level
333 my ($pindex) = $level =~ /^(.)/;
[4]334
[139]335 # next comes a processed version of the index
336 $pindex .= $self->process_field ($gran);
337 $pindex = lc ($pindex);
338
[69]339 # next comes a processed version of the subcollection if there is one.
[139]340 my $psub = $self->process_field ($subcollection);
341 $psub = lc ($psub);
[69]342
[139]343 # next comes a processed version of the language if there is one.
344 my $plang = $self->process_field ($languages);
345 $plang = lc ($plang);
[4]346
[139]347 my $dirname = $pindex . $psub . $plang;
348
349 # check to be sure all index names are unique
350 while (defined ($dirnames{$dirname})) {
351 $dirname = $self->make_unique (\%pnames, $index, \$pindex, \$psub, \$plang);
[4]352 }
[139]353
[290]354 # store the mapping orders as well as the maps
355 if (!defined $mapping{'indexmap'}{"$level:$gran"}) {
356 $mapping{'indexmap'}{"$level:$gran"} = $pindex;
357 push (@{$mapping{'indexmaporder'}}, "$level:$gran");
358 }
359 if ($psub =~ /\w/ && !defined ($mapping{'subcollectionmap'}{$subcollection})) {
360 $mapping{'subcollectionmap'}{$subcollection} = $psub;
361 push (@{$mapping{'subcollectionmaporder'}}, $subcollection);
362 }
363 if ($plang =~ /\w/ && !defined ($mapping{'languagemap'}{$languages})) {
364 $mapping{'languagemap'}{$languages} = $plang;
365 push (@{$mapping{'languagemaporder'}}, $language);
366 }
[4]367 $mapping{$index} = $dirname;
368 $dirnames{$dirname} = $index;
[139]369 $pnames{'index'}{$pindex} = "$level:$gran";
370 $pnames{'subcollection'}{$psub} = $subcollection;
371 $pnames{'languages'}{$plang} = $languages;
[4]372 }
373
374 return \%mapping;
375}
376
[139]377# returns a processed version of a field.
378# if the field has only one component the processed
379# version will contain the first character and next consonant
380# of that componant - otherwise it will contain the first
381# character of the first two components
382sub process_field {
383 my $self = shift (@_);
384 my ($field) = @_;
385
386 return "" unless (defined ($field) && $field =~ /\w/);
[4]387
[139]388 my @components = split /,/, $field;
389 if (scalar @components >= 2) {
390 splice (@components, 2);
391 map {s/^(.).*$/$1/;} @components;
392 return join("", @components);
393 } else {
394 my ($a, $b) = $field =~ /^(.).*?([bcdfghjklmnpqrstvwxyz])/i;
395 ($a, $b) = $field =~ /^(.)(.)/ unless defined $a && defined $b;
396 return "$a$b";
397 }
398}
399
400sub make_unique {
401 my $self = shift (@_);
402 my ($namehash, $index, $indexref, $subref, $langref) = @_;
403 my ($level, $gran, $subcollection, $languages) = split (":", $index);
404
405 if ($namehash->{'index'}->{$$indexref} ne "$level:$gran") {
406 $self->get_next_version ($indexref);
407 } elsif ($namehash->{'subcollection'}->{$$subref} ne $subcollection) {
408 $self->get_next_version ($subref);
409 } elsif ($namehash->{'languages'}->{$$langref} ne $languages) {
410 $self->get_next_version ($langref);
411 }
412 return "$$indexref$$subref$$langref";
413}
414
415sub get_next_version {
416 my $self = shift (@_);
417 my ($nameref) = @_;
418
419 if ($$nameref =~ /(\d\d)$/) {
420 my $num = $1; $num ++;
421 $$nameref =~ s/\d\d$/$num/;
422 } elsif ($$nameref =~ /(\d)$/) {
423 my $num = $1;
424 if ($num == 9) {$$nameref =~ s/\d\d$/10/;}
425 else {$num ++; $$nameref =~ s/\d$/$num/;}
426 } else {
427 $$nameref =~ s/.$/0/;
428 }
429}
430
[4]431sub build_index {
432 my $self = shift (@_);
433 my ($index) = @_;
[1424]434 my $outhandle = $self->{'outhandle'};
[4]435
436 # get the full index directory path and make sure it exists
437 my $indexdir = $self->{'index_mapping'}->{$index};
438 &util::mk_all_dir (&util::filename_cat($self->{'build_dir'}, $indexdir));
439 my $fullindexprefix = &util::filename_cat ($self->{'build_dir'}, $indexdir,
440 $self->{'collection'});
441 my $fulltextprefix = &util::filename_cat ($self->{'build_dir'}, "text",
442 $self->{'collection'});
443
444 # get any os specific stuff
445 my $exedir = "$ENV{'GSDLHOME'}/bin/$ENV{'GSDLOS'}";
446 my $exe = &util::get_os_exe ();
[486]447 my $mg_passes_exe = &util::filename_cat($exedir, "mg_passes$exe");
448 my $mg_perf_hash_build_exe =
449 &util::filename_cat($exedir, "mg_perf_hash_build$exe");
450 my $mg_weights_build_exe =
451 &util::filename_cat ($exedir, "mg_weights_build$exe");
452 my $mg_invf_dict_exe =
453 &util::filename_cat ($exedir, "mg_invf_dict$exe");
454 my $mg_stem_idx_exe =
455 &util::filename_cat ($exedir, "mg_stem_idx$exe");
456
[4]457 my $osextra = "";
458 if ($ENV{'GSDLOS'} =~ /^windows$/i) {
459 $fullindexprefix =~ s/\//\\/g;
460 } else {
461 $osextra = " -d /";
462 }
463
464 # get the index level from the index description
465 # the index will be level 2 unless we are building a
466 # paragraph level index
467 my $index_level = 2;
468 $index_level = 3 if $index =~ /^paragraph/i;
469
[69]470 # get the index expression if this index belongs
471 # to a subcollection
472 my $indexexparr = [];
473 my ($level, $fields, $subcollection) = split (":", $index);
[85]474 my @subcollections = ();
475 @subcollections = split /,/, $subcollection if (defined $subcollection);
[69]476
477 foreach $subcollection (@subcollections) {
478 if (defined ($self->{'collect_cfg'}->{'subcollection'}->{$subcollection})) {
479 push (@$indexexparr, $self->{'collect_cfg'}->{'subcollection'}->{$subcollection});
480 }
481 }
482
[139]483 # add expressions for languages if this index belongs to
484 # a language subcollection
485 foreach $language (@{$self->{'collect_cfg'}->{'languages'}}) {
486 if ($language =~ s/^\!//) {
487 push (@$indexexparr, "!Language/$language/");
488 } else {
489 push (@$indexexparr, "Language/$language/");
490 }
491 }
[782]492
493 # Build index dictionary. Uses verbatim stem method
[1424]494 print $outhandle "\n creating index dictionary\n" if ($self->{'verbosity'} >= 1);
[782]495 my ($handle);
496 if ($self->{'debug'}) {
497 $handle = STDOUT;
498 } else {
499 if (!-e "$mg_passes_exe" ||
[1679]500# !open (PIPEOUT, "| \"$mg_passes_exe\" -f \"$fullindexprefix\" -b $maxdocsize " .
501 !open (PIPEOUT, "| mg_passes$exe -f \"$fullindexprefix\" -b $maxdocsize " .
[782]502 "-$index_level -m 32 -s 0 -G -t 10 -N1 $osextra")) {
503 die "mgbuilder::build_index - couldn't run $mg_passes_exe\n";
504 }
505 $handle = mgbuilder::PIPEOUT;
506 }
507
[4]508 # set up the document processor
[782]509 $self->{'buildproc'}->set_output_handle ($handle);
[4]510 $self->{'buildproc'}->set_mode ('text');
[69]511 $self->{'buildproc'}->set_index ($index, $indexexparr);
[292]512 $self->{'buildproc'}->set_indexing_text (1);
[4]513
514 $self->{'buildproc'}->reset();
515 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
[315]516 "", {}, $self->{'buildproc'}, $self->{'maxdocs'});
[782]517 close ($handle) unless $self->{'debug'};
[4]518
[1251]519 $self->print_stats();
520
[782]521 if (!$self->{'debug'}) {
522 # create the perfect hash function
523 if (!-e "$mg_perf_hash_build_exe") {
524 die "mgbuilder::build_index - couldn't run $mg_perf_hash_build_exe\n";
525 }
[1679]526# system ("\"$mg_perf_hash_build_exe\" -f \"$fullindexprefix\" $osextra");
527 system ("mg_perf_hash_build$exe -f \"$fullindexprefix\" $osextra");
[782]528
529 if (!-e "$mg_passes_exe" ||
[1679]530# !open ($handle, "| \"$mg_passes_exe\" -f \"$fullindexprefix\" -b $maxdocsize " .
531 !open ($handle, "| mg_passes$exe -f \"$fullindexprefix\" -b $maxdocsize " .
[782]532 "-$index_level -c 3 -G -t 10 -N2 $osextra")) {
533 die "mgbuilder::build_index - couldn't run $mg_passes_exe\n";
534 }
[4]535 }
[782]536
[4]537 # invert the text
[1424]538 print $outhandle "\n inverting the text\n" if ($self->{'verbosity'} >= 1);
[782]539
[4]540 $self->{'buildproc'}->reset();
541 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
[315]542 "", {}, $self->{'buildproc'}, $self->{'maxdocs'});
[1304]543
[1251]544 $self->print_stats ();
545
[782]546 if (!$self->{'debug'}) {
[4]547
[782]548 close ($handle);
549
550 # create the weights file
[1424]551 print $outhandle "\n create the weights file\n" if ($self->{'verbosity'} >= 1);
[782]552 if (!-e "$mg_weights_build_exe") {
553 die "mgbuilder::build_index - couldn't run $mg_weights_build_exe\n";
554 }
[1679]555# system ("\"$mg_weights_build_exe\" -f \"$fullindexprefix\" -t \"$fulltextprefix\" $osextra");
556 system ("mg_weights_build$exe -f \"$fullindexprefix\" -t \"$fulltextprefix\" $osextra");
[4]557
[782]558 # create 'on-disk' stemmed dictionary
[1424]559 print $outhandle "\n creating 'on-disk' stemmed dictionary\n" if ($self->{'verbosity'} >= 1);
[782]560 if (!-e "$mg_invf_dict_exe") {
561 die "mgbuilder::build_index - couldn't run $mg_invf_dict_exe\n";
562 }
[1679]563# system ("\"$mg_invf_dict_exe\" -f \"$fullindexprefix\" $osextra");
564 system ("mg_invf_dict$exe -f \"$fullindexprefix\" $osextra");
[4]565
566
[782]567 # creates stem index files for the various stemming methods
[1424]568 print $outhandle "\n creating stem indexes\n" if ($self->{'verbosity'} >= 1);
[782]569 if (!-e "$mg_stem_idx_exe") {
570 die "mgbuilder::build_index - couldn't run $mg_stem_idx_exe\n";
571 }
[1679]572# system ("\"$mg_stem_idx_exe\" -b 4096 -s1 -f \"$fullindexprefix\" $osextra");
573 system ("mg_stem_idx$exe -b 4096 -s1 -f \"$fullindexprefix\" $osextra");
574# system ("\"$mg_stem_idx_exe\" -b 4096 -s2 -f \"$fullindexprefix\" $osextra");
575 system ("mg_stem_idx$exe -b 4096 -s2 -f \"$fullindexprefix\" $osextra");
576# system ("\"$mg_stem_idx_exe\" -b 4096 -s3 -f \"$fullindexprefix\" $osextra");
577 system ("mg_stem_idx$exe -b 4096 -s3 -f \"$fullindexprefix\" $osextra");
[4]578
579
[782]580 # remove unwanted files
581 my $tmpdir = &util::filename_cat ($self->{'build_dir'}, $indexdir);
582 opendir (DIR, $tmpdir) || die
583 "mgbuilder::build_index - couldn't read directory $tmpdir\n";
584 foreach $file (readdir(DIR)) {
585 next if $file =~ /^\./;
586 my ($suffix) = $file =~ /\.([^\.]+)$/;
587 if (defined $suffix && !defined $wanted_index_files{$suffix}) {
588 # delete it!
[1424]589 print $outhandle "deleting $file\n" if $self->{'verbosity'} > 2;
[782]590 &util::rm (&util::filename_cat ($tmpdir, $file));
591 }
[4]592 }
[782]593 closedir (DIR);
[4]594 }
595}
596
597sub make_infodatabase {
598 my $self = shift (@_);
[1424]599 my $outhandle = $self->{'outhandle'};
600
[4]601 my $textdir = &util::filename_cat($self->{'build_dir'}, "text");
[810]602 my $assocdir = &util::filename_cat($self->{'build_dir'}, "assoc");
[4]603 &util::mk_all_dir ($textdir);
[810]604 &util::mk_all_dir ($assocdir);
[4]605
[85]606 # get db name
607 my $dbext = ".bdb";
608 $dbext = ".ldb" if &util::is_little_endian();
609 my $fulldbname = &util::filename_cat ($textdir, "$self->{'collection'}$dbext");
[4]610 $fulldbname =~ s/\//\\/g if ($ENV{'GSDLOS'} =~ /^windows$/i);
[85]611
[4]612 my $exedir = "$ENV{'GSDLHOME'}/bin/$ENV{'GSDLOS'}";
613 my $exe = &util::get_os_exe ();
[486]614 my $txt2db_exe = &util::filename_cat($exedir, "txt2db$exe");
[4]615
[1424]616 print $outhandle "\n*** creating the info database and processing associated files\n"
[810]617 if ($self->{'verbosity'} >= 1);
[4]618
[215]619 # init all the classifiers
[315]620 &classify::init_classifiers ($self->{'classifiers'});
[215]621
[4]622 # set up the document processor
[782]623 my ($handle);
624 if ($self->{'debug'}) {
625 $handle = STDOUT;
626 } else {
[1679]627# if (!-e "$txt2db_exe" || !open (PIPEOUT, "| \"$txt2db_exe\" \"$fulldbname\"")) {
628 if (!-e "$txt2db_exe" || !open (PIPEOUT, "| txt2db$exe \"$fulldbname\"")) {
[782]629 die "mgbuilder::make_infodatabase - couldn't run $txt2db_exe\n";
630 }
631 $handle = mgbuilder::PIPEOUT;
632 }
633
634 $self->{'buildproc'}->set_output_handle ($handle);
[315]635 $self->{'buildproc'}->set_mode ('infodb');
[810]636 $self->{'buildproc'}->set_assocdir ($assocdir);
[780]637 $self->{'buildproc'}->set_dontgdbm ($self->{'dontgdbm'});
[315]638 $self->{'buildproc'}->set_classifiers ($self->{'classifiers'});
[292]639 $self->{'buildproc'}->set_indexing_text (0);
[4]640 $self->{'buildproc'}->reset();
[246]641
[461]642 if (defined $self->{'collect_cfg'}->{'collectionmeta'}) {
[782]643
[461]644 if (!defined $self->{'index_mapping'}) {
645 $self->{'index_mapping'} =
646 $self->create_index_mapping ($self->{'collect_cfg'}->{'indexes'});
647 }
648
[782]649 print $handle "[collection]\n";
650
[461]651 foreach $cmeta (keys (%{$self->{'collect_cfg'}->{'collectionmeta'}})) {
652 if ($cmeta =~ s/^\.//) {
653 if (defined $self->{'index_mapping'}->{$cmeta}) {
[782]654 print $handle "<$self->{'index_mapping'}->{$cmeta}>" .
[461]655 $self->{'collect_cfg'}->{'collectionmeta'}->{".$cmeta"} . "\n";
656 } else {
[1424]657 print $outhandle "mgbuilder: warning bad collectionmeta option '$cmeta' - ignored\n";
[461]658 }
659 } else {
[782]660 print $handle "<$cmeta>$self->{'collect_cfg'}->{'collectionmeta'}->{$cmeta}\n";
[461]661 }
662 }
[782]663 print $handle "\n" . ('-' x 70) . "\n";
[461]664
665 }
666
[4]667 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
[315]668 "", {}, $self->{'buildproc'}, $self->{'maxdocs'});
[215]669
670 # output classification information
[782]671 &classify::output_classify_info ($self->{'classifiers'}, $handle,
[315]672 $self->{'allclassifications'});
[215]673
[782]674 close ($handle) if !$self->{'debug'};
[4]675}
676
[626]677sub collect_specific {
678 my $self = shift (@_);
679}
680
[4]681sub make_auxiliary_files {
682 my $self = shift (@_);
683 my ($index);
684 my %build_cfg = ();
[1424]685 my $outhandle = $self->{'outhandle'};
[4]686
[1424]687 print $outhandle "\n*** creating auxiliary files \n" if ($self->{'verbosity'} >= 1);
[4]688
689 # get the text directory
690 &util::mk_all_dir ($self->{'build_dir'});
691
692 # store the build date
693 $build_cfg->{'builddate'} = time;
694
695 # store the number of documents and number of bytes
696 $build_cfg->{'numdocs'} = $self->{'buildproc'}->get_num_docs();
697 $build_cfg->{'numbytes'} = $self->{'buildproc'}->get_num_bytes();
698
[1252]699 # get additional stats from mg
700 my $exedir = "$ENV{'GSDLHOME'}/bin/$ENV{'GSDLOS'}";
701 my $exe = &util::get_os_exe ();
702 my $mgstat_exe = &util::filename_cat($exedir, "mgstat$exe");
703 my $input_file = &util::filename_cat ("text", $self->{'collection'});
[1679]704# if (!-e "$mgstat_exe" || !open (PIPEIN, "\"$mgstat_exe\" -d \"$self->{'build_dir'}\" -f \"$input_file\" |")) {
705 if (!-e "$mgstat_exe" || !open (PIPEIN, "mgstat$exe -d \"$self->{'build_dir'}\" -f \"$input_file\" |")) {
[1424]706 print $outhandle "Warning: Couldn't open pipe to $mgstat_exe to get additional stats\n";
[1252]707 } else {
708 my $line = "";
709 while (defined ($line = <PIPEIN>)) {
710 if ($line =~ /^Words in collection \[dict\]\s+:\s+(\d+)/) {
711 ($build_cfg->{'numwords'}) = $1;
712 } elsif ($line =~ /^Documents\s+:\s+(\d+)/) {
713 ($build_cfg->{'numsections'}) = $1;
714 }
715 }
716 close PIPEIN;
717 }
718
[4]719 # store the mapping between the index names and the directory names
720 my @indexmap = ();
[290]721 foreach $index (@{$self->{'index_mapping'}->{'indexmaporder'}}) {
[139]722 push (@indexmap, "$index\-\>$self->{'index_mapping'}->{'indexmap'}->{$index}");
[4]723 }
724 $build_cfg->{'indexmap'} = \@indexmap;
725
[139]726 my @subcollectionmap = ();
[290]727 foreach $subcollection (@{$self->{'index_mapping'}->{'subcollectionmaporder'}}) {
[378]728 push (@subcollectionmap, "$subcollection\-\>" .
729 $self->{'index_mapping'}->{'subcollectionmap'}->{$subcollection});
[139]730 }
731 $build_cfg->{'subcollectionmap'} = \@subcollectionmap if scalar (@subcollectionmap);
732
733 my @languagemap = ();
[290]734 foreach $language (@{$self->{'index_mapping'}->{'languagemaporder'}}) {
[378]735 push (@languagemap, "$language\-\>" .
736 $self->{'index_mapping'}->{'languagemap'}->{$language});
[139]737 }
738 $build_cfg->{'languagemap'} = \@languagemap if scalar (@languagemap);
739
[1246]740 $build_cfg->{'notbuilt'} = $self->{'notbuilt'} if scalar @{$self->{'notbuilt'}};
[486]741
[4]742 # write out the build information
743 &cfgread::write_cfg_file("$self->{'build_dir'}/build.cfg", $build_cfg,
[1252]744 '^(builddate|numdocs|numbytes|numwords|numsections)$',
[626]745 '^(indexmap|subcollectionmap|languagemap|notbuilt)$');
[4]746
747}
748
749sub deinit {
750 my $self = shift (@_);
751}
752
[1251]753sub print_stats {
754 my $self = shift (@_);
[4]755
[1424]756 my $outhandle = $self->{'outhandle'};
[1251]757 my $indexing_text = $self->{'buildproc'}->get_indexing_text();
758 my $index = $self->{'buildproc'}->get_index();
759 my $num_bytes = $self->{'buildproc'}->get_num_bytes();
760 my $num_processed_bytes = $self->{'buildproc'}->get_num_processed_bytes();
761
762 if ($indexing_text) {
[1424]763 print $outhandle "Stats (Creating index $index)\n";
[1251]764 } else {
[1424]765 print $outhandle "Stats (Compressing text from $index)\n";
[1251]766 }
[1424]767 print $outhandle "Total bytes in collection: $num_bytes\n";
768 print $outhandle "Total bytes in $index: $num_processed_bytes\n";
[1251]769
770 if ($num_processed_bytes < 50) {
[1424]771 print $outhandle "***************\n";
772 print $outhandle "WARNING: There is very little or no text to process for $index\n";
[1251]773 if ($indexing_text) {
[1424]774 print $outhandle "This may cause an error while attempting to build the index\n";
[1251]775 } else {
[1424]776 print $outhandle "This may cause an error while attempting to compress the text\n";
[1251]777 }
[1424]778 print $outhandle "***************\n";
[1251]779 }
780}
781
[4]7821;
783
784
Note: See TracBrowser for help on using the repository browser.