source: tags/gsdl-2_30d-distribution/gsdl/perllib/mgbuilder.pm@ 2308

Last change on this file since 2308 was 1973, checked in by kjm18, 23 years ago

fixed up language stuff

  • Property svn:keywords set to Author Date Id Revision
File size: 27.4 KB
Line 
1###########################################################################
2#
3# mgbuilder.pm -- MGBuilder object
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26package mgbuilder;
27
28use classify;
29use cfgread;
30use colcfg;
31use plugin;
32use util;
33use FileHandle;
34
35BEGIN {
36 # set autoflush on for STDERR and STDOUT so that mg
37 # doesn't get out of sync with plugins
38 STDOUT->autoflush(1);
39 STDERR->autoflush(1);
40}
41
42END {
43 STDOUT->autoflush(0);
44 STDERR->autoflush(0);
45}
46
47$maxdocsize = 12000;
48
49%wanted_index_files = ('td'=>1,
50 't'=>1,
51 'idb'=>1,
52 'ib1'=>1,
53 'ib2'=>1,
54 'ib3'=>1,
55 'i'=>1,
56 'ip'=>1,
57 'tiw'=>1,
58 'wa'=>1);
59
60
61sub new {
62 my ($class, $collection, $source_dir, $build_dir, $verbosity,
63 $maxdocs, $debug, $keepold, $allclassifications, $outhandle) = @_;
64
65 $outhandle = STDERR unless defined $outhandle;
66
67 # create an mgbuilder object
68 my $self = bless {'collection'=>$collection,
69 'source_dir'=>$source_dir,
70 'build_dir'=>$build_dir,
71 'verbosity'=>$verbosity,
72 'maxdocs'=>$maxdocs,
73 'debug'=>$debug,
74 'keepold'=>$keepold,
75 'allclassifications'=>$allclassifications,
76 'outhandle'=>$outhandle,
77 'notbuilt'=>[] # indexes not built
78 }, $class;
79
80
81 # read in the collection configuration file
82 my $colcfgname = "$ENV{'GSDLCOLLECTDIR'}/etc/collect.cfg";
83 if (!-e $colcfgname) {
84 die "mgbuilder::new - couldn't find collect.cfg for collection $collection\n";
85 }
86 $self->{'collect_cfg'} = &colcfg::read_collect_cfg ($colcfgname);
87
88 # sort out subcollection indexes
89 if (defined $self->{'collect_cfg'}->{'indexsubcollections'}) {
90 my $indexes = $self->{'collect_cfg'}->{'indexes'};
91 $self->{'collect_cfg'}->{'indexes'} = [];
92 foreach $subcollection (@{$self->{'collect_cfg'}->{'indexsubcollections'}}) {
93 foreach $index (@$indexes) {
94 push (@{$self->{'collect_cfg'}->{'indexes'}}, "$index:$subcollection");
95 }
96 }
97 }
98
99 # sort out language subindexes
100 if (defined $self->{'collect_cfg'}->{'languages'}) {
101 my $indexes = $self->{'collect_cfg'}->{'indexes'};
102 $self->{'collect_cfg'}->{'indexes'} = [];
103 foreach $language (@{$self->{'collect_cfg'}->{'languages'}}) {
104 foreach $index (@$indexes) {
105 if (defined ($self->{'collect_cfg'}->{'indexsubcollections'})) {
106 push (@{$self->{'collect_cfg'}->{'indexes'}}, "$index:$language");
107 }
108 else { # add in an empty subcollection field
109 push (@{$self->{'collect_cfg'}->{'indexes'}}, "$index\:\:$language");
110 }
111 }
112 }
113 }
114
115 # make sure that the same index isn't specified more than once
116 my %tmphash = ();
117 my @tmparray = @{$self->{'collect_cfg'}->{'indexes'}};
118 $self->{'collect_cfg'}->{'indexes'} = [];
119 foreach my $i (@tmparray) {
120 if (!defined ($tmphash{$i})) {
121 push (@{$self->{'collect_cfg'}->{'indexes'}}, $i);
122 $tmphash{$i} = 1;
123 }
124 }
125
126 # get the list of plugins for this collection
127 my $plugins = [];
128 if (defined $self->{'collect_cfg'}->{'plugin'}) {
129 $plugins = $self->{'collect_cfg'}->{'plugin'};
130 }
131
132 # load all the plugins
133 $self->{'pluginfo'} = &plugin::load_plugins ($plugins, $verbosity, $outhandle);
134 if (scalar(@{$self->{'pluginfo'}}) == 0) {
135 print $outhandle "No plugins were loaded.\n";
136 die "\n";
137 }
138
139 # get the list of classifiers for this collection
140 my $classifiers = [];
141 if (defined $self->{'collect_cfg'}->{'classify'}) {
142 $classifiers = $self->{'collect_cfg'}->{'classify'};
143 }
144
145 # load all the classifiers
146 $self->{'classifiers'} = &classify::load_classifiers ($classifiers, $build_dir, $outhandle);
147
148 # load up any dontgdbm fields
149 $self->{'dontgdbm'} = {};
150 if (defined ($self->{'collect_cfg'}->{'dontgdbm'})) {
151 foreach $dg (@{$self->{'collect_cfg'}->{'dontgdbm'}}) {
152 $self->{'dontgdbm'}->{$dg} = 1;
153 }
154 }
155
156 # load up the document processor for building
157 # if a buildproc class has been created for this collection, use it
158 # otherwise, use the mg buildproc
159 my ($buildprocdir, $buildproctype);
160 if (-e "$ENV{'GSDLCOLLECTDIR'}/perllib/${collection}buildproc.pm") {
161 $buildprocdir = "$ENV{'GSDLCOLLECTDIR'}/perllib";
162 $buildproctype = "${collection}buildproc";
163 } else {
164 $buildprocdir = "$ENV{'GSDLHOME'}/perllib";
165 $buildproctype = "mgbuildproc";
166 }
167 require "$buildprocdir/$buildproctype.pm";
168
169 eval("\$self->{'buildproc'} = new $buildproctype(\$collection, " .
170 "\$source_dir, \$build_dir, \$verbosity, \$outhandle)");
171 die "$@" if $@;
172
173 return $self;
174}
175
176sub init {
177 my $self = shift (@_);
178
179 if (!$self->{'debug'} && !$self->{'keepold'}) {
180 # remove any old builds
181 &util::rm_r($self->{'build_dir'});
182 &util::mk_all_dir($self->{'build_dir'});
183
184 # make the text directory
185 my $textdir = "$self->{'build_dir'}/text";
186 &util::mk_all_dir($textdir);
187 }
188}
189
190sub compress_text {
191 my $self = shift (@_);
192 my ($textindex) = @_;
193 my $exedir = "$ENV{'GSDLHOME'}/bin/$ENV{'GSDLOS'}";
194 my $exe = &util::get_os_exe ();
195 my $mg_passes_exe = &util::filename_cat($exedir, "mg_passes$exe");
196 my $mg_compression_dict_exe = &util::filename_cat($exedir, "mg_compression_dict$exe");
197 my $outhandle = $self->{'outhandle'};
198
199 &util::mk_all_dir (&util::filename_cat($self->{'build_dir'}, "text"));
200 my $basefilename = "text/$self->{'collection'}";
201 my $fulltextprefix = &util::filename_cat ($self->{'build_dir'}, $basefilename);
202
203 my $osextra = "";
204 if ($ENV{'GSDLOS'} =~ /^windows$/i) {
205 $fulltextprefix =~ s/\//\\/g;
206 } else {
207 $osextra = " -d /";
208 }
209
210 print $outhandle "\n*** creating the compressed text\n" if ($self->{'verbosity'} >= 1);
211
212 # collect the statistics for the text
213 # -b $maxdocsize sets the maximum document size to be 12 meg
214 print $outhandle "\n collecting text statistics\n" if ($self->{'verbosity'} >= 1);
215
216 my ($handle);
217 if ($self->{'debug'}) {
218 $handle = STDOUT;
219 } else {
220 if (!-e "$mg_passes_exe" ||
221# !open (PIPEOUT, "| \"$mg_passes_exe\" -f \"$fulltextprefix\" -b $maxdocsize -T1 $osextra")) {
222 !open (PIPEOUT, "| mg_passes$exe -f \"$fulltextprefix\" -b $maxdocsize -T1 $osextra")) {
223 die "mgbuilder::compress_text - couldn't run $mg_passes_exe\n";
224 }
225 $handle = mgbuilder::PIPEOUT;
226 }
227
228 $self->{'buildproc'}->set_output_handle ($handle);
229 $self->{'buildproc'}->set_mode ('text');
230 $self->{'buildproc'}->set_index ($textindex);
231 $self->{'buildproc'}->set_indexing_text (0);
232 $self->{'buildproc'}->reset();
233 &plugin::begin($self->{'pluginfo'}, $self->{'source_dir'},
234 $self->{'buildproc'}, $self->{'maxdocs'});
235 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
236 "", {}, $self->{'buildproc'}, $self->{'maxdocs'});
237 &plugin::end($self->{'pluginfo'});
238
239 close ($handle) unless $self->{'debug'};
240
241 $self->print_stats();
242
243 # create the compression dictionary
244 # the compression dictionary is built by assuming the stats are from a seed
245 # dictionary (-S), if a novel word is encountered it is spelled out (-H),
246 # and the resulting dictionary must be less than 5 meg with the most frequent
247 # words being put into the dictionary first (-2 -k 5120)
248 if (!$self->{'debug'}) {
249 print $outhandle "\n creating the compression dictionary\n" if ($self->{'verbosity'} >= 1);
250 if (!-e "$mg_compression_dict_exe") {
251 die "mgbuilder::compress_text - couldn't run $mg_compression_dict_exe\n";
252 }
253# system ("\"$mg_compression_dict_exe\" -f \"$fulltextprefix\" -S -H -2 -k 5120 $osextra");
254 system ("mg_compression_dict$exe -f \"$fulltextprefix\" -S -H -2 -k 5120 $osextra");
255
256 # -b $maxdocsize sets the maximum document size to be 12 meg
257 if (!-e "$mg_passes_exe" ||
258# !open ($handle, "| \"$mg_passes_exe\" -f \"$fulltextprefix\" -b $maxdocsize -T2 $osextra")) {
259 !open ($handle, "| mg_passes$exe -f \"$fulltextprefix\" -b $maxdocsize -T2 $osextra")) {
260 die "mgbuilder::compress_text - couldn't run $mg_passes_exe\n";
261 }
262 }
263
264 $self->{'buildproc'}->reset();
265 # compress the text
266 print $outhandle "\n compressing the text\n" if ($self->{'verbosity'} >= 1);
267 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
268 "", {}, $self->{'buildproc'}, $self->{'maxdocs'});
269 close ($handle) unless $self->{'debug'};
270
271 $self->print_stats();
272}
273
274sub want_built {
275 my $self = shift (@_);
276 my ($index) = @_;
277
278 if (defined ($self->{'collect_cfg'}->{'dontbuild'})) {
279 foreach $checkstr (@{$self->{'collect_cfg'}->{'dontbuild'}}) {
280 if ($index =~ /^$checkstr$/) {
281 push (@{$self->{'notbuilt'}}, $self->{'index_mapping'}->{$index});
282 return 0;
283 }
284 }
285 }
286
287 return 1;
288}
289
290sub build_indexes {
291 my $self = shift (@_);
292 my ($indexname) = @_;
293 my $outhandle = $self->{'outhandle'};
294
295 my $indexes = [];
296 if (defined $indexname && $indexname =~ /\w/) {
297 push @$indexes, $indexname;
298 } else {
299 $indexes = $self->{'collect_cfg'}->{'indexes'};
300 }
301
302 # create the mapping between the index descriptions
303 # and their directory names
304 $self->{'index_mapping'} = $self->create_index_mapping ($indexes);
305
306 # build each of the indexes
307 foreach $index (@$indexes) {
308 if ($self->want_built($index)) {
309 print $outhandle "\n*** building index $index in subdirectory " .
310 "$self->{'index_mapping'}->{$index}\n" if ($self->{'verbosity'} >= 1);
311 $self->build_index($index);
312 } else {
313 print $outhandle "\n*** ignoring index $index\n" if ($self->{'verbosity'} >= 1);
314 }
315 }
316}
317
318# creates directory names for each of the index descriptions
319sub create_index_mapping {
320 my $self = shift (@_);
321 my ($indexes) = @_;
322
323 my %mapping = ();
324 $mapping{'indexmaporder'} = [];
325 $mapping{'subcollectionmaporder'} = [];
326 $mapping{'languagemaporder'} = [];
327
328 # dirnames is used to check for collisions. Start this off
329 # with the manditory directory names
330 my %dirnames = ('text'=>'text',
331 'extra'=>'extra');
332 my %pnames = ('index' => '', 'subcollection' => '', 'languages' => '');
333
334 foreach $index (@$indexes) {
335 my ($level, $gran, $subcollection, $languages) = split (":", $index);
336
337 # the directory name starts with the first character of the index level
338 my ($pindex) = $level =~ /^(.)/;
339
340 # next comes a processed version of the index
341 $pindex .= $self->process_field ($gran);
342 $pindex = lc ($pindex);
343
344 # next comes a processed version of the subcollection if there is one.
345 my $psub = $self->process_field ($subcollection);
346 $psub = lc ($psub);
347
348 # next comes a processed version of the language if there is one.
349 my $plang = $self->process_field ($languages);
350 $plang = lc ($plang);
351
352 my $dirname = $pindex . $psub . $plang;
353
354 # check to be sure all index names are unique
355 while (defined ($dirnames{$dirname})) {
356 $dirname = $self->make_unique (\%pnames, $index, \$pindex, \$psub, \$plang);
357 }
358 $mapping{$index} = $dirname;
359
360 # store the mapping orders as well as the maps
361 # also put index, subcollection and language fields into the mapping thing -
362 # (the full index name (eg document:text:subcol:lang) is not used on
363 # the query page) -these are used for collectionmeta later on
364 if (!defined $mapping{'indexmap'}{"$level:$gran"}) {
365 $mapping{'indexmap'}{"$level:$gran"} = $pindex;
366 push (@{$mapping{'indexmaporder'}}, "$level:$gran");
367 if (!defined $mapping{"$level:$gran"}) {
368 $mapping{"$level:$gran"} = $pindex;
369 }
370 }
371 if ($psub =~ /\w/ && !defined ($mapping{'subcollectionmap'}{$subcollection})) {
372 $mapping{'subcollectionmap'}{$subcollection} = $psub;
373 push (@{$mapping{'subcollectionmaporder'}}, $subcollection);
374 $mapping{$subcollection} = $psub;
375 }
376 if ($plang =~ /\w/ && !defined ($mapping{'languagemap'}{$languages})) {
377 $mapping{'languagemap'}{$languages} = $plang;
378 push (@{$mapping{'languagemaporder'}}, $languages);
379 $mapping{$languages} = $plang;
380 }
381 $dirnames{$dirname} = $index;
382 $pnames{'index'}{$pindex} = "$level:$gran";
383 $pnames{'subcollection'}{$psub} = $subcollection;
384 $pnames{'languages'}{$plang} = $languages;
385 }
386
387 return \%mapping;
388}
389
390# returns a processed version of a field.
391# if the field has only one component the processed
392# version will contain the first character and next consonant
393# of that componant - otherwise it will contain the first
394# character of the first two components
395sub process_field {
396 my $self = shift (@_);
397 my ($field) = @_;
398
399 return "" unless (defined ($field) && $field =~ /\w/);
400
401 my @components = split /,/, $field;
402 if (scalar @components >= 2) {
403 splice (@components, 2);
404 map {s/^(.).*$/$1/;} @components;
405 return join("", @components);
406 } else {
407 my ($a, $b) = $field =~ /^(.).*?([bcdfghjklmnpqrstvwxyz])/i;
408 ($a, $b) = $field =~ /^(.)(.)/ unless defined $a && defined $b;
409 return "$a$b";
410 }
411}
412
413sub make_unique {
414 my $self = shift (@_);
415 my ($namehash, $index, $indexref, $subref, $langref) = @_;
416 my ($level, $gran, $subcollection, $languages) = split (":", $index);
417
418 if ($namehash->{'index'}->{$$indexref} ne "$level:$gran") {
419 $self->get_next_version ($indexref);
420 } elsif ($namehash->{'subcollection'}->{$$subref} ne $subcollection) {
421 $self->get_next_version ($subref);
422 } elsif ($namehash->{'languages'}->{$$langref} ne $languages) {
423 $self->get_next_version ($langref);
424 }
425 return "$$indexref$$subref$$langref";
426}
427
428sub get_next_version {
429 my $self = shift (@_);
430 my ($nameref) = @_;
431
432 if ($$nameref =~ /(\d\d)$/) {
433 my $num = $1; $num ++;
434 $$nameref =~ s/\d\d$/$num/;
435 } elsif ($$nameref =~ /(\d)$/) {
436 my $num = $1;
437 if ($num == 9) {$$nameref =~ s/\d\d$/10/;}
438 else {$num ++; $$nameref =~ s/\d$/$num/;}
439 } else {
440 $$nameref =~ s/.$/0/;
441 }
442}
443
444sub build_index {
445 my $self = shift (@_);
446 my ($index) = @_;
447 my $outhandle = $self->{'outhandle'};
448
449 # get the full index directory path and make sure it exists
450 my $indexdir = $self->{'index_mapping'}->{$index};
451 &util::mk_all_dir (&util::filename_cat($self->{'build_dir'}, $indexdir));
452 my $fullindexprefix = &util::filename_cat ($self->{'build_dir'}, $indexdir,
453 $self->{'collection'});
454 my $fulltextprefix = &util::filename_cat ($self->{'build_dir'}, "text",
455 $self->{'collection'});
456
457 # get any os specific stuff
458 my $exedir = "$ENV{'GSDLHOME'}/bin/$ENV{'GSDLOS'}";
459 my $exe = &util::get_os_exe ();
460 my $mg_passes_exe = &util::filename_cat($exedir, "mg_passes$exe");
461 my $mg_perf_hash_build_exe =
462 &util::filename_cat($exedir, "mg_perf_hash_build$exe");
463 my $mg_weights_build_exe =
464 &util::filename_cat ($exedir, "mg_weights_build$exe");
465 my $mg_invf_dict_exe =
466 &util::filename_cat ($exedir, "mg_invf_dict$exe");
467 my $mg_stem_idx_exe =
468 &util::filename_cat ($exedir, "mg_stem_idx$exe");
469
470 my $osextra = "";
471 if ($ENV{'GSDLOS'} =~ /^windows$/i) {
472 $fullindexprefix =~ s/\//\\/g;
473 } else {
474 $osextra = " -d /";
475 }
476
477 # get the index level from the index description
478 # the index will be level 2 unless we are building a
479 # paragraph level index
480 my $index_level = 2;
481 $index_level = 3 if $index =~ /^paragraph/i;
482
483 # get the index expression if this index belongs
484 # to a subcollection
485 my $indexexparr = [];
486
487 # there may be subcollection info, and language info.
488 my ($level, $fields, $subcollection, $language) = split (":", $index);
489 my @subcollections = ();
490 @subcollections = split /,/, $subcollection if (defined $subcollection);
491
492 foreach $subcollection (@subcollections) {
493 if (defined ($self->{'collect_cfg'}->{'subcollection'}->{$subcollection})) {
494 push (@$indexexparr, $self->{'collect_cfg'}->{'subcollection'}->{$subcollection});
495 }
496 }
497
498 # add expressions for languages if this index belongs to
499 # a language subcollection - only put languages expressions for the
500 # ones we want in the index
501
502 my @languages = ();
503 @languages = split /,/, $language if (defined $language);
504 foreach $language (@languages) {
505 my $not=0;
506 if ($language =~ s/^\!//) {
507 $not = 1;
508 }
509 foreach $lang (@{$self->{'collect_cfg'}->{'languages'}}) {
510 if ($lang eq $language) {
511 if($not) {
512 push (@$indexexparr, "!Language/$language/");
513 } else {
514 push (@$indexexparr, "Language/$language/");
515 }
516 last;
517 }
518 }
519 }
520
521 # Build index dictionary. Uses verbatim stem method
522 print $outhandle "\n creating index dictionary\n" if ($self->{'verbosity'} >= 1);
523 my ($handle);
524 if ($self->{'debug'}) {
525 $handle = STDOUT;
526 } else {
527 if (!-e "$mg_passes_exe" ||
528# !open (PIPEOUT, "| \"$mg_passes_exe\" -f \"$fullindexprefix\" -b $maxdocsize " .
529 !open (PIPEOUT, "| mg_passes$exe -f \"$fullindexprefix\" -b $maxdocsize " .
530 "-$index_level -m 32 -s 0 -G -t 10 -N1 $osextra")) {
531 die "mgbuilder::build_index - couldn't run $mg_passes_exe\n";
532 }
533 $handle = mgbuilder::PIPEOUT;
534 }
535
536 # set up the document processor
537 $self->{'buildproc'}->set_output_handle ($handle);
538 $self->{'buildproc'}->set_mode ('text');
539 $self->{'buildproc'}->set_index ($index, $indexexparr);
540 $self->{'buildproc'}->set_indexing_text (1);
541
542 $self->{'buildproc'}->reset();
543 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
544 "", {}, $self->{'buildproc'}, $self->{'maxdocs'});
545 close ($handle) unless $self->{'debug'};
546
547 $self->print_stats();
548
549 if (!$self->{'debug'}) {
550 # create the perfect hash function
551 if (!-e "$mg_perf_hash_build_exe") {
552 die "mgbuilder::build_index - couldn't run $mg_perf_hash_build_exe\n";
553 }
554# system ("\"$mg_perf_hash_build_exe\" -f \"$fullindexprefix\" $osextra");
555 system ("mg_perf_hash_build$exe -f \"$fullindexprefix\" $osextra");
556
557 if (!-e "$mg_passes_exe" ||
558# !open ($handle, "| \"$mg_passes_exe\" -f \"$fullindexprefix\" -b $maxdocsize " .
559 !open ($handle, "| mg_passes$exe -f \"$fullindexprefix\" -b $maxdocsize " .
560 "-$index_level -c 3 -G -t 10 -N2 $osextra")) {
561 die "mgbuilder::build_index - couldn't run $mg_passes_exe\n";
562 }
563 }
564
565 # invert the text
566 print $outhandle "\n inverting the text\n" if ($self->{'verbosity'} >= 1);
567
568 $self->{'buildproc'}->reset();
569 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
570 "", {}, $self->{'buildproc'}, $self->{'maxdocs'});
571
572 $self->print_stats ();
573
574 if (!$self->{'debug'}) {
575
576 close ($handle);
577
578 # create the weights file
579 print $outhandle "\n create the weights file\n" if ($self->{'verbosity'} >= 1);
580 if (!-e "$mg_weights_build_exe") {
581 die "mgbuilder::build_index - couldn't run $mg_weights_build_exe\n";
582 }
583# system ("\"$mg_weights_build_exe\" -f \"$fullindexprefix\" -t \"$fulltextprefix\" $osextra");
584 system ("mg_weights_build$exe -f \"$fullindexprefix\" -t \"$fulltextprefix\" $osextra");
585
586 # create 'on-disk' stemmed dictionary
587 print $outhandle "\n creating 'on-disk' stemmed dictionary\n" if ($self->{'verbosity'} >= 1);
588 if (!-e "$mg_invf_dict_exe") {
589 die "mgbuilder::build_index - couldn't run $mg_invf_dict_exe\n";
590 }
591# system ("\"$mg_invf_dict_exe\" -f \"$fullindexprefix\" $osextra");
592 system ("mg_invf_dict$exe -f \"$fullindexprefix\" $osextra");
593
594
595 # creates stem index files for the various stemming methods
596 print $outhandle "\n creating stem indexes\n" if ($self->{'verbosity'} >= 1);
597 if (!-e "$mg_stem_idx_exe") {
598 die "mgbuilder::build_index - couldn't run $mg_stem_idx_exe\n";
599 }
600# system ("\"$mg_stem_idx_exe\" -b 4096 -s1 -f \"$fullindexprefix\" $osextra");
601 system ("mg_stem_idx$exe -b 4096 -s1 -f \"$fullindexprefix\" $osextra");
602# system ("\"$mg_stem_idx_exe\" -b 4096 -s2 -f \"$fullindexprefix\" $osextra");
603 system ("mg_stem_idx$exe -b 4096 -s2 -f \"$fullindexprefix\" $osextra");
604# system ("\"$mg_stem_idx_exe\" -b 4096 -s3 -f \"$fullindexprefix\" $osextra");
605 system ("mg_stem_idx$exe -b 4096 -s3 -f \"$fullindexprefix\" $osextra");
606
607
608 # remove unwanted files
609 my $tmpdir = &util::filename_cat ($self->{'build_dir'}, $indexdir);
610 opendir (DIR, $tmpdir) || die
611 "mgbuilder::build_index - couldn't read directory $tmpdir\n";
612 foreach $file (readdir(DIR)) {
613 next if $file =~ /^\./;
614 my ($suffix) = $file =~ /\.([^\.]+)$/;
615 if (defined $suffix && !defined $wanted_index_files{$suffix}) {
616 # delete it!
617 print $outhandle "deleting $file\n" if $self->{'verbosity'} > 2;
618 &util::rm (&util::filename_cat ($tmpdir, $file));
619 }
620 }
621 closedir (DIR);
622 }
623}
624
625sub make_infodatabase {
626 my $self = shift (@_);
627 my $outhandle = $self->{'outhandle'};
628
629 my $textdir = &util::filename_cat($self->{'build_dir'}, "text");
630 my $assocdir = &util::filename_cat($self->{'build_dir'}, "assoc");
631 &util::mk_all_dir ($textdir);
632 &util::mk_all_dir ($assocdir);
633
634 # get db name
635 my $dbext = ".bdb";
636 $dbext = ".ldb" if &util::is_little_endian();
637 my $fulldbname = &util::filename_cat ($textdir, "$self->{'collection'}$dbext");
638 $fulldbname =~ s/\//\\/g if ($ENV{'GSDLOS'} =~ /^windows$/i);
639
640 my $exedir = "$ENV{'GSDLHOME'}/bin/$ENV{'GSDLOS'}";
641 my $exe = &util::get_os_exe ();
642 my $txt2db_exe = &util::filename_cat($exedir, "txt2db$exe");
643
644 print $outhandle "\n*** creating the info database and processing associated files\n"
645 if ($self->{'verbosity'} >= 1);
646
647 # init all the classifiers
648 &classify::init_classifiers ($self->{'classifiers'});
649
650 # set up the document processor
651 my ($handle);
652 if ($self->{'debug'}) {
653 $handle = STDOUT;
654 } else {
655# if (!-e "$txt2db_exe" || !open (PIPEOUT, "| \"$txt2db_exe\" \"$fulldbname\"")) {
656 if (!-e "$txt2db_exe" || !open (PIPEOUT, "| txt2db$exe \"$fulldbname\"")) {
657 die "mgbuilder::make_infodatabase - couldn't run $txt2db_exe\n";
658 }
659 $handle = mgbuilder::PIPEOUT;
660 }
661
662 $self->{'buildproc'}->set_output_handle ($handle);
663 $self->{'buildproc'}->set_mode ('infodb');
664 $self->{'buildproc'}->set_assocdir ($assocdir);
665 $self->{'buildproc'}->set_dontgdbm ($self->{'dontgdbm'});
666 $self->{'buildproc'}->set_classifiers ($self->{'classifiers'});
667 $self->{'buildproc'}->set_indexing_text (0);
668 $self->{'buildproc'}->reset();
669
670 if (defined $self->{'collect_cfg'}->{'collectionmeta'}) {
671
672 if (!defined $self->{'index_mapping'}) {
673 $self->{'index_mapping'} =
674 $self->create_index_mapping ($self->{'collect_cfg'}->{'indexes'});
675 }
676
677 print $handle "[collection]\n";
678
679 foreach $cmeta (keys (%{$self->{'collect_cfg'}->{'collectionmeta'}})) {
680 if ($cmeta =~ s/^\.//) {
681 if (defined $self->{'index_mapping'}->{$cmeta}) {
682 print $handle "<$self->{'index_mapping'}->{$cmeta}>" .
683 $self->{'collect_cfg'}->{'collectionmeta'}->{".$cmeta"} . "\n";
684 }
685 else {
686 print $outhandle "mgbuilder: warning bad collectionmeta option '$cmeta' - ignored\n";
687 }
688 } else {
689 print $handle "<$cmeta>$self->{'collect_cfg'}->{'collectionmeta'}->{$cmeta}\n";
690 }
691 }
692 print $handle "\n" . ('-' x 70) . "\n";
693
694 }
695
696 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
697 "", {}, $self->{'buildproc'}, $self->{'maxdocs'});
698
699 # output classification information
700 &classify::output_classify_info ($self->{'classifiers'}, $handle,
701 $self->{'allclassifications'});
702
703 close ($handle) if !$self->{'debug'};
704}
705
706sub collect_specific {
707 my $self = shift (@_);
708}
709
710sub make_auxiliary_files {
711 my $self = shift (@_);
712 my ($index);
713 my %build_cfg = ();
714 my $outhandle = $self->{'outhandle'};
715
716 print $outhandle "\n*** creating auxiliary files \n" if ($self->{'verbosity'} >= 1);
717
718 # get the text directory
719 &util::mk_all_dir ($self->{'build_dir'});
720
721 # store the build date
722 $build_cfg->{'builddate'} = time;
723
724 # store the number of documents and number of bytes
725 $build_cfg->{'numdocs'} = $self->{'buildproc'}->get_num_docs();
726 $build_cfg->{'numbytes'} = $self->{'buildproc'}->get_num_bytes();
727
728 # get additional stats from mg
729 my $exedir = "$ENV{'GSDLHOME'}/bin/$ENV{'GSDLOS'}";
730 my $exe = &util::get_os_exe ();
731 my $mgstat_exe = &util::filename_cat($exedir, "mgstat$exe");
732 my $input_file = &util::filename_cat ("text", $self->{'collection'});
733# if (!-e "$mgstat_exe" || !open (PIPEIN, "\"$mgstat_exe\" -d \"$self->{'build_dir'}\" -f \"$input_file\" |")) {
734 if (!-e "$mgstat_exe" || !open (PIPEIN, "mgstat$exe -d \"$self->{'build_dir'}\" -f \"$input_file\" |")) {
735 print $outhandle "Warning: Couldn't open pipe to $mgstat_exe to get additional stats\n";
736 } else {
737 my $line = "";
738 while (defined ($line = <PIPEIN>)) {
739 if ($line =~ /^Words in collection \[dict\]\s+:\s+(\d+)/) {
740 ($build_cfg->{'numwords'}) = $1;
741 } elsif ($line =~ /^Documents\s+:\s+(\d+)/) {
742 ($build_cfg->{'numsections'}) = $1;
743 }
744 }
745 close PIPEIN;
746 }
747
748 # store the mapping between the index names and the directory names
749 my @indexmap = ();
750 foreach $index (@{$self->{'index_mapping'}->{'indexmaporder'}}) {
751 push (@indexmap, "$index\-\>$self->{'index_mapping'}->{'indexmap'}->{$index}");
752 }
753 $build_cfg->{'indexmap'} = \@indexmap;
754
755 my @subcollectionmap = ();
756 foreach $subcollection (@{$self->{'index_mapping'}->{'subcollectionmaporder'}}) {
757 push (@subcollectionmap, "$subcollection\-\>" .
758 $self->{'index_mapping'}->{'subcollectionmap'}->{$subcollection});
759 }
760 $build_cfg->{'subcollectionmap'} = \@subcollectionmap if scalar (@subcollectionmap);
761
762 my @languagemap = ();
763 foreach $language (@{$self->{'index_mapping'}->{'languagemaporder'}}) {
764 push (@languagemap, "$language\-\>" .
765 $self->{'index_mapping'}->{'languagemap'}->{$language});
766 }
767 $build_cfg->{'languagemap'} = \@languagemap if scalar (@languagemap);
768
769 $build_cfg->{'notbuilt'} = $self->{'notbuilt'} if scalar @{$self->{'notbuilt'}};
770
771 # write out the build information
772 &cfgread::write_cfg_file("$self->{'build_dir'}/build.cfg", $build_cfg,
773 '^(builddate|numdocs|numbytes|numwords|numsections)$',
774 '^(indexmap|subcollectionmap|languagemap|notbuilt)$');
775
776}
777
778sub deinit {
779 my $self = shift (@_);
780}
781
782sub print_stats {
783 my $self = shift (@_);
784
785 my $outhandle = $self->{'outhandle'};
786 my $indexing_text = $self->{'buildproc'}->get_indexing_text();
787 my $index = $self->{'buildproc'}->get_index();
788 my $num_bytes = $self->{'buildproc'}->get_num_bytes();
789 my $num_processed_bytes = $self->{'buildproc'}->get_num_processed_bytes();
790
791 if ($indexing_text) {
792 print $outhandle "Stats (Creating index $index)\n";
793 } else {
794 print $outhandle "Stats (Compressing text from $index)\n";
795 }
796 print $outhandle "Total bytes in collection: $num_bytes\n";
797 print $outhandle "Total bytes in $index: $num_processed_bytes\n";
798
799 if ($num_processed_bytes < 50) {
800 print $outhandle "***************\n";
801 print $outhandle "WARNING: There is very little or no text to process for $index\n";
802 if ($indexing_text) {
803 print $outhandle "This may cause an error while attempting to build the index\n";
804 } else {
805 print $outhandle "This may cause an error while attempting to compress the text\n";
806 }
807 print $outhandle "***************\n";
808 }
809}
810
8111;
812
813
Note: See TracBrowser for help on using the repository browser.