source: trunk/gsdl/perllib/mgbuilder.pm@ 3115

Last change on this file since 3115 was 3115, checked in by jrm21, 22 years ago

Redirect mg(pp)_passes stderr to /dev/null if the "-out xxx" option is given
to buildcol.pl, as some things (eg cron) think a program fails if there is
any output to stderr.

  • Property svn:keywords set to Author Date Id Revision
File size: 28.2 KB
Line 
1###########################################################################
2#
3# mgbuilder.pm -- MGBuilder object
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26package mgbuilder;
27
28use classify;
29use cfgread;
30use colcfg;
31use plugin;
32use util;
33use FileHandle;
34
35BEGIN {
36 # set autoflush on for STDERR and STDOUT so that mg
37 # doesn't get out of sync with plugins
38 STDOUT->autoflush(1);
39 STDERR->autoflush(1);
40}
41
42END {
43 STDOUT->autoflush(0);
44 STDERR->autoflush(0);
45}
46
47$maxdocsize = 12000;
48
49%wanted_index_files = ('td'=>1,
50 't'=>1,
51 'idb'=>1,
52 'ib1'=>1,
53 'ib2'=>1,
54 'ib3'=>1,
55 'i'=>1,
56 'ip'=>1,
57 'tiw'=>1,
58 'wa'=>1);
59
60
61sub new {
62 my ($class, $collection, $source_dir, $build_dir, $verbosity,
63 $maxdocs, $debug, $keepold, $allclassifications,
64 $outhandle, $no_text, $failhandle) = @_;
65
66 $outhandle = STDERR unless defined $outhandle;
67 $no_text = 0 unless defined $no_text;
68 $failhandle = STDERR unless defined $failhandle;
69
70 # create an mgbuilder object
71 my $self = bless {'collection'=>$collection,
72 'source_dir'=>$source_dir,
73 'build_dir'=>$build_dir,
74 'verbosity'=>$verbosity,
75 'maxdocs'=>$maxdocs,
76 'debug'=>$debug,
77 'keepold'=>$keepold,
78 'allclassifications'=>$allclassifications,
79 'outhandle'=>$outhandle,
80 'no_text'=>$no_text,
81 'failhandle'=>$failhandle,
82 'notbuilt'=>[] # indexes not built
83 }, $class;
84
85
86 # read in the collection configuration file
87 my $colcfgname = "$ENV{'GSDLCOLLECTDIR'}/etc/collect.cfg";
88 if (!-e $colcfgname) {
89 die "mgbuilder::new - couldn't find collect.cfg for collection $collection\n";
90 }
91 $self->{'collect_cfg'} = &colcfg::read_collect_cfg ($colcfgname);
92
93 # sort out subcollection indexes
94 if (defined $self->{'collect_cfg'}->{'indexsubcollections'}) {
95 my $indexes = $self->{'collect_cfg'}->{'indexes'};
96 $self->{'collect_cfg'}->{'indexes'} = [];
97 foreach $subcollection (@{$self->{'collect_cfg'}->{'indexsubcollections'}}) {
98 foreach $index (@$indexes) {
99 push (@{$self->{'collect_cfg'}->{'indexes'}}, "$index:$subcollection");
100 }
101 }
102 }
103
104 # sort out language subindexes
105 if (defined $self->{'collect_cfg'}->{'languages'}) {
106 my $indexes = $self->{'collect_cfg'}->{'indexes'};
107 $self->{'collect_cfg'}->{'indexes'} = [];
108 foreach $language (@{$self->{'collect_cfg'}->{'languages'}}) {
109 foreach $index (@$indexes) {
110 if (defined ($self->{'collect_cfg'}->{'indexsubcollections'})) {
111 push (@{$self->{'collect_cfg'}->{'indexes'}}, "$index:$language");
112 }
113 else { # add in an empty subcollection field
114 push (@{$self->{'collect_cfg'}->{'indexes'}}, "$index\:\:$language");
115 }
116 }
117 }
118 }
119
120 # make sure that the same index isn't specified more than once
121 my %tmphash = ();
122 my @tmparray = @{$self->{'collect_cfg'}->{'indexes'}};
123 $self->{'collect_cfg'}->{'indexes'} = [];
124 foreach my $i (@tmparray) {
125 if (!defined ($tmphash{$i})) {
126 push (@{$self->{'collect_cfg'}->{'indexes'}}, $i);
127 $tmphash{$i} = 1;
128 }
129 }
130
131 # get the list of plugins for this collection
132 my $plugins = [];
133 if (defined $self->{'collect_cfg'}->{'plugin'}) {
134 $plugins = $self->{'collect_cfg'}->{'plugin'};
135 }
136
137 # load all the plugins
138 $self->{'pluginfo'} = &plugin::load_plugins ($plugins, $verbosity, $outhandle, $failhandle);
139 if (scalar(@{$self->{'pluginfo'}}) == 0) {
140 print $outhandle "No plugins were loaded.\n";
141 die "\n";
142 }
143
144 # get the list of classifiers for this collection
145 my $classifiers = [];
146 if (defined $self->{'collect_cfg'}->{'classify'}) {
147 $classifiers = $self->{'collect_cfg'}->{'classify'};
148 }
149
150 # load all the classifiers
151 $self->{'classifiers'} = &classify::load_classifiers ($classifiers, $build_dir, $outhandle);
152
153 # load up any dontgdbm fields
154 $self->{'dontgdbm'} = {};
155 if (defined ($self->{'collect_cfg'}->{'dontgdbm'})) {
156 foreach $dg (@{$self->{'collect_cfg'}->{'dontgdbm'}}) {
157 $self->{'dontgdbm'}->{$dg} = 1;
158 }
159 }
160
161 # load up the document processor for building
162 # if a buildproc class has been created for this collection, use it
163 # otherwise, use the mg buildproc
164 my ($buildprocdir, $buildproctype);
165 if (-e "$ENV{'GSDLCOLLECTDIR'}/perllib/${collection}buildproc.pm") {
166 $buildprocdir = "$ENV{'GSDLCOLLECTDIR'}/perllib";
167 $buildproctype = "${collection}buildproc";
168 } else {
169 $buildprocdir = "$ENV{'GSDLHOME'}/perllib";
170 $buildproctype = "mgbuildproc";
171 }
172 require "$buildprocdir/$buildproctype.pm";
173
174 eval("\$self->{'buildproc'} = new $buildproctype(\$collection, " .
175 "\$source_dir, \$build_dir, \$verbosity, \$outhandle)");
176 die "$@" if $@;
177
178 return $self;
179}
180
181sub init {
182 my $self = shift (@_);
183
184 if (!$self->{'debug'} && !$self->{'keepold'}) {
185 # remove any old builds
186 &util::rm_r($self->{'build_dir'});
187 &util::mk_all_dir($self->{'build_dir'});
188
189 # make the text directory
190 my $textdir = "$self->{'build_dir'}/text";
191 &util::mk_all_dir($textdir);
192 }
193}
194
195sub compress_text {
196 my $self = shift (@_);
197 my ($textindex) = @_;
198 my $exedir = "$ENV{'GSDLHOME'}/bin/$ENV{'GSDLOS'}";
199 my $exe = &util::get_os_exe ();
200 my $mg_passes_exe = &util::filename_cat($exedir, "mg_passes$exe");
201 my $mg_compression_dict_exe = &util::filename_cat($exedir, "mg_compression_dict$exe");
202 my $outhandle = $self->{'outhandle'};
203
204 &util::mk_all_dir (&util::filename_cat($self->{'build_dir'}, "text"));
205 my $basefilename = "text/$self->{'collection'}";
206 my $fulltextprefix = &util::filename_cat ($self->{'build_dir'}, $basefilename);
207
208 my $osextra = "";
209 if ($ENV{'GSDLOS'} =~ /^windows$/i) {
210 $fulltextprefix =~ s@/@\\@g;
211 } else {
212 $osextra = " -d /";
213 }
214
215 print $outhandle "\n*** creating the compressed text\n" if ($self->{'verbosity'} >= 1);
216
217 # collect the statistics for the text
218 # -b $maxdocsize sets the maximum document size to be 12 meg
219 print $outhandle "\n collecting text statistics\n" if ($self->{'verbosity'} >= 1);
220
221 my ($handle);
222 if ($self->{'debug'}) {
223 $handle = STDOUT;
224 } else {
225 if (!-e "$mg_passes_exe" ||
226 !open (PIPEOUT, "| mg_passes$exe -f \"$fulltextprefix\" -b $maxdocsize -T1 $osextra")) {
227 die "mgbuilder::compress_text - couldn't run $mg_passes_exe\n";
228 }
229 $handle = mgbuilder::PIPEOUT;
230 }
231
232 $self->{'buildproc'}->set_output_handle ($handle);
233 $self->{'buildproc'}->set_mode ('text');
234 $self->{'buildproc'}->set_index ($textindex);
235 $self->{'buildproc'}->set_indexing_text (0);
236 if ($self->{'no_text'}) {
237 $self->{'buildproc'}->set_store_text(0);
238 } else {
239 $self->{'buildproc'}->set_store_text(1);
240 }
241 $self->{'buildproc'}->reset();
242 &plugin::begin($self->{'pluginfo'}, $self->{'source_dir'},
243 $self->{'buildproc'}, $self->{'maxdocs'});
244 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
245 "", {}, $self->{'buildproc'}, $self->{'maxdocs'});
246 &plugin::end($self->{'pluginfo'});
247
248 close ($handle) unless $self->{'debug'};
249
250 $self->print_stats();
251
252 # create the compression dictionary
253 # the compression dictionary is built by assuming the stats are from a seed
254 # dictionary (-S), if a novel word is encountered it is spelled out (-H),
255 # and the resulting dictionary must be less than 5 meg with the most frequent
256 # words being put into the dictionary first (-2 -k 5120)
257 if (!$self->{'debug'}) {
258 print $outhandle "\n creating the compression dictionary\n" if ($self->{'verbosity'} >= 1);
259 if (!-e "$mg_compression_dict_exe") {
260 die "mgbuilder::compress_text - couldn't run $mg_compression_dict_exe\n";
261 }
262 system ("mg_compression_dict$exe -f \"$fulltextprefix\" -S -H -2 -k 5120 $osextra");
263
264 # -b $maxdocsize sets the maximum document size to be 12 meg
265 if (!-e "$mg_passes_exe" ||
266 !open ($handle, "| mg_passes$exe -f \"$fulltextprefix\" -b $maxdocsize -T2 $osextra")) {
267 die "mgbuilder::compress_text - couldn't run $mg_passes_exe\n";
268 }
269 }
270
271 $self->{'buildproc'}->reset();
272 # compress the text
273 print $outhandle "\n compressing the text\n" if ($self->{'verbosity'} >= 1);
274 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
275 "", {}, $self->{'buildproc'}, $self->{'maxdocs'});
276 close ($handle) unless $self->{'debug'};
277
278 $self->print_stats();
279}
280
281sub want_built {
282 my $self = shift (@_);
283 my ($index) = @_;
284
285 if (defined ($self->{'collect_cfg'}->{'dontbuild'})) {
286 foreach $checkstr (@{$self->{'collect_cfg'}->{'dontbuild'}}) {
287 if ($index =~ /^$checkstr$/) {
288 push (@{$self->{'notbuilt'}}, $self->{'index_mapping'}->{$index});
289 return 0;
290 }
291 }
292 }
293
294 return 1;
295}
296
297sub build_indexes {
298 my $self = shift (@_);
299 my ($indexname) = @_;
300 my $outhandle = $self->{'outhandle'};
301
302 my $indexes = [];
303 if (defined $indexname && $indexname =~ /\w/) {
304 push @$indexes, $indexname;
305 } else {
306 $indexes = $self->{'collect_cfg'}->{'indexes'};
307 }
308
309 # create the mapping between the index descriptions
310 # and their directory names
311 $self->{'index_mapping'} = $self->create_index_mapping ($indexes);
312
313 # build each of the indexes
314 foreach $index (@$indexes) {
315 if ($self->want_built($index)) {
316 print $outhandle "\n*** building index $index in subdirectory " .
317 "$self->{'index_mapping'}->{$index}\n" if ($self->{'verbosity'} >= 1);
318 $self->build_index($index);
319 } else {
320 print $outhandle "\n*** ignoring index $index\n" if ($self->{'verbosity'} >= 1);
321 }
322 }
323}
324
325# creates directory names for each of the index descriptions
326sub create_index_mapping {
327 my $self = shift (@_);
328 my ($indexes) = @_;
329
330 my %mapping = ();
331 $mapping{'indexmaporder'} = [];
332 $mapping{'subcollectionmaporder'} = [];
333 $mapping{'languagemaporder'} = [];
334
335 # dirnames is used to check for collisions. Start this off
336 # with the manditory directory names
337 my %dirnames = ('text'=>'text',
338 'extra'=>'extra');
339 my %pnames = ('index' => '', 'subcollection' => '', 'languages' => '');
340
341 foreach $index (@$indexes) {
342 my ($level, $gran, $subcollection, $languages) = split (":", $index);
343
344 # the directory name starts with the first character of the index level
345 my ($pindex) = $level =~ /^(.)/;
346
347 # next comes a processed version of the index
348 $pindex .= $self->process_field ($gran);
349 $pindex = lc ($pindex);
350
351 # next comes a processed version of the subcollection if there is one.
352 my $psub = $self->process_field ($subcollection);
353 $psub = lc ($psub);
354
355 # next comes a processed version of the language if there is one.
356 my $plang = $self->process_field ($languages);
357 $plang = lc ($plang);
358
359 my $dirname = $pindex . $psub . $plang;
360
361 # check to be sure all index names are unique
362 while (defined ($dirnames{$dirname})) {
363 $dirname = $self->make_unique (\%pnames, $index, \$pindex, \$psub, \$plang);
364 }
365 $mapping{$index} = $dirname;
366
367 # store the mapping orders as well as the maps
368 # also put index, subcollection and language fields into the mapping thing -
369 # (the full index name (eg document:text:subcol:lang) is not used on
370 # the query page) -these are used for collectionmeta later on
371 if (!defined $mapping{'indexmap'}{"$level:$gran"}) {
372 $mapping{'indexmap'}{"$level:$gran"} = $pindex;
373 push (@{$mapping{'indexmaporder'}}, "$level:$gran");
374 if (!defined $mapping{"$level:$gran"}) {
375 $mapping{"$level:$gran"} = $pindex;
376 }
377 }
378 if ($psub =~ /\w/ && !defined ($mapping{'subcollectionmap'}{$subcollection})) {
379 $mapping{'subcollectionmap'}{$subcollection} = $psub;
380 push (@{$mapping{'subcollectionmaporder'}}, $subcollection);
381 $mapping{$subcollection} = $psub;
382 }
383 if ($plang =~ /\w/ && !defined ($mapping{'languagemap'}{$languages})) {
384 $mapping{'languagemap'}{$languages} = $plang;
385 push (@{$mapping{'languagemaporder'}}, $languages);
386 $mapping{$languages} = $plang;
387 }
388 $dirnames{$dirname} = $index;
389 $pnames{'index'}{$pindex} = "$level:$gran";
390 $pnames{'subcollection'}{$psub} = $subcollection;
391 $pnames{'languages'}{$plang} = $languages;
392 }
393
394 return \%mapping;
395}
396
397# returns a processed version of a field.
398# if the field has only one component the processed
399# version will contain the first character and next consonant
400# of that componant - otherwise it will contain the first
401# character of the first two components
402sub process_field {
403 my $self = shift (@_);
404 my ($field) = @_;
405
406 return "" unless (defined ($field) && $field =~ /\w/);
407
408 my @components = split /,/, $field;
409 if (scalar @components >= 2) {
410 splice (@components, 2);
411 map {s/^(.).*$/$1/;} @components;
412 return join("", @components);
413 } else {
414 my ($a, $b) = $field =~ /^(.).*?([bcdfghjklmnpqrstvwxyz])/i;
415 ($a, $b) = $field =~ /^(.)(.)/ unless defined $a && defined $b;
416 return "$a$b";
417 }
418}
419
420sub make_unique {
421 my $self = shift (@_);
422 my ($namehash, $index, $indexref, $subref, $langref) = @_;
423 my ($level, $gran, $subcollection, $languages) = split (":", $index);
424
425 if ($namehash->{'index'}->{$$indexref} ne "$level:$gran") {
426 $self->get_next_version ($indexref);
427 } elsif ($namehash->{'subcollection'}->{$$subref} ne $subcollection) {
428 $self->get_next_version ($subref);
429 } elsif ($namehash->{'languages'}->{$$langref} ne $languages) {
430 $self->get_next_version ($langref);
431 }
432 return "$$indexref$$subref$$langref";
433}
434
435sub get_next_version {
436 my $self = shift (@_);
437 my ($nameref) = @_;
438
439 if ($$nameref =~ /(\d\d)$/) {
440 my $num = $1; $num ++;
441 $$nameref =~ s/\d\d$/$num/;
442 } elsif ($$nameref =~ /(\d)$/) {
443 my $num = $1;
444 if ($num == 9) {$$nameref =~ s/\d\d$/10/;}
445 else {$num ++; $$nameref =~ s/\d$/$num/;}
446 } else {
447 $$nameref =~ s/.$/0/;
448 }
449}
450
451sub build_index {
452 my $self = shift (@_);
453 my ($index) = @_;
454 my $outhandle = $self->{'outhandle'};
455
456 # get the full index directory path and make sure it exists
457 my $indexdir = $self->{'index_mapping'}->{$index};
458 &util::mk_all_dir (&util::filename_cat($self->{'build_dir'}, $indexdir));
459 my $fullindexprefix = &util::filename_cat ($self->{'build_dir'}, $indexdir,
460 $self->{'collection'});
461 my $fulltextprefix = &util::filename_cat ($self->{'build_dir'}, "text",
462 $self->{'collection'});
463
464 # get any os specific stuff
465 my $exedir = "$ENV{'GSDLHOME'}/bin/$ENV{'GSDLOS'}";
466 my $exe = &util::get_os_exe ();
467 my $mg_passes_exe = &util::filename_cat($exedir, "mg_passes$exe");
468 my $mg_perf_hash_build_exe =
469 &util::filename_cat($exedir, "mg_perf_hash_build$exe");
470 my $mg_weights_build_exe =
471 &util::filename_cat ($exedir, "mg_weights_build$exe");
472 my $mg_invf_dict_exe =
473 &util::filename_cat ($exedir, "mg_invf_dict$exe");
474 my $mg_stem_idx_exe =
475 &util::filename_cat ($exedir, "mg_stem_idx$exe");
476
477 my $osextra = "";
478 if ($ENV{'GSDLOS'} =~ /^windows$/i) {
479 $fullindexprefix =~ s@/@\\@g;
480 } else {
481 $osextra = " -d /";
482 if ($outhandle ne "STDERR") {
483 # so mg_passes doesn't print to stderr if we redirect output
484 $osextra .= " 2>/dev/null";
485 }
486 }
487
488 # get the index level from the index description
489 # the index will be level 2 unless we are building a
490 # paragraph level index
491 my $index_level = 2;
492 $index_level = 3 if $index =~ /^paragraph/i;
493
494 # get the index expression if this index belongs
495 # to a subcollection
496 my $indexexparr = [];
497
498 # there may be subcollection info, and language info.
499 my ($level, $fields, $subcollection, $language) = split (":", $index);
500 my @subcollections = ();
501 @subcollections = split /,/, $subcollection if (defined $subcollection);
502
503 foreach $subcollection (@subcollections) {
504 if (defined ($self->{'collect_cfg'}->{'subcollection'}->{$subcollection})) {
505 push (@$indexexparr, $self->{'collect_cfg'}->{'subcollection'}->{$subcollection});
506 }
507 }
508
509 # add expressions for languages if this index belongs to
510 # a language subcollection - only put languages expressions for the
511 # ones we want in the index
512
513 my @languages = ();
514 @languages = split /,/, $language if (defined $language);
515 foreach $language (@languages) {
516 my $not=0;
517 if ($language =~ s/^\!//) {
518 $not = 1;
519 }
520 foreach $lang (@{$self->{'collect_cfg'}->{'languages'}}) {
521 if ($lang eq $language) {
522 if($not) {
523 push (@$indexexparr, "!Language/$language/");
524 } else {
525 push (@$indexexparr, "Language/$language/");
526 }
527 last;
528 }
529 }
530 }
531
532 # Build index dictionary. Uses verbatim stem method
533 print $outhandle "\n creating index dictionary\n" if ($self->{'verbosity'} >= 1);
534 my ($handle);
535 if ($self->{'debug'}) {
536 $handle = STDOUT;
537 } else {
538 if (!-e "$mg_passes_exe" ||
539 !open (PIPEOUT, "| mg_passes$exe -f \"$fullindexprefix\" -b $maxdocsize " .
540 "-$index_level -m 32 -s 0 -G -t 10 -N1 $osextra")) {
541 die "mgbuilder::build_index - couldn't run $mg_passes_exe\n";
542 }
543 $handle = mgbuilder::PIPEOUT;
544 }
545
546 # set up the document processor
547 $self->{'buildproc'}->set_output_handle ($handle);
548 $self->{'buildproc'}->set_mode ('text');
549 $self->{'buildproc'}->set_index ($index, $indexexparr);
550 $self->{'buildproc'}->set_indexing_text (1);
551 $self->{'buildproc'}->set_store_text(1);
552
553 $self->{'buildproc'}->reset();
554 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
555 "", {}, $self->{'buildproc'}, $self->{'maxdocs'});
556 close ($handle) unless $self->{'debug'};
557
558 $self->print_stats();
559
560 if (!$self->{'debug'}) {
561 # create the perfect hash function
562 if (!-e "$mg_perf_hash_build_exe") {
563 die "mgbuilder::build_index - couldn't run $mg_perf_hash_build_exe\n";
564 }
565 system ("mg_perf_hash_build$exe -f \"$fullindexprefix\" $osextra");
566
567 if (!-e "$mg_passes_exe" ||
568 !open ($handle, "| mg_passes$exe -f \"$fullindexprefix\" -b $maxdocsize " .
569 "-$index_level -c 3 -G -t 10 -N2 $osextra")) {
570 die "mgbuilder::build_index - couldn't run $mg_passes_exe\n";
571 }
572 }
573
574 # invert the text
575 print $outhandle "\n inverting the text\n" if ($self->{'verbosity'} >= 1);
576
577 $self->{'buildproc'}->reset();
578 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
579 "", {}, $self->{'buildproc'}, $self->{'maxdocs'});
580
581 $self->print_stats ();
582
583 if (!$self->{'debug'}) {
584
585 close ($handle);
586
587 # create the weights file
588 print $outhandle "\n create the weights file\n" if ($self->{'verbosity'} >= 1);
589 if (!-e "$mg_weights_build_exe") {
590 die "mgbuilder::build_index - couldn't run $mg_weights_build_exe\n";
591 }
592 system ("mg_weights_build$exe -f \"$fullindexprefix\" -t \"$fulltextprefix\" $osextra");
593
594 # create 'on-disk' stemmed dictionary
595 print $outhandle "\n creating 'on-disk' stemmed dictionary\n" if ($self->{'verbosity'} >= 1);
596 if (!-e "$mg_invf_dict_exe") {
597 die "mgbuilder::build_index - couldn't run $mg_invf_dict_exe\n";
598 }
599 system ("mg_invf_dict$exe -f \"$fullindexprefix\" $osextra");
600
601
602 # creates stem index files for the various stemming methods
603 print $outhandle "\n creating stem indexes\n" if ($self->{'verbosity'} >= 1);
604 if (!-e "$mg_stem_idx_exe") {
605 die "mgbuilder::build_index - couldn't run $mg_stem_idx_exe\n";
606 }
607 system ("mg_stem_idx$exe -b 4096 -s1 -f \"$fullindexprefix\" $osextra");
608 system ("mg_stem_idx$exe -b 4096 -s2 -f \"$fullindexprefix\" $osextra");
609 system ("mg_stem_idx$exe -b 4096 -s3 -f \"$fullindexprefix\" $osextra");
610
611 # remove unwanted files
612 my $tmpdir = &util::filename_cat ($self->{'build_dir'}, $indexdir);
613 opendir (DIR, $tmpdir) || die
614 "mgbuilder::build_index - couldn't read directory $tmpdir\n";
615 foreach $file (readdir(DIR)) {
616 next if $file =~ /^\./;
617 my ($suffix) = $file =~ /\.([^\.]+)$/;
618 if (defined $suffix && !defined $wanted_index_files{$suffix}) {
619 # delete it!
620 print $outhandle "deleting $file\n" if $self->{'verbosity'} > 2;
621 &util::rm (&util::filename_cat ($tmpdir, $file));
622 }
623 }
624 closedir (DIR);
625 }
626}
627
628sub make_infodatabase {
629 my $self = shift (@_);
630 my $outhandle = $self->{'outhandle'};
631
632 my $textdir = &util::filename_cat($self->{'build_dir'}, "text");
633 my $assocdir = &util::filename_cat($self->{'build_dir'}, "assoc");
634 &util::mk_all_dir ($textdir);
635 &util::mk_all_dir ($assocdir);
636
637 # get db name
638 my $dbext = ".bdb";
639 $dbext = ".ldb" if &util::is_little_endian();
640 my $fulldbname = &util::filename_cat ($textdir, "$self->{'collection'}$dbext");
641 $fulldbname =~ s/\//\\/g if ($ENV{'GSDLOS'} =~ /^windows$/i);
642
643 my $exedir = "$ENV{'GSDLHOME'}/bin/$ENV{'GSDLOS'}";
644 my $exe = &util::get_os_exe ();
645 my $txt2db_exe = &util::filename_cat($exedir, "txt2db$exe");
646
647 print $outhandle "\n*** creating the info database and processing associated files\n"
648 if ($self->{'verbosity'} >= 1);
649
650 # init all the classifiers
651 &classify::init_classifiers ($self->{'classifiers'});
652
653
654 # set up the document processor
655 my ($handle);
656 if ($self->{'debug'}) {
657 $handle = STDOUT;
658 } else {
659 if (!-e "$txt2db_exe" || !open (PIPEOUT, "| txt2db$exe \"$fulldbname\"")) {
660 die "mgbuilder::make_infodatabase - couldn't run $txt2db_exe\n";
661 }
662 $handle = mgbuilder::PIPEOUT;
663 }
664
665 $self->{'buildproc'}->set_output_handle ($handle);
666 $self->{'buildproc'}->set_mode ('infodb');
667 $self->{'buildproc'}->set_assocdir ($assocdir);
668 $self->{'buildproc'}->set_dontgdbm ($self->{'dontgdbm'});
669 $self->{'buildproc'}->set_classifiers ($self->{'classifiers'});
670 $self->{'buildproc'}->set_indexing_text (0);
671 $self->{'buildproc'}->set_store_text(1);
672 $self->{'buildproc'}->reset();
673
674 if (defined $self->{'collect_cfg'}->{'collectionmeta'}) {
675
676 if (!defined $self->{'index_mapping'}) {
677 $self->{'index_mapping'} =
678 $self->create_index_mapping ($self->{'collect_cfg'}->{'indexes'});
679 }
680
681 print $handle "[collection]\n";
682
683 foreach $cmeta (keys (%{$self->{'collect_cfg'}->{'collectionmeta'}})) {
684 my $defaultfound=0;
685 my $first=1;
686 my $metadata_entry = "";
687 my $default="";
688 my $cmetamap = "";
689 if ($cmeta =~ s/^\.//) {
690 if (defined $self->{'index_mapping'}->{$cmeta}) {
691 $cmetamap = $self->{'index_mapping'}->{$cmeta};
692 $cmeta = ".$cmeta";
693 }
694 else {
695 print $outhandle "mgbuilder: warning bad collectionmeta option '$cmeta' - ignored\n";
696 next; #ignore this one
697 }
698 }
699 else {
700 $cmetamap = $cmeta; # just using the same name
701 }
702 #iterate through the languages
703 foreach $lang (keys (%{$self->{'collect_cfg'}->{'collectionmeta'}->{$cmeta}})) {
704 if ($first) {
705 $first=0;
706 #set the default default to the first entry
707 $default=$self->{'collect_cfg'}->{'collectionmeta'}->{$cmeta}->{$lang};
708 }
709 if ($lang =~ /default/) {
710 $defaultfound=1;
711 #the default entry goes first
712 $metadata_entry = "<$cmetamap>" .
713 $self->{'collect_cfg'}->{'collectionmeta'}->{$cmeta}->{'default'} . "\n" . $metadata_entry;
714 }
715 else {
716 my ($l) = $lang =~ /^\[l=(\w*)\]$/;
717 if ($l) {
718 $metadata_entry .= "<$cmetamap:$l>" .
719 $self->{'collect_cfg'}->{'collectionmeta'}->{$cmeta}->{$lang} . "\n";
720 }
721 }
722 }
723 #if we haven't found a default, put one in
724 if (!$defaultfound) {
725 $metadata_entry = "<$cmetamap>$default\n" . $metadata_entry;
726 }
727 #write the entry to the file
728 print $handle $metadata_entry;
729
730 }
731
732 print $handle "\n" . ('-' x 70) . "\n";
733
734 }
735
736
737 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
738 "", {}, $self->{'buildproc'}, $self->{'maxdocs'});
739
740 # output classification information
741 &classify::output_classify_info ($self->{'classifiers'}, $handle,
742 $self->{'allclassifications'});
743
744
745
746 #output doclist
747 my @doclist = $self->{'buildproc'}->get_doc_list();
748 my $docs = join (";",@doclist);
749 print $handle "[browselist]\n";
750 print $handle "<hastxt>0\n";
751 print $handle "<childtype>VList\n";
752 print $handle "<numleafdocs>" . ($#doclist+1) . "\n";
753 print $handle "<thistype>Invisible\n";
754 print $handle "<contains>$docs";
755 print $handle "\n" . ('-' x 70) . "\n";
756
757 close ($handle) if !$self->{'debug'};
758}
759
760sub collect_specific {
761 my $self = shift (@_);
762}
763
764sub make_auxiliary_files {
765 my $self = shift (@_);
766 my ($index);
767 my %build_cfg = ();
768 my $outhandle = $self->{'outhandle'};
769
770 print $outhandle "\n*** creating auxiliary files \n" if ($self->{'verbosity'} >= 1);
771
772 # get the text directory
773 &util::mk_all_dir ($self->{'build_dir'});
774
775 # store the build date
776 $build_cfg->{'builddate'} = time;
777
778 # store the number of documents and number of bytes
779 $build_cfg->{'numdocs'} = $self->{'buildproc'}->get_num_docs();
780 $build_cfg->{'numbytes'} = $self->{'buildproc'}->get_num_bytes();
781
782 # get additional stats from mg
783 my $exedir = "$ENV{'GSDLHOME'}/bin/$ENV{'GSDLOS'}";
784 my $exe = &util::get_os_exe ();
785 my $mgstat_exe = &util::filename_cat($exedir, "mgstat$exe");
786 my $input_file = &util::filename_cat ("text", $self->{'collection'});
787 if (!-e "$mgstat_exe" || !open (PIPEIN, "mgstat$exe -d \"$self->{'build_dir'}\" -f \"$input_file\" |")) {
788 print $outhandle "Warning: Couldn't open pipe to $mgstat_exe to get additional stats\n";
789 } else {
790 my $line = "";
791 while (defined ($line = <PIPEIN>)) {
792 if ($line =~ /^Words in collection \[dict\]\s+:\s+(\d+)/) {
793 ($build_cfg->{'numwords'}) = $1;
794 } elsif ($line =~ /^Documents\s+:\s+(\d+)/) {
795 ($build_cfg->{'numsections'}) = $1;
796 }
797 }
798 close PIPEIN;
799 }
800
801 # store the mapping between the index names and the directory names
802 my @indexmap = ();
803 foreach $index (@{$self->{'index_mapping'}->{'indexmaporder'}}) {
804 push (@indexmap, "$index\-\>$self->{'index_mapping'}->{'indexmap'}->{$index}");
805 }
806 $build_cfg->{'indexmap'} = \@indexmap;
807
808 my @subcollectionmap = ();
809 foreach $subcollection (@{$self->{'index_mapping'}->{'subcollectionmaporder'}}) {
810 push (@subcollectionmap, "$subcollection\-\>" .
811 $self->{'index_mapping'}->{'subcollectionmap'}->{$subcollection});
812 }
813 $build_cfg->{'subcollectionmap'} = \@subcollectionmap if scalar (@subcollectionmap);
814
815 my @languagemap = ();
816 foreach $language (@{$self->{'index_mapping'}->{'languagemaporder'}}) {
817 push (@languagemap, "$language\-\>" .
818 $self->{'index_mapping'}->{'languagemap'}->{$language});
819 }
820 $build_cfg->{'languagemap'} = \@languagemap if scalar (@languagemap);
821
822 $build_cfg->{'notbuilt'} = $self->{'notbuilt'} if scalar @{$self->{'notbuilt'}};
823
824 # write out the build information
825 &cfgread::write_cfg_file("$self->{'build_dir'}/build.cfg", $build_cfg,
826 '^(builddate|numdocs|numbytes|numwords|numsections)$',
827 '^(indexmap|subcollectionmap|languagemap|notbuilt)$');
828
829}
830
831sub deinit {
832 my $self = shift (@_);
833}
834
835sub print_stats {
836 my $self = shift (@_);
837
838 my $outhandle = $self->{'outhandle'};
839 my $indexing_text = $self->{'buildproc'}->get_indexing_text();
840 my $index = $self->{'buildproc'}->get_index();
841 my $num_bytes = $self->{'buildproc'}->get_num_bytes();
842 my $num_processed_bytes = $self->{'buildproc'}->get_num_processed_bytes();
843
844 if ($indexing_text) {
845 print $outhandle "Stats (Creating index $index)\n";
846 } else {
847 print $outhandle "Stats (Compressing text from $index)\n";
848 }
849 print $outhandle "Total bytes in collection: $num_bytes\n";
850 print $outhandle "Total bytes in $index: $num_processed_bytes\n";
851
852 if ($num_processed_bytes < 50 && ($indexing_text || !$self->{'no_text'})) {
853 print $outhandle "***************\n";
854 if ($indexing_text) {
855 print $outhandle "WARNING: There is very little or no text to process for $index\n";
856 } elsif (!$self->{'no_text'}) {
857 print $outhandle "WARNING: There is very little or no text to compress\n";
858 }
859 print $outhandle " Was this your intention?\n";
860 print $outhandle "***************\n";
861 }
862}
863
8641;
Note: See TracBrowser for help on using the repository browser.