source: trunk/gsdl/perllib/mgbuilder.pm@ 2785

Last change on this file since 2785 was 2785, checked in by sjboddie, 23 years ago

The build process now creates a summary of how many files were included,
which were rejected, etc. A link to a page containing this summary is
provided from the final page of the collector (once the collection is built
successfully) and from the default "about this collection" text for
collections built by the collector.

Also did a little bit of tidying in a couple of places

  • Property svn:keywords set to Author Date Id Revision
File size: 28.1 KB
Line 
1###########################################################################
2#
3# mgbuilder.pm -- MGBuilder object
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26package mgbuilder;
27
28use classify;
29use cfgread;
30use colcfg;
31use plugin;
32use util;
33use FileHandle;
34
35BEGIN {
36 # set autoflush on for STDERR and STDOUT so that mg
37 # doesn't get out of sync with plugins
38 STDOUT->autoflush(1);
39 STDERR->autoflush(1);
40}
41
42END {
43 STDOUT->autoflush(0);
44 STDERR->autoflush(0);
45}
46
47$maxdocsize = 12000;
48
49%wanted_index_files = ('td'=>1,
50 't'=>1,
51 'idb'=>1,
52 'ib1'=>1,
53 'ib2'=>1,
54 'ib3'=>1,
55 'i'=>1,
56 'ip'=>1,
57 'tiw'=>1,
58 'wa'=>1);
59
60
61sub new {
62 my ($class, $collection, $source_dir, $build_dir, $verbosity,
63 $maxdocs, $debug, $keepold, $allclassifications,
64 $outhandle, $no_text, $failhandle) = @_;
65
66 $outhandle = STDERR unless defined $outhandle;
67 $no_text = 0 unless defined $no_text;
68 $failhandle = STDERR unless defined $failhandle;
69
70 # create an mgbuilder object
71 my $self = bless {'collection'=>$collection,
72 'source_dir'=>$source_dir,
73 'build_dir'=>$build_dir,
74 'verbosity'=>$verbosity,
75 'maxdocs'=>$maxdocs,
76 'debug'=>$debug,
77 'keepold'=>$keepold,
78 'allclassifications'=>$allclassifications,
79 'outhandle'=>$outhandle,
80 'no_text'=>$no_text,
81 'failhandle'=>$failhandle,
82 'notbuilt'=>[] # indexes not built
83 }, $class;
84
85
86 # read in the collection configuration file
87 my $colcfgname = "$ENV{'GSDLCOLLECTDIR'}/etc/collect.cfg";
88 if (!-e $colcfgname) {
89 die "mgbuilder::new - couldn't find collect.cfg for collection $collection\n";
90 }
91 $self->{'collect_cfg'} = &colcfg::read_collect_cfg ($colcfgname);
92
93 # sort out subcollection indexes
94 if (defined $self->{'collect_cfg'}->{'indexsubcollections'}) {
95 my $indexes = $self->{'collect_cfg'}->{'indexes'};
96 $self->{'collect_cfg'}->{'indexes'} = [];
97 foreach $subcollection (@{$self->{'collect_cfg'}->{'indexsubcollections'}}) {
98 foreach $index (@$indexes) {
99 push (@{$self->{'collect_cfg'}->{'indexes'}}, "$index:$subcollection");
100 }
101 }
102 }
103
104 # sort out language subindexes
105 if (defined $self->{'collect_cfg'}->{'languages'}) {
106 my $indexes = $self->{'collect_cfg'}->{'indexes'};
107 $self->{'collect_cfg'}->{'indexes'} = [];
108 foreach $language (@{$self->{'collect_cfg'}->{'languages'}}) {
109 foreach $index (@$indexes) {
110 if (defined ($self->{'collect_cfg'}->{'indexsubcollections'})) {
111 push (@{$self->{'collect_cfg'}->{'indexes'}}, "$index:$language");
112 }
113 else { # add in an empty subcollection field
114 push (@{$self->{'collect_cfg'}->{'indexes'}}, "$index\:\:$language");
115 }
116 }
117 }
118 }
119
120 # make sure that the same index isn't specified more than once
121 my %tmphash = ();
122 my @tmparray = @{$self->{'collect_cfg'}->{'indexes'}};
123 $self->{'collect_cfg'}->{'indexes'} = [];
124 foreach my $i (@tmparray) {
125 if (!defined ($tmphash{$i})) {
126 push (@{$self->{'collect_cfg'}->{'indexes'}}, $i);
127 $tmphash{$i} = 1;
128 }
129 }
130
131 # get the list of plugins for this collection
132 my $plugins = [];
133 if (defined $self->{'collect_cfg'}->{'plugin'}) {
134 $plugins = $self->{'collect_cfg'}->{'plugin'};
135 }
136
137 # load all the plugins
138 $self->{'pluginfo'} = &plugin::load_plugins ($plugins, $verbosity, $outhandle, $failhandle);
139 if (scalar(@{$self->{'pluginfo'}}) == 0) {
140 print $outhandle "No plugins were loaded.\n";
141 die "\n";
142 }
143
144 # get the list of classifiers for this collection
145 my $classifiers = [];
146 if (defined $self->{'collect_cfg'}->{'classify'}) {
147 $classifiers = $self->{'collect_cfg'}->{'classify'};
148 }
149
150 # load all the classifiers
151 $self->{'classifiers'} = &classify::load_classifiers ($classifiers, $build_dir, $outhandle);
152
153 # load up any dontgdbm fields
154 $self->{'dontgdbm'} = {};
155 if (defined ($self->{'collect_cfg'}->{'dontgdbm'})) {
156 foreach $dg (@{$self->{'collect_cfg'}->{'dontgdbm'}}) {
157 $self->{'dontgdbm'}->{$dg} = 1;
158 }
159 }
160
161 # load up the document processor for building
162 # if a buildproc class has been created for this collection, use it
163 # otherwise, use the mg buildproc
164 my ($buildprocdir, $buildproctype);
165 if (-e "$ENV{'GSDLCOLLECTDIR'}/perllib/${collection}buildproc.pm") {
166 $buildprocdir = "$ENV{'GSDLCOLLECTDIR'}/perllib";
167 $buildproctype = "${collection}buildproc";
168 } else {
169 $buildprocdir = "$ENV{'GSDLHOME'}/perllib";
170 $buildproctype = "mgbuildproc";
171 }
172 require "$buildprocdir/$buildproctype.pm";
173
174 eval("\$self->{'buildproc'} = new $buildproctype(\$collection, " .
175 "\$source_dir, \$build_dir, \$verbosity, \$outhandle)");
176 die "$@" if $@;
177
178 return $self;
179}
180
181sub init {
182 my $self = shift (@_);
183
184 if (!$self->{'debug'} && !$self->{'keepold'}) {
185 # remove any old builds
186 &util::rm_r($self->{'build_dir'});
187 &util::mk_all_dir($self->{'build_dir'});
188
189 # make the text directory
190 my $textdir = "$self->{'build_dir'}/text";
191 &util::mk_all_dir($textdir);
192 }
193}
194
195sub compress_text {
196 my $self = shift (@_);
197 my ($textindex) = @_;
198 my $exedir = "$ENV{'GSDLHOME'}/bin/$ENV{'GSDLOS'}";
199 my $exe = &util::get_os_exe ();
200 my $mg_passes_exe = &util::filename_cat($exedir, "mg_passes$exe");
201 my $mg_compression_dict_exe = &util::filename_cat($exedir, "mg_compression_dict$exe");
202 my $outhandle = $self->{'outhandle'};
203
204 &util::mk_all_dir (&util::filename_cat($self->{'build_dir'}, "text"));
205 my $basefilename = "text/$self->{'collection'}";
206 my $fulltextprefix = &util::filename_cat ($self->{'build_dir'}, $basefilename);
207
208 my $osextra = "";
209 if ($ENV{'GSDLOS'} =~ /^windows$/i) {
210 $fulltextprefix =~ s/\//\\/g;
211 } else {
212 $osextra = " -d /";
213 }
214
215 print $outhandle "\n*** creating the compressed text\n" if ($self->{'verbosity'} >= 1);
216
217 # collect the statistics for the text
218 # -b $maxdocsize sets the maximum document size to be 12 meg
219 print $outhandle "\n collecting text statistics\n" if ($self->{'verbosity'} >= 1);
220
221 my ($handle);
222 if ($self->{'debug'}) {
223 $handle = STDOUT;
224 } else {
225 if (!-e "$mg_passes_exe" ||
226 !open (PIPEOUT, "| mg_passes$exe -f \"$fulltextprefix\" -b $maxdocsize -T1 $osextra")) {
227 die "mgbuilder::compress_text - couldn't run $mg_passes_exe\n";
228 }
229 $handle = mgbuilder::PIPEOUT;
230 }
231
232 $self->{'buildproc'}->set_output_handle ($handle);
233 $self->{'buildproc'}->set_mode ('text');
234 $self->{'buildproc'}->set_index ($textindex);
235 $self->{'buildproc'}->set_indexing_text (0);
236 if ($self->{'no_text'}) {
237 $self->{'buildproc'}->set_store_text(0);
238 } else {
239 $self->{'buildproc'}->set_store_text(1);
240 }
241 $self->{'buildproc'}->reset();
242 &plugin::begin($self->{'pluginfo'}, $self->{'source_dir'},
243 $self->{'buildproc'}, $self->{'maxdocs'});
244 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
245 "", {}, $self->{'buildproc'}, $self->{'maxdocs'});
246 &plugin::end($self->{'pluginfo'});
247
248 close ($handle) unless $self->{'debug'};
249
250 $self->print_stats();
251
252 # create the compression dictionary
253 # the compression dictionary is built by assuming the stats are from a seed
254 # dictionary (-S), if a novel word is encountered it is spelled out (-H),
255 # and the resulting dictionary must be less than 5 meg with the most frequent
256 # words being put into the dictionary first (-2 -k 5120)
257 if (!$self->{'debug'}) {
258 print $outhandle "\n creating the compression dictionary\n" if ($self->{'verbosity'} >= 1);
259 if (!-e "$mg_compression_dict_exe") {
260 die "mgbuilder::compress_text - couldn't run $mg_compression_dict_exe\n";
261 }
262 system ("mg_compression_dict$exe -f \"$fulltextprefix\" -S -H -2 -k 5120 $osextra");
263
264 # -b $maxdocsize sets the maximum document size to be 12 meg
265 if (!-e "$mg_passes_exe" ||
266 !open ($handle, "| mg_passes$exe -f \"$fulltextprefix\" -b $maxdocsize -T2 $osextra")) {
267 die "mgbuilder::compress_text - couldn't run $mg_passes_exe\n";
268 }
269 }
270
271 $self->{'buildproc'}->reset();
272 # compress the text
273 print $outhandle "\n compressing the text\n" if ($self->{'verbosity'} >= 1);
274 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
275 "", {}, $self->{'buildproc'}, $self->{'maxdocs'});
276 close ($handle) unless $self->{'debug'};
277
278 $self->print_stats();
279}
280
281sub want_built {
282 my $self = shift (@_);
283 my ($index) = @_;
284
285 if (defined ($self->{'collect_cfg'}->{'dontbuild'})) {
286 foreach $checkstr (@{$self->{'collect_cfg'}->{'dontbuild'}}) {
287 if ($index =~ /^$checkstr$/) {
288 push (@{$self->{'notbuilt'}}, $self->{'index_mapping'}->{$index});
289 return 0;
290 }
291 }
292 }
293
294 return 1;
295}
296
297sub build_indexes {
298 my $self = shift (@_);
299 my ($indexname) = @_;
300 my $outhandle = $self->{'outhandle'};
301
302 my $indexes = [];
303 if (defined $indexname && $indexname =~ /\w/) {
304 push @$indexes, $indexname;
305 } else {
306 $indexes = $self->{'collect_cfg'}->{'indexes'};
307 }
308
309 # create the mapping between the index descriptions
310 # and their directory names
311 $self->{'index_mapping'} = $self->create_index_mapping ($indexes);
312
313 # build each of the indexes
314 foreach $index (@$indexes) {
315 if ($self->want_built($index)) {
316 print $outhandle "\n*** building index $index in subdirectory " .
317 "$self->{'index_mapping'}->{$index}\n" if ($self->{'verbosity'} >= 1);
318 $self->build_index($index);
319 } else {
320 print $outhandle "\n*** ignoring index $index\n" if ($self->{'verbosity'} >= 1);
321 }
322 }
323}
324
325# creates directory names for each of the index descriptions
326sub create_index_mapping {
327 my $self = shift (@_);
328 my ($indexes) = @_;
329
330 my %mapping = ();
331 $mapping{'indexmaporder'} = [];
332 $mapping{'subcollectionmaporder'} = [];
333 $mapping{'languagemaporder'} = [];
334
335 # dirnames is used to check for collisions. Start this off
336 # with the manditory directory names
337 my %dirnames = ('text'=>'text',
338 'extra'=>'extra');
339 my %pnames = ('index' => '', 'subcollection' => '', 'languages' => '');
340
341 foreach $index (@$indexes) {
342 my ($level, $gran, $subcollection, $languages) = split (":", $index);
343
344 # the directory name starts with the first character of the index level
345 my ($pindex) = $level =~ /^(.)/;
346
347 # next comes a processed version of the index
348 $pindex .= $self->process_field ($gran);
349 $pindex = lc ($pindex);
350
351 # next comes a processed version of the subcollection if there is one.
352 my $psub = $self->process_field ($subcollection);
353 $psub = lc ($psub);
354
355 # next comes a processed version of the language if there is one.
356 my $plang = $self->process_field ($languages);
357 $plang = lc ($plang);
358
359 my $dirname = $pindex . $psub . $plang;
360
361 # check to be sure all index names are unique
362 while (defined ($dirnames{$dirname})) {
363 $dirname = $self->make_unique (\%pnames, $index, \$pindex, \$psub, \$plang);
364 }
365 $mapping{$index} = $dirname;
366
367 # store the mapping orders as well as the maps
368 # also put index, subcollection and language fields into the mapping thing -
369 # (the full index name (eg document:text:subcol:lang) is not used on
370 # the query page) -these are used for collectionmeta later on
371 if (!defined $mapping{'indexmap'}{"$level:$gran"}) {
372 $mapping{'indexmap'}{"$level:$gran"} = $pindex;
373 push (@{$mapping{'indexmaporder'}}, "$level:$gran");
374 if (!defined $mapping{"$level:$gran"}) {
375 $mapping{"$level:$gran"} = $pindex;
376 }
377 }
378 if ($psub =~ /\w/ && !defined ($mapping{'subcollectionmap'}{$subcollection})) {
379 $mapping{'subcollectionmap'}{$subcollection} = $psub;
380 push (@{$mapping{'subcollectionmaporder'}}, $subcollection);
381 $mapping{$subcollection} = $psub;
382 }
383 if ($plang =~ /\w/ && !defined ($mapping{'languagemap'}{$languages})) {
384 $mapping{'languagemap'}{$languages} = $plang;
385 push (@{$mapping{'languagemaporder'}}, $languages);
386 $mapping{$languages} = $plang;
387 }
388 $dirnames{$dirname} = $index;
389 $pnames{'index'}{$pindex} = "$level:$gran";
390 $pnames{'subcollection'}{$psub} = $subcollection;
391 $pnames{'languages'}{$plang} = $languages;
392 }
393
394 return \%mapping;
395}
396
397# returns a processed version of a field.
398# if the field has only one component the processed
399# version will contain the first character and next consonant
400# of that componant - otherwise it will contain the first
401# character of the first two components
402sub process_field {
403 my $self = shift (@_);
404 my ($field) = @_;
405
406 return "" unless (defined ($field) && $field =~ /\w/);
407
408 my @components = split /,/, $field;
409 if (scalar @components >= 2) {
410 splice (@components, 2);
411 map {s/^(.).*$/$1/;} @components;
412 return join("", @components);
413 } else {
414 my ($a, $b) = $field =~ /^(.).*?([bcdfghjklmnpqrstvwxyz])/i;
415 ($a, $b) = $field =~ /^(.)(.)/ unless defined $a && defined $b;
416 return "$a$b";
417 }
418}
419
420sub make_unique {
421 my $self = shift (@_);
422 my ($namehash, $index, $indexref, $subref, $langref) = @_;
423 my ($level, $gran, $subcollection, $languages) = split (":", $index);
424
425 if ($namehash->{'index'}->{$$indexref} ne "$level:$gran") {
426 $self->get_next_version ($indexref);
427 } elsif ($namehash->{'subcollection'}->{$$subref} ne $subcollection) {
428 $self->get_next_version ($subref);
429 } elsif ($namehash->{'languages'}->{$$langref} ne $languages) {
430 $self->get_next_version ($langref);
431 }
432 return "$$indexref$$subref$$langref";
433}
434
435sub get_next_version {
436 my $self = shift (@_);
437 my ($nameref) = @_;
438
439 if ($$nameref =~ /(\d\d)$/) {
440 my $num = $1; $num ++;
441 $$nameref =~ s/\d\d$/$num/;
442 } elsif ($$nameref =~ /(\d)$/) {
443 my $num = $1;
444 if ($num == 9) {$$nameref =~ s/\d\d$/10/;}
445 else {$num ++; $$nameref =~ s/\d$/$num/;}
446 } else {
447 $$nameref =~ s/.$/0/;
448 }
449}
450
451sub build_index {
452 my $self = shift (@_);
453 my ($index) = @_;
454 my $outhandle = $self->{'outhandle'};
455
456 # get the full index directory path and make sure it exists
457 my $indexdir = $self->{'index_mapping'}->{$index};
458 &util::mk_all_dir (&util::filename_cat($self->{'build_dir'}, $indexdir));
459 my $fullindexprefix = &util::filename_cat ($self->{'build_dir'}, $indexdir,
460 $self->{'collection'});
461 my $fulltextprefix = &util::filename_cat ($self->{'build_dir'}, "text",
462 $self->{'collection'});
463
464 # get any os specific stuff
465 my $exedir = "$ENV{'GSDLHOME'}/bin/$ENV{'GSDLOS'}";
466 my $exe = &util::get_os_exe ();
467 my $mg_passes_exe = &util::filename_cat($exedir, "mg_passes$exe");
468 my $mg_perf_hash_build_exe =
469 &util::filename_cat($exedir, "mg_perf_hash_build$exe");
470 my $mg_weights_build_exe =
471 &util::filename_cat ($exedir, "mg_weights_build$exe");
472 my $mg_invf_dict_exe =
473 &util::filename_cat ($exedir, "mg_invf_dict$exe");
474 my $mg_stem_idx_exe =
475 &util::filename_cat ($exedir, "mg_stem_idx$exe");
476
477 my $osextra = "";
478 if ($ENV{'GSDLOS'} =~ /^windows$/i) {
479 $fullindexprefix =~ s/\//\\/g;
480 } else {
481 $osextra = " -d /";
482 }
483
484 # get the index level from the index description
485 # the index will be level 2 unless we are building a
486 # paragraph level index
487 my $index_level = 2;
488 $index_level = 3 if $index =~ /^paragraph/i;
489
490 # get the index expression if this index belongs
491 # to a subcollection
492 my $indexexparr = [];
493
494 # there may be subcollection info, and language info.
495 my ($level, $fields, $subcollection, $language) = split (":", $index);
496 my @subcollections = ();
497 @subcollections = split /,/, $subcollection if (defined $subcollection);
498
499 foreach $subcollection (@subcollections) {
500 if (defined ($self->{'collect_cfg'}->{'subcollection'}->{$subcollection})) {
501 push (@$indexexparr, $self->{'collect_cfg'}->{'subcollection'}->{$subcollection});
502 }
503 }
504
505 # add expressions for languages if this index belongs to
506 # a language subcollection - only put languages expressions for the
507 # ones we want in the index
508
509 my @languages = ();
510 @languages = split /,/, $language if (defined $language);
511 foreach $language (@languages) {
512 my $not=0;
513 if ($language =~ s/^\!//) {
514 $not = 1;
515 }
516 foreach $lang (@{$self->{'collect_cfg'}->{'languages'}}) {
517 if ($lang eq $language) {
518 if($not) {
519 push (@$indexexparr, "!Language/$language/");
520 } else {
521 push (@$indexexparr, "Language/$language/");
522 }
523 last;
524 }
525 }
526 }
527
528 # Build index dictionary. Uses verbatim stem method
529 print $outhandle "\n creating index dictionary\n" if ($self->{'verbosity'} >= 1);
530 my ($handle);
531 if ($self->{'debug'}) {
532 $handle = STDOUT;
533 } else {
534 if (!-e "$mg_passes_exe" ||
535 !open (PIPEOUT, "| mg_passes$exe -f \"$fullindexprefix\" -b $maxdocsize " .
536 "-$index_level -m 32 -s 0 -G -t 10 -N1 $osextra")) {
537 die "mgbuilder::build_index - couldn't run $mg_passes_exe\n";
538 }
539 $handle = mgbuilder::PIPEOUT;
540 }
541
542 # set up the document processor
543 $self->{'buildproc'}->set_output_handle ($handle);
544 $self->{'buildproc'}->set_mode ('text');
545 $self->{'buildproc'}->set_index ($index, $indexexparr);
546 $self->{'buildproc'}->set_indexing_text (1);
547 $self->{'buildproc'}->set_store_text(1);
548
549 $self->{'buildproc'}->reset();
550 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
551 "", {}, $self->{'buildproc'}, $self->{'maxdocs'});
552 close ($handle) unless $self->{'debug'};
553
554 $self->print_stats();
555
556 if (!$self->{'debug'}) {
557 # create the perfect hash function
558 if (!-e "$mg_perf_hash_build_exe") {
559 die "mgbuilder::build_index - couldn't run $mg_perf_hash_build_exe\n";
560 }
561 system ("mg_perf_hash_build$exe -f \"$fullindexprefix\" $osextra");
562
563 if (!-e "$mg_passes_exe" ||
564 !open ($handle, "| mg_passes$exe -f \"$fullindexprefix\" -b $maxdocsize " .
565 "-$index_level -c 3 -G -t 10 -N2 $osextra")) {
566 die "mgbuilder::build_index - couldn't run $mg_passes_exe\n";
567 }
568 }
569
570 # invert the text
571 print $outhandle "\n inverting the text\n" if ($self->{'verbosity'} >= 1);
572
573 $self->{'buildproc'}->reset();
574 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
575 "", {}, $self->{'buildproc'}, $self->{'maxdocs'});
576
577 $self->print_stats ();
578
579 if (!$self->{'debug'}) {
580
581 close ($handle);
582
583 # create the weights file
584 print $outhandle "\n create the weights file\n" if ($self->{'verbosity'} >= 1);
585 if (!-e "$mg_weights_build_exe") {
586 die "mgbuilder::build_index - couldn't run $mg_weights_build_exe\n";
587 }
588 system ("mg_weights_build$exe -f \"$fullindexprefix\" -t \"$fulltextprefix\" $osextra");
589
590 # create 'on-disk' stemmed dictionary
591 print $outhandle "\n creating 'on-disk' stemmed dictionary\n" if ($self->{'verbosity'} >= 1);
592 if (!-e "$mg_invf_dict_exe") {
593 die "mgbuilder::build_index - couldn't run $mg_invf_dict_exe\n";
594 }
595 system ("mg_invf_dict$exe -f \"$fullindexprefix\" $osextra");
596
597
598 # creates stem index files for the various stemming methods
599 print $outhandle "\n creating stem indexes\n" if ($self->{'verbosity'} >= 1);
600 if (!-e "$mg_stem_idx_exe") {
601 die "mgbuilder::build_index - couldn't run $mg_stem_idx_exe\n";
602 }
603 system ("mg_stem_idx$exe -b 4096 -s1 -f \"$fullindexprefix\" $osextra");
604 system ("mg_stem_idx$exe -b 4096 -s2 -f \"$fullindexprefix\" $osextra");
605 system ("mg_stem_idx$exe -b 4096 -s3 -f \"$fullindexprefix\" $osextra");
606
607 # remove unwanted files
608 my $tmpdir = &util::filename_cat ($self->{'build_dir'}, $indexdir);
609 opendir (DIR, $tmpdir) || die
610 "mgbuilder::build_index - couldn't read directory $tmpdir\n";
611 foreach $file (readdir(DIR)) {
612 next if $file =~ /^\./;
613 my ($suffix) = $file =~ /\.([^\.]+)$/;
614 if (defined $suffix && !defined $wanted_index_files{$suffix}) {
615 # delete it!
616 print $outhandle "deleting $file\n" if $self->{'verbosity'} > 2;
617 &util::rm (&util::filename_cat ($tmpdir, $file));
618 }
619 }
620 closedir (DIR);
621 }
622}
623
624sub make_infodatabase {
625 my $self = shift (@_);
626 my $outhandle = $self->{'outhandle'};
627
628 my $textdir = &util::filename_cat($self->{'build_dir'}, "text");
629 my $assocdir = &util::filename_cat($self->{'build_dir'}, "assoc");
630 &util::mk_all_dir ($textdir);
631 &util::mk_all_dir ($assocdir);
632
633 # get db name
634 my $dbext = ".bdb";
635 $dbext = ".ldb" if &util::is_little_endian();
636 my $fulldbname = &util::filename_cat ($textdir, "$self->{'collection'}$dbext");
637 $fulldbname =~ s/\//\\/g if ($ENV{'GSDLOS'} =~ /^windows$/i);
638
639 my $exedir = "$ENV{'GSDLHOME'}/bin/$ENV{'GSDLOS'}";
640 my $exe = &util::get_os_exe ();
641 my $txt2db_exe = &util::filename_cat($exedir, "txt2db$exe");
642
643 print $outhandle "\n*** creating the info database and processing associated files\n"
644 if ($self->{'verbosity'} >= 1);
645
646 # init all the classifiers
647 &classify::init_classifiers ($self->{'classifiers'});
648
649
650 # set up the document processor
651 my ($handle);
652 if ($self->{'debug'}) {
653 $handle = STDOUT;
654 } else {
655 if (!-e "$txt2db_exe" || !open (PIPEOUT, "| txt2db$exe \"$fulldbname\"")) {
656 die "mgbuilder::make_infodatabase - couldn't run $txt2db_exe\n";
657 }
658 $handle = mgbuilder::PIPEOUT;
659 }
660
661 $self->{'buildproc'}->set_output_handle ($handle);
662 $self->{'buildproc'}->set_mode ('infodb');
663 $self->{'buildproc'}->set_assocdir ($assocdir);
664 $self->{'buildproc'}->set_dontgdbm ($self->{'dontgdbm'});
665 $self->{'buildproc'}->set_classifiers ($self->{'classifiers'});
666 $self->{'buildproc'}->set_indexing_text (0);
667 $self->{'buildproc'}->set_store_text(1);
668 $self->{'buildproc'}->reset();
669
670 if (defined $self->{'collect_cfg'}->{'collectionmeta'}) {
671
672 if (!defined $self->{'index_mapping'}) {
673 $self->{'index_mapping'} =
674 $self->create_index_mapping ($self->{'collect_cfg'}->{'indexes'});
675 }
676
677 print $handle "[collection]\n";
678
679 foreach $cmeta (keys (%{$self->{'collect_cfg'}->{'collectionmeta'}})) {
680 my $defaultfound=0;
681 my $first=1;
682 my $metadata_entry = "";
683 my $default="";
684 my $cmetamap = "";
685 if ($cmeta =~ s/^\.//) {
686 if (defined $self->{'index_mapping'}->{$cmeta}) {
687 $cmetamap = $self->{'index_mapping'}->{$cmeta};
688 $cmeta = ".$cmeta";
689 }
690 else {
691 print $outhandle "mgbuilder: warning bad collectionmeta option '$cmeta' - ignored\n";
692 next; #ignore this one
693 }
694 }
695 else {
696 $cmetamap = $cmeta; # just using the same name
697 }
698 #iterate through the languages
699 foreach $lang (keys (%{$self->{'collect_cfg'}->{'collectionmeta'}->{$cmeta}})) {
700 if ($first) {
701 $first=0;
702 #set the default default to the first entry
703 $default=$self->{'collect_cfg'}->{'collectionmeta'}->{$cmeta}->{$lang};
704 }
705 if ($lang =~ /default/) {
706 $defaultfound=1;
707 #the default entry goes first
708 $metadata_entry = "<$cmetamap>" .
709 $self->{'collect_cfg'}->{'collectionmeta'}->{$cmeta}->{'default'} . "\n" . $metadata_entry;
710 }
711 else {
712 my ($l) = $lang =~ /^\[l=(\w*)\]$/;
713 if ($l) {
714 $metadata_entry .= "<$cmetamap:$l>" .
715 $self->{'collect_cfg'}->{'collectionmeta'}->{$cmeta}->{$lang} . "\n";
716 }
717 }
718 }
719 #if we haven't found a default, put one in
720 if (!$defaultfound) {
721 $metadata_entry = "<$cmetamap>$default\n" . $metadata_entry;
722 }
723 #write the entry to the file
724 print $handle $metadata_entry;
725
726 }
727
728 print $handle "\n" . ('-' x 70) . "\n";
729
730 }
731
732
733 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
734 "", {}, $self->{'buildproc'}, $self->{'maxdocs'});
735
736 # output classification information
737 &classify::output_classify_info ($self->{'classifiers'}, $handle,
738 $self->{'allclassifications'});
739
740
741
742 #output doclist
743 my @doclist = $self->{'buildproc'}->get_doc_list();
744 my $docs = join (";",@doclist);
745 print $handle "[browselist]\n";
746 print $handle "<hastxt>0\n";
747 print $handle "<childtype>VList\n";
748 print $handle "<numleafdocs>" . ($#doclist+1) . "\n";
749 print $handle "<thistype>Invisible\n";
750 print $handle "<contains>$docs";
751 print $handle "\n" . ('-' x 70) . "\n";
752
753 close ($handle) if !$self->{'debug'};
754}
755
756sub collect_specific {
757 my $self = shift (@_);
758}
759
760sub make_auxiliary_files {
761 my $self = shift (@_);
762 my ($index);
763 my %build_cfg = ();
764 my $outhandle = $self->{'outhandle'};
765
766 print $outhandle "\n*** creating auxiliary files \n" if ($self->{'verbosity'} >= 1);
767
768 # get the text directory
769 &util::mk_all_dir ($self->{'build_dir'});
770
771 # store the build date
772 $build_cfg->{'builddate'} = time;
773
774 # store the number of documents and number of bytes
775 $build_cfg->{'numdocs'} = $self->{'buildproc'}->get_num_docs();
776 $build_cfg->{'numbytes'} = $self->{'buildproc'}->get_num_bytes();
777
778 # get additional stats from mg
779 my $exedir = "$ENV{'GSDLHOME'}/bin/$ENV{'GSDLOS'}";
780 my $exe = &util::get_os_exe ();
781 my $mgstat_exe = &util::filename_cat($exedir, "mgstat$exe");
782 my $input_file = &util::filename_cat ("text", $self->{'collection'});
783 if (!-e "$mgstat_exe" || !open (PIPEIN, "mgstat$exe -d \"$self->{'build_dir'}\" -f \"$input_file\" |")) {
784 print $outhandle "Warning: Couldn't open pipe to $mgstat_exe to get additional stats\n";
785 } else {
786 my $line = "";
787 while (defined ($line = <PIPEIN>)) {
788 if ($line =~ /^Words in collection \[dict\]\s+:\s+(\d+)/) {
789 ($build_cfg->{'numwords'}) = $1;
790 } elsif ($line =~ /^Documents\s+:\s+(\d+)/) {
791 ($build_cfg->{'numsections'}) = $1;
792 }
793 }
794 close PIPEIN;
795 }
796
797 # store the mapping between the index names and the directory names
798 my @indexmap = ();
799 foreach $index (@{$self->{'index_mapping'}->{'indexmaporder'}}) {
800 push (@indexmap, "$index\-\>$self->{'index_mapping'}->{'indexmap'}->{$index}");
801 }
802 $build_cfg->{'indexmap'} = \@indexmap;
803
804 my @subcollectionmap = ();
805 foreach $subcollection (@{$self->{'index_mapping'}->{'subcollectionmaporder'}}) {
806 push (@subcollectionmap, "$subcollection\-\>" .
807 $self->{'index_mapping'}->{'subcollectionmap'}->{$subcollection});
808 }
809 $build_cfg->{'subcollectionmap'} = \@subcollectionmap if scalar (@subcollectionmap);
810
811 my @languagemap = ();
812 foreach $language (@{$self->{'index_mapping'}->{'languagemaporder'}}) {
813 push (@languagemap, "$language\-\>" .
814 $self->{'index_mapping'}->{'languagemap'}->{$language});
815 }
816 $build_cfg->{'languagemap'} = \@languagemap if scalar (@languagemap);
817
818 $build_cfg->{'notbuilt'} = $self->{'notbuilt'} if scalar @{$self->{'notbuilt'}};
819
820 # write out the build information
821 &cfgread::write_cfg_file("$self->{'build_dir'}/build.cfg", $build_cfg,
822 '^(builddate|numdocs|numbytes|numwords|numsections)$',
823 '^(indexmap|subcollectionmap|languagemap|notbuilt)$');
824
825}
826
827sub deinit {
828 my $self = shift (@_);
829}
830
831sub print_stats {
832 my $self = shift (@_);
833
834 my $outhandle = $self->{'outhandle'};
835 my $indexing_text = $self->{'buildproc'}->get_indexing_text();
836 my $index = $self->{'buildproc'}->get_index();
837 my $num_bytes = $self->{'buildproc'}->get_num_bytes();
838 my $num_processed_bytes = $self->{'buildproc'}->get_num_processed_bytes();
839
840 if ($indexing_text) {
841 print $outhandle "Stats (Creating index $index)\n";
842 } else {
843 print $outhandle "Stats (Compressing text from $index)\n";
844 }
845 print $outhandle "Total bytes in collection: $num_bytes\n";
846 print $outhandle "Total bytes in $index: $num_processed_bytes\n";
847
848 if ($num_processed_bytes < 50 && ($indexing_text || !$self->{'no_text'})) {
849 print $outhandle "***************\n";
850 if ($indexing_text) {
851 print $outhandle "WARNING: There is very little or no text to process for $index\n";
852 } elsif (!$self->{'no_text'}) {
853 print $outhandle "WARNING: There is very little or no text to compress\n";
854 }
855 print $outhandle " Was this your intention?\n";
856 print $outhandle "***************\n";
857 }
858}
859
8601;
Note: See TracBrowser for help on using the repository browser.