source: trunk/gsdl/perllib/mgbuilder.pm@ 2772

Last change on this file since 2772 was 2772, checked in by kjm18, 23 years ago

changes to enable language specific collectionmeta in collect.cfg
collectionmeta now specified as eg
collectionmeta collectionname [l=en] "greenstone demo"
any entries without language parameter are used as a default

  • Property svn:keywords set to Author Date Id Revision
File size: 28.0 KB
Line 
1###########################################################################
2#
3# mgbuilder.pm -- MGBuilder object
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26package mgbuilder;
27
28use classify;
29use cfgread;
30use colcfg;
31use plugin;
32use util;
33use FileHandle;
34
35BEGIN {
36 # set autoflush on for STDERR and STDOUT so that mg
37 # doesn't get out of sync with plugins
38 STDOUT->autoflush(1);
39 STDERR->autoflush(1);
40}
41
42END {
43 STDOUT->autoflush(0);
44 STDERR->autoflush(0);
45}
46
47$maxdocsize = 12000;
48
49%wanted_index_files = ('td'=>1,
50 't'=>1,
51 'idb'=>1,
52 'ib1'=>1,
53 'ib2'=>1,
54 'ib3'=>1,
55 'i'=>1,
56 'ip'=>1,
57 'tiw'=>1,
58 'wa'=>1);
59
60
61sub new {
62 my ($class, $collection, $source_dir, $build_dir, $verbosity,
63 $maxdocs, $debug, $keepold, $allclassifications,
64 $outhandle, $no_text) = @_;
65
66 $outhandle = STDERR unless defined $outhandle;
67 $no_text = 0 unless defined $no_text;
68
69 # create an mgbuilder object
70 my $self = bless {'collection'=>$collection,
71 'source_dir'=>$source_dir,
72 'build_dir'=>$build_dir,
73 'verbosity'=>$verbosity,
74 'maxdocs'=>$maxdocs,
75 'debug'=>$debug,
76 'keepold'=>$keepold,
77 'allclassifications'=>$allclassifications,
78 'outhandle'=>$outhandle,
79 'no_text'=>$no_text,
80 'notbuilt'=>[] # indexes not built
81 }, $class;
82
83
84 # read in the collection configuration file
85 my $colcfgname = "$ENV{'GSDLCOLLECTDIR'}/etc/collect.cfg";
86 if (!-e $colcfgname) {
87 die "mgbuilder::new - couldn't find collect.cfg for collection $collection\n";
88 }
89 $self->{'collect_cfg'} = &colcfg::read_collect_cfg ($colcfgname);
90
91 # sort out subcollection indexes
92 if (defined $self->{'collect_cfg'}->{'indexsubcollections'}) {
93 my $indexes = $self->{'collect_cfg'}->{'indexes'};
94 $self->{'collect_cfg'}->{'indexes'} = [];
95 foreach $subcollection (@{$self->{'collect_cfg'}->{'indexsubcollections'}}) {
96 foreach $index (@$indexes) {
97 push (@{$self->{'collect_cfg'}->{'indexes'}}, "$index:$subcollection");
98 }
99 }
100 }
101
102 # sort out language subindexes
103 if (defined $self->{'collect_cfg'}->{'languages'}) {
104 my $indexes = $self->{'collect_cfg'}->{'indexes'};
105 $self->{'collect_cfg'}->{'indexes'} = [];
106 foreach $language (@{$self->{'collect_cfg'}->{'languages'}}) {
107 foreach $index (@$indexes) {
108 if (defined ($self->{'collect_cfg'}->{'indexsubcollections'})) {
109 push (@{$self->{'collect_cfg'}->{'indexes'}}, "$index:$language");
110 }
111 else { # add in an empty subcollection field
112 push (@{$self->{'collect_cfg'}->{'indexes'}}, "$index\:\:$language");
113 }
114 }
115 }
116 }
117
118 # make sure that the same index isn't specified more than once
119 my %tmphash = ();
120 my @tmparray = @{$self->{'collect_cfg'}->{'indexes'}};
121 $self->{'collect_cfg'}->{'indexes'} = [];
122 foreach my $i (@tmparray) {
123 if (!defined ($tmphash{$i})) {
124 push (@{$self->{'collect_cfg'}->{'indexes'}}, $i);
125 $tmphash{$i} = 1;
126 }
127 }
128
129 # get the list of plugins for this collection
130 my $plugins = [];
131 if (defined $self->{'collect_cfg'}->{'plugin'}) {
132 $plugins = $self->{'collect_cfg'}->{'plugin'};
133 }
134
135 # load all the plugins
136 $self->{'pluginfo'} = &plugin::load_plugins ($plugins, $verbosity, $outhandle);
137 if (scalar(@{$self->{'pluginfo'}}) == 0) {
138 print $outhandle "No plugins were loaded.\n";
139 die "\n";
140 }
141
142 # get the list of classifiers for this collection
143 my $classifiers = [];
144 if (defined $self->{'collect_cfg'}->{'classify'}) {
145 $classifiers = $self->{'collect_cfg'}->{'classify'};
146 }
147
148 # load all the classifiers
149 $self->{'classifiers'} = &classify::load_classifiers ($classifiers, $build_dir, $outhandle);
150
151 # load up any dontgdbm fields
152 $self->{'dontgdbm'} = {};
153 if (defined ($self->{'collect_cfg'}->{'dontgdbm'})) {
154 foreach $dg (@{$self->{'collect_cfg'}->{'dontgdbm'}}) {
155 $self->{'dontgdbm'}->{$dg} = 1;
156 }
157 }
158
159 # load up the document processor for building
160 # if a buildproc class has been created for this collection, use it
161 # otherwise, use the mg buildproc
162 my ($buildprocdir, $buildproctype);
163 if (-e "$ENV{'GSDLCOLLECTDIR'}/perllib/${collection}buildproc.pm") {
164 $buildprocdir = "$ENV{'GSDLCOLLECTDIR'}/perllib";
165 $buildproctype = "${collection}buildproc";
166 } else {
167 $buildprocdir = "$ENV{'GSDLHOME'}/perllib";
168 $buildproctype = "mgbuildproc";
169 }
170 require "$buildprocdir/$buildproctype.pm";
171
172 eval("\$self->{'buildproc'} = new $buildproctype(\$collection, " .
173 "\$source_dir, \$build_dir, \$verbosity, \$outhandle)");
174 die "$@" if $@;
175
176 return $self;
177}
178
179sub init {
180 my $self = shift (@_);
181
182 if (!$self->{'debug'} && !$self->{'keepold'}) {
183 # remove any old builds
184 &util::rm_r($self->{'build_dir'});
185 &util::mk_all_dir($self->{'build_dir'});
186
187 # make the text directory
188 my $textdir = "$self->{'build_dir'}/text";
189 &util::mk_all_dir($textdir);
190 }
191}
192
193sub compress_text {
194 my $self = shift (@_);
195 my ($textindex) = @_;
196 my $exedir = "$ENV{'GSDLHOME'}/bin/$ENV{'GSDLOS'}";
197 my $exe = &util::get_os_exe ();
198 my $mg_passes_exe = &util::filename_cat($exedir, "mg_passes$exe");
199 my $mg_compression_dict_exe = &util::filename_cat($exedir, "mg_compression_dict$exe");
200 my $outhandle = $self->{'outhandle'};
201
202 &util::mk_all_dir (&util::filename_cat($self->{'build_dir'}, "text"));
203 my $basefilename = "text/$self->{'collection'}";
204 my $fulltextprefix = &util::filename_cat ($self->{'build_dir'}, $basefilename);
205
206 my $osextra = "";
207 if ($ENV{'GSDLOS'} =~ /^windows$/i) {
208 $fulltextprefix =~ s/\//\\/g;
209 } else {
210 $osextra = " -d /";
211 }
212
213 print $outhandle "\n*** creating the compressed text\n" if ($self->{'verbosity'} >= 1);
214
215 # collect the statistics for the text
216 # -b $maxdocsize sets the maximum document size to be 12 meg
217 print $outhandle "\n collecting text statistics\n" if ($self->{'verbosity'} >= 1);
218
219 my ($handle);
220 if ($self->{'debug'}) {
221 $handle = STDOUT;
222 } else {
223 if (!-e "$mg_passes_exe" ||
224 !open (PIPEOUT, "| mg_passes$exe -f \"$fulltextprefix\" -b $maxdocsize -T1 $osextra")) {
225 die "mgbuilder::compress_text - couldn't run $mg_passes_exe\n";
226 }
227 $handle = mgbuilder::PIPEOUT;
228 }
229
230 $self->{'buildproc'}->set_output_handle ($handle);
231 $self->{'buildproc'}->set_mode ('text');
232 $self->{'buildproc'}->set_index ($textindex);
233 $self->{'buildproc'}->set_indexing_text (0);
234 if ($self->{'no_text'}) {
235 $self->{'buildproc'}->set_store_text(0);
236 } else {
237 $self->{'buildproc'}->set_store_text(1);
238 }
239 $self->{'buildproc'}->reset();
240 &plugin::begin($self->{'pluginfo'}, $self->{'source_dir'},
241 $self->{'buildproc'}, $self->{'maxdocs'});
242 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
243 "", {}, $self->{'buildproc'}, $self->{'maxdocs'});
244 &plugin::end($self->{'pluginfo'});
245
246 close ($handle) unless $self->{'debug'};
247
248 $self->print_stats();
249
250 # create the compression dictionary
251 # the compression dictionary is built by assuming the stats are from a seed
252 # dictionary (-S), if a novel word is encountered it is spelled out (-H),
253 # and the resulting dictionary must be less than 5 meg with the most frequent
254 # words being put into the dictionary first (-2 -k 5120)
255 if (!$self->{'debug'}) {
256 print $outhandle "\n creating the compression dictionary\n" if ($self->{'verbosity'} >= 1);
257 if (!-e "$mg_compression_dict_exe") {
258 die "mgbuilder::compress_text - couldn't run $mg_compression_dict_exe\n";
259 }
260 system ("mg_compression_dict$exe -f \"$fulltextprefix\" -S -H -2 -k 5120 $osextra");
261
262 # -b $maxdocsize sets the maximum document size to be 12 meg
263 if (!-e "$mg_passes_exe" ||
264 !open ($handle, "| mg_passes$exe -f \"$fulltextprefix\" -b $maxdocsize -T2 $osextra")) {
265 die "mgbuilder::compress_text - couldn't run $mg_passes_exe\n";
266 }
267 }
268
269 $self->{'buildproc'}->reset();
270 # compress the text
271 print $outhandle "\n compressing the text\n" if ($self->{'verbosity'} >= 1);
272 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
273 "", {}, $self->{'buildproc'}, $self->{'maxdocs'});
274 close ($handle) unless $self->{'debug'};
275
276 $self->print_stats();
277}
278
279sub want_built {
280 my $self = shift (@_);
281 my ($index) = @_;
282
283 if (defined ($self->{'collect_cfg'}->{'dontbuild'})) {
284 foreach $checkstr (@{$self->{'collect_cfg'}->{'dontbuild'}}) {
285 if ($index =~ /^$checkstr$/) {
286 push (@{$self->{'notbuilt'}}, $self->{'index_mapping'}->{$index});
287 return 0;
288 }
289 }
290 }
291
292 return 1;
293}
294
295sub build_indexes {
296 my $self = shift (@_);
297 my ($indexname) = @_;
298 my $outhandle = $self->{'outhandle'};
299
300 my $indexes = [];
301 if (defined $indexname && $indexname =~ /\w/) {
302 push @$indexes, $indexname;
303 } else {
304 $indexes = $self->{'collect_cfg'}->{'indexes'};
305 }
306
307 # create the mapping between the index descriptions
308 # and their directory names
309 $self->{'index_mapping'} = $self->create_index_mapping ($indexes);
310
311 # build each of the indexes
312 foreach $index (@$indexes) {
313 if ($self->want_built($index)) {
314 print $outhandle "\n*** building index $index in subdirectory " .
315 "$self->{'index_mapping'}->{$index}\n" if ($self->{'verbosity'} >= 1);
316 $self->build_index($index);
317 } else {
318 print $outhandle "\n*** ignoring index $index\n" if ($self->{'verbosity'} >= 1);
319 }
320 }
321}
322
323# creates directory names for each of the index descriptions
324sub create_index_mapping {
325 my $self = shift (@_);
326 my ($indexes) = @_;
327
328 my %mapping = ();
329 $mapping{'indexmaporder'} = [];
330 $mapping{'subcollectionmaporder'} = [];
331 $mapping{'languagemaporder'} = [];
332
333 # dirnames is used to check for collisions. Start this off
334 # with the manditory directory names
335 my %dirnames = ('text'=>'text',
336 'extra'=>'extra');
337 my %pnames = ('index' => '', 'subcollection' => '', 'languages' => '');
338
339 foreach $index (@$indexes) {
340 my ($level, $gran, $subcollection, $languages) = split (":", $index);
341
342 # the directory name starts with the first character of the index level
343 my ($pindex) = $level =~ /^(.)/;
344
345 # next comes a processed version of the index
346 $pindex .= $self->process_field ($gran);
347 $pindex = lc ($pindex);
348
349 # next comes a processed version of the subcollection if there is one.
350 my $psub = $self->process_field ($subcollection);
351 $psub = lc ($psub);
352
353 # next comes a processed version of the language if there is one.
354 my $plang = $self->process_field ($languages);
355 $plang = lc ($plang);
356
357 my $dirname = $pindex . $psub . $plang;
358
359 # check to be sure all index names are unique
360 while (defined ($dirnames{$dirname})) {
361 $dirname = $self->make_unique (\%pnames, $index, \$pindex, \$psub, \$plang);
362 }
363 $mapping{$index} = $dirname;
364
365 # store the mapping orders as well as the maps
366 # also put index, subcollection and language fields into the mapping thing -
367 # (the full index name (eg document:text:subcol:lang) is not used on
368 # the query page) -these are used for collectionmeta later on
369 if (!defined $mapping{'indexmap'}{"$level:$gran"}) {
370 $mapping{'indexmap'}{"$level:$gran"} = $pindex;
371 push (@{$mapping{'indexmaporder'}}, "$level:$gran");
372 if (!defined $mapping{"$level:$gran"}) {
373 $mapping{"$level:$gran"} = $pindex;
374 }
375 }
376 if ($psub =~ /\w/ && !defined ($mapping{'subcollectionmap'}{$subcollection})) {
377 $mapping{'subcollectionmap'}{$subcollection} = $psub;
378 push (@{$mapping{'subcollectionmaporder'}}, $subcollection);
379 $mapping{$subcollection} = $psub;
380 }
381 if ($plang =~ /\w/ && !defined ($mapping{'languagemap'}{$languages})) {
382 $mapping{'languagemap'}{$languages} = $plang;
383 push (@{$mapping{'languagemaporder'}}, $languages);
384 $mapping{$languages} = $plang;
385 }
386 $dirnames{$dirname} = $index;
387 $pnames{'index'}{$pindex} = "$level:$gran";
388 $pnames{'subcollection'}{$psub} = $subcollection;
389 $pnames{'languages'}{$plang} = $languages;
390 }
391
392 return \%mapping;
393}
394
395# returns a processed version of a field.
396# if the field has only one component the processed
397# version will contain the first character and next consonant
398# of that componant - otherwise it will contain the first
399# character of the first two components
400sub process_field {
401 my $self = shift (@_);
402 my ($field) = @_;
403
404 return "" unless (defined ($field) && $field =~ /\w/);
405
406 my @components = split /,/, $field;
407 if (scalar @components >= 2) {
408 splice (@components, 2);
409 map {s/^(.).*$/$1/;} @components;
410 return join("", @components);
411 } else {
412 my ($a, $b) = $field =~ /^(.).*?([bcdfghjklmnpqrstvwxyz])/i;
413 ($a, $b) = $field =~ /^(.)(.)/ unless defined $a && defined $b;
414 return "$a$b";
415 }
416}
417
418sub make_unique {
419 my $self = shift (@_);
420 my ($namehash, $index, $indexref, $subref, $langref) = @_;
421 my ($level, $gran, $subcollection, $languages) = split (":", $index);
422
423 if ($namehash->{'index'}->{$$indexref} ne "$level:$gran") {
424 $self->get_next_version ($indexref);
425 } elsif ($namehash->{'subcollection'}->{$$subref} ne $subcollection) {
426 $self->get_next_version ($subref);
427 } elsif ($namehash->{'languages'}->{$$langref} ne $languages) {
428 $self->get_next_version ($langref);
429 }
430 return "$$indexref$$subref$$langref";
431}
432
433sub get_next_version {
434 my $self = shift (@_);
435 my ($nameref) = @_;
436
437 if ($$nameref =~ /(\d\d)$/) {
438 my $num = $1; $num ++;
439 $$nameref =~ s/\d\d$/$num/;
440 } elsif ($$nameref =~ /(\d)$/) {
441 my $num = $1;
442 if ($num == 9) {$$nameref =~ s/\d\d$/10/;}
443 else {$num ++; $$nameref =~ s/\d$/$num/;}
444 } else {
445 $$nameref =~ s/.$/0/;
446 }
447}
448
449sub build_index {
450 my $self = shift (@_);
451 my ($index) = @_;
452 my $outhandle = $self->{'outhandle'};
453
454 # get the full index directory path and make sure it exists
455 my $indexdir = $self->{'index_mapping'}->{$index};
456 &util::mk_all_dir (&util::filename_cat($self->{'build_dir'}, $indexdir));
457 my $fullindexprefix = &util::filename_cat ($self->{'build_dir'}, $indexdir,
458 $self->{'collection'});
459 my $fulltextprefix = &util::filename_cat ($self->{'build_dir'}, "text",
460 $self->{'collection'});
461
462 # get any os specific stuff
463 my $exedir = "$ENV{'GSDLHOME'}/bin/$ENV{'GSDLOS'}";
464 my $exe = &util::get_os_exe ();
465 my $mg_passes_exe = &util::filename_cat($exedir, "mg_passes$exe");
466 my $mg_perf_hash_build_exe =
467 &util::filename_cat($exedir, "mg_perf_hash_build$exe");
468 my $mg_weights_build_exe =
469 &util::filename_cat ($exedir, "mg_weights_build$exe");
470 my $mg_invf_dict_exe =
471 &util::filename_cat ($exedir, "mg_invf_dict$exe");
472 my $mg_stem_idx_exe =
473 &util::filename_cat ($exedir, "mg_stem_idx$exe");
474
475 my $osextra = "";
476 if ($ENV{'GSDLOS'} =~ /^windows$/i) {
477 $fullindexprefix =~ s/\//\\/g;
478 } else {
479 $osextra = " -d /";
480 }
481
482 # get the index level from the index description
483 # the index will be level 2 unless we are building a
484 # paragraph level index
485 my $index_level = 2;
486 $index_level = 3 if $index =~ /^paragraph/i;
487
488 # get the index expression if this index belongs
489 # to a subcollection
490 my $indexexparr = [];
491
492 # there may be subcollection info, and language info.
493 my ($level, $fields, $subcollection, $language) = split (":", $index);
494 my @subcollections = ();
495 @subcollections = split /,/, $subcollection if (defined $subcollection);
496
497 foreach $subcollection (@subcollections) {
498 if (defined ($self->{'collect_cfg'}->{'subcollection'}->{$subcollection})) {
499 push (@$indexexparr, $self->{'collect_cfg'}->{'subcollection'}->{$subcollection});
500 }
501 }
502
503 # add expressions for languages if this index belongs to
504 # a language subcollection - only put languages expressions for the
505 # ones we want in the index
506
507 my @languages = ();
508 @languages = split /,/, $language if (defined $language);
509 foreach $language (@languages) {
510 my $not=0;
511 if ($language =~ s/^\!//) {
512 $not = 1;
513 }
514 foreach $lang (@{$self->{'collect_cfg'}->{'languages'}}) {
515 if ($lang eq $language) {
516 if($not) {
517 push (@$indexexparr, "!Language/$language/");
518 } else {
519 push (@$indexexparr, "Language/$language/");
520 }
521 last;
522 }
523 }
524 }
525
526 # Build index dictionary. Uses verbatim stem method
527 print $outhandle "\n creating index dictionary\n" if ($self->{'verbosity'} >= 1);
528 my ($handle);
529 if ($self->{'debug'}) {
530 $handle = STDOUT;
531 } else {
532 if (!-e "$mg_passes_exe" ||
533 !open (PIPEOUT, "| mg_passes$exe -f \"$fullindexprefix\" -b $maxdocsize " .
534 "-$index_level -m 32 -s 0 -G -t 10 -N1 $osextra")) {
535 die "mgbuilder::build_index - couldn't run $mg_passes_exe\n";
536 }
537 $handle = mgbuilder::PIPEOUT;
538 }
539
540 # set up the document processor
541 $self->{'buildproc'}->set_output_handle ($handle);
542 $self->{'buildproc'}->set_mode ('text');
543 $self->{'buildproc'}->set_index ($index, $indexexparr);
544 $self->{'buildproc'}->set_indexing_text (1);
545 $self->{'buildproc'}->set_store_text(1);
546
547 $self->{'buildproc'}->reset();
548 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
549 "", {}, $self->{'buildproc'}, $self->{'maxdocs'});
550 close ($handle) unless $self->{'debug'};
551
552 $self->print_stats();
553
554 if (!$self->{'debug'}) {
555 # create the perfect hash function
556 if (!-e "$mg_perf_hash_build_exe") {
557 die "mgbuilder::build_index - couldn't run $mg_perf_hash_build_exe\n";
558 }
559 system ("mg_perf_hash_build$exe -f \"$fullindexprefix\" $osextra");
560
561 if (!-e "$mg_passes_exe" ||
562 !open ($handle, "| mg_passes$exe -f \"$fullindexprefix\" -b $maxdocsize " .
563 "-$index_level -c 3 -G -t 10 -N2 $osextra")) {
564 die "mgbuilder::build_index - couldn't run $mg_passes_exe\n";
565 }
566 }
567
568 # invert the text
569 print $outhandle "\n inverting the text\n" if ($self->{'verbosity'} >= 1);
570
571 $self->{'buildproc'}->reset();
572 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
573 "", {}, $self->{'buildproc'}, $self->{'maxdocs'});
574
575 $self->print_stats ();
576
577 if (!$self->{'debug'}) {
578
579 close ($handle);
580
581 # create the weights file
582 print $outhandle "\n create the weights file\n" if ($self->{'verbosity'} >= 1);
583 if (!-e "$mg_weights_build_exe") {
584 die "mgbuilder::build_index - couldn't run $mg_weights_build_exe\n";
585 }
586 system ("mg_weights_build$exe -f \"$fullindexprefix\" -t \"$fulltextprefix\" $osextra");
587
588 # create 'on-disk' stemmed dictionary
589 print $outhandle "\n creating 'on-disk' stemmed dictionary\n" if ($self->{'verbosity'} >= 1);
590 if (!-e "$mg_invf_dict_exe") {
591 die "mgbuilder::build_index - couldn't run $mg_invf_dict_exe\n";
592 }
593 system ("mg_invf_dict$exe -f \"$fullindexprefix\" $osextra");
594
595
596 # creates stem index files for the various stemming methods
597 print $outhandle "\n creating stem indexes\n" if ($self->{'verbosity'} >= 1);
598 if (!-e "$mg_stem_idx_exe") {
599 die "mgbuilder::build_index - couldn't run $mg_stem_idx_exe\n";
600 }
601 system ("mg_stem_idx$exe -b 4096 -s1 -f \"$fullindexprefix\" $osextra");
602 system ("mg_stem_idx$exe -b 4096 -s2 -f \"$fullindexprefix\" $osextra");
603 system ("mg_stem_idx$exe -b 4096 -s3 -f \"$fullindexprefix\" $osextra");
604
605 # remove unwanted files
606 my $tmpdir = &util::filename_cat ($self->{'build_dir'}, $indexdir);
607 opendir (DIR, $tmpdir) || die
608 "mgbuilder::build_index - couldn't read directory $tmpdir\n";
609 foreach $file (readdir(DIR)) {
610 next if $file =~ /^\./;
611 my ($suffix) = $file =~ /\.([^\.]+)$/;
612 if (defined $suffix && !defined $wanted_index_files{$suffix}) {
613 # delete it!
614 print $outhandle "deleting $file\n" if $self->{'verbosity'} > 2;
615 &util::rm (&util::filename_cat ($tmpdir, $file));
616 }
617 }
618 closedir (DIR);
619 }
620}
621
622sub make_infodatabase {
623 my $self = shift (@_);
624 my $outhandle = $self->{'outhandle'};
625
626 my $textdir = &util::filename_cat($self->{'build_dir'}, "text");
627 my $assocdir = &util::filename_cat($self->{'build_dir'}, "assoc");
628 &util::mk_all_dir ($textdir);
629 &util::mk_all_dir ($assocdir);
630
631 # get db name
632 my $dbext = ".bdb";
633 $dbext = ".ldb" if &util::is_little_endian();
634 my $fulldbname = &util::filename_cat ($textdir, "$self->{'collection'}$dbext");
635 $fulldbname =~ s/\//\\/g if ($ENV{'GSDLOS'} =~ /^windows$/i);
636
637 my $exedir = "$ENV{'GSDLHOME'}/bin/$ENV{'GSDLOS'}";
638 my $exe = &util::get_os_exe ();
639 my $txt2db_exe = &util::filename_cat($exedir, "txt2db$exe");
640
641 print $outhandle "\n*** creating the info database and processing associated files\n"
642 if ($self->{'verbosity'} >= 1);
643
644 # init all the classifiers
645 &classify::init_classifiers ($self->{'classifiers'});
646
647
648 # set up the document processor
649 my ($handle);
650 if ($self->{'debug'}) {
651 $handle = STDOUT;
652 } else {
653 if (!-e "$txt2db_exe" || !open (PIPEOUT, "| txt2db$exe \"$fulldbname\"")) {
654 die "mgbuilder::make_infodatabase - couldn't run $txt2db_exe\n";
655 }
656 $handle = mgbuilder::PIPEOUT;
657 }
658
659 $self->{'buildproc'}->set_output_handle ($handle);
660 $self->{'buildproc'}->set_mode ('infodb');
661 $self->{'buildproc'}->set_assocdir ($assocdir);
662 $self->{'buildproc'}->set_dontgdbm ($self->{'dontgdbm'});
663 $self->{'buildproc'}->set_classifiers ($self->{'classifiers'});
664 $self->{'buildproc'}->set_indexing_text (0);
665 $self->{'buildproc'}->set_store_text(1);
666 $self->{'buildproc'}->reset();
667
668 if (defined $self->{'collect_cfg'}->{'collectionmeta'}) {
669
670 if (!defined $self->{'index_mapping'}) {
671 $self->{'index_mapping'} =
672 $self->create_index_mapping ($self->{'collect_cfg'}->{'indexes'});
673 }
674
675 print $handle "[collection]\n";
676
677 foreach $cmeta (keys (%{$self->{'collect_cfg'}->{'collectionmeta'}})) {
678 my $defaultfound=0;
679 my $first=1;
680 my $metadata_entry = "";
681 my $default="";
682 my $cmetamap = "";
683 if ($cmeta =~ s/^\.//) {
684 if (defined $self->{'index_mapping'}->{$cmeta}) {
685 $cmetamap = $self->{'index_mapping'}->{$cmeta};
686 $cmeta = ".$cmeta";
687 }
688 else {
689 print $outhandle "mgbuilder: warning bad collectionmeta option '$cmeta' - ignored\n";
690 next; #ignore this one
691 }
692 }
693 else {
694 $cmetamap = $cmeta; # just using the same name
695 }
696 #iterate through the languages
697 foreach $lang (keys (%{$self->{'collect_cfg'}->{'collectionmeta'}->{$cmeta}})) {
698 if ($first) {
699 $first=0;
700 #set the default default to the first entry
701 $default=$self->{'collect_cfg'}->{'collectionmeta'}->{$cmeta}->{$lang};
702 }
703 if ($lang =~ /default/) {
704 $defaultfound=1;
705 #the default entry goes first
706 $metadata_entry = "<$cmetamap>" .
707 $self->{'collect_cfg'}->{'collectionmeta'}->{$cmeta}->{'default'} . "\n" . $metadata_entry;
708 }
709 else {
710 my ($l) = $lang =~ /^\[l=(\w*)\]$/;
711 if ($l) {
712 $metadata_entry .= "<$cmetamap:$l>" .
713 $self->{'collect_cfg'}->{'collectionmeta'}->{$cmeta}->{$lang} . "\n";
714 }
715 }
716 }
717 #if we haven't found a default, put one in
718 if (!$defaultfound) {
719 $metadata_entry = "<$cmetamap>$default\n" . $metadata_entry;
720 }
721 #write the entry to the file
722 print $handle $metadata_entry;
723
724 }
725
726 print $handle "\n" . ('-' x 70) . "\n";
727
728 }
729
730
731 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
732 "", {}, $self->{'buildproc'}, $self->{'maxdocs'});
733
734 # output classification information
735 &classify::output_classify_info ($self->{'classifiers'}, $handle,
736 $self->{'allclassifications'});
737
738
739
740 #output doclist
741 my @doclist = $self->{'buildproc'}->get_doc_list();
742 my $docs = join (";",@doclist);
743 print $handle "[browselist]\n";
744 print $handle "<hastxt>0\n";
745 print $handle "<childtype>VList\n";
746 print $handle "<numleafdocs>" . ($#doclist+1) . "\n";
747 print $handle "<thistype>Invisible\n";
748 print $handle "<contains>$docs";
749 print $handle "\n" . ('-' x 70) . "\n";
750
751 close ($handle) if !$self->{'debug'};
752}
753
754sub collect_specific {
755 my $self = shift (@_);
756}
757
758sub make_auxiliary_files {
759 my $self = shift (@_);
760 my ($index);
761 my %build_cfg = ();
762 my $outhandle = $self->{'outhandle'};
763
764 print $outhandle "\n*** creating auxiliary files \n" if ($self->{'verbosity'} >= 1);
765
766 # get the text directory
767 &util::mk_all_dir ($self->{'build_dir'});
768
769 # store the build date
770 $build_cfg->{'builddate'} = time;
771
772 # store the number of documents and number of bytes
773 $build_cfg->{'numdocs'} = $self->{'buildproc'}->get_num_docs();
774 $build_cfg->{'numbytes'} = $self->{'buildproc'}->get_num_bytes();
775
776 # get additional stats from mg
777 my $exedir = "$ENV{'GSDLHOME'}/bin/$ENV{'GSDLOS'}";
778 my $exe = &util::get_os_exe ();
779 my $mgstat_exe = &util::filename_cat($exedir, "mgstat$exe");
780 my $input_file = &util::filename_cat ("text", $self->{'collection'});
781 if (!-e "$mgstat_exe" || !open (PIPEIN, "mgstat$exe -d \"$self->{'build_dir'}\" -f \"$input_file\" |")) {
782 print $outhandle "Warning: Couldn't open pipe to $mgstat_exe to get additional stats\n";
783 } else {
784 my $line = "";
785 while (defined ($line = <PIPEIN>)) {
786 if ($line =~ /^Words in collection \[dict\]\s+:\s+(\d+)/) {
787 ($build_cfg->{'numwords'}) = $1;
788 } elsif ($line =~ /^Documents\s+:\s+(\d+)/) {
789 ($build_cfg->{'numsections'}) = $1;
790 }
791 }
792 close PIPEIN;
793 }
794
795 # store the mapping between the index names and the directory names
796 my @indexmap = ();
797 foreach $index (@{$self->{'index_mapping'}->{'indexmaporder'}}) {
798 push (@indexmap, "$index\-\>$self->{'index_mapping'}->{'indexmap'}->{$index}");
799 }
800 $build_cfg->{'indexmap'} = \@indexmap;
801
802 my @subcollectionmap = ();
803 foreach $subcollection (@{$self->{'index_mapping'}->{'subcollectionmaporder'}}) {
804 push (@subcollectionmap, "$subcollection\-\>" .
805 $self->{'index_mapping'}->{'subcollectionmap'}->{$subcollection});
806 }
807 $build_cfg->{'subcollectionmap'} = \@subcollectionmap if scalar (@subcollectionmap);
808
809 my @languagemap = ();
810 foreach $language (@{$self->{'index_mapping'}->{'languagemaporder'}}) {
811 push (@languagemap, "$language\-\>" .
812 $self->{'index_mapping'}->{'languagemap'}->{$language});
813 }
814 $build_cfg->{'languagemap'} = \@languagemap if scalar (@languagemap);
815
816 $build_cfg->{'notbuilt'} = $self->{'notbuilt'} if scalar @{$self->{'notbuilt'}};
817
818 # write out the build information
819 &cfgread::write_cfg_file("$self->{'build_dir'}/build.cfg", $build_cfg,
820 '^(builddate|numdocs|numbytes|numwords|numsections)$',
821 '^(indexmap|subcollectionmap|languagemap|notbuilt)$');
822
823}
824
825sub deinit {
826 my $self = shift (@_);
827}
828
829sub print_stats {
830 my $self = shift (@_);
831
832 my $outhandle = $self->{'outhandle'};
833 my $indexing_text = $self->{'buildproc'}->get_indexing_text();
834 my $index = $self->{'buildproc'}->get_index();
835 my $num_bytes = $self->{'buildproc'}->get_num_bytes();
836 my $num_processed_bytes = $self->{'buildproc'}->get_num_processed_bytes();
837
838 if ($indexing_text) {
839 print $outhandle "Stats (Creating index $index)\n";
840 } else {
841 print $outhandle "Stats (Compressing text from $index)\n";
842 }
843 print $outhandle "Total bytes in collection: $num_bytes\n";
844 print $outhandle "Total bytes in $index: $num_processed_bytes\n";
845
846 if ($num_processed_bytes < 50 && ($indexing_text || !$self->{'no_text'})) {
847 print $outhandle "***************\n";
848 if ($indexing_text) {
849 print $outhandle "WARNING: There is very little or no text to process for $index\n";
850 } elsif (!$self->{'no_text'}) {
851 print $outhandle "WARNING: There is very little or no text to compress\n";
852 }
853 print $outhandle " Was this your intention?\n";
854 print $outhandle "***************\n";
855 }
856}
857
8581;
Note: See TracBrowser for help on using the repository browser.