source: main/tags/2.40/gsdl/perllib/mgbuilder.pm@ 21110

Last change on this file since 21110 was 4743, checked in by sjboddie, 21 years ago

Build code changes allowing mg collections containing no indexes to
be built (it in fact builds a small "dummy:text" index if none are
specified since we need an index for the runtime code to be able to
retrieve the compressed text).

  • Property svn:keywords set to Author Date Id Revision
File size: 29.2 KB
Line 
1###########################################################################
2#
3# mgbuilder.pm -- MGBuilder object
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26package mgbuilder;
27
28use classify;
29use cfgread;
30use colcfg;
31use plugin;
32use util;
33use FileHandle;
34
35BEGIN {
36 # set autoflush on for STDERR and STDOUT so that mg
37 # doesn't get out of sync with plugins
38 STDOUT->autoflush(1);
39 STDERR->autoflush(1);
40}
41
42END {
43 STDOUT->autoflush(0);
44 STDERR->autoflush(0);
45}
46
47$maxdocsize = 12000;
48
49%wanted_index_files = ('td'=>1,
50 't'=>1,
51 'idb'=>1,
52 'ib1'=>1,
53 'ib2'=>1,
54 'ib3'=>1,
55 'i'=>1,
56 'ip'=>1,
57 'tiw'=>1,
58 'wa'=>1);
59
60
61sub new {
62 my ($class, $collection, $source_dir, $build_dir, $verbosity,
63 $maxdocs, $debug, $keepold, $allclassifications,
64 $outhandle, $no_text, $failhandle) = @_;
65
66 $outhandle = STDERR unless defined $outhandle;
67 $no_text = 0 unless defined $no_text;
68 $failhandle = STDERR unless defined $failhandle;
69
70 # create an mgbuilder object
71 my $self = bless {'collection'=>$collection,
72 'source_dir'=>$source_dir,
73 'build_dir'=>$build_dir,
74 'verbosity'=>$verbosity,
75 'maxdocs'=>$maxdocs,
76 'debug'=>$debug,
77 'keepold'=>$keepold,
78 'allclassifications'=>$allclassifications,
79 'outhandle'=>$outhandle,
80 'no_text'=>$no_text,
81 'failhandle'=>$failhandle,
82 'notbuilt'=>[] # indexes not built
83 }, $class;
84
85
86 # read in the collection configuration file
87 my $colcfgname = "$ENV{'GSDLCOLLECTDIR'}/etc/collect.cfg";
88 if (!-e $colcfgname) {
89 die "mgbuilder::new - couldn't find collect.cfg for collection $collection\n";
90 }
91 $self->{'collect_cfg'} = &colcfg::read_collect_cfg ($colcfgname);
92
93 # sort out subcollection indexes
94 if (defined $self->{'collect_cfg'}->{'indexsubcollections'}) {
95 my $indexes = $self->{'collect_cfg'}->{'indexes'};
96 $self->{'collect_cfg'}->{'indexes'} = [];
97 foreach $subcollection (@{$self->{'collect_cfg'}->{'indexsubcollections'}}) {
98 foreach $index (@$indexes) {
99 push (@{$self->{'collect_cfg'}->{'indexes'}}, "$index:$subcollection");
100 }
101 }
102 }
103
104 # sort out language subindexes
105 if (defined $self->{'collect_cfg'}->{'languages'}) {
106 my $indexes = $self->{'collect_cfg'}->{'indexes'};
107 $self->{'collect_cfg'}->{'indexes'} = [];
108 foreach $language (@{$self->{'collect_cfg'}->{'languages'}}) {
109 foreach $index (@$indexes) {
110 if (defined ($self->{'collect_cfg'}->{'indexsubcollections'})) {
111 push (@{$self->{'collect_cfg'}->{'indexes'}}, "$index:$language");
112 }
113 else { # add in an empty subcollection field
114 push (@{$self->{'collect_cfg'}->{'indexes'}}, "$index\:\:$language");
115 }
116 }
117 }
118 }
119
120 if (defined($self->{'collect_cfg'}->{'indexes'})) {
121 # make sure that the same index isn't specified more than once
122 my %tmphash = ();
123 my @tmparray = @{$self->{'collect_cfg'}->{'indexes'}};
124 $self->{'collect_cfg'}->{'indexes'} = [];
125 foreach my $i (@tmparray) {
126 if (!defined ($tmphash{$i})) {
127 push (@{$self->{'collect_cfg'}->{'indexes'}}, $i);
128 $tmphash{$i} = 1;
129 }
130 }
131 } else {
132 $self->{'collect_cfg'}->{'indexes'} = [];
133 }
134
135 if (scalar(@{$self->{'collect_cfg'}->{'indexes'}}) == 0) {
136 # no indexes have been specified so we'll build a "dummy:text" index
137 push (@{$self->{'collect_cfg'}->{'indexes'}}, "dummy:text");
138 }
139
140 # get the list of plugins for this collection
141 my $plugins = [];
142 if (defined $self->{'collect_cfg'}->{'plugin'}) {
143 $plugins = $self->{'collect_cfg'}->{'plugin'};
144 }
145
146 # load all the plugins
147 $self->{'pluginfo'} = &plugin::load_plugins ($plugins, $verbosity, $outhandle, $failhandle);
148 if (scalar(@{$self->{'pluginfo'}}) == 0) {
149 print $outhandle "No plugins were loaded.\n";
150 die "\n";
151 }
152
153 # get the list of classifiers for this collection
154 my $classifiers = [];
155 if (defined $self->{'collect_cfg'}->{'classify'}) {
156 $classifiers = $self->{'collect_cfg'}->{'classify'};
157 }
158
159 # load all the classifiers
160 $self->{'classifiers'} = &classify::load_classifiers ($classifiers, $build_dir, $outhandle);
161
162 # load up any dontgdbm fields
163 $self->{'dontgdbm'} = {};
164 if (defined ($self->{'collect_cfg'}->{'dontgdbm'})) {
165 foreach $dg (@{$self->{'collect_cfg'}->{'dontgdbm'}}) {
166 $self->{'dontgdbm'}->{$dg} = 1;
167 }
168 }
169
170 # load up the document processor for building
171 # if a buildproc class has been created for this collection, use it
172 # otherwise, use the mg buildproc
173 my ($buildprocdir, $buildproctype);
174 if (-e "$ENV{'GSDLCOLLECTDIR'}/perllib/${collection}buildproc.pm") {
175 $buildprocdir = "$ENV{'GSDLCOLLECTDIR'}/perllib";
176 $buildproctype = "${collection}buildproc";
177 } else {
178 $buildprocdir = "$ENV{'GSDLHOME'}/perllib";
179 $buildproctype = "mgbuildproc";
180 }
181 require "$buildprocdir/$buildproctype.pm";
182
183 eval("\$self->{'buildproc'} = new $buildproctype(\$collection, " .
184 "\$source_dir, \$build_dir, \$verbosity, \$outhandle)");
185 die "$@" if $@;
186
187 return $self;
188}
189
190sub init {
191 my $self = shift (@_);
192
193 if (!$self->{'debug'} && !$self->{'keepold'}) {
194 # remove any old builds
195 &util::rm_r($self->{'build_dir'});
196 &util::mk_all_dir($self->{'build_dir'});
197
198 # make the text directory
199 my $textdir = "$self->{'build_dir'}/text";
200 &util::mk_all_dir($textdir);
201 }
202}
203
204sub compress_text {
205 my $self = shift (@_);
206 my ($textindex) = @_;
207 my $exedir = "$ENV{'GSDLHOME'}/bin/$ENV{'GSDLOS'}";
208 my $exe = &util::get_os_exe ();
209 my $mg_passes_exe = &util::filename_cat($exedir, "mg_passes$exe");
210 my $mg_compression_dict_exe = &util::filename_cat($exedir, "mg_compression_dict$exe");
211 my $outhandle = $self->{'outhandle'};
212
213 my $maxnumeric = 4;
214 if (defined($self->{'collect_cfg'}->{'maxnumeric'}) &&
215 $self->{'collect_cfg'}->{'maxnumeric'} =~ /^\d+$/) {
216 $maxnumeric = $self->{'collect_cfg'}->{'maxnumeric'};
217 }
218
219 &util::mk_all_dir (&util::filename_cat($self->{'build_dir'}, "text"));
220 my $basefilename = "text/$self->{'collection'}";
221 my $fulltextprefix = &util::filename_cat ($self->{'build_dir'}, $basefilename);
222
223 my $osextra = "";
224 if ($ENV{'GSDLOS'} =~ /^windows$/i) {
225 $fulltextprefix =~ s@/@\\@g;
226 } else {
227 $osextra = " -d /";
228 }
229
230 print $outhandle "\n*** creating the compressed text\n" if ($self->{'verbosity'} >= 1);
231
232 # collect the statistics for the text
233 # -b $maxdocsize sets the maximum document size to be 12 meg
234 print $outhandle "\n collecting text statistics\n" if ($self->{'verbosity'} >= 1);
235
236 my ($handle);
237 if ($self->{'debug'}) {
238 $handle = STDOUT;
239 } else {
240 if (!-e "$mg_passes_exe" ||
241 !open (PIPEOUT, "| mg_passes$exe -f \"$fulltextprefix\" -b $maxdocsize -T1 -M $maxnumeric $osextra")) {
242 die "mgbuilder::compress_text - couldn't run $mg_passes_exe\n";
243 }
244 $handle = mgbuilder::PIPEOUT;
245 }
246
247 $self->{'buildproc'}->set_output_handle ($handle);
248 $self->{'buildproc'}->set_mode ('text');
249 $self->{'buildproc'}->set_index ($textindex);
250 $self->{'buildproc'}->set_indexing_text (0);
251 if ($self->{'no_text'}) {
252 $self->{'buildproc'}->set_store_text(0);
253 } else {
254 $self->{'buildproc'}->set_store_text(1);
255 }
256 $self->{'buildproc'}->reset();
257 &plugin::begin($self->{'pluginfo'}, $self->{'source_dir'},
258 $self->{'buildproc'}, $self->{'maxdocs'});
259 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
260 "", {}, $self->{'buildproc'}, $self->{'maxdocs'});
261 &plugin::end($self->{'pluginfo'});
262
263 close ($handle) unless $self->{'debug'};
264
265 $self->print_stats();
266
267 # create the compression dictionary
268 # the compression dictionary is built by assuming the stats are from a seed
269 # dictionary (-S), if a novel word is encountered it is spelled out (-H),
270 # and the resulting dictionary must be less than 5 meg with the most frequent
271 # words being put into the dictionary first (-2 -k 5120)
272 if (!$self->{'debug'}) {
273 print $outhandle "\n creating the compression dictionary\n" if ($self->{'verbosity'} >= 1);
274 if (!-e "$mg_compression_dict_exe") {
275 die "mgbuilder::compress_text - couldn't run $mg_compression_dict_exe\n";
276 }
277 system ("mg_compression_dict$exe -f \"$fulltextprefix\" -S -H -2 -k 5120 $osextra");
278
279 # -b $maxdocsize sets the maximum document size to be 12 meg
280 if (!-e "$mg_passes_exe" ||
281 !open ($handle, "| mg_passes$exe -f \"$fulltextprefix\" -b $maxdocsize -T2 -M $maxnumeric $osextra")) {
282 die "mgbuilder::compress_text - couldn't run $mg_passes_exe\n";
283 }
284 }
285
286 $self->{'buildproc'}->reset();
287 # compress the text
288 print $outhandle "\n compressing the text\n" if ($self->{'verbosity'} >= 1);
289 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
290 "", {}, $self->{'buildproc'}, $self->{'maxdocs'});
291 close ($handle) unless $self->{'debug'};
292
293 $self->print_stats();
294}
295
296sub want_built {
297 my $self = shift (@_);
298 my ($index) = @_;
299
300 if (defined ($self->{'collect_cfg'}->{'dontbuild'})) {
301 foreach $checkstr (@{$self->{'collect_cfg'}->{'dontbuild'}}) {
302 if ($index =~ /^$checkstr$/) {
303 push (@{$self->{'notbuilt'}}, $self->{'index_mapping'}->{$index});
304 return 0;
305 }
306 }
307 }
308
309 return 1;
310}
311
312sub build_indexes {
313 my $self = shift (@_);
314 my ($indexname) = @_;
315 my $outhandle = $self->{'outhandle'};
316
317 my $indexes = [];
318 if (defined $indexname && $indexname =~ /\w/) {
319 push @$indexes, $indexname;
320 } else {
321 $indexes = $self->{'collect_cfg'}->{'indexes'};
322 }
323
324 # create the mapping between the index descriptions
325 # and their directory names
326 $self->{'index_mapping'} = $self->create_index_mapping ($indexes);
327
328 # build each of the indexes
329 foreach $index (@$indexes) {
330 if ($self->want_built($index)) {
331 print $outhandle "\n*** building index $index in subdirectory " .
332 "$self->{'index_mapping'}->{$index}\n" if ($self->{'verbosity'} >= 1);
333 $self->build_index($index);
334 } else {
335 print $outhandle "\n*** ignoring index $index\n" if ($self->{'verbosity'} >= 1);
336 }
337 }
338}
339
340# creates directory names for each of the index descriptions
341sub create_index_mapping {
342 my $self = shift (@_);
343 my ($indexes) = @_;
344
345 my %mapping = ();
346 $mapping{'indexmaporder'} = [];
347 $mapping{'subcollectionmaporder'} = [];
348 $mapping{'languagemaporder'} = [];
349
350 # dirnames is used to check for collisions. Start this off
351 # with the manditory directory names
352 my %dirnames = ('text'=>'text',
353 'extra'=>'extra');
354 my %pnames = ('index' => '', 'subcollection' => '', 'languages' => '');
355
356 foreach $index (@$indexes) {
357 my ($level, $gran, $subcollection, $languages) = split (":", $index);
358
359 # the directory name starts with the first character of the index level
360 my ($pindex) = $level =~ /^(.)/;
361
362 # next comes a processed version of the index
363 $pindex .= $self->process_field ($gran);
364 $pindex = lc ($pindex);
365
366 # next comes a processed version of the subcollection if there is one.
367 my $psub = $self->process_field ($subcollection);
368 $psub = lc ($psub);
369
370 # next comes a processed version of the language if there is one.
371 my $plang = $self->process_field ($languages);
372 $plang = lc ($plang);
373
374 my $dirname = $pindex . $psub . $plang;
375
376 # check to be sure all index names are unique
377 while (defined ($dirnames{$dirname})) {
378 $dirname = $self->make_unique (\%pnames, $index, \$pindex, \$psub, \$plang);
379 }
380 $mapping{$index} = $dirname;
381
382 # store the mapping orders as well as the maps
383 # also put index, subcollection and language fields into the mapping thing -
384 # (the full index name (eg document:text:subcol:lang) is not used on
385 # the query page) -these are used for collectionmeta later on
386 if (!defined $mapping{'indexmap'}{"$level:$gran"}) {
387 $mapping{'indexmap'}{"$level:$gran"} = $pindex;
388 push (@{$mapping{'indexmaporder'}}, "$level:$gran");
389 if (!defined $mapping{"$level:$gran"}) {
390 $mapping{"$level:$gran"} = $pindex;
391 }
392 }
393 if ($psub =~ /\w/ && !defined ($mapping{'subcollectionmap'}{$subcollection})) {
394 $mapping{'subcollectionmap'}{$subcollection} = $psub;
395 push (@{$mapping{'subcollectionmaporder'}}, $subcollection);
396 $mapping{$subcollection} = $psub;
397 }
398 if ($plang =~ /\w/ && !defined ($mapping{'languagemap'}{$languages})) {
399 $mapping{'languagemap'}{$languages} = $plang;
400 push (@{$mapping{'languagemaporder'}}, $languages);
401 $mapping{$languages} = $plang;
402 }
403 $dirnames{$dirname} = $index;
404 $pnames{'index'}{$pindex} = "$level:$gran";
405 $pnames{'subcollection'}{$psub} = $subcollection;
406 $pnames{'languages'}{$plang} = $languages;
407 }
408
409 return \%mapping;
410}
411
412# returns a processed version of a field.
413# if the field has only one component the processed
414# version will contain the first character and next consonant
415# of that componant - otherwise it will contain the first
416# character of the first two components
417sub process_field {
418 my $self = shift (@_);
419 my ($field) = @_;
420
421 return "" unless (defined ($field) && $field =~ /\w/);
422
423 my @components = split /,/, $field;
424 if (scalar @components >= 2) {
425 splice (@components, 2);
426 map {s/^(.).*$/$1/;} @components;
427 return join("", @components);
428 } else {
429 my ($a, $b) = $field =~ /^(.).*?([bcdfghjklmnpqrstvwxyz])/i;
430 ($a, $b) = $field =~ /^(.)(.)/ unless defined $a && defined $b;
431 return "$a$b";
432 }
433}
434
435sub make_unique {
436 my $self = shift (@_);
437 my ($namehash, $index, $indexref, $subref, $langref) = @_;
438 my ($level, $gran, $subcollection, $languages) = split (":", $index);
439
440 if ($namehash->{'index'}->{$$indexref} ne "$level:$gran") {
441 $self->get_next_version ($indexref);
442 } elsif ($namehash->{'subcollection'}->{$$subref} ne $subcollection) {
443 $self->get_next_version ($subref);
444 } elsif ($namehash->{'languages'}->{$$langref} ne $languages) {
445 $self->get_next_version ($langref);
446 }
447 return "$$indexref$$subref$$langref";
448}
449
450sub get_next_version {
451 my $self = shift (@_);
452 my ($nameref) = @_;
453
454 if ($$nameref =~ /(\d\d)$/) {
455 my $num = $1; $num ++;
456 $$nameref =~ s/\d\d$/$num/;
457 } elsif ($$nameref =~ /(\d)$/) {
458 my $num = $1;
459 if ($num == 9) {$$nameref =~ s/\d\d$/10/;}
460 else {$num ++; $$nameref =~ s/\d$/$num/;}
461 } else {
462 $$nameref =~ s/.$/0/;
463 }
464}
465
466sub build_index {
467 my $self = shift (@_);
468 my ($index) = @_;
469 my $outhandle = $self->{'outhandle'};
470
471 # get the full index directory path and make sure it exists
472 my $indexdir = $self->{'index_mapping'}->{$index};
473 &util::mk_all_dir (&util::filename_cat($self->{'build_dir'}, $indexdir));
474 my $fullindexprefix = &util::filename_cat ($self->{'build_dir'}, $indexdir,
475 $self->{'collection'});
476 my $fulltextprefix = &util::filename_cat ($self->{'build_dir'}, "text",
477 $self->{'collection'});
478
479 # get any os specific stuff
480 my $exedir = "$ENV{'GSDLHOME'}/bin/$ENV{'GSDLOS'}";
481 my $exe = &util::get_os_exe ();
482 my $mg_passes_exe = &util::filename_cat($exedir, "mg_passes$exe");
483 my $mg_perf_hash_build_exe =
484 &util::filename_cat($exedir, "mg_perf_hash_build$exe");
485 my $mg_weights_build_exe =
486 &util::filename_cat ($exedir, "mg_weights_build$exe");
487 my $mg_invf_dict_exe =
488 &util::filename_cat ($exedir, "mg_invf_dict$exe");
489 my $mg_stem_idx_exe =
490 &util::filename_cat ($exedir, "mg_stem_idx$exe");
491
492 my $maxnumeric = 4;
493 if (defined($self->{'collect_cfg'}->{'maxnumeric'}) &&
494 $self->{'collect_cfg'}->{'maxnumeric'} =~ /^\d+$/) {
495 $maxnumeric = $self->{'collect_cfg'}->{'maxnumeric'};
496 }
497
498 my $osextra = "";
499 if ($ENV{'GSDLOS'} =~ /^windows$/i) {
500 $fullindexprefix =~ s@/@\\@g;
501 } else {
502 $osextra = " -d /";
503 if ($outhandle ne "STDERR") {
504 # so mg_passes doesn't print to stderr if we redirect output
505 $osextra .= " 2>/dev/null";
506 }
507 }
508
509 # get the index level from the index description
510 # the index will be level 2 unless we are building a
511 # paragraph level index
512 my $index_level = 2;
513 $index_level = 3 if $index =~ /^paragraph/i;
514
515 # get the index expression if this index belongs
516 # to a subcollection
517 my $indexexparr = [];
518
519 # there may be subcollection info, and language info.
520 my ($level, $fields, $subcollection, $language) = split (":", $index);
521 my @subcollections = ();
522 @subcollections = split /,/, $subcollection if (defined $subcollection);
523
524 foreach $subcollection (@subcollections) {
525 if (defined ($self->{'collect_cfg'}->{'subcollection'}->{$subcollection})) {
526 push (@$indexexparr, $self->{'collect_cfg'}->{'subcollection'}->{$subcollection});
527 }
528 }
529
530 # add expressions for languages if this index belongs to
531 # a language subcollection - only put languages expressions for the
532 # ones we want in the index
533
534 my @languages = ();
535 @languages = split /,/, $language if (defined $language);
536 foreach $language (@languages) {
537 my $not=0;
538 if ($language =~ s/^\!//) {
539 $not = 1;
540 }
541 foreach $lang (@{$self->{'collect_cfg'}->{'languages'}}) {
542 if ($lang eq $language) {
543 if($not) {
544 push (@$indexexparr, "!Language/$language/");
545 } else {
546 push (@$indexexparr, "Language/$language/");
547 }
548 last;
549 }
550 }
551 }
552
553 # Build index dictionary. Uses verbatim stem method
554 print $outhandle "\n creating index dictionary\n" if ($self->{'verbosity'} >= 1);
555 my ($handle);
556 if ($self->{'debug'}) {
557 $handle = STDOUT;
558 } else {
559 if (!-e "$mg_passes_exe" ||
560 !open (PIPEOUT, "| mg_passes$exe -f \"$fullindexprefix\" -b $maxdocsize " .
561 "-$index_level -m 32 -s 0 -G -t 10 -N1 -M $maxnumeric $osextra")) {
562 die "mgbuilder::build_index - couldn't run $mg_passes_exe\n";
563 }
564 $handle = mgbuilder::PIPEOUT;
565 }
566
567 # set up the document processor
568 $self->{'buildproc'}->set_output_handle ($handle);
569 $self->{'buildproc'}->set_mode ('text');
570 $self->{'buildproc'}->set_index ($index, $indexexparr);
571 $self->{'buildproc'}->set_indexing_text (1);
572 $self->{'buildproc'}->set_store_text(1);
573
574 $self->{'buildproc'}->reset();
575 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
576 "", {}, $self->{'buildproc'}, $self->{'maxdocs'});
577 close ($handle) unless $self->{'debug'};
578
579 $self->print_stats();
580
581 if (!$self->{'debug'}) {
582 # create the perfect hash function
583 if (!-e "$mg_perf_hash_build_exe") {
584 die "mgbuilder::build_index - couldn't run $mg_perf_hash_build_exe\n";
585 }
586 system ("mg_perf_hash_build$exe -f \"$fullindexprefix\" $osextra");
587
588 if (!-e "$mg_passes_exe" ||
589 !open ($handle, "| mg_passes$exe -f \"$fullindexprefix\" -b $maxdocsize " .
590 "-$index_level -c 3 -G -t 10 -N2 -M $maxnumeric $osextra")) {
591 die "mgbuilder::build_index - couldn't run $mg_passes_exe\n";
592 }
593 }
594
595 # invert the text
596 print $outhandle "\n inverting the text\n" if ($self->{'verbosity'} >= 1);
597
598 $self->{'buildproc'}->reset();
599 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
600 "", {}, $self->{'buildproc'}, $self->{'maxdocs'});
601
602 $self->print_stats ();
603
604 if (!$self->{'debug'}) {
605
606 close ($handle);
607
608 # create the weights file
609 print $outhandle "\n create the weights file\n" if ($self->{'verbosity'} >= 1);
610 if (!-e "$mg_weights_build_exe") {
611 die "mgbuilder::build_index - couldn't run $mg_weights_build_exe\n";
612 }
613 system ("mg_weights_build$exe -f \"$fullindexprefix\" -t \"$fulltextprefix\" $osextra");
614
615 # create 'on-disk' stemmed dictionary
616 print $outhandle "\n creating 'on-disk' stemmed dictionary\n" if ($self->{'verbosity'} >= 1);
617 if (!-e "$mg_invf_dict_exe") {
618 die "mgbuilder::build_index - couldn't run $mg_invf_dict_exe\n";
619 }
620 system ("mg_invf_dict$exe -f \"$fullindexprefix\" $osextra");
621
622
623 # creates stem index files for the various stemming methods
624 print $outhandle "\n creating stem indexes\n" if ($self->{'verbosity'} >= 1);
625 if (!-e "$mg_stem_idx_exe") {
626 die "mgbuilder::build_index - couldn't run $mg_stem_idx_exe\n";
627 }
628 system ("mg_stem_idx$exe -b 4096 -s1 -f \"$fullindexprefix\" $osextra");
629 system ("mg_stem_idx$exe -b 4096 -s2 -f \"$fullindexprefix\" $osextra");
630 system ("mg_stem_idx$exe -b 4096 -s3 -f \"$fullindexprefix\" $osextra");
631
632 # remove unwanted files
633 my $tmpdir = &util::filename_cat ($self->{'build_dir'}, $indexdir);
634 opendir (DIR, $tmpdir) || die
635 "mgbuilder::build_index - couldn't read directory $tmpdir\n";
636 foreach $file (readdir(DIR)) {
637 next if $file =~ /^\./;
638 my ($suffix) = $file =~ /\.([^\.]+)$/;
639 if (defined $suffix && !defined $wanted_index_files{$suffix}) {
640 # delete it!
641 print $outhandle "deleting $file\n" if $self->{'verbosity'} > 2;
642 &util::rm (&util::filename_cat ($tmpdir, $file));
643 }
644 }
645 closedir (DIR);
646 }
647}
648
649sub make_infodatabase {
650 my $self = shift (@_);
651 my $outhandle = $self->{'outhandle'};
652
653 my $textdir = &util::filename_cat($self->{'build_dir'}, "text");
654 my $assocdir = &util::filename_cat($self->{'build_dir'}, "assoc");
655 &util::mk_all_dir ($textdir);
656 &util::mk_all_dir ($assocdir);
657
658 # get db name
659 my $dbext = ".bdb";
660 $dbext = ".ldb" if &util::is_little_endian();
661 my $fulldbname = &util::filename_cat ($textdir, "$self->{'collection'}$dbext");
662 $fulldbname =~ s/\//\\/g if ($ENV{'GSDLOS'} =~ /^windows$/i);
663
664 my $exedir = "$ENV{'GSDLHOME'}/bin/$ENV{'GSDLOS'}";
665 my $exe = &util::get_os_exe ();
666 my $txt2db_exe = &util::filename_cat($exedir, "txt2db$exe");
667
668 print $outhandle "\n*** creating the info database and processing associated files\n"
669 if ($self->{'verbosity'} >= 1);
670
671 # init all the classifiers
672 &classify::init_classifiers ($self->{'classifiers'});
673
674
675 # set up the document processor
676 my ($handle);
677 if ($self->{'debug'}) {
678 $handle = STDOUT;
679 } else {
680 if (!-e "$txt2db_exe" || !open (PIPEOUT, "| txt2db$exe \"$fulldbname\"")) {
681 die "mgbuilder::make_infodatabase - couldn't run $txt2db_exe\n";
682 }
683 $handle = mgbuilder::PIPEOUT;
684 }
685
686 $self->{'buildproc'}->set_output_handle ($handle);
687 $self->{'buildproc'}->set_mode ('infodb');
688 $self->{'buildproc'}->set_assocdir ($assocdir);
689 $self->{'buildproc'}->set_dontgdbm ($self->{'dontgdbm'});
690 $self->{'buildproc'}->set_classifiers ($self->{'classifiers'});
691 $self->{'buildproc'}->set_indexing_text (0);
692 $self->{'buildproc'}->set_store_text(1);
693 $self->{'buildproc'}->reset();
694
695 if (defined $self->{'collect_cfg'}->{'collectionmeta'}) {
696
697 if (!defined $self->{'index_mapping'}) {
698 $self->{'index_mapping'} =
699 $self->create_index_mapping ($self->{'collect_cfg'}->{'indexes'});
700 }
701
702 print $handle "[collection]\n";
703
704 foreach $cmeta (keys (%{$self->{'collect_cfg'}->{'collectionmeta'}})) {
705 my $defaultfound=0;
706 my $first=1;
707 my $metadata_entry = "";
708 my $default="";
709 my $cmetamap = "";
710 if ($cmeta =~ s/^\.//) {
711 if (defined $self->{'index_mapping'}->{$cmeta}) {
712 $cmetamap = $self->{'index_mapping'}->{$cmeta};
713 $cmeta = ".$cmeta";
714 }
715 else {
716 print $outhandle "mgbuilder: warning bad collectionmeta option '$cmeta' - ignored\n";
717 next; #ignore this one
718 }
719 }
720 else {
721 $cmetamap = $cmeta; # just using the same name
722 }
723 #iterate through the languages
724 foreach $lang (keys (%{$self->{'collect_cfg'}->{'collectionmeta'}->{$cmeta}})) {
725 if ($first) {
726 $first=0;
727 #set the default default to the first entry
728 $default=$self->{'collect_cfg'}->{'collectionmeta'}->{$cmeta}->{$lang};
729 }
730 if ($lang =~ /default/) {
731 $defaultfound=1;
732 #the default entry goes first
733 $metadata_entry = "<$cmetamap>" .
734 $self->{'collect_cfg'}->{'collectionmeta'}->{$cmeta}->{'default'} . "\n" . $metadata_entry;
735 }
736 else {
737 my ($l) = $lang =~ /^\[l=(\w*)\]$/;
738 if ($l) {
739 $metadata_entry .= "<$cmetamap:$l>" .
740 $self->{'collect_cfg'}->{'collectionmeta'}->{$cmeta}->{$lang} . "\n";
741 }
742 }
743 }
744 #if we haven't found a default, put one in
745 if (!$defaultfound) {
746 $metadata_entry = "<$cmetamap>$default\n" . $metadata_entry;
747 }
748 #write the entry to the file
749 print $handle $metadata_entry;
750
751 }
752
753 print $handle "\n" . ('-' x 70) . "\n";
754
755 }
756
757
758 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
759 "", {}, $self->{'buildproc'}, $self->{'maxdocs'});
760
761 # output classification information
762 &classify::output_classify_info ($self->{'classifiers'}, $handle,
763 $self->{'allclassifications'});
764
765
766
767 #output doclist
768 my @doclist = $self->{'buildproc'}->get_doc_list();
769 my $docs = join (";",@doclist);
770 print $handle "[browselist]\n";
771 print $handle "<hastxt>0\n";
772 print $handle "<childtype>VList\n";
773 print $handle "<numleafdocs>" . ($#doclist+1) . "\n";
774 print $handle "<thistype>Invisible\n";
775 print $handle "<contains>$docs";
776 print $handle "\n" . ('-' x 70) . "\n";
777
778 close ($handle) if !$self->{'debug'};
779}
780
781sub collect_specific {
782 my $self = shift (@_);
783}
784
785sub make_auxiliary_files {
786 my $self = shift (@_);
787 my ($index);
788 my %build_cfg = ();
789 my $outhandle = $self->{'outhandle'};
790
791 print $outhandle "\n*** creating auxiliary files \n" if ($self->{'verbosity'} >= 1);
792
793 # get the text directory
794 &util::mk_all_dir ($self->{'build_dir'});
795
796 # store the build date
797 $build_cfg->{'builddate'} = time;
798
799 # store the number of documents and number of bytes
800 $build_cfg->{'numdocs'} = $self->{'buildproc'}->get_num_docs();
801 $build_cfg->{'numbytes'} = $self->{'buildproc'}->get_num_bytes();
802
803 # get additional stats from mg
804 my $exedir = "$ENV{'GSDLHOME'}/bin/$ENV{'GSDLOS'}";
805 my $exe = &util::get_os_exe ();
806 my $mgstat_exe = &util::filename_cat($exedir, "mgstat$exe");
807 my $input_file = &util::filename_cat ("text", $self->{'collection'});
808 if (!-e "$mgstat_exe" || !open (PIPEIN, "mgstat$exe -d \"$self->{'build_dir'}\" -f \"$input_file\" |")) {
809 print $outhandle "Warning: Couldn't open pipe to $mgstat_exe to get additional stats\n";
810 } else {
811 my $line = "";
812 while (defined ($line = <PIPEIN>)) {
813 if ($line =~ /^Words in collection \[dict\]\s+:\s+(\d+)/) {
814 ($build_cfg->{'numwords'}) = $1;
815 } elsif ($line =~ /^Documents\s+:\s+(\d+)/) {
816 ($build_cfg->{'numsections'}) = $1;
817 }
818 }
819 close PIPEIN;
820 }
821
822 # store the mapping between the index names and the directory names
823 my @indexmap = ();
824 foreach $index (@{$self->{'index_mapping'}->{'indexmaporder'}}) {
825 push (@indexmap, "$index\-\>$self->{'index_mapping'}->{'indexmap'}->{$index}");
826 }
827 $build_cfg->{'indexmap'} = \@indexmap;
828
829 my @subcollectionmap = ();
830 foreach $subcollection (@{$self->{'index_mapping'}->{'subcollectionmaporder'}}) {
831 push (@subcollectionmap, "$subcollection\-\>" .
832 $self->{'index_mapping'}->{'subcollectionmap'}->{$subcollection});
833 }
834 $build_cfg->{'subcollectionmap'} = \@subcollectionmap if scalar (@subcollectionmap);
835
836 my @languagemap = ();
837 foreach $language (@{$self->{'index_mapping'}->{'languagemaporder'}}) {
838 push (@languagemap, "$language\-\>" .
839 $self->{'index_mapping'}->{'languagemap'}->{$language});
840 }
841 $build_cfg->{'languagemap'} = \@languagemap if scalar (@languagemap);
842
843 $build_cfg->{'notbuilt'} = $self->{'notbuilt'} if scalar @{$self->{'notbuilt'}};
844
845 $build_cfg->{'maxnumeric'} = 4;
846 if (defined($self->{'collect_cfg'}->{'maxnumeric'}) &&
847 $self->{'collect_cfg'}->{'maxnumeric'} =~ /^\d+$/) {
848 $build_cfg->{'maxnumeric'} = $self->{'collect_cfg'}->{'maxnumeric'};
849 }
850
851 # write out the build information
852 &cfgread::write_cfg_file("$self->{'build_dir'}/build.cfg", $build_cfg,
853 '^(builddate|numdocs|numbytes|numwords|numsections|maxnumeric)$',
854 '^(indexmap|subcollectionmap|languagemap|notbuilt)$');
855
856}
857
858sub deinit {
859 my $self = shift (@_);
860}
861
862sub print_stats {
863 my $self = shift (@_);
864
865 my $outhandle = $self->{'outhandle'};
866 my $indexing_text = $self->{'buildproc'}->get_indexing_text();
867 my $index = $self->{'buildproc'}->get_index();
868 my $num_bytes = $self->{'buildproc'}->get_num_bytes();
869 my $num_processed_bytes = $self->{'buildproc'}->get_num_processed_bytes();
870
871 if ($indexing_text) {
872 print $outhandle "Stats (Creating index $index)\n";
873 } else {
874 print $outhandle "Stats (Compressing text from $index)\n";
875 }
876 print $outhandle "Total bytes in collection: $num_bytes\n";
877 print $outhandle "Total bytes in $index: $num_processed_bytes\n";
878
879 if ($num_processed_bytes < 50 && ($indexing_text || !$self->{'no_text'})) {
880 print $outhandle "***************\n";
881 if ($indexing_text) {
882 print $outhandle "WARNING: There is very little or no text to process for $index\n";
883 } elsif (!$self->{'no_text'}) {
884 print $outhandle "WARNING: There is very little or no text to compress\n";
885 }
886 print $outhandle " Was this your intention?\n";
887 print $outhandle "***************\n";
888 }
889}
890
8911;
Note: See TracBrowser for help on using the repository browser.