source: main/tags/2.52/gsdl/perllib/mgbuilder.pm@ 25422

Last change on this file since 25422 was 8361, checked in by kjdon, 20 years ago

renamed build option 'allclassifications' to 'remove_empty_classifications' - this means that empty classifications (classifiers and internal nodes) are displayed by default now. Note, if a collection has been built previously by the GLI, and allclassifications options used, then this will crap out building until that old option is deleted from collname.col file

  • Property svn:keywords set to Author Date Id Revision
File size: 32.6 KB
Line 
1###########################################################################
2#
3# mgbuilder.pm -- MGBuilder object
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26package mgbuilder;
27
28use classify;
29use cfgread;
30use colcfg;
31use plugin;
32use util;
33use FileHandle;
34
35BEGIN {
36 # set autoflush on for STDERR and STDOUT so that mg
37 # doesn't get out of sync with plugins
38 STDOUT->autoflush(1);
39 STDERR->autoflush(1);
40}
41
42END {
43 STDOUT->autoflush(0);
44 STDERR->autoflush(0);
45}
46
47$maxdocsize = 12000;
48
49%wanted_index_files = ('td'=>1,
50 't'=>1,
51 'idb'=>1,
52 'ib1'=>1,
53 'ib2'=>1,
54 'ib3'=>1,
55 'i'=>1,
56 'ip'=>1,
57 'tiw'=>1,
58 'wa'=>1);
59
60
61sub new {
62 my ($class, $collection, $source_dir, $build_dir, $verbosity,
63 $maxdocs, $debug, $keepold, $remove_empty_classifications,
64 $outhandle, $no_text, $failhandle, $gli) = @_;
65
66 $outhandle = STDERR unless defined $outhandle;
67 $no_text = 0 unless defined $no_text;
68 $failhandle = STDERR unless defined $failhandle;
69
70 # create an mgbuilder object
71 my $self = bless {'collection'=>$collection,
72 'source_dir'=>$source_dir,
73 'build_dir'=>$build_dir,
74 'verbosity'=>$verbosity,
75 'maxdocs'=>$maxdocs,
76 'debug'=>$debug,
77 'keepold'=>$keepold,
78 'remove_empty_classifications'=>$remove_empty_classifications,
79 'outhandle'=>$outhandle,
80 'no_text'=>$no_text,
81 'failhandle'=>$failhandle,
82 'notbuilt'=>{}, # indexes not built
83 'gli'=>$gli
84 }, $class;
85
86 $self->{'gli'} = 0 unless defined $self->{'gli'};
87
88 # read in the collection configuration file
89 my $colcfgname = "$ENV{'GSDLCOLLECTDIR'}/etc/collect.cfg";
90 if (!-e $colcfgname) {
91 die "mgbuilder::new - couldn't find collect.cfg for collection $collection\n";
92 }
93 $self->{'collect_cfg'} = &colcfg::read_collect_cfg ($colcfgname);
94
95 if (!defined($self->{'collect_cfg'}->{'indexes'})) {
96 $self->{'collect_cfg'}->{'indexes'} = [];
97 }
98
99 # sort out subcollection indexes
100 if (defined $self->{'collect_cfg'}->{'indexsubcollections'}) {
101 my $indexes = $self->{'collect_cfg'}->{'indexes'};
102 $self->{'collect_cfg'}->{'indexes'} = [];
103 foreach $subcollection (@{$self->{'collect_cfg'}->{'indexsubcollections'}}) {
104 foreach $index (@$indexes) {
105 push (@{$self->{'collect_cfg'}->{'indexes'}}, "$index:$subcollection");
106 }
107 }
108 }
109
110 # sort out language subindexes
111 if (defined $self->{'collect_cfg'}->{'languages'}) {
112 my $indexes = $self->{'collect_cfg'}->{'indexes'};
113 $self->{'collect_cfg'}->{'indexes'} = [];
114 foreach $language (@{$self->{'collect_cfg'}->{'languages'}}) {
115 foreach $index (@$indexes) {
116 if (defined ($self->{'collect_cfg'}->{'indexsubcollections'})) {
117 push (@{$self->{'collect_cfg'}->{'indexes'}}, "$index:$language");
118 }
119 else { # add in an empty subcollection field
120 push (@{$self->{'collect_cfg'}->{'indexes'}}, "$index\:\:$language");
121 }
122 }
123 }
124 }
125
126 if (defined($self->{'collect_cfg'}->{'indexes'})) {
127 # make sure that the same index isn't specified more than once
128 my %tmphash = ();
129 my @tmparray = @{$self->{'collect_cfg'}->{'indexes'}};
130 $self->{'collect_cfg'}->{'indexes'} = [];
131 foreach my $i (@tmparray) {
132 if (!defined ($tmphash{$i})) {
133 push (@{$self->{'collect_cfg'}->{'indexes'}}, $i);
134 $tmphash{$i} = 1;
135 }
136 }
137 } else {
138 $self->{'collect_cfg'}->{'indexes'} = [];
139 }
140
141 if (scalar(@{$self->{'collect_cfg'}->{'indexes'}}) == 0) {
142 # no indexes have been specified so we'll build a "dummy:text" index
143 push (@{$self->{'collect_cfg'}->{'indexes'}}, "dummy:text");
144 }
145
146 # get the list of plugins for this collection
147 my $plugins = [];
148 if (defined $self->{'collect_cfg'}->{'plugin'}) {
149 $plugins = $self->{'collect_cfg'}->{'plugin'};
150 }
151
152 # load all the plugins
153
154 #build up the extra global options for the plugins
155 my @global_opts = ();
156 if (defined $self->{'collect_cfg'}->{'separate_cjk'} && $self->{'collect_cfg'}->{'separate_cjk'} =~ /^true$/i) {
157 push @global_opts, "-separate_cjk";
158 }
159 $self->{'pluginfo'} = &plugin::load_plugins ($plugins, $verbosity, $outhandle, $failhandle, \@global_opts);
160
161 if (scalar(@{$self->{'pluginfo'}}) == 0) {
162 print $outhandle "No plugins were loaded.\n";
163 die "\n";
164 }
165
166 # get the list of classifiers for this collection
167 my $classifiers = [];
168 if (defined $self->{'collect_cfg'}->{'classify'}) {
169 $classifiers = $self->{'collect_cfg'}->{'classify'};
170 }
171
172 # load all the classifiers
173 $self->{'classifiers'} = &classify::load_classifiers ($classifiers, $build_dir, $outhandle);
174
175 # load up any dontgdbm fields
176 $self->{'dontgdbm'} = {};
177 if (defined ($self->{'collect_cfg'}->{'dontgdbm'})) {
178 foreach $dg (@{$self->{'collect_cfg'}->{'dontgdbm'}}) {
179 $self->{'dontgdbm'}->{$dg} = 1;
180 }
181 }
182
183 # load up the document processor for building
184 # if a buildproc class has been created for this collection, use it
185 # otherwise, use the mg buildproc
186 my ($buildprocdir, $buildproctype);
187 if (-e "$ENV{'GSDLCOLLECTDIR'}/perllib/${collection}buildproc.pm") {
188 $buildprocdir = "$ENV{'GSDLCOLLECTDIR'}/perllib";
189 $buildproctype = "${collection}buildproc";
190 } else {
191 $buildprocdir = "$ENV{'GSDLHOME'}/perllib";
192 $buildproctype = "mgbuildproc";
193 }
194
195 require "$buildprocdir/$buildproctype.pm";
196
197 eval("\$self->{'buildproc'} = new $buildproctype(\$collection, " .
198 "\$source_dir, \$build_dir, \$verbosity, \$outhandle)");
199 die "$@" if $@;
200
201 return $self;
202}
203
204sub init {
205 my $self = shift (@_);
206
207 if (!$self->{'debug'} && !$self->{'keepold'}) {
208 # remove any old builds
209 &util::rm_r($self->{'build_dir'});
210 &util::mk_all_dir($self->{'build_dir'});
211
212 # make the text directory
213 my $textdir = "$self->{'build_dir'}/text";
214 &util::mk_all_dir($textdir);
215 }
216}
217
218sub compress_text {
219 my $self = shift (@_);
220 my ($textindex) = @_;
221 my $exedir = "$ENV{'GSDLHOME'}/bin/$ENV{'GSDLOS'}";
222 my $exe = &util::get_os_exe ();
223 my $mg_passes_exe = &util::filename_cat($exedir, "mg_passes$exe");
224 my $mg_compression_dict_exe = &util::filename_cat($exedir, "mg_compression_dict$exe");
225 my $outhandle = $self->{'outhandle'};
226
227 my $maxnumeric = 4;
228 if (defined($self->{'collect_cfg'}->{'maxnumeric'}) &&
229 $self->{'collect_cfg'}->{'maxnumeric'} =~ /^\d+$/) {
230 $maxnumeric = $self->{'collect_cfg'}->{'maxnumeric'};
231 }
232
233 &util::mk_all_dir (&util::filename_cat($self->{'build_dir'}, "text"));
234 my $basefilename = "text/$self->{'collection'}";
235 my $fulltextprefix = &util::filename_cat ($self->{'build_dir'}, $basefilename);
236
237 my $osextra = "";
238 if ($ENV{'GSDLOS'} =~ /^windows$/i) {
239 $fulltextprefix =~ s@/@\\@g;
240 } else {
241 $osextra = " -d /";
242 }
243
244 print $outhandle "\n*** creating the compressed text\n" if ($self->{'verbosity'} >= 1);
245 print STDERR "<Stage name='CompressText'>\n" if $self->{'gli'};
246
247 # collect the statistics for the text
248 # -b $maxdocsize sets the maximum document size to be 12 meg
249 print $outhandle "\n collecting text statistics\n" if ($self->{'verbosity'} >= 1);
250 print STDERR "<Phase name='CollectTextStats'/>\n" if $self->{'gli'};
251
252 my ($handle);
253 if ($self->{'debug'}) {
254 $handle = STDOUT;
255 } else {
256 if (!-e "$mg_passes_exe" ||
257 !open (PIPEOUT, "| mg_passes$exe -f \"$fulltextprefix\" -b $maxdocsize -T1 -M $maxnumeric $osextra")) {
258 print STDERR "<FatalError name='NoRunMGPasses'>\n</Stage>\n" if $self->{'gli'};
259 die "mgbuilder::compress_text - couldn't run $mg_passes_exe\n";
260 }
261 $handle = mgbuilder::PIPEOUT;
262 }
263
264 $self->{'buildproc'}->set_output_handle ($handle);
265 $self->{'buildproc'}->set_mode ('text');
266 $self->{'buildproc'}->set_index ($textindex);
267 $self->{'buildproc'}->set_indexing_text (0);
268
269
270 if ($self->{'no_text'}) {
271 $self->{'buildproc'}->set_store_text(0);
272 } else {
273 $self->{'buildproc'}->set_store_text(1);
274 }
275 $self->{'buildproc'}->reset();
276
277 &plugin::begin($self->{'pluginfo'}, $self->{'source_dir'},
278 $self->{'buildproc'}, $self->{'maxdocs'});
279 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
280 "", {}, $self->{'buildproc'}, $self->{'maxdocs'});
281 &plugin::end($self->{'pluginfo'});
282
283
284 close ($handle) unless $self->{'debug'};
285
286 $self->print_stats();
287
288 # create the compression dictionary
289 # the compression dictionary is built by assuming the stats are from a seed
290 # dictionary (-S), if a novel word is encountered it is spelled out (-H),
291 # and the resulting dictionary must be less than 5 meg with the most frequent
292 # words being put into the dictionary first (-2 -k 5120)
293 if (!$self->{'debug'}) {
294 print $outhandle "\n creating the compression dictionary\n" if ($self->{'verbosity'} >= 1);
295 print STDERR "<Phase name='CreatingCompress'/>\n" if $self->{'gli'};
296 if (!-e "$mg_compression_dict_exe") {
297 die "mgbuilder::compress_text - couldn't run $mg_compression_dict_exe\n";
298 }
299 system ("mg_compression_dict$exe -f \"$fulltextprefix\" -S -H -2 -k 5120 $osextra");
300
301 # -b $maxdocsize sets the maximum document size to be 12 meg
302 if (!-e "$mg_passes_exe" ||
303 !open ($handle, "| mg_passes$exe -f \"$fulltextprefix\" -b $maxdocsize -T2 -M $maxnumeric $osextra")) {
304 print STDERR "<FatalError name='NoRunMGPasses'/>\n</Stage>\n" if $self->{'gli'};
305 die "mgbuilder::compress_text - couldn't run $mg_passes_exe\n";
306 }
307 }
308 else {
309 print STDERR "<Phase name='SkipCreatingComp'/>\n" if $self->{'gli'};
310 }
311
312 $self->{'buildproc'}->reset();
313 # compress the text
314 print $outhandle "\n compressing the text\n" if ($self->{'verbosity'} >= 1);
315 print STDERR "<Phase name='CompressingText'/>\n" if $self->{'gli'};
316
317 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
318 "", {}, $self->{'buildproc'}, $self->{'maxdocs'});
319
320 close ($handle) unless $self->{'debug'};
321
322 $self->print_stats();
323 print STDERR "</Stage>\n" if $self->{'gli'};
324}
325
326sub want_built {
327 my $self = shift (@_);
328 my ($index) = @_;
329
330 if (defined ($self->{'collect_cfg'}->{'dontbuild'})) {
331 foreach $checkstr (@{$self->{'collect_cfg'}->{'dontbuild'}}) {
332 if ($index =~ /^$checkstr$/) {
333 #push (@{$self->{'notbuilt'}}, $self->{'index_mapping'}->{$index});
334 $self->{'notbuilt'}->{$index} = 1;
335 return 0;
336 }
337 }
338 }
339
340 return 1;
341}
342
343sub build_indexes {
344 my $self = shift (@_);
345 my ($indexname) = @_;
346 my $outhandle = $self->{'outhandle'};
347
348 my $indexes = [];
349 if (defined $indexname && $indexname =~ /\w/) {
350 push @$indexes, $indexname;
351 } else {
352 $indexes = $self->{'collect_cfg'}->{'indexes'};
353 }
354
355 # create the mapping between the index descriptions
356 # and their directory names
357 $self->{'index_mapping'} = $self->create_index_mapping ($indexes);
358
359 # build each of the indexes
360 foreach $index (@$indexes) {
361 if ($self->want_built($index)) {
362 print $outhandle "\n*** building index $index in subdirectory " .
363 "$self->{'index_mapping'}->{$index}\n" if ($self->{'verbosity'} >= 1);
364 print STDERR "<Stage name='Index' source='$index'>\n" if $self->{'gli'};
365 $self->build_index($index);
366 } else {
367 print $outhandle "\n*** ignoring index $index\n" if ($self->{'verbosity'} >= 1);
368 }
369 }
370}
371
372# creates directory names for each of the index descriptions
373sub create_index_mapping {
374 my $self = shift (@_);
375 my ($indexes) = @_;
376
377 my %mapping = ();
378 $mapping{'indexmaporder'} = [];
379 $mapping{'subcollectionmaporder'} = [];
380 $mapping{'languagemaporder'} = [];
381
382 # dirnames is used to check for collisions. Start this off
383 # with the manditory directory names
384 my %dirnames = ('text'=>'text',
385 'extra'=>'extra');
386 my %pnames = ('index' => '', 'subcollection' => '', 'languages' => '');
387
388 foreach $index (@$indexes) {
389 my ($level, $gran, $subcollection, $languages) = split (":", $index);
390
391 # the directory name starts with the first character of the index level
392 my ($pindex) = $level =~ /^(.)/;
393
394 # next comes a processed version of the index
395 $pindex .= $self->process_field ($gran);
396 $pindex = lc ($pindex);
397
398 # next comes a processed version of the subcollection if there is one.
399 my $psub = $self->process_field ($subcollection);
400 $psub = lc ($psub);
401
402 # next comes a processed version of the language if there is one.
403 my $plang = $self->process_field ($languages);
404 $plang = lc ($plang);
405
406 my $dirname = $pindex . $psub . $plang;
407
408 # check to be sure all index names are unique
409 while (defined ($dirnames{$dirname})) {
410 $dirname = $self->make_unique (\%pnames, $index, \$pindex, \$psub, \$plang);
411 }
412 $mapping{$index} = $dirname;
413
414 # store the mapping orders as well as the maps
415 # also put index, subcollection and language fields into the mapping thing -
416 # (the full index name (eg document:text:subcol:lang) is not used on
417 # the query page) -these are used for collectionmeta later on
418 if (!defined $mapping{'indexmap'}{"$level:$gran"}) {
419 $mapping{'indexmap'}{"$level:$gran"} = $pindex;
420 push (@{$mapping{'indexmaporder'}}, "$level:$gran");
421 if (!defined $mapping{"$level:$gran"}) {
422 $mapping{"$level:$gran"} = $pindex;
423 }
424 }
425 if ($psub =~ /\w/ && !defined ($mapping{'subcollectionmap'}{$subcollection})) {
426 $mapping{'subcollectionmap'}{$subcollection} = $psub;
427 push (@{$mapping{'subcollectionmaporder'}}, $subcollection);
428 $mapping{$subcollection} = $psub;
429 }
430 if ($plang =~ /\w/ && !defined ($mapping{'languagemap'}{$languages})) {
431 $mapping{'languagemap'}{$languages} = $plang;
432 push (@{$mapping{'languagemaporder'}}, $languages);
433 $mapping{$languages} = $plang;
434 }
435 $dirnames{$dirname} = $index;
436 $pnames{'index'}{$pindex} = "$level:$gran";
437 $pnames{'subcollection'}{$psub} = $subcollection;
438 $pnames{'languages'}{$plang} = $languages;
439 }
440
441 return \%mapping;
442}
443
444# returns a processed version of a field.
445# if the field has only one component the processed
446# version will contain the first character and next consonant
447# of that componant - otherwise it will contain the first
448# character of the first two components
449sub process_field {
450 my $self = shift (@_);
451 my ($field) = @_;
452
453 return "" unless (defined ($field) && $field =~ /\w/);
454
455 my @components = split /,/, $field;
456 if (scalar @components >= 2) {
457 splice (@components, 2);
458 map {s/^(.).*$/$1/;} @components;
459 return join("", @components);
460 } else {
461 my ($a, $b) = $field =~ /^(.).*?([bcdfghjklmnpqrstvwxyz])/i;
462 ($a, $b) = $field =~ /^(.)(.)/ unless defined $a && defined $b;
463 return "$a$b";
464 }
465}
466
467sub make_unique {
468 my $self = shift (@_);
469 my ($namehash, $index, $indexref, $subref, $langref) = @_;
470 my ($level, $gran, $subcollection, $languages) = split (":", $index);
471
472 if ($namehash->{'index'}->{$$indexref} ne "$level:$gran") {
473 $self->get_next_version ($indexref);
474 } elsif ($namehash->{'subcollection'}->{$$subref} ne $subcollection) {
475 $self->get_next_version ($subref);
476 } elsif ($namehash->{'languages'}->{$$langref} ne $languages) {
477 $self->get_next_version ($langref);
478 }
479 return "$$indexref$$subref$$langref";
480}
481
482sub get_next_version {
483 my $self = shift (@_);
484 my ($nameref) = @_;
485
486 if ($$nameref =~ /(\d\d)$/) {
487 my $num = $1; $num ++;
488 $$nameref =~ s/\d\d$/$num/;
489 } elsif ($$nameref =~ /(\d)$/) {
490 my $num = $1;
491 if ($num == 9) {$$nameref =~ s/\d\d$/10/;}
492 else {$num ++; $$nameref =~ s/\d$/$num/;}
493 } else {
494 $$nameref =~ s/.$/0/;
495 }
496}
497
498sub build_index {
499 my $self = shift (@_);
500 my ($index) = @_;
501 my $outhandle = $self->{'outhandle'};
502
503 # get the full index directory path and make sure it exists
504 my $indexdir = $self->{'index_mapping'}->{$index};
505 &util::mk_all_dir (&util::filename_cat($self->{'build_dir'}, $indexdir));
506 my $fullindexprefix = &util::filename_cat ($self->{'build_dir'}, $indexdir,
507 $self->{'collection'});
508 my $fulltextprefix = &util::filename_cat ($self->{'build_dir'}, "text",
509 $self->{'collection'});
510
511 # get any os specific stuff
512 my $exedir = "$ENV{'GSDLHOME'}/bin/$ENV{'GSDLOS'}";
513 my $exe = &util::get_os_exe ();
514 my $mg_passes_exe = &util::filename_cat($exedir, "mg_passes$exe");
515 my $mg_perf_hash_build_exe =
516 &util::filename_cat($exedir, "mg_perf_hash_build$exe");
517 my $mg_weights_build_exe =
518 &util::filename_cat ($exedir, "mg_weights_build$exe");
519 my $mg_invf_dict_exe =
520 &util::filename_cat ($exedir, "mg_invf_dict$exe");
521 my $mg_stem_idx_exe =
522 &util::filename_cat ($exedir, "mg_stem_idx$exe");
523
524 my $maxnumeric = 4;
525 if (defined($self->{'collect_cfg'}->{'maxnumeric'}) &&
526 $self->{'collect_cfg'}->{'maxnumeric'} =~ /^\d+$/) {
527 $maxnumeric = $self->{'collect_cfg'}->{'maxnumeric'};
528 }
529
530 my $osextra = "";
531 if ($ENV{'GSDLOS'} =~ /^windows$/i) {
532 $fullindexprefix =~ s@/@\\@g;
533 } else {
534 $osextra = " -d /";
535 if ($outhandle ne "STDERR") {
536 # so mg_passes doesn't print to stderr if we redirect output
537 $osextra .= " 2>/dev/null";
538 }
539 }
540
541 # get the index level from the index description
542 # the index will be level 2 unless we are building a
543 # paragraph level index
544 my $index_level = 2;
545 $index_level = 3 if $index =~ /^paragraph/i;
546
547 # get the index expression if this index belongs
548 # to a subcollection
549 my $indexexparr = [];
550
551 # there may be subcollection info, and language info.
552 my ($level, $fields, $subcollection, $language) = split (":", $index);
553 my @subcollections = ();
554 @subcollections = split /,/, $subcollection if (defined $subcollection);
555
556 foreach $subcollection (@subcollections) {
557 if (defined ($self->{'collect_cfg'}->{'subcollection'}->{$subcollection})) {
558 push (@$indexexparr, $self->{'collect_cfg'}->{'subcollection'}->{$subcollection});
559 }
560 }
561
562 # add expressions for languages if this index belongs to
563 # a language subcollection - only put languages expressions for the
564 # ones we want in the index
565 # this puts a separate Language/en entry in for each language in the list
566 # is this what we want?
567 # should we just have one entry with Language/en,es/ ??
568
569 my @languages = ();
570 @languages = split /,/, $language if (defined $language);
571 foreach $language (@languages) {
572 my $not=0;
573 if ($language =~ s/^\!//) {
574 $not = 1;
575 }
576 if($not) {
577 push (@$indexexparr, "!Language/$language/");
578 } else {
579 push (@$indexexparr, "Language/$language/");
580 }
581 }
582
583 # Build index dictionary. Uses verbatim stem method
584 print $outhandle "\n creating index dictionary\n" if ($self->{'verbosity'} >= 1);
585 print STDERR "<Phase name='CreatingIndexDic'/>\n" if $self->{'gli'};
586 my ($handle);
587 if ($self->{'debug'}) {
588 $handle = STDOUT;
589 } else {
590 if (!-e "$mg_passes_exe" ||
591 !open (PIPEOUT, "| mg_passes$exe -f \"$fullindexprefix\" -b $maxdocsize " .
592 "-$index_level -m 32 -s 0 -G -t 10 -N1 -M $maxnumeric $osextra")) {
593 print STDERR "<FatalError name='NoRunMGPasses'/>\n</Stage>\n" if $self->{'gli'};
594 die "mgbuilder::build_index - couldn't run $mg_passes_exe\n";
595 }
596 $handle = mgbuilder::PIPEOUT;
597 }
598
599 # set up the document processor
600 $self->{'buildproc'}->set_output_handle ($handle);
601 $self->{'buildproc'}->set_mode ('text');
602 $self->{'buildproc'}->set_index ($index, $indexexparr);
603 $self->{'buildproc'}->set_indexing_text (1);
604 $self->{'buildproc'}->set_store_text(1);
605
606 $self->{'buildproc'}->reset();
607 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
608 "", {}, $self->{'buildproc'}, $self->{'maxdocs'});
609 close ($handle) unless $self->{'debug'};
610
611 $self->print_stats();
612
613 # now we check to see if the required files have been produced - if not we quit building this index so the whole process doesn't crap out.
614 # we check on the .id file - index dictionary
615 my $dict_file = "$fullindexprefix.id";
616 if (!-e $dict_file) {
617 print $outhandle "mgbuilder::build_index - Couldn't create index $index\n";
618 $self->{'notbuilt'}->{$index}=1;
619 return;
620 }
621 if (!$self->{'debug'}) {
622 # create the perfect hash function
623 if (!-e "$mg_perf_hash_build_exe") {
624 print STDERR "<FatalError name='NoRunMGHash'/>\n</Stage>\n" if $self->{'gli'};
625 die "mgbuilder::build_index - couldn't run $mg_perf_hash_build_exe\n";
626 }
627 system ("mg_perf_hash_build$exe -f \"$fullindexprefix\" $osextra");
628
629 if (!-e "$mg_passes_exe" ||
630 !open ($handle, "| mg_passes$exe -f \"$fullindexprefix\" -b $maxdocsize " .
631 "-$index_level -c 3 -G -t 10 -N2 -M $maxnumeric $osextra")) {
632 print STDERR "<FatalError name='NoRunMGPasses'/>\n</Stage>\n" if $self->{'gli'};
633 die "mgbuilder::build_index - couldn't run $mg_passes_exe\n";
634 }
635 }
636
637 # invert the text
638 print $outhandle "\n inverting the text\n" if ($self->{'verbosity'} >= 1);
639 print STDERR "<Phase name='InvertingText'/>\n" if $self->{'gli'};
640 $self->{'buildproc'}->reset();
641 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
642 "", {}, $self->{'buildproc'}, $self->{'maxdocs'});
643
644
645 $self->print_stats ();
646
647 if (!$self->{'debug'}) {
648
649 close ($handle);
650
651 # create the weights file
652 print $outhandle "\n create the weights file\n" if ($self->{'verbosity'} >= 1);
653 print STDERR "<Phase name='CreateTheWeights'/>\n" if $self->{'gli'};
654 if (!-e "$mg_weights_build_exe") {
655 print STDERR "<FatalError name='NoRunMGWeights'/>\n</Stage>\n" if $self->{'gli'};
656 die "mgbuilder::build_index - couldn't run $mg_weights_build_exe\n";
657 }
658 system ("mg_weights_build$exe -f \"$fullindexprefix\" -t \"$fulltextprefix\" $osextra");
659
660 # create 'on-disk' stemmed dictionary
661 print $outhandle "\n creating 'on-disk' stemmed dictionary\n" if ($self->{'verbosity'} >= 1);
662 print STDERR "<Phase name='CreateStemmedDic'/>\n" if $self->{'gli'};
663 if (!-e "$mg_invf_dict_exe") {
664 print STDERR "<FatalError name='NoRunMGInvf'/>\n</Stage>\n" if $self->{'gli'};
665 die "mgbuilder::build_index - couldn't run $mg_invf_dict_exe\n";
666 }
667 system ("mg_invf_dict$exe -f \"$fullindexprefix\" $osextra");
668
669
670 # creates stem index files for the various stemming methods
671 print $outhandle "\n creating stem indexes\n" if ($self->{'verbosity'} >= 1);
672 print STDERR "<Phase name='CreatingStemIndx'/>\n" if $self->{'gli'};
673 if (!-e "$mg_stem_idx_exe") {
674 print STDERR "<FatalError name='NoRunMGStem'/>\n</Stage>\n" if $self->{'gli'};
675 die "mgbuilder::build_index - couldn't run $mg_stem_idx_exe\n";
676 }
677 system ("mg_stem_idx$exe -b 4096 -s1 -f \"$fullindexprefix\" $osextra");
678 system ("mg_stem_idx$exe -b 4096 -s2 -f \"$fullindexprefix\" $osextra");
679 system ("mg_stem_idx$exe -b 4096 -s3 -f \"$fullindexprefix\" $osextra");
680
681 # remove unwanted files
682 my $tmpdir = &util::filename_cat ($self->{'build_dir'}, $indexdir);
683 opendir (DIR, $tmpdir) || die
684 "mgbuilder::build_index - couldn't read directory $tmpdir\n";
685 foreach $file (readdir(DIR)) {
686 next if $file =~ /^\./;
687 my ($suffix) = $file =~ /\.([^\.]+)$/;
688 if (defined $suffix && !defined $wanted_index_files{$suffix}) {
689 # delete it!
690 print $outhandle "deleting $file\n" if $self->{'verbosity'} > 2;
691 &util::rm (&util::filename_cat ($tmpdir, $file));
692 }
693 }
694 closedir (DIR);
695 }
696 print STDERR "</Stage>\n" if $self->{'gli'};
697}
698
699sub make_infodatabase {
700 my $self = shift (@_);
701 my $outhandle = $self->{'outhandle'};
702
703 my $textdir = &util::filename_cat($self->{'build_dir'}, "text");
704 my $assocdir = &util::filename_cat($self->{'build_dir'}, "assoc");
705 &util::mk_all_dir ($textdir);
706 &util::mk_all_dir ($assocdir);
707
708 # get db name
709 my $dbext = ".bdb";
710 $dbext = ".ldb" if &util::is_little_endian();
711 my $fulldbname = &util::filename_cat ($textdir, "$self->{'collection'}$dbext");
712 $fulldbname =~ s/\//\\/g if ($ENV{'GSDLOS'} =~ /^windows$/i);
713
714 my $exedir = "$ENV{'GSDLHOME'}/bin/$ENV{'GSDLOS'}";
715 my $exe = &util::get_os_exe ();
716 my $txt2db_exe = &util::filename_cat($exedir, "txt2db$exe");
717
718 print $outhandle "\n*** creating the info database and processing associated files\n"
719 if ($self->{'verbosity'} >= 1);
720 print STDERR "<Stage name='CreateInfoData'>\n" if $self->{'gli'};
721
722 # init all the classifiers
723 &classify::init_classifiers ($self->{'classifiers'});
724
725 # set up the document processor
726 my ($handle);
727 if ($self->{'debug'}) {
728 $handle = STDOUT;
729 } else {
730 if (!-e "$txt2db_exe" || !open (PIPEOUT, "| txt2db$exe \"$fulldbname\"")) {
731 print STDERR "<FatalError name='NoRunText2DB'/>\n</Stage>\n" if $self->{'gli'};
732 die "mgbuilder::make_infodatabase - couldn't run $txt2db_exe\n";
733 }
734 $handle = mgbuilder::PIPEOUT;
735 }
736
737 $self->{'buildproc'}->set_output_handle ($handle);
738 $self->{'buildproc'}->set_mode ('infodb');
739 $self->{'buildproc'}->set_assocdir ($assocdir);
740 $self->{'buildproc'}->set_dontgdbm ($self->{'dontgdbm'});
741 $self->{'buildproc'}->set_classifiers ($self->{'classifiers'});
742 $self->{'buildproc'}->set_indexing_text (0);
743 $self->{'buildproc'}->set_store_text(1);
744 $self->{'buildproc'}->reset();
745
746 if (defined $self->{'collect_cfg'}->{'collectionmeta'}) {
747
748 if (!defined $self->{'index_mapping'}) {
749 $self->{'index_mapping'} =
750 $self->create_index_mapping ($self->{'collect_cfg'}->{'indexes'});
751 }
752
753 print $handle "[collection]\n";
754
755 foreach $cmeta (keys (%{$self->{'collect_cfg'}->{'collectionmeta'}})) {
756 my $defaultfound=0;
757 my $first=1;
758 my $metadata_entry = "";
759 my $default="";
760 my $cmetamap = "";
761 if ($cmeta =~ s/^\.//) {
762 if (defined $self->{'index_mapping'}->{$cmeta}) {
763 $cmetamap = $self->{'index_mapping'}->{$cmeta};
764 $cmeta = ".$cmeta";
765 }
766 else {
767 print $outhandle "mgbuilder: warning bad collectionmeta option '$cmeta' - ignored\n";
768 next; #ignore this one
769 }
770 }
771 else {
772 $cmetamap = $cmeta; # just using the same name
773 }
774 #iterate through the languages
775 foreach $lang (keys (%{$self->{'collect_cfg'}->{'collectionmeta'}->{$cmeta}})) {
776 if ($first) {
777 $first=0;
778 #set the default default to the first entry
779 $default=$self->{'collect_cfg'}->{'collectionmeta'}->{$cmeta}->{$lang};
780 }
781 if ($lang =~ /default/) {
782 $defaultfound=1;
783 #the default entry goes first
784 $metadata_entry = "<$cmetamap>" .
785 $self->{'collect_cfg'}->{'collectionmeta'}->{$cmeta}->{'default'} . "\n" . $metadata_entry;
786 }
787 else {
788 my ($l) = $lang =~ /^\[l=(\w*)\]$/;
789 if ($l) {
790 $metadata_entry .= "<$cmetamap:$l>" .
791 $self->{'collect_cfg'}->{'collectionmeta'}->{$cmeta}->{$lang} . "\n";
792
793 # Use the English value as the default if no default is specified
794 if ($l =~ /en/i) {
795 $default=$self->{'collect_cfg'}->{'collectionmeta'}->{$cmeta}->{$lang};
796 }
797 }
798 }
799 }
800 #if we haven't found a default, put one in
801 if (!$defaultfound) {
802 $metadata_entry = "<$cmetamap>$default\n" . $metadata_entry;
803 }
804 #write the entry to the file
805 print $handle $metadata_entry;
806
807 }
808
809 print $handle "\n" . ('-' x 70) . "\n";
810 }
811
812 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
813 "", {}, $self->{'buildproc'}, $self->{'maxdocs'});
814
815 # output classification information
816 &classify::output_classify_info ($self->{'classifiers'}, $handle,
817 $self->{'remove_empty_classifications'},
818 $self->{'gli'});
819
820
821 #output doclist
822 my @doclist = $self->{'buildproc'}->get_doc_list();
823 my $docs = join (";",@doclist);
824 print $handle "[browselist]\n";
825 print $handle "<hastxt>0\n";
826 print $handle "<childtype>VList\n";
827 print $handle "<numleafdocs>" . ($#doclist+1) . "\n";
828 print $handle "<thistype>Invisible\n";
829 print $handle "<contains>$docs";
830 print $handle "\n" . ('-' x 70) . "\n";
831
832 close ($handle) if !$self->{'debug'};
833
834 print STDERR "</Stage>\n" if $self->{'gli'};
835}
836
837sub collect_specific {
838 my $self = shift (@_);
839}
840
841sub make_auxiliary_files {
842 my $self = shift (@_);
843 my ($index);
844 my %build_cfg = ();
845 my $outhandle = $self->{'outhandle'};
846
847 print $outhandle "\n*** creating auxiliary files \n" if ($self->{'verbosity'} >= 1);
848 print STDERR "<Stage name='CreatingAuxilary'>\n" if $self->{'gli'};
849
850 # get the text directory
851 &util::mk_all_dir ($self->{'build_dir'});
852
853 # store the build date
854 $build_cfg->{'builddate'} = time;
855
856 # store the number of documents and number of bytes
857 $build_cfg->{'numdocs'} = $self->{'buildproc'}->get_num_docs();
858 $build_cfg->{'numbytes'} = $self->{'buildproc'}->get_num_bytes();
859
860 # get additional stats from mg
861 my $exedir = "$ENV{'GSDLHOME'}/bin/$ENV{'GSDLOS'}";
862 my $exe = &util::get_os_exe ();
863 my $mgstat_exe = &util::filename_cat($exedir, "mgstat$exe");
864 my $input_file = &util::filename_cat ("text", $self->{'collection'});
865 if (!-e "$mgstat_exe" || !open (PIPEIN, "mgstat$exe -d \"$self->{'build_dir'}\" -f \"$input_file\" |")) {
866 print $outhandle "Warning: Couldn't open pipe to $mgstat_exe to get additional stats\n";
867 } else {
868 my $line = "";
869 while (defined ($line = <PIPEIN>)) {
870 if ($line =~ /^Words in collection \[dict\]\s+:\s+(\d+)/) {
871 ($build_cfg->{'numwords'}) = $1;
872 } elsif ($line =~ /^Documents\s+:\s+(\d+)/) {
873 ($build_cfg->{'numsections'}) = $1;
874 }
875 }
876 close PIPEIN;
877 }
878
879 # store the mapping between the index names and the directory names
880 # the index map is used to determine what indexes there are, so any that are not built should not be put into the map.
881 my @indexmap = ();
882 foreach $index (@{$self->{'index_mapping'}->{'indexmaporder'}}) {
883 if (not defined ($self->{'notbuilt'}->{$index})) {
884 push (@indexmap, "$index\-\>$self->{'index_mapping'}->{'indexmap'}->{$index}");
885 }
886 }
887 $build_cfg->{'indexmap'} = \@indexmap;
888
889 my @subcollectionmap = ();
890 foreach $subcollection (@{$self->{'index_mapping'}->{'subcollectionmaporder'}}) {
891 push (@subcollectionmap, "$subcollection\-\>" .
892 $self->{'index_mapping'}->{'subcollectionmap'}->{$subcollection});
893 }
894 $build_cfg->{'subcollectionmap'} = \@subcollectionmap if scalar (@subcollectionmap);
895
896 my @languagemap = ();
897 foreach $language (@{$self->{'index_mapping'}->{'languagemaporder'}}) {
898 push (@languagemap, "$language\-\>" .
899 $self->{'index_mapping'}->{'languagemap'}->{$language});
900 }
901 $build_cfg->{'languagemap'} = \@languagemap if scalar (@languagemap);
902
903 #$build_cfg->{'notbuilt'} = $self->{'notbuilt'} if scalar @{$self->{'notbuilt'}};
904 my @notbuilt = ();
905 foreach $nb (keys %{$self->{'notbuilt'}}) {
906 push (@notbuilt, $nb);
907 }
908 $build_cfg->{'notbuilt'} = \@notbuilt if scalar (@notbuilt);
909 $build_cfg->{'maxnumeric'} = 4;
910 if (defined($self->{'collect_cfg'}->{'maxnumeric'}) &&
911 $self->{'collect_cfg'}->{'maxnumeric'} =~ /^\d+$/) {
912 $build_cfg->{'maxnumeric'} = $self->{'collect_cfg'}->{'maxnumeric'};
913 }
914
915 # write out the build information
916 &cfgread::write_cfg_file("$self->{'build_dir'}/build.cfg", $build_cfg,
917 '^(builddate|numdocs|numbytes|numwords|numsections|maxnumeric)$',
918 '^(indexmap|subcollectionmap|languagemap|notbuilt)$');
919
920 print STDERR "</Stage>\n" if $self->{'gli'};
921}
922
923sub deinit {
924 my $self = shift (@_);
925}
926
927sub print_stats {
928 my $self = shift (@_);
929
930 my $outhandle = $self->{'outhandle'};
931 my $indexing_text = $self->{'buildproc'}->get_indexing_text();
932 my $index = $self->{'buildproc'}->get_index();
933 my $num_bytes = $self->{'buildproc'}->get_num_bytes();
934 my $num_processed_bytes = $self->{'buildproc'}->get_num_processed_bytes();
935
936 if ($indexing_text) {
937 print $outhandle "Stats (Creating index $index)\n";
938 } else {
939 print $outhandle "Stats (Compressing text from $index)\n";
940 }
941 print $outhandle "Total bytes in collection: $num_bytes\n";
942 print $outhandle "Total bytes in $index: $num_processed_bytes\n";
943
944 if ($num_processed_bytes < 50 && ($indexing_text || !$self->{'no_text'})) {
945 print $outhandle "***************\n";
946 if ($indexing_text) {
947 print $outhandle "WARNING: There is very little or no text to process for $index\n";
948 } elsif (!$self->{'no_text'}) {
949 print $outhandle "WARNING: There is very little or no text to compress\n";
950 }
951 print $outhandle " Was this your intention?\n";
952 print $outhandle "***************\n";
953 print STDERR "<Warning name='LittleOrNoText'>\n" if $self->{'gli'};
954 }
955}
956
9571;
958
959
960
Note: See TracBrowser for help on using the repository browser.