source: main/tags/2.51/gsdl/perllib/mgbuilder.pm@ 32629

Last change on this file since 32629 was 7150, checked in by mdewsnip, 20 years ago

Now chooses the English collectionmeta value (if it exists) to be the default, in the absence of a specified default value. If there is no default value and no English value, a random value will be used as the default.

  • Property svn:keywords set to Author Date Id Revision
File size: 32.5 KB
Line 
1###########################################################################
2#
3# mgbuilder.pm -- MGBuilder object
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26package mgbuilder;
27
28use classify;
29use cfgread;
30use colcfg;
31use plugin;
32use util;
33use FileHandle;
34
35BEGIN {
36 # set autoflush on for STDERR and STDOUT so that mg
37 # doesn't get out of sync with plugins
38 STDOUT->autoflush(1);
39 STDERR->autoflush(1);
40}
41
42END {
43 STDOUT->autoflush(0);
44 STDERR->autoflush(0);
45}
46
47$maxdocsize = 12000;
48
49%wanted_index_files = ('td'=>1,
50 't'=>1,
51 'idb'=>1,
52 'ib1'=>1,
53 'ib2'=>1,
54 'ib3'=>1,
55 'i'=>1,
56 'ip'=>1,
57 'tiw'=>1,
58 'wa'=>1);
59
60
61sub new {
62 my ($class, $collection, $source_dir, $build_dir, $verbosity,
63 $maxdocs, $debug, $keepold, $allclassifications,
64 $outhandle, $no_text, $failhandle, $gli) = @_;
65
66 $outhandle = STDERR unless defined $outhandle;
67 $no_text = 0 unless defined $no_text;
68 $failhandle = STDERR unless defined $failhandle;
69
70 # create an mgbuilder object
71 my $self = bless {'collection'=>$collection,
72 'source_dir'=>$source_dir,
73 'build_dir'=>$build_dir,
74 'verbosity'=>$verbosity,
75 'maxdocs'=>$maxdocs,
76 'debug'=>$debug,
77 'keepold'=>$keepold,
78 'allclassifications'=>$allclassifications,
79 'outhandle'=>$outhandle,
80 'no_text'=>$no_text,
81 'failhandle'=>$failhandle,
82 'notbuilt'=>{}, # indexes not built
83 'gli'=>$gli
84 }, $class;
85
86 $self->{'gli'} = 0 unless defined $self->{'gli'};
87
88 # read in the collection configuration file
89 my $colcfgname = "$ENV{'GSDLCOLLECTDIR'}/etc/collect.cfg";
90 if (!-e $colcfgname) {
91 die "mgbuilder::new - couldn't find collect.cfg for collection $collection\n";
92 }
93 $self->{'collect_cfg'} = &colcfg::read_collect_cfg ($colcfgname);
94
95 if (!defined($self->{'collect_cfg'}->{'indexes'})) {
96 $self->{'collect_cfg'}->{'indexes'} = [];
97 }
98
99 # sort out subcollection indexes
100 if (defined $self->{'collect_cfg'}->{'indexsubcollections'}) {
101 my $indexes = $self->{'collect_cfg'}->{'indexes'};
102 $self->{'collect_cfg'}->{'indexes'} = [];
103 foreach $subcollection (@{$self->{'collect_cfg'}->{'indexsubcollections'}}) {
104 foreach $index (@$indexes) {
105 push (@{$self->{'collect_cfg'}->{'indexes'}}, "$index:$subcollection");
106 }
107 }
108 }
109
110 # sort out language subindexes
111 if (defined $self->{'collect_cfg'}->{'languages'}) {
112 my $indexes = $self->{'collect_cfg'}->{'indexes'};
113 $self->{'collect_cfg'}->{'indexes'} = [];
114 foreach $language (@{$self->{'collect_cfg'}->{'languages'}}) {
115 foreach $index (@$indexes) {
116 if (defined ($self->{'collect_cfg'}->{'indexsubcollections'})) {
117 push (@{$self->{'collect_cfg'}->{'indexes'}}, "$index:$language");
118 }
119 else { # add in an empty subcollection field
120 push (@{$self->{'collect_cfg'}->{'indexes'}}, "$index\:\:$language");
121 }
122 }
123 }
124 }
125
126 if (defined($self->{'collect_cfg'}->{'indexes'})) {
127 # make sure that the same index isn't specified more than once
128 my %tmphash = ();
129 my @tmparray = @{$self->{'collect_cfg'}->{'indexes'}};
130 $self->{'collect_cfg'}->{'indexes'} = [];
131 foreach my $i (@tmparray) {
132 if (!defined ($tmphash{$i})) {
133 push (@{$self->{'collect_cfg'}->{'indexes'}}, $i);
134 $tmphash{$i} = 1;
135 }
136 }
137 } else {
138 $self->{'collect_cfg'}->{'indexes'} = [];
139 }
140
141 if (scalar(@{$self->{'collect_cfg'}->{'indexes'}}) == 0) {
142 # no indexes have been specified so we'll build a "dummy:text" index
143 push (@{$self->{'collect_cfg'}->{'indexes'}}, "dummy:text");
144 }
145
146 # get the list of plugins for this collection
147 my $plugins = [];
148 if (defined $self->{'collect_cfg'}->{'plugin'}) {
149 $plugins = $self->{'collect_cfg'}->{'plugin'};
150 }
151
152 # load all the plugins
153
154 #build up the extra global options for the plugins
155 my @global_opts = ();
156 if (defined $self->{'collect_cfg'}->{'separate_cjk'} && $self->{'collect_cfg'}->{'separate_cjk'} =~ /^true$/i) {
157 push @global_opts, "-separate_cjk";
158 }
159 $self->{'pluginfo'} = &plugin::load_plugins ($plugins, $verbosity, $outhandle, $failhandle, \@global_opts);
160 if (scalar(@{$self->{'pluginfo'}}) == 0) {
161 print $outhandle "No plugins were loaded.\n";
162 die "\n";
163 }
164
165 # get the list of classifiers for this collection
166 my $classifiers = [];
167 if (defined $self->{'collect_cfg'}->{'classify'}) {
168 $classifiers = $self->{'collect_cfg'}->{'classify'};
169 }
170
171 # load all the classifiers
172 $self->{'classifiers'} = &classify::load_classifiers ($classifiers, $build_dir, $outhandle);
173
174 # load up any dontgdbm fields
175 $self->{'dontgdbm'} = {};
176 if (defined ($self->{'collect_cfg'}->{'dontgdbm'})) {
177 foreach $dg (@{$self->{'collect_cfg'}->{'dontgdbm'}}) {
178 $self->{'dontgdbm'}->{$dg} = 1;
179 }
180 }
181
182 # load up the document processor for building
183 # if a buildproc class has been created for this collection, use it
184 # otherwise, use the mg buildproc
185 my ($buildprocdir, $buildproctype);
186 if (-e "$ENV{'GSDLCOLLECTDIR'}/perllib/${collection}buildproc.pm") {
187 $buildprocdir = "$ENV{'GSDLCOLLECTDIR'}/perllib";
188 $buildproctype = "${collection}buildproc";
189 } else {
190 $buildprocdir = "$ENV{'GSDLHOME'}/perllib";
191 $buildproctype = "mgbuildproc";
192 }
193 require "$buildprocdir/$buildproctype.pm";
194
195 eval("\$self->{'buildproc'} = new $buildproctype(\$collection, " .
196 "\$source_dir, \$build_dir, \$verbosity, \$outhandle)");
197 die "$@" if $@;
198
199 return $self;
200}
201
202sub init {
203 my $self = shift (@_);
204
205 if (!$self->{'debug'} && !$self->{'keepold'}) {
206 # remove any old builds
207 &util::rm_r($self->{'build_dir'});
208 &util::mk_all_dir($self->{'build_dir'});
209
210 # make the text directory
211 my $textdir = "$self->{'build_dir'}/text";
212 &util::mk_all_dir($textdir);
213 }
214}
215
216sub compress_text {
217 my $self = shift (@_);
218 my ($textindex) = @_;
219 my $exedir = "$ENV{'GSDLHOME'}/bin/$ENV{'GSDLOS'}";
220 my $exe = &util::get_os_exe ();
221 my $mg_passes_exe = &util::filename_cat($exedir, "mg_passes$exe");
222 my $mg_compression_dict_exe = &util::filename_cat($exedir, "mg_compression_dict$exe");
223 my $outhandle = $self->{'outhandle'};
224
225 my $maxnumeric = 4;
226 if (defined($self->{'collect_cfg'}->{'maxnumeric'}) &&
227 $self->{'collect_cfg'}->{'maxnumeric'} =~ /^\d+$/) {
228 $maxnumeric = $self->{'collect_cfg'}->{'maxnumeric'};
229 }
230
231 &util::mk_all_dir (&util::filename_cat($self->{'build_dir'}, "text"));
232 my $basefilename = "text/$self->{'collection'}";
233 my $fulltextprefix = &util::filename_cat ($self->{'build_dir'}, $basefilename);
234
235 my $osextra = "";
236 if ($ENV{'GSDLOS'} =~ /^windows$/i) {
237 $fulltextprefix =~ s@/@\\@g;
238 } else {
239 $osextra = " -d /";
240 }
241
242 print $outhandle "\n*** creating the compressed text\n" if ($self->{'verbosity'} >= 1);
243 print STDERR "<Stage name='CompressText'>\n" if $self->{'gli'};
244
245 # collect the statistics for the text
246 # -b $maxdocsize sets the maximum document size to be 12 meg
247 print $outhandle "\n collecting text statistics\n" if ($self->{'verbosity'} >= 1);
248 print STDERR "<Phase name='CollectTextStats'/>\n" if $self->{'gli'};
249
250 my ($handle);
251 if ($self->{'debug'}) {
252 $handle = STDOUT;
253 } else {
254 if (!-e "$mg_passes_exe" ||
255 !open (PIPEOUT, "| mg_passes$exe -f \"$fulltextprefix\" -b $maxdocsize -T1 -M $maxnumeric $osextra")) {
256 print STDERR "<FatalError name='NoRunMGPasses'>\n</Stage>\n" if $self->{'gli'};
257 die "mgbuilder::compress_text - couldn't run $mg_passes_exe\n";
258 }
259 $handle = mgbuilder::PIPEOUT;
260 }
261
262 $self->{'buildproc'}->set_output_handle ($handle);
263 $self->{'buildproc'}->set_mode ('text');
264 $self->{'buildproc'}->set_index ($textindex);
265 $self->{'buildproc'}->set_indexing_text (0);
266 if ($self->{'no_text'}) {
267 $self->{'buildproc'}->set_store_text(0);
268 } else {
269 $self->{'buildproc'}->set_store_text(1);
270 }
271 $self->{'buildproc'}->reset();
272 &plugin::begin($self->{'pluginfo'}, $self->{'source_dir'},
273 $self->{'buildproc'}, $self->{'maxdocs'});
274 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
275 "", {}, $self->{'buildproc'}, $self->{'maxdocs'});
276 &plugin::end($self->{'pluginfo'});
277
278 close ($handle) unless $self->{'debug'};
279
280 $self->print_stats();
281
282 # create the compression dictionary
283 # the compression dictionary is built by assuming the stats are from a seed
284 # dictionary (-S), if a novel word is encountered it is spelled out (-H),
285 # and the resulting dictionary must be less than 5 meg with the most frequent
286 # words being put into the dictionary first (-2 -k 5120)
287 if (!$self->{'debug'}) {
288 print $outhandle "\n creating the compression dictionary\n" if ($self->{'verbosity'} >= 1);
289 print STDERR "<Phase name='CreatingCompress'/>\n" if $self->{'gli'};
290 if (!-e "$mg_compression_dict_exe") {
291 die "mgbuilder::compress_text - couldn't run $mg_compression_dict_exe\n";
292 }
293 system ("mg_compression_dict$exe -f \"$fulltextprefix\" -S -H -2 -k 5120 $osextra");
294
295 # -b $maxdocsize sets the maximum document size to be 12 meg
296 if (!-e "$mg_passes_exe" ||
297 !open ($handle, "| mg_passes$exe -f \"$fulltextprefix\" -b $maxdocsize -T2 -M $maxnumeric $osextra")) {
298 print STDERR "<FatalError name='NoRunMGPasses'/>\n</Stage>\n" if $self->{'gli'};
299 die "mgbuilder::compress_text - couldn't run $mg_passes_exe\n";
300 }
301 }
302 else {
303 print STDERR "<Phase name='SkipCreatingComp'/>\n" if $self->{'gli'};
304 }
305
306 $self->{'buildproc'}->reset();
307 # compress the text
308 print $outhandle "\n compressing the text\n" if ($self->{'verbosity'} >= 1);
309 print STDERR "<Phase name='CompressingText'/>\n" if $self->{'gli'};
310
311 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
312 "", {}, $self->{'buildproc'}, $self->{'maxdocs'});
313 close ($handle) unless $self->{'debug'};
314
315 $self->print_stats();
316 print STDERR "</Stage>\n" if $self->{'gli'};
317}
318
319sub want_built {
320 my $self = shift (@_);
321 my ($index) = @_;
322
323 if (defined ($self->{'collect_cfg'}->{'dontbuild'})) {
324 foreach $checkstr (@{$self->{'collect_cfg'}->{'dontbuild'}}) {
325 if ($index =~ /^$checkstr$/) {
326 #push (@{$self->{'notbuilt'}}, $self->{'index_mapping'}->{$index});
327 $self->{'notbuilt'}->{$index} = 1;
328 return 0;
329 }
330 }
331 }
332
333 return 1;
334}
335
336sub build_indexes {
337 my $self = shift (@_);
338 my ($indexname) = @_;
339 my $outhandle = $self->{'outhandle'};
340
341 my $indexes = [];
342 if (defined $indexname && $indexname =~ /\w/) {
343 push @$indexes, $indexname;
344 } else {
345 $indexes = $self->{'collect_cfg'}->{'indexes'};
346 }
347
348 # create the mapping between the index descriptions
349 # and their directory names
350 $self->{'index_mapping'} = $self->create_index_mapping ($indexes);
351
352 # build each of the indexes
353 foreach $index (@$indexes) {
354 if ($self->want_built($index)) {
355 print $outhandle "\n*** building index $index in subdirectory " .
356 "$self->{'index_mapping'}->{$index}\n" if ($self->{'verbosity'} >= 1);
357 print STDERR "<Stage name='Index' source='$index'>\n" if $self->{'gli'};
358 $self->build_index($index);
359 } else {
360 print $outhandle "\n*** ignoring index $index\n" if ($self->{'verbosity'} >= 1);
361 }
362 }
363}
364
365# creates directory names for each of the index descriptions
366sub create_index_mapping {
367 my $self = shift (@_);
368 my ($indexes) = @_;
369
370 my %mapping = ();
371 $mapping{'indexmaporder'} = [];
372 $mapping{'subcollectionmaporder'} = [];
373 $mapping{'languagemaporder'} = [];
374
375 # dirnames is used to check for collisions. Start this off
376 # with the manditory directory names
377 my %dirnames = ('text'=>'text',
378 'extra'=>'extra');
379 my %pnames = ('index' => '', 'subcollection' => '', 'languages' => '');
380
381 foreach $index (@$indexes) {
382 my ($level, $gran, $subcollection, $languages) = split (":", $index);
383
384 # the directory name starts with the first character of the index level
385 my ($pindex) = $level =~ /^(.)/;
386
387 # next comes a processed version of the index
388 $pindex .= $self->process_field ($gran);
389 $pindex = lc ($pindex);
390
391 # next comes a processed version of the subcollection if there is one.
392 my $psub = $self->process_field ($subcollection);
393 $psub = lc ($psub);
394
395 # next comes a processed version of the language if there is one.
396 my $plang = $self->process_field ($languages);
397 $plang = lc ($plang);
398
399 my $dirname = $pindex . $psub . $plang;
400
401 # check to be sure all index names are unique
402 while (defined ($dirnames{$dirname})) {
403 $dirname = $self->make_unique (\%pnames, $index, \$pindex, \$psub, \$plang);
404 }
405 $mapping{$index} = $dirname;
406
407 # store the mapping orders as well as the maps
408 # also put index, subcollection and language fields into the mapping thing -
409 # (the full index name (eg document:text:subcol:lang) is not used on
410 # the query page) -these are used for collectionmeta later on
411 if (!defined $mapping{'indexmap'}{"$level:$gran"}) {
412 $mapping{'indexmap'}{"$level:$gran"} = $pindex;
413 push (@{$mapping{'indexmaporder'}}, "$level:$gran");
414 if (!defined $mapping{"$level:$gran"}) {
415 $mapping{"$level:$gran"} = $pindex;
416 }
417 }
418 if ($psub =~ /\w/ && !defined ($mapping{'subcollectionmap'}{$subcollection})) {
419 $mapping{'subcollectionmap'}{$subcollection} = $psub;
420 push (@{$mapping{'subcollectionmaporder'}}, $subcollection);
421 $mapping{$subcollection} = $psub;
422 }
423 if ($plang =~ /\w/ && !defined ($mapping{'languagemap'}{$languages})) {
424 $mapping{'languagemap'}{$languages} = $plang;
425 push (@{$mapping{'languagemaporder'}}, $languages);
426 $mapping{$languages} = $plang;
427 }
428 $dirnames{$dirname} = $index;
429 $pnames{'index'}{$pindex} = "$level:$gran";
430 $pnames{'subcollection'}{$psub} = $subcollection;
431 $pnames{'languages'}{$plang} = $languages;
432 }
433
434 return \%mapping;
435}
436
437# returns a processed version of a field.
438# if the field has only one component the processed
439# version will contain the first character and next consonant
440# of that componant - otherwise it will contain the first
441# character of the first two components
442sub process_field {
443 my $self = shift (@_);
444 my ($field) = @_;
445
446 return "" unless (defined ($field) && $field =~ /\w/);
447
448 my @components = split /,/, $field;
449 if (scalar @components >= 2) {
450 splice (@components, 2);
451 map {s/^(.).*$/$1/;} @components;
452 return join("", @components);
453 } else {
454 my ($a, $b) = $field =~ /^(.).*?([bcdfghjklmnpqrstvwxyz])/i;
455 ($a, $b) = $field =~ /^(.)(.)/ unless defined $a && defined $b;
456 return "$a$b";
457 }
458}
459
460sub make_unique {
461 my $self = shift (@_);
462 my ($namehash, $index, $indexref, $subref, $langref) = @_;
463 my ($level, $gran, $subcollection, $languages) = split (":", $index);
464
465 if ($namehash->{'index'}->{$$indexref} ne "$level:$gran") {
466 $self->get_next_version ($indexref);
467 } elsif ($namehash->{'subcollection'}->{$$subref} ne $subcollection) {
468 $self->get_next_version ($subref);
469 } elsif ($namehash->{'languages'}->{$$langref} ne $languages) {
470 $self->get_next_version ($langref);
471 }
472 return "$$indexref$$subref$$langref";
473}
474
475sub get_next_version {
476 my $self = shift (@_);
477 my ($nameref) = @_;
478
479 if ($$nameref =~ /(\d\d)$/) {
480 my $num = $1; $num ++;
481 $$nameref =~ s/\d\d$/$num/;
482 } elsif ($$nameref =~ /(\d)$/) {
483 my $num = $1;
484 if ($num == 9) {$$nameref =~ s/\d\d$/10/;}
485 else {$num ++; $$nameref =~ s/\d$/$num/;}
486 } else {
487 $$nameref =~ s/.$/0/;
488 }
489}
490
491sub build_index {
492 my $self = shift (@_);
493 my ($index) = @_;
494 my $outhandle = $self->{'outhandle'};
495
496 # get the full index directory path and make sure it exists
497 my $indexdir = $self->{'index_mapping'}->{$index};
498 &util::mk_all_dir (&util::filename_cat($self->{'build_dir'}, $indexdir));
499 my $fullindexprefix = &util::filename_cat ($self->{'build_dir'}, $indexdir,
500 $self->{'collection'});
501 my $fulltextprefix = &util::filename_cat ($self->{'build_dir'}, "text",
502 $self->{'collection'});
503
504 # get any os specific stuff
505 my $exedir = "$ENV{'GSDLHOME'}/bin/$ENV{'GSDLOS'}";
506 my $exe = &util::get_os_exe ();
507 my $mg_passes_exe = &util::filename_cat($exedir, "mg_passes$exe");
508 my $mg_perf_hash_build_exe =
509 &util::filename_cat($exedir, "mg_perf_hash_build$exe");
510 my $mg_weights_build_exe =
511 &util::filename_cat ($exedir, "mg_weights_build$exe");
512 my $mg_invf_dict_exe =
513 &util::filename_cat ($exedir, "mg_invf_dict$exe");
514 my $mg_stem_idx_exe =
515 &util::filename_cat ($exedir, "mg_stem_idx$exe");
516
517 my $maxnumeric = 4;
518 if (defined($self->{'collect_cfg'}->{'maxnumeric'}) &&
519 $self->{'collect_cfg'}->{'maxnumeric'} =~ /^\d+$/) {
520 $maxnumeric = $self->{'collect_cfg'}->{'maxnumeric'};
521 }
522
523 my $osextra = "";
524 if ($ENV{'GSDLOS'} =~ /^windows$/i) {
525 $fullindexprefix =~ s@/@\\@g;
526 } else {
527 $osextra = " -d /";
528 if ($outhandle ne "STDERR") {
529 # so mg_passes doesn't print to stderr if we redirect output
530 $osextra .= " 2>/dev/null";
531 }
532 }
533
534 # get the index level from the index description
535 # the index will be level 2 unless we are building a
536 # paragraph level index
537 my $index_level = 2;
538 $index_level = 3 if $index =~ /^paragraph/i;
539
540 # get the index expression if this index belongs
541 # to a subcollection
542 my $indexexparr = [];
543
544 # there may be subcollection info, and language info.
545 my ($level, $fields, $subcollection, $language) = split (":", $index);
546 my @subcollections = ();
547 @subcollections = split /,/, $subcollection if (defined $subcollection);
548
549 foreach $subcollection (@subcollections) {
550 if (defined ($self->{'collect_cfg'}->{'subcollection'}->{$subcollection})) {
551 push (@$indexexparr, $self->{'collect_cfg'}->{'subcollection'}->{$subcollection});
552 }
553 }
554
555 # add expressions for languages if this index belongs to
556 # a language subcollection - only put languages expressions for the
557 # ones we want in the index
558 # this puts a separate Language/en entry in for each language in the list
559 # is this what we want?
560 # should we just have one entry with Language/en,es/ ??
561
562 my @languages = ();
563 @languages = split /,/, $language if (defined $language);
564 foreach $language (@languages) {
565 my $not=0;
566 if ($language =~ s/^\!//) {
567 $not = 1;
568 }
569 if($not) {
570 push (@$indexexparr, "!Language/$language/");
571 } else {
572 push (@$indexexparr, "Language/$language/");
573 }
574 }
575
576 # Build index dictionary. Uses verbatim stem method
577 print $outhandle "\n creating index dictionary\n" if ($self->{'verbosity'} >= 1);
578 print STDERR "<Phase name='CreatingIndexDic'/>\n" if $self->{'gli'};
579 my ($handle);
580 if ($self->{'debug'}) {
581 $handle = STDOUT;
582 } else {
583 if (!-e "$mg_passes_exe" ||
584 !open (PIPEOUT, "| mg_passes$exe -f \"$fullindexprefix\" -b $maxdocsize " .
585 "-$index_level -m 32 -s 0 -G -t 10 -N1 -M $maxnumeric $osextra")) {
586 print STDERR "<FatalError name='NoRunMGPasses'/>\n</Stage>\n" if $self->{'gli'};
587 die "mgbuilder::build_index - couldn't run $mg_passes_exe\n";
588 }
589 $handle = mgbuilder::PIPEOUT;
590 }
591
592 # set up the document processor
593 $self->{'buildproc'}->set_output_handle ($handle);
594 $self->{'buildproc'}->set_mode ('text');
595 $self->{'buildproc'}->set_index ($index, $indexexparr);
596 $self->{'buildproc'}->set_indexing_text (1);
597 $self->{'buildproc'}->set_store_text(1);
598
599 $self->{'buildproc'}->reset();
600 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
601 "", {}, $self->{'buildproc'}, $self->{'maxdocs'});
602 close ($handle) unless $self->{'debug'};
603
604 $self->print_stats();
605
606 # now we check to see if the required files have been produced - if not we quit building this index so the whole process doesn't crap out.
607 # we check on the .id file - index dictionary
608 my $dict_file = "$fullindexprefix.id";
609 if (!-e $dict_file) {
610 print $outhandle "mgbuilder::build_index - Couldn't create index $index\n";
611 $self->{'notbuilt'}->{$index}=1;
612 return;
613 }
614 if (!$self->{'debug'}) {
615 # create the perfect hash function
616 if (!-e "$mg_perf_hash_build_exe") {
617 print STDERR "<FatalError name='NoRunMGHash'/>\n</Stage>\n" if $self->{'gli'};
618 die "mgbuilder::build_index - couldn't run $mg_perf_hash_build_exe\n";
619 }
620 system ("mg_perf_hash_build$exe -f \"$fullindexprefix\" $osextra");
621
622 if (!-e "$mg_passes_exe" ||
623 !open ($handle, "| mg_passes$exe -f \"$fullindexprefix\" -b $maxdocsize " .
624 "-$index_level -c 3 -G -t 10 -N2 -M $maxnumeric $osextra")) {
625 print STDERR "<FatalError name='NoRunMGPasses'/>\n</Stage>\n" if $self->{'gli'};
626 die "mgbuilder::build_index - couldn't run $mg_passes_exe\n";
627 }
628 }
629
630 # invert the text
631 print $outhandle "\n inverting the text\n" if ($self->{'verbosity'} >= 1);
632 print STDERR "<Phase name='InvertingText'/>\n" if $self->{'gli'};
633 $self->{'buildproc'}->reset();
634 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
635 "", {}, $self->{'buildproc'}, $self->{'maxdocs'});
636
637 $self->print_stats ();
638
639 if (!$self->{'debug'}) {
640
641 close ($handle);
642
643 # create the weights file
644 print $outhandle "\n create the weights file\n" if ($self->{'verbosity'} >= 1);
645 print STDERR "<Phase name='CreateTheWeights'/>\n" if $self->{'gli'};
646 if (!-e "$mg_weights_build_exe") {
647 print STDERR "<FatalError name='NoRunMGWeights'/>\n</Stage>\n" if $self->{'gli'};
648 die "mgbuilder::build_index - couldn't run $mg_weights_build_exe\n";
649 }
650 system ("mg_weights_build$exe -f \"$fullindexprefix\" -t \"$fulltextprefix\" $osextra");
651
652 # create 'on-disk' stemmed dictionary
653 print $outhandle "\n creating 'on-disk' stemmed dictionary\n" if ($self->{'verbosity'} >= 1);
654 print STDERR "<Phase name='CreateStemmedDic'/>\n" if $self->{'gli'};
655 if (!-e "$mg_invf_dict_exe") {
656 print STDERR "<FatalError name='NoRunMGInvf'/>\n</Stage>\n" if $self->{'gli'};
657 die "mgbuilder::build_index - couldn't run $mg_invf_dict_exe\n";
658 }
659 system ("mg_invf_dict$exe -f \"$fullindexprefix\" $osextra");
660
661
662 # creates stem index files for the various stemming methods
663 print $outhandle "\n creating stem indexes\n" if ($self->{'verbosity'} >= 1);
664 print STDERR "<Phase name='CreatingStemIndx'/>\n" if $self->{'gli'};
665 if (!-e "$mg_stem_idx_exe") {
666 print STDERR "<FatalError name='NoRunMGStem'/>\n</Stage>\n" if $self->{'gli'};
667 die "mgbuilder::build_index - couldn't run $mg_stem_idx_exe\n";
668 }
669 system ("mg_stem_idx$exe -b 4096 -s1 -f \"$fullindexprefix\" $osextra");
670 system ("mg_stem_idx$exe -b 4096 -s2 -f \"$fullindexprefix\" $osextra");
671 system ("mg_stem_idx$exe -b 4096 -s3 -f \"$fullindexprefix\" $osextra");
672
673 # remove unwanted files
674 my $tmpdir = &util::filename_cat ($self->{'build_dir'}, $indexdir);
675 opendir (DIR, $tmpdir) || die
676 "mgbuilder::build_index - couldn't read directory $tmpdir\n";
677 foreach $file (readdir(DIR)) {
678 next if $file =~ /^\./;
679 my ($suffix) = $file =~ /\.([^\.]+)$/;
680 if (defined $suffix && !defined $wanted_index_files{$suffix}) {
681 # delete it!
682 print $outhandle "deleting $file\n" if $self->{'verbosity'} > 2;
683 &util::rm (&util::filename_cat ($tmpdir, $file));
684 }
685 }
686 closedir (DIR);
687 }
688 print STDERR "</Stage>\n" if $self->{'gli'};
689}
690
691sub make_infodatabase {
692 my $self = shift (@_);
693 my $outhandle = $self->{'outhandle'};
694
695 my $textdir = &util::filename_cat($self->{'build_dir'}, "text");
696 my $assocdir = &util::filename_cat($self->{'build_dir'}, "assoc");
697 &util::mk_all_dir ($textdir);
698 &util::mk_all_dir ($assocdir);
699
700 # get db name
701 my $dbext = ".bdb";
702 $dbext = ".ldb" if &util::is_little_endian();
703 my $fulldbname = &util::filename_cat ($textdir, "$self->{'collection'}$dbext");
704 $fulldbname =~ s/\//\\/g if ($ENV{'GSDLOS'} =~ /^windows$/i);
705
706 my $exedir = "$ENV{'GSDLHOME'}/bin/$ENV{'GSDLOS'}";
707 my $exe = &util::get_os_exe ();
708 my $txt2db_exe = &util::filename_cat($exedir, "txt2db$exe");
709
710 print $outhandle "\n*** creating the info database and processing associated files\n"
711 if ($self->{'verbosity'} >= 1);
712 print STDERR "<Stage name='CreateInfoData'>\n" if $self->{'gli'};
713
714 # init all the classifiers
715 &classify::init_classifiers ($self->{'classifiers'});
716
717 # set up the document processor
718 my ($handle);
719 if ($self->{'debug'}) {
720 $handle = STDOUT;
721 } else {
722 if (!-e "$txt2db_exe" || !open (PIPEOUT, "| txt2db$exe \"$fulldbname\"")) {
723 print STDERR "<FatalError name='NoRunText2DB'/>\n</Stage>\n" if $self->{'gli'};
724 die "mgbuilder::make_infodatabase - couldn't run $txt2db_exe\n";
725 }
726 $handle = mgbuilder::PIPEOUT;
727 }
728
729 $self->{'buildproc'}->set_output_handle ($handle);
730 $self->{'buildproc'}->set_mode ('infodb');
731 $self->{'buildproc'}->set_assocdir ($assocdir);
732 $self->{'buildproc'}->set_dontgdbm ($self->{'dontgdbm'});
733 $self->{'buildproc'}->set_classifiers ($self->{'classifiers'});
734 $self->{'buildproc'}->set_indexing_text (0);
735 $self->{'buildproc'}->set_store_text(1);
736 $self->{'buildproc'}->reset();
737
738 if (defined $self->{'collect_cfg'}->{'collectionmeta'}) {
739
740 if (!defined $self->{'index_mapping'}) {
741 $self->{'index_mapping'} =
742 $self->create_index_mapping ($self->{'collect_cfg'}->{'indexes'});
743 }
744
745 print $handle "[collection]\n";
746
747 foreach $cmeta (keys (%{$self->{'collect_cfg'}->{'collectionmeta'}})) {
748 my $defaultfound=0;
749 my $first=1;
750 my $metadata_entry = "";
751 my $default="";
752 my $cmetamap = "";
753 if ($cmeta =~ s/^\.//) {
754 if (defined $self->{'index_mapping'}->{$cmeta}) {
755 $cmetamap = $self->{'index_mapping'}->{$cmeta};
756 $cmeta = ".$cmeta";
757 }
758 else {
759 print $outhandle "mgbuilder: warning bad collectionmeta option '$cmeta' - ignored\n";
760 next; #ignore this one
761 }
762 }
763 else {
764 $cmetamap = $cmeta; # just using the same name
765 }
766 #iterate through the languages
767 foreach $lang (keys (%{$self->{'collect_cfg'}->{'collectionmeta'}->{$cmeta}})) {
768 if ($first) {
769 $first=0;
770 #set the default default to the first entry
771 $default=$self->{'collect_cfg'}->{'collectionmeta'}->{$cmeta}->{$lang};
772 }
773 if ($lang =~ /default/) {
774 $defaultfound=1;
775 #the default entry goes first
776 $metadata_entry = "<$cmetamap>" .
777 $self->{'collect_cfg'}->{'collectionmeta'}->{$cmeta}->{'default'} . "\n" . $metadata_entry;
778 }
779 else {
780 my ($l) = $lang =~ /^\[l=(\w*)\]$/;
781 if ($l) {
782 $metadata_entry .= "<$cmetamap:$l>" .
783 $self->{'collect_cfg'}->{'collectionmeta'}->{$cmeta}->{$lang} . "\n";
784
785 # Use the English value as the default if no default is specified
786 if ($l =~ /en/i) {
787 $default=$self->{'collect_cfg'}->{'collectionmeta'}->{$cmeta}->{$lang};
788 }
789 }
790 }
791 }
792 #if we haven't found a default, put one in
793 if (!$defaultfound) {
794 $metadata_entry = "<$cmetamap>$default\n" . $metadata_entry;
795 }
796 #write the entry to the file
797 print $handle $metadata_entry;
798
799 }
800
801 print $handle "\n" . ('-' x 70) . "\n";
802 }
803
804 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
805 "", {}, $self->{'buildproc'}, $self->{'maxdocs'});
806
807 # output classification information
808 &classify::output_classify_info ($self->{'classifiers'}, $handle,
809 $self->{'allclassifications'},
810 $self->{'gli'});
811
812
813 #output doclist
814 my @doclist = $self->{'buildproc'}->get_doc_list();
815 my $docs = join (";",@doclist);
816 print $handle "[browselist]\n";
817 print $handle "<hastxt>0\n";
818 print $handle "<childtype>VList\n";
819 print $handle "<numleafdocs>" . ($#doclist+1) . "\n";
820 print $handle "<thistype>Invisible\n";
821 print $handle "<contains>$docs";
822 print $handle "\n" . ('-' x 70) . "\n";
823
824 close ($handle) if !$self->{'debug'};
825
826 print STDERR "</Stage>\n" if $self->{'gli'};
827}
828
829sub collect_specific {
830 my $self = shift (@_);
831}
832
833sub make_auxiliary_files {
834 my $self = shift (@_);
835 my ($index);
836 my %build_cfg = ();
837 my $outhandle = $self->{'outhandle'};
838
839 print $outhandle "\n*** creating auxiliary files \n" if ($self->{'verbosity'} >= 1);
840 print STDERR "<Stage name='CreatingAuxilary'>\n" if $self->{'gli'};
841
842 # get the text directory
843 &util::mk_all_dir ($self->{'build_dir'});
844
845 # store the build date
846 $build_cfg->{'builddate'} = time;
847
848 # store the number of documents and number of bytes
849 $build_cfg->{'numdocs'} = $self->{'buildproc'}->get_num_docs();
850 $build_cfg->{'numbytes'} = $self->{'buildproc'}->get_num_bytes();
851
852 # get additional stats from mg
853 my $exedir = "$ENV{'GSDLHOME'}/bin/$ENV{'GSDLOS'}";
854 my $exe = &util::get_os_exe ();
855 my $mgstat_exe = &util::filename_cat($exedir, "mgstat$exe");
856 my $input_file = &util::filename_cat ("text", $self->{'collection'});
857 if (!-e "$mgstat_exe" || !open (PIPEIN, "mgstat$exe -d \"$self->{'build_dir'}\" -f \"$input_file\" |")) {
858 print $outhandle "Warning: Couldn't open pipe to $mgstat_exe to get additional stats\n";
859 } else {
860 my $line = "";
861 while (defined ($line = <PIPEIN>)) {
862 if ($line =~ /^Words in collection \[dict\]\s+:\s+(\d+)/) {
863 ($build_cfg->{'numwords'}) = $1;
864 } elsif ($line =~ /^Documents\s+:\s+(\d+)/) {
865 ($build_cfg->{'numsections'}) = $1;
866 }
867 }
868 close PIPEIN;
869 }
870
871 # store the mapping between the index names and the directory names
872 # the index map is used to determine what indexes there are, so any that are not built should not be put into the map.
873 my @indexmap = ();
874 foreach $index (@{$self->{'index_mapping'}->{'indexmaporder'}}) {
875 if (not defined ($self->{'notbuilt'}->{$index})) {
876 push (@indexmap, "$index\-\>$self->{'index_mapping'}->{'indexmap'}->{$index}");
877 }
878 }
879 $build_cfg->{'indexmap'} = \@indexmap;
880
881 my @subcollectionmap = ();
882 foreach $subcollection (@{$self->{'index_mapping'}->{'subcollectionmaporder'}}) {
883 push (@subcollectionmap, "$subcollection\-\>" .
884 $self->{'index_mapping'}->{'subcollectionmap'}->{$subcollection});
885 }
886 $build_cfg->{'subcollectionmap'} = \@subcollectionmap if scalar (@subcollectionmap);
887
888 my @languagemap = ();
889 foreach $language (@{$self->{'index_mapping'}->{'languagemaporder'}}) {
890 push (@languagemap, "$language\-\>" .
891 $self->{'index_mapping'}->{'languagemap'}->{$language});
892 }
893 $build_cfg->{'languagemap'} = \@languagemap if scalar (@languagemap);
894
895 #$build_cfg->{'notbuilt'} = $self->{'notbuilt'} if scalar @{$self->{'notbuilt'}};
896 my @notbuilt = ();
897 foreach $nb (keys %{$self->{'notbuilt'}}) {
898 push (@notbuilt, $nb);
899 }
900 $build_cfg->{'notbuilt'} = \@notbuilt if scalar (@notbuilt);
901 $build_cfg->{'maxnumeric'} = 4;
902 if (defined($self->{'collect_cfg'}->{'maxnumeric'}) &&
903 $self->{'collect_cfg'}->{'maxnumeric'} =~ /^\d+$/) {
904 $build_cfg->{'maxnumeric'} = $self->{'collect_cfg'}->{'maxnumeric'};
905 }
906
907 # write out the build information
908 &cfgread::write_cfg_file("$self->{'build_dir'}/build.cfg", $build_cfg,
909 '^(builddate|numdocs|numbytes|numwords|numsections|maxnumeric)$',
910 '^(indexmap|subcollectionmap|languagemap|notbuilt)$');
911
912 print STDERR "</Stage>\n" if $self->{'gli'};
913}
914
915sub deinit {
916 my $self = shift (@_);
917}
918
919sub print_stats {
920 my $self = shift (@_);
921
922 my $outhandle = $self->{'outhandle'};
923 my $indexing_text = $self->{'buildproc'}->get_indexing_text();
924 my $index = $self->{'buildproc'}->get_index();
925 my $num_bytes = $self->{'buildproc'}->get_num_bytes();
926 my $num_processed_bytes = $self->{'buildproc'}->get_num_processed_bytes();
927
928 if ($indexing_text) {
929 print $outhandle "Stats (Creating index $index)\n";
930 } else {
931 print $outhandle "Stats (Compressing text from $index)\n";
932 }
933 print $outhandle "Total bytes in collection: $num_bytes\n";
934 print $outhandle "Total bytes in $index: $num_processed_bytes\n";
935
936 if ($num_processed_bytes < 50 && ($indexing_text || !$self->{'no_text'})) {
937 print $outhandle "***************\n";
938 if ($indexing_text) {
939 print $outhandle "WARNING: There is very little or no text to process for $index\n";
940 } elsif (!$self->{'no_text'}) {
941 print $outhandle "WARNING: There is very little or no text to compress\n";
942 }
943 print $outhandle " Was this your intention?\n";
944 print $outhandle "***************\n";
945 print STDERR "<Warning name='LittleOrNoText'>\n" if $self->{'gli'};
946 }
947}
948
9491;
Note: See TracBrowser for help on using the repository browser.