source: trunk/gsdl/perllib/mgbuilder.pm@ 6282

Last change on this file since 6282 was 5768, checked in by kjdon, 21 years ago

added a check for failed indexing - just test to see if the .id file exists. if not, don't continue with building that index. Indexes that haven't been built are no longer included in the indexmap entry in teh build config file - and therefore wont appear in the list of indexes

  • Property svn:keywords set to Author Date Id Revision
File size: 30.1 KB
Line 
1###########################################################################
2#
3# mgbuilder.pm -- MGBuilder object
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26package mgbuilder;
27
28use classify;
29use cfgread;
30use colcfg;
31use plugin;
32use util;
33use FileHandle;
34
35BEGIN {
36 # set autoflush on for STDERR and STDOUT so that mg
37 # doesn't get out of sync with plugins
38 STDOUT->autoflush(1);
39 STDERR->autoflush(1);
40}
41
42END {
43 STDOUT->autoflush(0);
44 STDERR->autoflush(0);
45}
46
47$maxdocsize = 12000;
48
49%wanted_index_files = ('td'=>1,
50 't'=>1,
51 'idb'=>1,
52 'ib1'=>1,
53 'ib2'=>1,
54 'ib3'=>1,
55 'i'=>1,
56 'ip'=>1,
57 'tiw'=>1,
58 'wa'=>1);
59
60
61sub new {
62 my ($class, $collection, $source_dir, $build_dir, $verbosity,
63 $maxdocs, $debug, $keepold, $allclassifications,
64 $outhandle, $no_text, $failhandle) = @_;
65
66 $outhandle = STDERR unless defined $outhandle;
67 $no_text = 0 unless defined $no_text;
68 $failhandle = STDERR unless defined $failhandle;
69
70 # create an mgbuilder object
71 my $self = bless {'collection'=>$collection,
72 'source_dir'=>$source_dir,
73 'build_dir'=>$build_dir,
74 'verbosity'=>$verbosity,
75 'maxdocs'=>$maxdocs,
76 'debug'=>$debug,
77 'keepold'=>$keepold,
78 'allclassifications'=>$allclassifications,
79 'outhandle'=>$outhandle,
80 'no_text'=>$no_text,
81 'failhandle'=>$failhandle,
82 'notbuilt'=>{} # indexes not built
83 }, $class;
84
85
86 # read in the collection configuration file
87 my $colcfgname = "$ENV{'GSDLCOLLECTDIR'}/etc/collect.cfg";
88 if (!-e $colcfgname) {
89 die "mgbuilder::new - couldn't find collect.cfg for collection $collection\n";
90 }
91 $self->{'collect_cfg'} = &colcfg::read_collect_cfg ($colcfgname);
92
93 if (!defined($self->{'collect_cfg'}->{'indexes'})) {
94 $self->{'collect_cfg'}->{'indexes'} = [];
95 }
96
97 # sort out subcollection indexes
98 if (defined $self->{'collect_cfg'}->{'indexsubcollections'}) {
99 my $indexes = $self->{'collect_cfg'}->{'indexes'};
100 $self->{'collect_cfg'}->{'indexes'} = [];
101 foreach $subcollection (@{$self->{'collect_cfg'}->{'indexsubcollections'}}) {
102 foreach $index (@$indexes) {
103 push (@{$self->{'collect_cfg'}->{'indexes'}}, "$index:$subcollection");
104 }
105 }
106 }
107
108 # sort out language subindexes
109 if (defined $self->{'collect_cfg'}->{'languages'}) {
110 my $indexes = $self->{'collect_cfg'}->{'indexes'};
111 $self->{'collect_cfg'}->{'indexes'} = [];
112 foreach $language (@{$self->{'collect_cfg'}->{'languages'}}) {
113 foreach $index (@$indexes) {
114 if (defined ($self->{'collect_cfg'}->{'indexsubcollections'})) {
115 push (@{$self->{'collect_cfg'}->{'indexes'}}, "$index:$language");
116 }
117 else { # add in an empty subcollection field
118 push (@{$self->{'collect_cfg'}->{'indexes'}}, "$index\:\:$language");
119 }
120 }
121 }
122 }
123
124 if (defined($self->{'collect_cfg'}->{'indexes'})) {
125 # make sure that the same index isn't specified more than once
126 my %tmphash = ();
127 my @tmparray = @{$self->{'collect_cfg'}->{'indexes'}};
128 $self->{'collect_cfg'}->{'indexes'} = [];
129 foreach my $i (@tmparray) {
130 if (!defined ($tmphash{$i})) {
131 push (@{$self->{'collect_cfg'}->{'indexes'}}, $i);
132 $tmphash{$i} = 1;
133 }
134 }
135 } else {
136 $self->{'collect_cfg'}->{'indexes'} = [];
137 }
138
139 if (scalar(@{$self->{'collect_cfg'}->{'indexes'}}) == 0) {
140 # no indexes have been specified so we'll build a "dummy:text" index
141 push (@{$self->{'collect_cfg'}->{'indexes'}}, "dummy:text");
142 }
143
144 # get the list of plugins for this collection
145 my $plugins = [];
146 if (defined $self->{'collect_cfg'}->{'plugin'}) {
147 $plugins = $self->{'collect_cfg'}->{'plugin'};
148 }
149
150 # load all the plugins
151 $self->{'pluginfo'} = &plugin::load_plugins ($plugins, $verbosity, $outhandle, $failhandle);
152 if (scalar(@{$self->{'pluginfo'}}) == 0) {
153 print $outhandle "No plugins were loaded.\n";
154 die "\n";
155 }
156
157 # get the list of classifiers for this collection
158 my $classifiers = [];
159 if (defined $self->{'collect_cfg'}->{'classify'}) {
160 $classifiers = $self->{'collect_cfg'}->{'classify'};
161 }
162
163 # load all the classifiers
164 $self->{'classifiers'} = &classify::load_classifiers ($classifiers, $build_dir, $outhandle);
165
166 # load up any dontgdbm fields
167 $self->{'dontgdbm'} = {};
168 if (defined ($self->{'collect_cfg'}->{'dontgdbm'})) {
169 foreach $dg (@{$self->{'collect_cfg'}->{'dontgdbm'}}) {
170 $self->{'dontgdbm'}->{$dg} = 1;
171 }
172 }
173
174 # load up the document processor for building
175 # if a buildproc class has been created for this collection, use it
176 # otherwise, use the mg buildproc
177 my ($buildprocdir, $buildproctype);
178 if (-e "$ENV{'GSDLCOLLECTDIR'}/perllib/${collection}buildproc.pm") {
179 $buildprocdir = "$ENV{'GSDLCOLLECTDIR'}/perllib";
180 $buildproctype = "${collection}buildproc";
181 } else {
182 $buildprocdir = "$ENV{'GSDLHOME'}/perllib";
183 $buildproctype = "mgbuildproc";
184 }
185 require "$buildprocdir/$buildproctype.pm";
186
187 eval("\$self->{'buildproc'} = new $buildproctype(\$collection, " .
188 "\$source_dir, \$build_dir, \$verbosity, \$outhandle)");
189 die "$@" if $@;
190
191 return $self;
192}
193
194sub init {
195 my $self = shift (@_);
196
197 if (!$self->{'debug'} && !$self->{'keepold'}) {
198 # remove any old builds
199 &util::rm_r($self->{'build_dir'});
200 &util::mk_all_dir($self->{'build_dir'});
201
202 # make the text directory
203 my $textdir = "$self->{'build_dir'}/text";
204 &util::mk_all_dir($textdir);
205 }
206}
207
208sub compress_text {
209 my $self = shift (@_);
210 my ($textindex) = @_;
211 my $exedir = "$ENV{'GSDLHOME'}/bin/$ENV{'GSDLOS'}";
212 my $exe = &util::get_os_exe ();
213 my $mg_passes_exe = &util::filename_cat($exedir, "mg_passes$exe");
214 my $mg_compression_dict_exe = &util::filename_cat($exedir, "mg_compression_dict$exe");
215 my $outhandle = $self->{'outhandle'};
216
217 my $maxnumeric = 4;
218 if (defined($self->{'collect_cfg'}->{'maxnumeric'}) &&
219 $self->{'collect_cfg'}->{'maxnumeric'} =~ /^\d+$/) {
220 $maxnumeric = $self->{'collect_cfg'}->{'maxnumeric'};
221 }
222
223 &util::mk_all_dir (&util::filename_cat($self->{'build_dir'}, "text"));
224 my $basefilename = "text/$self->{'collection'}";
225 my $fulltextprefix = &util::filename_cat ($self->{'build_dir'}, $basefilename);
226
227 my $osextra = "";
228 if ($ENV{'GSDLOS'} =~ /^windows$/i) {
229 $fulltextprefix =~ s@/@\\@g;
230 } else {
231 $osextra = " -d /";
232 }
233
234 print $outhandle "\n*** creating the compressed text\n" if ($self->{'verbosity'} >= 1);
235
236 # collect the statistics for the text
237 # -b $maxdocsize sets the maximum document size to be 12 meg
238 print $outhandle "\n collecting text statistics\n" if ($self->{'verbosity'} >= 1);
239
240 my ($handle);
241 if ($self->{'debug'}) {
242 $handle = STDOUT;
243 } else {
244 if (!-e "$mg_passes_exe" ||
245 !open (PIPEOUT, "| mg_passes$exe -f \"$fulltextprefix\" -b $maxdocsize -T1 -M $maxnumeric $osextra")) {
246 die "mgbuilder::compress_text - couldn't run $mg_passes_exe\n";
247 }
248 $handle = mgbuilder::PIPEOUT;
249 }
250
251 $self->{'buildproc'}->set_output_handle ($handle);
252 $self->{'buildproc'}->set_mode ('text');
253 $self->{'buildproc'}->set_index ($textindex);
254 $self->{'buildproc'}->set_indexing_text (0);
255 if ($self->{'no_text'}) {
256 $self->{'buildproc'}->set_store_text(0);
257 } else {
258 $self->{'buildproc'}->set_store_text(1);
259 }
260 $self->{'buildproc'}->reset();
261 &plugin::begin($self->{'pluginfo'}, $self->{'source_dir'},
262 $self->{'buildproc'}, $self->{'maxdocs'});
263 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
264 "", {}, $self->{'buildproc'}, $self->{'maxdocs'});
265 &plugin::end($self->{'pluginfo'});
266
267 close ($handle) unless $self->{'debug'};
268
269 $self->print_stats();
270
271 # create the compression dictionary
272 # the compression dictionary is built by assuming the stats are from a seed
273 # dictionary (-S), if a novel word is encountered it is spelled out (-H),
274 # and the resulting dictionary must be less than 5 meg with the most frequent
275 # words being put into the dictionary first (-2 -k 5120)
276 if (!$self->{'debug'}) {
277 print $outhandle "\n creating the compression dictionary\n" if ($self->{'verbosity'} >= 1);
278 if (!-e "$mg_compression_dict_exe") {
279 die "mgbuilder::compress_text - couldn't run $mg_compression_dict_exe\n";
280 }
281 system ("mg_compression_dict$exe -f \"$fulltextprefix\" -S -H -2 -k 5120 $osextra");
282
283 # -b $maxdocsize sets the maximum document size to be 12 meg
284 if (!-e "$mg_passes_exe" ||
285 !open ($handle, "| mg_passes$exe -f \"$fulltextprefix\" -b $maxdocsize -T2 -M $maxnumeric $osextra")) {
286 die "mgbuilder::compress_text - couldn't run $mg_passes_exe\n";
287 }
288 }
289
290 $self->{'buildproc'}->reset();
291 # compress the text
292 print $outhandle "\n compressing the text\n" if ($self->{'verbosity'} >= 1);
293 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
294 "", {}, $self->{'buildproc'}, $self->{'maxdocs'});
295 close ($handle) unless $self->{'debug'};
296
297 $self->print_stats();
298}
299
300sub want_built {
301 my $self = shift (@_);
302 my ($index) = @_;
303
304 if (defined ($self->{'collect_cfg'}->{'dontbuild'})) {
305 foreach $checkstr (@{$self->{'collect_cfg'}->{'dontbuild'}}) {
306 if ($index =~ /^$checkstr$/) {
307 #push (@{$self->{'notbuilt'}}, $self->{'index_mapping'}->{$index});
308 $self->{'notbuilt'}->{$index} = 1;
309 return 0;
310 }
311 }
312 }
313
314 return 1;
315}
316
317sub build_indexes {
318 my $self = shift (@_);
319 my ($indexname) = @_;
320 my $outhandle = $self->{'outhandle'};
321
322 my $indexes = [];
323 if (defined $indexname && $indexname =~ /\w/) {
324 push @$indexes, $indexname;
325 } else {
326 $indexes = $self->{'collect_cfg'}->{'indexes'};
327 }
328
329 # create the mapping between the index descriptions
330 # and their directory names
331 $self->{'index_mapping'} = $self->create_index_mapping ($indexes);
332
333 # build each of the indexes
334 foreach $index (@$indexes) {
335 if ($self->want_built($index)) {
336 print $outhandle "\n*** building index $index in subdirectory " .
337 "$self->{'index_mapping'}->{$index}\n" if ($self->{'verbosity'} >= 1);
338 $self->build_index($index);
339 } else {
340 print $outhandle "\n*** ignoring index $index\n" if ($self->{'verbosity'} >= 1);
341 }
342 }
343}
344
345# creates directory names for each of the index descriptions
346sub create_index_mapping {
347 my $self = shift (@_);
348 my ($indexes) = @_;
349
350 my %mapping = ();
351 $mapping{'indexmaporder'} = [];
352 $mapping{'subcollectionmaporder'} = [];
353 $mapping{'languagemaporder'} = [];
354
355 # dirnames is used to check for collisions. Start this off
356 # with the manditory directory names
357 my %dirnames = ('text'=>'text',
358 'extra'=>'extra');
359 my %pnames = ('index' => '', 'subcollection' => '', 'languages' => '');
360
361 foreach $index (@$indexes) {
362 my ($level, $gran, $subcollection, $languages) = split (":", $index);
363
364 # the directory name starts with the first character of the index level
365 my ($pindex) = $level =~ /^(.)/;
366
367 # next comes a processed version of the index
368 $pindex .= $self->process_field ($gran);
369 $pindex = lc ($pindex);
370
371 # next comes a processed version of the subcollection if there is one.
372 my $psub = $self->process_field ($subcollection);
373 $psub = lc ($psub);
374
375 # next comes a processed version of the language if there is one.
376 my $plang = $self->process_field ($languages);
377 $plang = lc ($plang);
378
379 my $dirname = $pindex . $psub . $plang;
380
381 # check to be sure all index names are unique
382 while (defined ($dirnames{$dirname})) {
383 $dirname = $self->make_unique (\%pnames, $index, \$pindex, \$psub, \$plang);
384 }
385 $mapping{$index} = $dirname;
386
387 # store the mapping orders as well as the maps
388 # also put index, subcollection and language fields into the mapping thing -
389 # (the full index name (eg document:text:subcol:lang) is not used on
390 # the query page) -these are used for collectionmeta later on
391 if (!defined $mapping{'indexmap'}{"$level:$gran"}) {
392 $mapping{'indexmap'}{"$level:$gran"} = $pindex;
393 push (@{$mapping{'indexmaporder'}}, "$level:$gran");
394 if (!defined $mapping{"$level:$gran"}) {
395 $mapping{"$level:$gran"} = $pindex;
396 }
397 }
398 if ($psub =~ /\w/ && !defined ($mapping{'subcollectionmap'}{$subcollection})) {
399 $mapping{'subcollectionmap'}{$subcollection} = $psub;
400 push (@{$mapping{'subcollectionmaporder'}}, $subcollection);
401 $mapping{$subcollection} = $psub;
402 }
403 if ($plang =~ /\w/ && !defined ($mapping{'languagemap'}{$languages})) {
404 $mapping{'languagemap'}{$languages} = $plang;
405 push (@{$mapping{'languagemaporder'}}, $languages);
406 $mapping{$languages} = $plang;
407 }
408 $dirnames{$dirname} = $index;
409 $pnames{'index'}{$pindex} = "$level:$gran";
410 $pnames{'subcollection'}{$psub} = $subcollection;
411 $pnames{'languages'}{$plang} = $languages;
412 }
413
414 return \%mapping;
415}
416
417# returns a processed version of a field.
418# if the field has only one component the processed
419# version will contain the first character and next consonant
420# of that componant - otherwise it will contain the first
421# character of the first two components
422sub process_field {
423 my $self = shift (@_);
424 my ($field) = @_;
425
426 return "" unless (defined ($field) && $field =~ /\w/);
427
428 my @components = split /,/, $field;
429 if (scalar @components >= 2) {
430 splice (@components, 2);
431 map {s/^(.).*$/$1/;} @components;
432 return join("", @components);
433 } else {
434 my ($a, $b) = $field =~ /^(.).*?([bcdfghjklmnpqrstvwxyz])/i;
435 ($a, $b) = $field =~ /^(.)(.)/ unless defined $a && defined $b;
436 return "$a$b";
437 }
438}
439
440sub make_unique {
441 my $self = shift (@_);
442 my ($namehash, $index, $indexref, $subref, $langref) = @_;
443 my ($level, $gran, $subcollection, $languages) = split (":", $index);
444
445 if ($namehash->{'index'}->{$$indexref} ne "$level:$gran") {
446 $self->get_next_version ($indexref);
447 } elsif ($namehash->{'subcollection'}->{$$subref} ne $subcollection) {
448 $self->get_next_version ($subref);
449 } elsif ($namehash->{'languages'}->{$$langref} ne $languages) {
450 $self->get_next_version ($langref);
451 }
452 return "$$indexref$$subref$$langref";
453}
454
455sub get_next_version {
456 my $self = shift (@_);
457 my ($nameref) = @_;
458
459 if ($$nameref =~ /(\d\d)$/) {
460 my $num = $1; $num ++;
461 $$nameref =~ s/\d\d$/$num/;
462 } elsif ($$nameref =~ /(\d)$/) {
463 my $num = $1;
464 if ($num == 9) {$$nameref =~ s/\d\d$/10/;}
465 else {$num ++; $$nameref =~ s/\d$/$num/;}
466 } else {
467 $$nameref =~ s/.$/0/;
468 }
469}
470
471sub build_index {
472 my $self = shift (@_);
473 my ($index) = @_;
474 my $outhandle = $self->{'outhandle'};
475
476 # get the full index directory path and make sure it exists
477 my $indexdir = $self->{'index_mapping'}->{$index};
478 &util::mk_all_dir (&util::filename_cat($self->{'build_dir'}, $indexdir));
479 my $fullindexprefix = &util::filename_cat ($self->{'build_dir'}, $indexdir,
480 $self->{'collection'});
481 my $fulltextprefix = &util::filename_cat ($self->{'build_dir'}, "text",
482 $self->{'collection'});
483
484 # get any os specific stuff
485 my $exedir = "$ENV{'GSDLHOME'}/bin/$ENV{'GSDLOS'}";
486 my $exe = &util::get_os_exe ();
487 my $mg_passes_exe = &util::filename_cat($exedir, "mg_passes$exe");
488 my $mg_perf_hash_build_exe =
489 &util::filename_cat($exedir, "mg_perf_hash_build$exe");
490 my $mg_weights_build_exe =
491 &util::filename_cat ($exedir, "mg_weights_build$exe");
492 my $mg_invf_dict_exe =
493 &util::filename_cat ($exedir, "mg_invf_dict$exe");
494 my $mg_stem_idx_exe =
495 &util::filename_cat ($exedir, "mg_stem_idx$exe");
496
497 my $maxnumeric = 4;
498 if (defined($self->{'collect_cfg'}->{'maxnumeric'}) &&
499 $self->{'collect_cfg'}->{'maxnumeric'} =~ /^\d+$/) {
500 $maxnumeric = $self->{'collect_cfg'}->{'maxnumeric'};
501 }
502
503 my $osextra = "";
504 if ($ENV{'GSDLOS'} =~ /^windows$/i) {
505 $fullindexprefix =~ s@/@\\@g;
506 } else {
507 $osextra = " -d /";
508 if ($outhandle ne "STDERR") {
509 # so mg_passes doesn't print to stderr if we redirect output
510 $osextra .= " 2>/dev/null";
511 }
512 }
513
514 # get the index level from the index description
515 # the index will be level 2 unless we are building a
516 # paragraph level index
517 my $index_level = 2;
518 $index_level = 3 if $index =~ /^paragraph/i;
519
520 # get the index expression if this index belongs
521 # to a subcollection
522 my $indexexparr = [];
523
524 # there may be subcollection info, and language info.
525 my ($level, $fields, $subcollection, $language) = split (":", $index);
526 my @subcollections = ();
527 @subcollections = split /,/, $subcollection if (defined $subcollection);
528
529 foreach $subcollection (@subcollections) {
530 if (defined ($self->{'collect_cfg'}->{'subcollection'}->{$subcollection})) {
531 push (@$indexexparr, $self->{'collect_cfg'}->{'subcollection'}->{$subcollection});
532 }
533 }
534
535 # add expressions for languages if this index belongs to
536 # a language subcollection - only put languages expressions for the
537 # ones we want in the index
538
539 my @languages = ();
540 @languages = split /,/, $language if (defined $language);
541 foreach $language (@languages) {
542 my $not=0;
543 if ($language =~ s/^\!//) {
544 $not = 1;
545 }
546 foreach $lang (@{$self->{'collect_cfg'}->{'languages'}}) {
547 if ($lang eq $language) {
548 if($not) {
549 push (@$indexexparr, "!Language/$language/");
550 } else {
551 push (@$indexexparr, "Language/$language/");
552 }
553 last;
554 }
555 }
556 }
557
558 # Build index dictionary. Uses verbatim stem method
559 print $outhandle "\n creating index dictionary\n" if ($self->{'verbosity'} >= 1);
560 my ($handle);
561 if ($self->{'debug'}) {
562 $handle = STDOUT;
563 } else {
564 if (!-e "$mg_passes_exe" ||
565 !open (PIPEOUT, "| mg_passes$exe -f \"$fullindexprefix\" -b $maxdocsize " .
566 "-$index_level -m 32 -s 0 -G -t 10 -N1 -M $maxnumeric $osextra")) {
567 die "mgbuilder::build_index - couldn't run $mg_passes_exe\n";
568 }
569 $handle = mgbuilder::PIPEOUT;
570 }
571
572 # set up the document processor
573 $self->{'buildproc'}->set_output_handle ($handle);
574 $self->{'buildproc'}->set_mode ('text');
575 $self->{'buildproc'}->set_index ($index, $indexexparr);
576 $self->{'buildproc'}->set_indexing_text (1);
577 $self->{'buildproc'}->set_store_text(1);
578
579 $self->{'buildproc'}->reset();
580 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
581 "", {}, $self->{'buildproc'}, $self->{'maxdocs'});
582 close ($handle) unless $self->{'debug'};
583
584 $self->print_stats();
585
586 # now we check to see if the required files have been produced - if not we quit building this index so the whole process doesn't crap out.
587 # we check on the .id file - index dictionary
588 my $dict_file = "$fullindexprefix.id";
589 if (!-e $dict_file) {
590 print $outhandle "mgbuilder::build_index - Couldn't create index $index\n";
591 $self->{'notbuilt'}->{$index}=1;
592 return;
593 }
594 if (!$self->{'debug'}) {
595 # create the perfect hash function
596 if (!-e "$mg_perf_hash_build_exe") {
597 die "mgbuilder::build_index - couldn't run $mg_perf_hash_build_exe\n";
598 }
599 system ("mg_perf_hash_build$exe -f \"$fullindexprefix\" $osextra");
600
601 if (!-e "$mg_passes_exe" ||
602 !open ($handle, "| mg_passes$exe -f \"$fullindexprefix\" -b $maxdocsize " .
603 "-$index_level -c 3 -G -t 10 -N2 -M $maxnumeric $osextra")) {
604 die "mgbuilder::build_index - couldn't run $mg_passes_exe\n";
605 }
606 }
607
608 # invert the text
609 print $outhandle "\n inverting the text\n" if ($self->{'verbosity'} >= 1);
610
611 $self->{'buildproc'}->reset();
612 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
613 "", {}, $self->{'buildproc'}, $self->{'maxdocs'});
614
615 $self->print_stats ();
616
617 if (!$self->{'debug'}) {
618
619 close ($handle);
620
621 # create the weights file
622 print $outhandle "\n create the weights file\n" if ($self->{'verbosity'} >= 1);
623 if (!-e "$mg_weights_build_exe") {
624 die "mgbuilder::build_index - couldn't run $mg_weights_build_exe\n";
625 }
626 system ("mg_weights_build$exe -f \"$fullindexprefix\" -t \"$fulltextprefix\" $osextra");
627
628 # create 'on-disk' stemmed dictionary
629 print $outhandle "\n creating 'on-disk' stemmed dictionary\n" if ($self->{'verbosity'} >= 1);
630 if (!-e "$mg_invf_dict_exe") {
631 die "mgbuilder::build_index - couldn't run $mg_invf_dict_exe\n";
632 }
633 system ("mg_invf_dict$exe -f \"$fullindexprefix\" $osextra");
634
635
636 # creates stem index files for the various stemming methods
637 print $outhandle "\n creating stem indexes\n" if ($self->{'verbosity'} >= 1);
638 if (!-e "$mg_stem_idx_exe") {
639 die "mgbuilder::build_index - couldn't run $mg_stem_idx_exe\n";
640 }
641 system ("mg_stem_idx$exe -b 4096 -s1 -f \"$fullindexprefix\" $osextra");
642 system ("mg_stem_idx$exe -b 4096 -s2 -f \"$fullindexprefix\" $osextra");
643 system ("mg_stem_idx$exe -b 4096 -s3 -f \"$fullindexprefix\" $osextra");
644
645 # remove unwanted files
646 my $tmpdir = &util::filename_cat ($self->{'build_dir'}, $indexdir);
647 opendir (DIR, $tmpdir) || die
648 "mgbuilder::build_index - couldn't read directory $tmpdir\n";
649 foreach $file (readdir(DIR)) {
650 next if $file =~ /^\./;
651 my ($suffix) = $file =~ /\.([^\.]+)$/;
652 if (defined $suffix && !defined $wanted_index_files{$suffix}) {
653 # delete it!
654 print $outhandle "deleting $file\n" if $self->{'verbosity'} > 2;
655 &util::rm (&util::filename_cat ($tmpdir, $file));
656 }
657 }
658 closedir (DIR);
659 }
660}
661
662sub make_infodatabase {
663 my $self = shift (@_);
664 my $outhandle = $self->{'outhandle'};
665
666 my $textdir = &util::filename_cat($self->{'build_dir'}, "text");
667 my $assocdir = &util::filename_cat($self->{'build_dir'}, "assoc");
668 &util::mk_all_dir ($textdir);
669 &util::mk_all_dir ($assocdir);
670
671 # get db name
672 my $dbext = ".bdb";
673 $dbext = ".ldb" if &util::is_little_endian();
674 my $fulldbname = &util::filename_cat ($textdir, "$self->{'collection'}$dbext");
675 $fulldbname =~ s/\//\\/g if ($ENV{'GSDLOS'} =~ /^windows$/i);
676
677 my $exedir = "$ENV{'GSDLHOME'}/bin/$ENV{'GSDLOS'}";
678 my $exe = &util::get_os_exe ();
679 my $txt2db_exe = &util::filename_cat($exedir, "txt2db$exe");
680
681 print $outhandle "\n*** creating the info database and processing associated files\n"
682 if ($self->{'verbosity'} >= 1);
683
684 # init all the classifiers
685 &classify::init_classifiers ($self->{'classifiers'});
686
687
688 # set up the document processor
689 my ($handle);
690 if ($self->{'debug'}) {
691 $handle = STDOUT;
692 } else {
693 if (!-e "$txt2db_exe" || !open (PIPEOUT, "| txt2db$exe \"$fulldbname\"")) {
694 die "mgbuilder::make_infodatabase - couldn't run $txt2db_exe\n";
695 }
696 $handle = mgbuilder::PIPEOUT;
697 }
698
699 $self->{'buildproc'}->set_output_handle ($handle);
700 $self->{'buildproc'}->set_mode ('infodb');
701 $self->{'buildproc'}->set_assocdir ($assocdir);
702 $self->{'buildproc'}->set_dontgdbm ($self->{'dontgdbm'});
703 $self->{'buildproc'}->set_classifiers ($self->{'classifiers'});
704 $self->{'buildproc'}->set_indexing_text (0);
705 $self->{'buildproc'}->set_store_text(1);
706 $self->{'buildproc'}->reset();
707
708 if (defined $self->{'collect_cfg'}->{'collectionmeta'}) {
709
710 if (!defined $self->{'index_mapping'}) {
711 $self->{'index_mapping'} =
712 $self->create_index_mapping ($self->{'collect_cfg'}->{'indexes'});
713 }
714
715 print $handle "[collection]\n";
716
717 foreach $cmeta (keys (%{$self->{'collect_cfg'}->{'collectionmeta'}})) {
718 my $defaultfound=0;
719 my $first=1;
720 my $metadata_entry = "";
721 my $default="";
722 my $cmetamap = "";
723 if ($cmeta =~ s/^\.//) {
724 if (defined $self->{'index_mapping'}->{$cmeta}) {
725 $cmetamap = $self->{'index_mapping'}->{$cmeta};
726 $cmeta = ".$cmeta";
727 }
728 else {
729 print $outhandle "mgbuilder: warning bad collectionmeta option '$cmeta' - ignored\n";
730 next; #ignore this one
731 }
732 }
733 else {
734 $cmetamap = $cmeta; # just using the same name
735 }
736 #iterate through the languages
737 foreach $lang (keys (%{$self->{'collect_cfg'}->{'collectionmeta'}->{$cmeta}})) {
738 if ($first) {
739 $first=0;
740 #set the default default to the first entry
741 $default=$self->{'collect_cfg'}->{'collectionmeta'}->{$cmeta}->{$lang};
742 }
743 if ($lang =~ /default/) {
744 $defaultfound=1;
745 #the default entry goes first
746 $metadata_entry = "<$cmetamap>" .
747 $self->{'collect_cfg'}->{'collectionmeta'}->{$cmeta}->{'default'} . "\n" . $metadata_entry;
748 }
749 else {
750 my ($l) = $lang =~ /^\[l=(\w*)\]$/;
751 if ($l) {
752 $metadata_entry .= "<$cmetamap:$l>" .
753 $self->{'collect_cfg'}->{'collectionmeta'}->{$cmeta}->{$lang} . "\n";
754 }
755 }
756 }
757 #if we haven't found a default, put one in
758 if (!$defaultfound) {
759 $metadata_entry = "<$cmetamap>$default\n" . $metadata_entry;
760 }
761 #write the entry to the file
762 print $handle $metadata_entry;
763
764 }
765
766 print $handle "\n" . ('-' x 70) . "\n";
767
768 }
769
770
771 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
772 "", {}, $self->{'buildproc'}, $self->{'maxdocs'});
773
774 # output classification information
775 &classify::output_classify_info ($self->{'classifiers'}, $handle,
776 $self->{'allclassifications'});
777
778
779
780 #output doclist
781 my @doclist = $self->{'buildproc'}->get_doc_list();
782 my $docs = join (";",@doclist);
783 print $handle "[browselist]\n";
784 print $handle "<hastxt>0\n";
785 print $handle "<childtype>VList\n";
786 print $handle "<numleafdocs>" . ($#doclist+1) . "\n";
787 print $handle "<thistype>Invisible\n";
788 print $handle "<contains>$docs";
789 print $handle "\n" . ('-' x 70) . "\n";
790
791 close ($handle) if !$self->{'debug'};
792}
793
794sub collect_specific {
795 my $self = shift (@_);
796}
797
798sub make_auxiliary_files {
799 my $self = shift (@_);
800 my ($index);
801 my %build_cfg = ();
802 my $outhandle = $self->{'outhandle'};
803
804 print $outhandle "\n*** creating auxiliary files \n" if ($self->{'verbosity'} >= 1);
805
806 # get the text directory
807 &util::mk_all_dir ($self->{'build_dir'});
808
809 # store the build date
810 $build_cfg->{'builddate'} = time;
811
812 # store the number of documents and number of bytes
813 $build_cfg->{'numdocs'} = $self->{'buildproc'}->get_num_docs();
814 $build_cfg->{'numbytes'} = $self->{'buildproc'}->get_num_bytes();
815
816 # get additional stats from mg
817 my $exedir = "$ENV{'GSDLHOME'}/bin/$ENV{'GSDLOS'}";
818 my $exe = &util::get_os_exe ();
819 my $mgstat_exe = &util::filename_cat($exedir, "mgstat$exe");
820 my $input_file = &util::filename_cat ("text", $self->{'collection'});
821 if (!-e "$mgstat_exe" || !open (PIPEIN, "mgstat$exe -d \"$self->{'build_dir'}\" -f \"$input_file\" |")) {
822 print $outhandle "Warning: Couldn't open pipe to $mgstat_exe to get additional stats\n";
823 } else {
824 my $line = "";
825 while (defined ($line = <PIPEIN>)) {
826 if ($line =~ /^Words in collection \[dict\]\s+:\s+(\d+)/) {
827 ($build_cfg->{'numwords'}) = $1;
828 } elsif ($line =~ /^Documents\s+:\s+(\d+)/) {
829 ($build_cfg->{'numsections'}) = $1;
830 }
831 }
832 close PIPEIN;
833 }
834
835 # store the mapping between the index names and the directory names
836 # the index map is used to determine what indexes there are, so any that are not built should not be put into the map.
837 my @indexmap = ();
838 foreach $index (@{$self->{'index_mapping'}->{'indexmaporder'}}) {
839 if (not defined ($self->{'notbuilt'}->{$index})) {
840 push (@indexmap, "$index\-\>$self->{'index_mapping'}->{'indexmap'}->{$index}");
841 }
842 }
843 $build_cfg->{'indexmap'} = \@indexmap;
844
845 my @subcollectionmap = ();
846 foreach $subcollection (@{$self->{'index_mapping'}->{'subcollectionmaporder'}}) {
847 push (@subcollectionmap, "$subcollection\-\>" .
848 $self->{'index_mapping'}->{'subcollectionmap'}->{$subcollection});
849 }
850 $build_cfg->{'subcollectionmap'} = \@subcollectionmap if scalar (@subcollectionmap);
851
852 my @languagemap = ();
853 foreach $language (@{$self->{'index_mapping'}->{'languagemaporder'}}) {
854 push (@languagemap, "$language\-\>" .
855 $self->{'index_mapping'}->{'languagemap'}->{$language});
856 }
857 $build_cfg->{'languagemap'} = \@languagemap if scalar (@languagemap);
858
859 #$build_cfg->{'notbuilt'} = $self->{'notbuilt'} if scalar @{$self->{'notbuilt'}};
860 my @notbuilt = ();
861 foreach $nb (keys %{$self->{'notbuilt'}}) {
862 push (@notbuilt, $nb);
863 }
864 $build_cfg->{'notbuilt'} = \@notbuilt if scalar (@notbuilt);
865 $build_cfg->{'maxnumeric'} = 4;
866 if (defined($self->{'collect_cfg'}->{'maxnumeric'}) &&
867 $self->{'collect_cfg'}->{'maxnumeric'} =~ /^\d+$/) {
868 $build_cfg->{'maxnumeric'} = $self->{'collect_cfg'}->{'maxnumeric'};
869 }
870
871 # write out the build information
872 &cfgread::write_cfg_file("$self->{'build_dir'}/build.cfg", $build_cfg,
873 '^(builddate|numdocs|numbytes|numwords|numsections|maxnumeric)$',
874 '^(indexmap|subcollectionmap|languagemap|notbuilt)$');
875
876}
877
878sub deinit {
879 my $self = shift (@_);
880}
881
882sub print_stats {
883 my $self = shift (@_);
884
885 my $outhandle = $self->{'outhandle'};
886 my $indexing_text = $self->{'buildproc'}->get_indexing_text();
887 my $index = $self->{'buildproc'}->get_index();
888 my $num_bytes = $self->{'buildproc'}->get_num_bytes();
889 my $num_processed_bytes = $self->{'buildproc'}->get_num_processed_bytes();
890
891 if ($indexing_text) {
892 print $outhandle "Stats (Creating index $index)\n";
893 } else {
894 print $outhandle "Stats (Compressing text from $index)\n";
895 }
896 print $outhandle "Total bytes in collection: $num_bytes\n";
897 print $outhandle "Total bytes in $index: $num_processed_bytes\n";
898
899 if ($num_processed_bytes < 50 && ($indexing_text || !$self->{'no_text'})) {
900 print $outhandle "***************\n";
901 if ($indexing_text) {
902 print $outhandle "WARNING: There is very little or no text to process for $index\n";
903 } elsif (!$self->{'no_text'}) {
904 print $outhandle "WARNING: There is very little or no text to compress\n";
905 }
906 print $outhandle " Was this your intention?\n";
907 print $outhandle "***************\n";
908 }
909}
910
9111;
Note: See TracBrowser for help on using the repository browser.