source: main/tags/2.53/gsdl/perllib/mgbuilder.pm@ 24239

Last change on this file since 24239 was 8776, checked in by kjdon, 19 years ago

fixed a bug whereby you couldn't build more than 11 subcollections

  • Property svn:keywords set to Author Date Id Revision
File size: 32.7 KB
Line 
1###########################################################################
2#
3# mgbuilder.pm -- MGBuilder object
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26package mgbuilder;
27
28use classify;
29use cfgread;
30use colcfg;
31use plugin;
32use util;
33use FileHandle;
34
35BEGIN {
36 # set autoflush on for STDERR and STDOUT so that mg
37 # doesn't get out of sync with plugins
38 STDOUT->autoflush(1);
39 STDERR->autoflush(1);
40}
41
42END {
43 STDOUT->autoflush(0);
44 STDERR->autoflush(0);
45}
46
47my $maxdocsize = 12000;
48
49my %wanted_index_files = ('td'=>1,
50 't'=>1,
51 'idb'=>1,
52 'ib1'=>1,
53 'ib2'=>1,
54 'ib3'=>1,
55 'i'=>1,
56 'ip'=>1,
57 'tiw'=>1,
58 'wa'=>1);
59
60
61sub new {
62 my ($class, $collection, $source_dir, $build_dir, $verbosity,
63 $maxdocs, $debug, $keepold, $remove_empty_classifications,
64 $outhandle, $no_text, $failhandle, $gli) = @_;
65
66 $outhandle = STDERR unless defined $outhandle;
67 $no_text = 0 unless defined $no_text;
68 $failhandle = STDERR unless defined $failhandle;
69
70 # create an mgbuilder object
71 my $self = bless {'collection'=>$collection,
72 'source_dir'=>$source_dir,
73 'build_dir'=>$build_dir,
74 'verbosity'=>$verbosity,
75 'maxdocs'=>$maxdocs,
76 'debug'=>$debug,
77 'keepold'=>$keepold,
78 'remove_empty_classifications'=>$remove_empty_classifications,
79 'outhandle'=>$outhandle,
80 'no_text'=>$no_text,
81 'failhandle'=>$failhandle,
82 'notbuilt'=>{}, # indexes not built
83 'gli'=>$gli
84 }, $class;
85
86 $self->{'gli'} = 0 unless defined $self->{'gli'};
87
88 # read in the collection configuration file
89 my $colcfgname = "$ENV{'GSDLCOLLECTDIR'}/etc/collect.cfg";
90 if (!-e $colcfgname) {
91 die "mgbuilder::new - couldn't find collect.cfg for collection $collection\n";
92 }
93 $self->{'collect_cfg'} = &colcfg::read_collect_cfg ($colcfgname);
94
95 if (!defined($self->{'collect_cfg'}->{'indexes'})) {
96 $self->{'collect_cfg'}->{'indexes'} = [];
97 }
98
99 # sort out subcollection indexes
100 if (defined $self->{'collect_cfg'}->{'indexsubcollections'}) {
101 my $indexes = $self->{'collect_cfg'}->{'indexes'};
102 $self->{'collect_cfg'}->{'indexes'} = [];
103 foreach my $subcollection (@{$self->{'collect_cfg'}->{'indexsubcollections'}}) {
104 foreach my $index (@$indexes) {
105 push (@{$self->{'collect_cfg'}->{'indexes'}}, "$index:$subcollection");
106 }
107 }
108 }
109
110 # sort out language subindexes
111 if (defined $self->{'collect_cfg'}->{'languages'}) {
112 my $indexes = $self->{'collect_cfg'}->{'indexes'};
113 $self->{'collect_cfg'}->{'indexes'} = [];
114 foreach my $language (@{$self->{'collect_cfg'}->{'languages'}}) {
115 foreach my $index (@$indexes) {
116 if (defined ($self->{'collect_cfg'}->{'indexsubcollections'})) {
117 push (@{$self->{'collect_cfg'}->{'indexes'}}, "$index:$language");
118 }
119 else { # add in an empty subcollection field
120 push (@{$self->{'collect_cfg'}->{'indexes'}}, "$index\:\:$language");
121 }
122 }
123 }
124 }
125
126 if (defined($self->{'collect_cfg'}->{'indexes'})) {
127 # make sure that the same index isn't specified more than once
128 my %tmphash = ();
129 my @tmparray = @{$self->{'collect_cfg'}->{'indexes'}};
130 $self->{'collect_cfg'}->{'indexes'} = [];
131 foreach my $i (@tmparray) {
132 if (!defined ($tmphash{$i})) {
133 push (@{$self->{'collect_cfg'}->{'indexes'}}, $i);
134 $tmphash{$i} = 1;
135 }
136 }
137 } else {
138 $self->{'collect_cfg'}->{'indexes'} = [];
139 }
140
141 if (scalar(@{$self->{'collect_cfg'}->{'indexes'}}) == 0) {
142 # no indexes have been specified so we'll build a "dummy:text" index
143 push (@{$self->{'collect_cfg'}->{'indexes'}}, "dummy:text");
144 }
145
146 # get the list of plugins for this collection
147 my $plugins = [];
148 if (defined $self->{'collect_cfg'}->{'plugin'}) {
149 $plugins = $self->{'collect_cfg'}->{'plugin'};
150 }
151
152 # load all the plugins
153
154 #build up the extra global options for the plugins
155 my @global_opts = ();
156 if (defined $self->{'collect_cfg'}->{'separate_cjk'} && $self->{'collect_cfg'}->{'separate_cjk'} =~ /^true$/i) {
157 push @global_opts, "-separate_cjk";
158 }
159 $self->{'pluginfo'} = &plugin::load_plugins ($plugins, $verbosity, $outhandle, $failhandle, \@global_opts);
160
161 if (scalar(@{$self->{'pluginfo'}}) == 0) {
162 print $outhandle "No plugins were loaded.\n";
163 die "\n";
164 }
165
166 # get the list of classifiers for this collection
167 my $classifiers = [];
168 if (defined $self->{'collect_cfg'}->{'classify'}) {
169 $classifiers = $self->{'collect_cfg'}->{'classify'};
170 }
171
172 # load all the classifiers
173 $self->{'classifiers'} = &classify::load_classifiers ($classifiers, $build_dir, $outhandle);
174
175 # load up any dontgdbm fields
176 $self->{'dontgdbm'} = {};
177 if (defined ($self->{'collect_cfg'}->{'dontgdbm'})) {
178 foreach my $dg (@{$self->{'collect_cfg'}->{'dontgdbm'}}) {
179 $self->{'dontgdbm'}->{$dg} = 1;
180 }
181 }
182
183 # load up the document processor for building
184 # if a buildproc class has been created for this collection, use it
185 # otherwise, use the mg buildproc
186 my ($buildprocdir, $buildproctype);
187 if (-e "$ENV{'GSDLCOLLECTDIR'}/perllib/${collection}buildproc.pm") {
188 $buildprocdir = "$ENV{'GSDLCOLLECTDIR'}/perllib";
189 $buildproctype = "${collection}buildproc";
190 } else {
191 $buildprocdir = "$ENV{'GSDLHOME'}/perllib";
192 $buildproctype = "mgbuildproc";
193 }
194
195 require "$buildprocdir/$buildproctype.pm";
196
197 eval("\$self->{'buildproc'} = new $buildproctype(\$collection, " .
198 "\$source_dir, \$build_dir, \$verbosity, \$outhandle)");
199 die "$@" if $@;
200
201 return $self;
202}
203
204sub init {
205 my $self = shift (@_);
206
207 if (!$self->{'debug'} && !$self->{'keepold'}) {
208 # remove any old builds
209 &util::rm_r($self->{'build_dir'});
210 &util::mk_all_dir($self->{'build_dir'});
211
212 # make the text directory
213 my $textdir = "$self->{'build_dir'}/text";
214 &util::mk_all_dir($textdir);
215 }
216}
217
218sub compress_text {
219 my $self = shift (@_);
220 my ($textindex) = @_;
221 my $exedir = "$ENV{'GSDLHOME'}/bin/$ENV{'GSDLOS'}";
222 my $exe = &util::get_os_exe ();
223 my $mg_passes_exe = &util::filename_cat($exedir, "mg_passes$exe");
224 my $mg_compression_dict_exe = &util::filename_cat($exedir, "mg_compression_dict$exe");
225 my $outhandle = $self->{'outhandle'};
226
227 my $maxnumeric = 4;
228 if (defined($self->{'collect_cfg'}->{'maxnumeric'}) &&
229 $self->{'collect_cfg'}->{'maxnumeric'} =~ /^\d+$/) {
230 $maxnumeric = $self->{'collect_cfg'}->{'maxnumeric'};
231 }
232
233 &util::mk_all_dir (&util::filename_cat($self->{'build_dir'}, "text"));
234 my $basefilename = "text/$self->{'collection'}";
235 my $fulltextprefix = &util::filename_cat ($self->{'build_dir'}, $basefilename);
236
237 my $osextra = "";
238 if ($ENV{'GSDLOS'} =~ /^windows$/i) {
239 $fulltextprefix =~ s@/@\\@g;
240 } else {
241 $osextra = " -d /";
242 }
243
244 print $outhandle "\n*** creating the compressed text\n" if ($self->{'verbosity'} >= 1);
245 print STDERR "<Stage name='CompressText'>\n" if $self->{'gli'};
246
247 # collect the statistics for the text
248 # -b $maxdocsize sets the maximum document size to be 12 meg
249 print $outhandle "\n collecting text statistics\n" if ($self->{'verbosity'} >= 1);
250 print STDERR "<Phase name='CollectTextStats'/>\n" if $self->{'gli'};
251
252 my ($handle);
253 if ($self->{'debug'}) {
254 $handle = STDOUT;
255 } else {
256 if (!-e "$mg_passes_exe" ||
257 !open (PIPEOUT, "| mg_passes$exe -f \"$fulltextprefix\" -b $maxdocsize -T1 -M $maxnumeric $osextra")) {
258 print STDERR "<FatalError name='NoRunMGPasses'>\n</Stage>\n" if $self->{'gli'};
259 die "mgbuilder::compress_text - couldn't run $mg_passes_exe\n";
260 }
261 $handle = mgbuilder::PIPEOUT;
262 }
263
264 $self->{'buildproc'}->set_output_handle ($handle);
265 $self->{'buildproc'}->set_mode ('text');
266 $self->{'buildproc'}->set_index ($textindex);
267 $self->{'buildproc'}->set_indexing_text (0);
268
269
270 if ($self->{'no_text'}) {
271 $self->{'buildproc'}->set_store_text(0);
272 } else {
273 $self->{'buildproc'}->set_store_text(1);
274 }
275 $self->{'buildproc'}->reset();
276
277 &plugin::begin($self->{'pluginfo'}, $self->{'source_dir'},
278 $self->{'buildproc'}, $self->{'maxdocs'});
279 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
280 "", {}, $self->{'buildproc'}, $self->{'maxdocs'});
281 &plugin::end($self->{'pluginfo'});
282
283
284 close ($handle) unless $self->{'debug'};
285
286 $self->print_stats();
287
288 # create the compression dictionary
289 # the compression dictionary is built by assuming the stats are from a seed
290 # dictionary (-S), if a novel word is encountered it is spelled out (-H),
291 # and the resulting dictionary must be less than 5 meg with the most frequent
292 # words being put into the dictionary first (-2 -k 5120)
293 if (!$self->{'debug'}) {
294 print $outhandle "\n creating the compression dictionary\n" if ($self->{'verbosity'} >= 1);
295 print STDERR "<Phase name='CreatingCompress'/>\n" if $self->{'gli'};
296 if (!-e "$mg_compression_dict_exe") {
297 die "mgbuilder::compress_text - couldn't run $mg_compression_dict_exe\n";
298 }
299 system ("mg_compression_dict$exe -f \"$fulltextprefix\" -S -H -2 -k 5120 $osextra");
300
301 # -b $maxdocsize sets the maximum document size to be 12 meg
302 if (!-e "$mg_passes_exe" ||
303 !open ($handle, "| mg_passes$exe -f \"$fulltextprefix\" -b $maxdocsize -T2 -M $maxnumeric $osextra")) {
304 print STDERR "<FatalError name='NoRunMGPasses'/>\n</Stage>\n" if $self->{'gli'};
305 die "mgbuilder::compress_text - couldn't run $mg_passes_exe\n";
306 }
307 }
308 else {
309 print STDERR "<Phase name='SkipCreatingComp'/>\n" if $self->{'gli'};
310 }
311
312 $self->{'buildproc'}->reset();
313 # compress the text
314 print $outhandle "\n compressing the text\n" if ($self->{'verbosity'} >= 1);
315 print STDERR "<Phase name='CompressingText'/>\n" if $self->{'gli'};
316
317 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
318 "", {}, $self->{'buildproc'}, $self->{'maxdocs'});
319
320 close ($handle) unless $self->{'debug'};
321
322 $self->print_stats();
323 print STDERR "</Stage>\n" if $self->{'gli'};
324}
325
326sub want_built {
327 my $self = shift (@_);
328 my ($index) = @_;
329
330 if (defined ($self->{'collect_cfg'}->{'dontbuild'})) {
331 foreach my $checkstr (@{$self->{'collect_cfg'}->{'dontbuild'}}) {
332 if ($index =~ /^$checkstr$/) {
333 #push (@{$self->{'notbuilt'}}, $self->{'index_mapping'}->{$index});
334 $self->{'notbuilt'}->{$index} = 1;
335 return 0;
336 }
337 }
338 }
339
340 return 1;
341}
342
343sub build_indexes {
344 my $self = shift (@_);
345 my ($indexname) = @_;
346 my $outhandle = $self->{'outhandle'};
347 my $indexes = [];
348 if (defined $indexname && $indexname =~ /\w/) {
349 push @$indexes, $indexname;
350 } else {
351 $indexes = $self->{'collect_cfg'}->{'indexes'};
352 }
353
354 # create the mapping between the index descriptions
355 # and their directory names
356 $self->{'index_mapping'} = $self->create_index_mapping ($indexes);
357
358 # build each of the indexes
359 foreach my $index (@$indexes) {
360 if ($self->want_built($index)) {
361 print $outhandle "\n*** building index $index in subdirectory " .
362 "$self->{'index_mapping'}->{$index}\n" if ($self->{'verbosity'} >= 1);
363 print STDERR "<Stage name='Index' source='$index'>\n" if $self->{'gli'};
364 $self->build_index($index);
365 } else {
366 print $outhandle "\n*** ignoring index $index\n" if ($self->{'verbosity'} >= 1);
367 }
368 }
369}
370
371# creates directory names for each of the index descriptions
372sub create_index_mapping {
373 my $self = shift (@_);
374 my ($indexes) = @_;
375
376 my %mapping = ();
377 $mapping{'indexmaporder'} = [];
378 $mapping{'subcollectionmaporder'} = [];
379 $mapping{'languagemaporder'} = [];
380
381 # dirnames is used to check for collisions. Start this off
382 # with the manditory directory names
383 my %dirnames = ('text'=>'text',
384 'extra'=>'extra');
385 my %pnames = ('index' => {}, 'subcollection' => {}, 'languages' => {});
386 foreach my $index (@$indexes) {
387 my ($level, $gran, $subcollection, $languages) = split (":", $index);
388
389 # the directory name starts with the first character of the index level
390 my ($pindex) = $level =~ /^(.)/;
391
392 # next comes a processed version of the index
393 $pindex .= $self->process_field ($gran);
394 $pindex = lc ($pindex);
395
396 # next comes a processed version of the subcollection if there is one.
397 my $psub = $self->process_field ($subcollection);
398 $psub = lc ($psub);
399
400 # next comes a processed version of the language if there is one.
401 my $plang = $self->process_field ($languages);
402 $plang = lc ($plang);
403
404 my $dirname = $pindex . $psub . $plang;
405
406 # check to be sure all index names are unique
407 while (defined ($dirnames{$dirname})) {
408 $dirname = $self->make_unique (\%pnames, $index, \$pindex, \$psub, \$plang);
409 }
410 $mapping{$index} = $dirname;
411
412 # store the mapping orders as well as the maps
413 # also put index, subcollection and language fields into the mapping thing -
414 # (the full index name (eg document:text:subcol:lang) is not used on
415 # the query page) -these are used for collectionmeta later on
416 if (!defined $mapping{'indexmap'}{"$level:$gran"}) {
417 $mapping{'indexmap'}{"$level:$gran"} = $pindex;
418 push (@{$mapping{'indexmaporder'}}, "$level:$gran");
419 if (!defined $mapping{"$level:$gran"}) {
420 $mapping{"$level:$gran"} = $pindex;
421 }
422 }
423 if ($psub =~ /\w/ && !defined ($mapping{'subcollectionmap'}{$subcollection})) {
424 $mapping{'subcollectionmap'}{$subcollection} = $psub;
425 push (@{$mapping{'subcollectionmaporder'}}, $subcollection);
426 $mapping{$subcollection} = $psub;
427 }
428 if ($plang =~ /\w/ && !defined ($mapping{'languagemap'}{$languages})) {
429 $mapping{'languagemap'}{$languages} = $plang;
430 push (@{$mapping{'languagemaporder'}}, $languages);
431 $mapping{$languages} = $plang;
432 }
433 $dirnames{$dirname} = $index;
434 $pnames{'index'}->{$pindex} = "$level:$gran";
435 $pnames{'subcollection'}->{$psub} = $subcollection;
436 $pnames{'languages'}->{$plang} = $languages;
437 }
438
439 return \%mapping;
440}
441
442# returns a processed version of a field.
443# if the field has only one component the processed
444# version will contain the first character and next consonant
445# of that componant - otherwise it will contain the first
446# character of the first two components
447sub process_field {
448 my $self = shift (@_);
449 my ($field) = @_;
450
451 return "" unless (defined ($field) && $field =~ /\w/);
452
453 my @components = split /,/, $field;
454 if (scalar @components >= 2) {
455 splice (@components, 2);
456 map {s/^(.).*$/$1/;} @components;
457 return join("", @components);
458 } else {
459 my ($a, $b) = $field =~ /^(.).*?([bcdfghjklmnpqrstvwxyz])/i;
460 ($a, $b) = $field =~ /^(.)(.)/ unless defined $a && defined $b;
461 return "$a$b";
462 }
463}
464
465sub make_unique {
466 my $self = shift (@_);
467 my ($namehash, $index, $indexref, $subref, $langref) = @_;
468 my ($level, $gran, $subcollection, $languages) = split (":", $index);
469
470 if ($namehash->{'index'}->{$$indexref} ne "$level:$gran") {
471 $self->get_next_version ($indexref);
472 } elsif ($namehash->{'subcollection'}->{$$subref} ne $subcollection) {
473 $self->get_next_version ($subref);
474 } elsif ($namehash->{'languages'}->{$$langref} ne $languages) {
475 $self->get_next_version ($langref);
476 }
477 return "$$indexref$$subref$$langref";
478}
479
480sub get_next_version {
481 my $self = shift (@_);
482 my ($nameref) = @_;
483 if ($$nameref =~ /(\d\d)$/) {
484 my $num = $1; $num ++;
485 $$nameref =~ s/\d\d$/$num/;
486 } elsif ($$nameref =~ /(\d)$/) {
487 my $num = $1;
488 if ($num == 9) {$$nameref =~ s/\d$/10/;}
489 else {$num ++; $$nameref =~ s/\d$/$num/;}
490 } else {
491 $$nameref =~ s/.$/0/;
492 }
493}
494
495sub build_index {
496 my $self = shift (@_);
497 my ($index) = @_;
498 my $outhandle = $self->{'outhandle'};
499
500 # get the full index directory path and make sure it exists
501 my $indexdir = $self->{'index_mapping'}->{$index};
502 &util::mk_all_dir (&util::filename_cat($self->{'build_dir'}, $indexdir));
503 my $fullindexprefix = &util::filename_cat ($self->{'build_dir'}, $indexdir,
504 $self->{'collection'});
505 my $fulltextprefix = &util::filename_cat ($self->{'build_dir'}, "text",
506 $self->{'collection'});
507
508 # get any os specific stuff
509 my $exedir = "$ENV{'GSDLHOME'}/bin/$ENV{'GSDLOS'}";
510 my $exe = &util::get_os_exe ();
511 my $mg_passes_exe = &util::filename_cat($exedir, "mg_passes$exe");
512 my $mg_perf_hash_build_exe =
513 &util::filename_cat($exedir, "mg_perf_hash_build$exe");
514 my $mg_weights_build_exe =
515 &util::filename_cat ($exedir, "mg_weights_build$exe");
516 my $mg_invf_dict_exe =
517 &util::filename_cat ($exedir, "mg_invf_dict$exe");
518 my $mg_stem_idx_exe =
519 &util::filename_cat ($exedir, "mg_stem_idx$exe");
520
521 my $maxnumeric = 4;
522 if (defined($self->{'collect_cfg'}->{'maxnumeric'}) &&
523 $self->{'collect_cfg'}->{'maxnumeric'} =~ /^\d+$/) {
524 $maxnumeric = $self->{'collect_cfg'}->{'maxnumeric'};
525 }
526
527 my $osextra = "";
528 if ($ENV{'GSDLOS'} =~ /^windows$/i) {
529 $fullindexprefix =~ s@/@\\@g;
530 } else {
531 $osextra = " -d /";
532 if ($outhandle ne "STDERR") {
533 # so mg_passes doesn't print to stderr if we redirect output
534 $osextra .= " 2>/dev/null";
535 }
536 }
537
538 # get the index level from the index description
539 # the index will be level 2 unless we are building a
540 # paragraph level index
541 my $index_level = 2;
542 $index_level = 3 if $index =~ /^paragraph/i;
543
544 # get the index expression if this index belongs
545 # to a subcollection
546 my $indexexparr = [];
547
548 # there may be subcollection info, and language info.
549 my ($level, $fields, $subcollection, $language) = split (":", $index);
550 my @subcollections = ();
551 @subcollections = split /,/, $subcollection if (defined $subcollection);
552
553 foreach my $subcollection (@subcollections) {
554 if (defined ($self->{'collect_cfg'}->{'subcollection'}->{$subcollection})) {
555 push (@$indexexparr, $self->{'collect_cfg'}->{'subcollection'}->{$subcollection});
556 }
557 }
558
559 # add expressions for languages if this index belongs to
560 # a language subcollection - only put languages expressions for the
561 # ones we want in the index
562 # this puts a separate Language/en entry in for each language in the list
563 # is this what we want?
564 # should we just have one entry with Language/en,es/ ??
565
566 my @languages = ();
567 @languages = split /,/, $language if (defined $language);
568 foreach my $language (@languages) {
569 my $not=0;
570 if ($language =~ s/^\!//) {
571 $not = 1;
572 }
573 if($not) {
574 push (@$indexexparr, "!Language/$language/");
575 } else {
576 push (@$indexexparr, "Language/$language/");
577 }
578 }
579
580 # Build index dictionary. Uses verbatim stem method
581 print $outhandle "\n creating index dictionary\n" if ($self->{'verbosity'} >= 1);
582 print STDERR "<Phase name='CreatingIndexDic'/>\n" if $self->{'gli'};
583 my ($handle);
584 if ($self->{'debug'}) {
585 $handle = STDOUT;
586 } else {
587 if (!-e "$mg_passes_exe" ||
588 !open (PIPEOUT, "| mg_passes$exe -f \"$fullindexprefix\" -b $maxdocsize " .
589 "-$index_level -m 32 -s 0 -G -t 10 -N1 -M $maxnumeric $osextra")) {
590 print STDERR "<FatalError name='NoRunMGPasses'/>\n</Stage>\n" if $self->{'gli'};
591 die "mgbuilder::build_index - couldn't run $mg_passes_exe\n";
592 }
593 $handle = mgbuilder::PIPEOUT;
594 }
595
596 # set up the document processor
597 $self->{'buildproc'}->set_output_handle ($handle);
598 $self->{'buildproc'}->set_mode ('text');
599 $self->{'buildproc'}->set_index ($index, $indexexparr);
600 $self->{'buildproc'}->set_indexing_text (1);
601 $self->{'buildproc'}->set_store_text(1);
602
603 $self->{'buildproc'}->reset();
604 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
605 "", {}, $self->{'buildproc'}, $self->{'maxdocs'});
606 close ($handle) unless $self->{'debug'};
607
608 $self->print_stats();
609
610 # now we check to see if the required files have been produced - if not we quit building this index so the whole process doesn't crap out.
611 # we check on the .id file - index dictionary
612 my $dict_file = "$fullindexprefix.id";
613 if (!-e $dict_file) {
614 print $outhandle "mgbuilder::build_index - Couldn't create index $index\n";
615 $self->{'notbuilt'}->{$index}=1;
616 return;
617 }
618 if (!$self->{'debug'}) {
619 # create the perfect hash function
620 if (!-e "$mg_perf_hash_build_exe") {
621 print STDERR "<FatalError name='NoRunMGHash'/>\n</Stage>\n" if $self->{'gli'};
622 die "mgbuilder::build_index - couldn't run $mg_perf_hash_build_exe\n";
623 }
624 system ("mg_perf_hash_build$exe -f \"$fullindexprefix\" $osextra");
625
626 if (!-e "$mg_passes_exe" ||
627 !open ($handle, "| mg_passes$exe -f \"$fullindexprefix\" -b $maxdocsize " .
628 "-$index_level -c 3 -G -t 10 -N2 -M $maxnumeric $osextra")) {
629 print STDERR "<FatalError name='NoRunMGPasses'/>\n</Stage>\n" if $self->{'gli'};
630 die "mgbuilder::build_index - couldn't run $mg_passes_exe\n";
631 }
632 }
633
634 # invert the text
635 print $outhandle "\n inverting the text\n" if ($self->{'verbosity'} >= 1);
636 print STDERR "<Phase name='InvertingText'/>\n" if $self->{'gli'};
637 $self->{'buildproc'}->reset();
638 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
639 "", {}, $self->{'buildproc'}, $self->{'maxdocs'});
640
641
642 $self->print_stats ();
643
644 if (!$self->{'debug'}) {
645
646 close ($handle);
647
648 # create the weights file
649 print $outhandle "\n create the weights file\n" if ($self->{'verbosity'} >= 1);
650 print STDERR "<Phase name='CreateTheWeights'/>\n" if $self->{'gli'};
651 if (!-e "$mg_weights_build_exe") {
652 print STDERR "<FatalError name='NoRunMGWeights'/>\n</Stage>\n" if $self->{'gli'};
653 die "mgbuilder::build_index - couldn't run $mg_weights_build_exe\n";
654 }
655 system ("mg_weights_build$exe -f \"$fullindexprefix\" -t \"$fulltextprefix\" $osextra");
656
657 # create 'on-disk' stemmed dictionary
658 print $outhandle "\n creating 'on-disk' stemmed dictionary\n" if ($self->{'verbosity'} >= 1);
659 print STDERR "<Phase name='CreateStemmedDic'/>\n" if $self->{'gli'};
660 if (!-e "$mg_invf_dict_exe") {
661 print STDERR "<FatalError name='NoRunMGInvf'/>\n</Stage>\n" if $self->{'gli'};
662 die "mgbuilder::build_index - couldn't run $mg_invf_dict_exe\n";
663 }
664 system ("mg_invf_dict$exe -f \"$fullindexprefix\" $osextra");
665
666
667 # creates stem index files for the various stemming methods
668 print $outhandle "\n creating stem indexes\n" if ($self->{'verbosity'} >= 1);
669 print STDERR "<Phase name='CreatingStemIndx'/>\n" if $self->{'gli'};
670 if (!-e "$mg_stem_idx_exe") {
671 print STDERR "<FatalError name='NoRunMGStem'/>\n</Stage>\n" if $self->{'gli'};
672 die "mgbuilder::build_index - couldn't run $mg_stem_idx_exe\n";
673 }
674 system ("mg_stem_idx$exe -b 4096 -s1 -f \"$fullindexprefix\" $osextra");
675 system ("mg_stem_idx$exe -b 4096 -s2 -f \"$fullindexprefix\" $osextra");
676 system ("mg_stem_idx$exe -b 4096 -s3 -f \"$fullindexprefix\" $osextra");
677
678 # remove unwanted files
679 my $tmpdir = &util::filename_cat ($self->{'build_dir'}, $indexdir);
680 opendir (DIR, $tmpdir) || die
681 "mgbuilder::build_index - couldn't read directory $tmpdir\n";
682 foreach my $file (readdir(DIR)) {
683 next if $file =~ /^\./;
684 my ($suffix) = $file =~ /\.([^\.]+)$/;
685 if (defined $suffix && !defined $wanted_index_files{$suffix}) {
686 # delete it!
687 print $outhandle "deleting $file\n" if $self->{'verbosity'} > 2;
688 &util::rm (&util::filename_cat ($tmpdir, $file));
689 }
690 }
691 closedir (DIR);
692 }
693 print STDERR "</Stage>\n" if $self->{'gli'};
694}
695
696sub make_infodatabase {
697 my $self = shift (@_);
698 my $outhandle = $self->{'outhandle'};
699
700 my $textdir = &util::filename_cat($self->{'build_dir'}, "text");
701 my $assocdir = &util::filename_cat($self->{'build_dir'}, "assoc");
702 &util::mk_all_dir ($textdir);
703 &util::mk_all_dir ($assocdir);
704
705 # get db name
706 my $dbext = ".bdb";
707 $dbext = ".ldb" if &util::is_little_endian();
708 my $fulldbname = &util::filename_cat ($textdir, "$self->{'collection'}$dbext");
709 $fulldbname =~ s/\//\\/g if ($ENV{'GSDLOS'} =~ /^windows$/i);
710
711 my $exedir = "$ENV{'GSDLHOME'}/bin/$ENV{'GSDLOS'}";
712 my $exe = &util::get_os_exe ();
713 my $txt2db_exe = &util::filename_cat($exedir, "txt2db$exe");
714
715 print $outhandle "\n*** creating the info database and processing associated files\n"
716 if ($self->{'verbosity'} >= 1);
717 print STDERR "<Stage name='CreateInfoData'>\n" if $self->{'gli'};
718
719 # init all the classifiers
720 &classify::init_classifiers ($self->{'classifiers'});
721
722 # set up the document processor
723 my ($handle);
724 if ($self->{'debug'}) {
725 $handle = STDOUT;
726 } else {
727 if (!-e "$txt2db_exe" || !open (PIPEOUT, "| txt2db$exe \"$fulldbname\"")) {
728 print STDERR "<FatalError name='NoRunText2DB'/>\n</Stage>\n" if $self->{'gli'};
729 die "mgbuilder::make_infodatabase - couldn't run $txt2db_exe\n";
730 }
731 $handle = mgbuilder::PIPEOUT;
732 }
733
734 $self->{'buildproc'}->set_output_handle ($handle);
735 $self->{'buildproc'}->set_mode ('infodb');
736 $self->{'buildproc'}->set_assocdir ($assocdir);
737 $self->{'buildproc'}->set_dontgdbm ($self->{'dontgdbm'});
738 $self->{'buildproc'}->set_classifiers ($self->{'classifiers'});
739 $self->{'buildproc'}->set_indexing_text (0);
740 $self->{'buildproc'}->set_store_text(1);
741 $self->{'buildproc'}->reset();
742
743 if (defined $self->{'collect_cfg'}->{'collectionmeta'}) {
744
745 if (!defined $self->{'index_mapping'}) {
746 $self->{'index_mapping'} =
747 $self->create_index_mapping ($self->{'collect_cfg'}->{'indexes'});
748 }
749
750 print $handle "[collection]\n";
751
752 foreach my $cmeta (keys (%{$self->{'collect_cfg'}->{'collectionmeta'}})) {
753 my $defaultfound=0;
754 my $first=1;
755 my $metadata_entry = "";
756 my $default="";
757 my $cmetamap = "";
758 if ($cmeta =~ s/^\.//) {
759 if (defined $self->{'index_mapping'}->{$cmeta}) {
760 $cmetamap = $self->{'index_mapping'}->{$cmeta};
761 $cmeta = ".$cmeta";
762 }
763 else {
764 print $outhandle "mgbuilder: warning bad collectionmeta option '$cmeta' - ignored\n";
765 next; #ignore this one
766 }
767 }
768 else {
769 $cmetamap = $cmeta; # just using the same name
770 }
771 #iterate through the languages
772 foreach my $lang (keys (%{$self->{'collect_cfg'}->{'collectionmeta'}->{$cmeta}})) {
773 if ($first) {
774 $first=0;
775 #set the default default to the first entry
776 $default=$self->{'collect_cfg'}->{'collectionmeta'}->{$cmeta}->{$lang};
777 }
778 if ($lang =~ /default/) {
779 $defaultfound=1;
780 #the default entry goes first
781 $metadata_entry = "<$cmetamap>" .
782 $self->{'collect_cfg'}->{'collectionmeta'}->{$cmeta}->{'default'} . "\n" . $metadata_entry;
783 }
784 else {
785 my ($l) = $lang =~ /^\[l=(\w*)\]$/;
786 if ($l) {
787 $metadata_entry .= "<$cmetamap:$l>" .
788 $self->{'collect_cfg'}->{'collectionmeta'}->{$cmeta}->{$lang} . "\n";
789
790 # Use the English value as the default if no default is specified
791 if ($l =~ /en/i) {
792 $default=$self->{'collect_cfg'}->{'collectionmeta'}->{$cmeta}->{$lang};
793 }
794 }
795 }
796 }
797 #if we haven't found a default, put one in
798 if (!$defaultfound) {
799 $metadata_entry = "<$cmetamap>$default\n" . $metadata_entry;
800 }
801 #write the entry to the file
802 print $handle $metadata_entry;
803
804 }
805
806 print $handle "\n" . ('-' x 70) . "\n";
807 }
808
809 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
810 "", {}, $self->{'buildproc'}, $self->{'maxdocs'});
811
812 # output classification information
813 &classify::output_classify_info ($self->{'classifiers'}, $handle,
814 $self->{'remove_empty_classifications'},
815 $self->{'gli'});
816
817
818 #output doclist
819 my @doclist = $self->{'buildproc'}->get_doc_list();
820 my $docs = join (";",@doclist);
821 print $handle "[browselist]\n";
822 print $handle "<hastxt>0\n";
823 print $handle "<childtype>VList\n";
824 print $handle "<numleafdocs>" . ($#doclist+1) . "\n";
825 print $handle "<thistype>Invisible\n";
826 print $handle "<contains>$docs";
827 print $handle "\n" . ('-' x 70) . "\n";
828
829 close ($handle) if !$self->{'debug'};
830
831 print STDERR "</Stage>\n" if $self->{'gli'};
832}
833
834sub collect_specific {
835 my $self = shift (@_);
836}
837
838sub make_auxiliary_files {
839 my $self = shift (@_);
840 my ($index);
841 my $build_cfg = {};
842 my $outhandle = $self->{'outhandle'};
843
844 print $outhandle "\n*** creating auxiliary files \n" if ($self->{'verbosity'} >= 1);
845 print STDERR "<Stage name='CreatingAuxilary'>\n" if $self->{'gli'};
846
847 # get the text directory
848 &util::mk_all_dir ($self->{'build_dir'});
849
850 # store the build date
851 $build_cfg->{'builddate'} = time;
852
853 # store the number of documents and number of bytes
854 $build_cfg->{'numdocs'} = $self->{'buildproc'}->get_num_docs();
855 $build_cfg->{'numbytes'} = $self->{'buildproc'}->get_num_bytes();
856
857 # get additional stats from mg
858 my $exedir = "$ENV{'GSDLHOME'}/bin/$ENV{'GSDLOS'}";
859 my $exe = &util::get_os_exe ();
860 my $mgstat_exe = &util::filename_cat($exedir, "mgstat$exe");
861 my $input_file = &util::filename_cat ("text", $self->{'collection'});
862 if (!-e "$mgstat_exe" || !open (PIPEIN, "mgstat$exe -d \"$self->{'build_dir'}\" -f \"$input_file\" |")) {
863 print $outhandle "Warning: Couldn't open pipe to $mgstat_exe to get additional stats\n";
864 } else {
865 my $line = "";
866 while (defined ($line = <PIPEIN>)) {
867 if ($line =~ /^Words in collection \[dict\]\s+:\s+(\d+)/) {
868 ($build_cfg->{'numwords'}) = $1;
869 } elsif ($line =~ /^Documents\s+:\s+(\d+)/) {
870 ($build_cfg->{'numsections'}) = $1;
871 }
872 }
873 close PIPEIN;
874 }
875
876 # store the mapping between the index names and the directory names
877 # the index map is used to determine what indexes there are, so any that are not built should not be put into the map.
878 my @indexmap = ();
879 foreach my $index (@{$self->{'index_mapping'}->{'indexmaporder'}}) {
880 if (not defined ($self->{'notbuilt'}->{$index})) {
881 push (@indexmap, "$index\-\>$self->{'index_mapping'}->{'indexmap'}->{$index}");
882 }
883 }
884 $build_cfg->{'indexmap'} = \@indexmap;
885
886 my @subcollectionmap = ();
887 foreach my $subcollection (@{$self->{'index_mapping'}->{'subcollectionmaporder'}}) {
888 push (@subcollectionmap, "$subcollection\-\>" .
889 $self->{'index_mapping'}->{'subcollectionmap'}->{$subcollection});
890 }
891 $build_cfg->{'subcollectionmap'} = \@subcollectionmap if scalar (@subcollectionmap);
892
893 my @languagemap = ();
894 foreach my $language (@{$self->{'index_mapping'}->{'languagemaporder'}}) {
895 push (@languagemap, "$language\-\>" .
896 $self->{'index_mapping'}->{'languagemap'}->{$language});
897 }
898 $build_cfg->{'languagemap'} = \@languagemap if scalar (@languagemap);
899
900 #$build_cfg->{'notbuilt'} = $self->{'notbuilt'} if scalar @{$self->{'notbuilt'}};
901 my @notbuilt = ();
902 foreach my $nb (keys %{$self->{'notbuilt'}}) {
903 push (@notbuilt, $nb);
904 }
905 $build_cfg->{'notbuilt'} = \@notbuilt if scalar (@notbuilt);
906 $build_cfg->{'maxnumeric'} = 4;
907 if (defined($self->{'collect_cfg'}->{'maxnumeric'}) &&
908 $self->{'collect_cfg'}->{'maxnumeric'} =~ /^\d+$/) {
909 $build_cfg->{'maxnumeric'} = $self->{'collect_cfg'}->{'maxnumeric'};
910 }
911
912 # write out the build information
913 &cfgread::write_cfg_file("$self->{'build_dir'}/build.cfg", $build_cfg,
914 '^(builddate|numdocs|numbytes|numwords|numsections|maxnumeric)$',
915 '^(indexmap|subcollectionmap|languagemap|notbuilt)$');
916
917 print STDERR "</Stage>\n" if $self->{'gli'};
918}
919
920sub deinit {
921 my $self = shift (@_);
922}
923
924sub print_stats {
925 my $self = shift (@_);
926
927 my $outhandle = $self->{'outhandle'};
928 my $indexing_text = $self->{'buildproc'}->get_indexing_text();
929 my $index = $self->{'buildproc'}->get_index();
930 my $num_bytes = $self->{'buildproc'}->get_num_bytes();
931 my $num_processed_bytes = $self->{'buildproc'}->get_num_processed_bytes();
932
933 if ($indexing_text) {
934 print $outhandle "Stats (Creating index $index)\n";
935 } else {
936 print $outhandle "Stats (Compressing text from $index)\n";
937 }
938 print $outhandle "Total bytes in collection: $num_bytes\n";
939 print $outhandle "Total bytes in $index: $num_processed_bytes\n";
940
941 if ($num_processed_bytes < 50 && ($indexing_text || !$self->{'no_text'})) {
942 print $outhandle "***************\n";
943 if ($indexing_text) {
944 print $outhandle "WARNING: There is very little or no text to process for $index\n";
945 } elsif (!$self->{'no_text'}) {
946 print $outhandle "WARNING: There is very little or no text to compress\n";
947 }
948 print $outhandle " Was this your intention?\n";
949 print $outhandle "***************\n";
950 print STDERR "<Warning name='LittleOrNoText'>\n" if $self->{'gli'};
951 }
952}
953
9541;
955
956
957
Note: See TracBrowser for help on using the repository browser.