source: trunk/gsdl/perllib/mgbuilder.pm@ 10168

Last change on this file since 10168 was 10158, checked in by davidb, 19 years ago

*builder.pm packages (principally lucenebuilder.pl which inherits from
mgppbuilder) upgraded to support incremental building.

  • Property svn:keywords set to Author Date Id Revision
File size: 33.0 KB
Line 
1###########################################################################
2#
3# mgbuilder.pm -- MGBuilder object
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26package mgbuilder;
27
28use classify;
29use cfgread;
30use colcfg;
31use plugin;
32use util;
33use FileHandle;
34
35BEGIN {
36 # set autoflush on for STDERR and STDOUT so that mg
37 # doesn't get out of sync with plugins
38 STDOUT->autoflush(1);
39 STDERR->autoflush(1);
40}
41
42END {
43 STDOUT->autoflush(0);
44 STDERR->autoflush(0);
45}
46
47my $maxdocsize = 12000;
48
49my %wanted_index_files = ('td'=>1,
50 't'=>1,
51 'idb'=>1,
52 'ib1'=>1,
53 'ib2'=>1,
54 'ib3'=>1,
55 'i'=>1,
56 'ip'=>1,
57 'tiw'=>1,
58 'wa'=>1);
59
60
61sub new {
62 my ($class, $collection, $source_dir, $build_dir, $verbosity,
63 $maxdocs, $debug, $keepold, $remove_empty_classifications,
64 $outhandle, $no_text, $failhandle, $gli) = @_;
65
66 $outhandle = STDERR unless defined $outhandle;
67 $no_text = 0 unless defined $no_text;
68 $failhandle = STDERR unless defined $failhandle;
69
70 # create an mgbuilder object
71 my $self = bless {'collection'=>$collection,
72 'source_dir'=>$source_dir,
73 'build_dir'=>$build_dir,
74 'verbosity'=>$verbosity,
75 'maxdocs'=>$maxdocs,
76 'debug'=>$debug,
77 'keepold'=>$keepold,
78 'remove_empty_classifications'=>$remove_empty_classifications,
79 'outhandle'=>$outhandle,
80 'no_text'=>$no_text,
81 'failhandle'=>$failhandle,
82 'notbuilt'=>{}, # indexes not built
83 'gli'=>$gli
84 }, $class;
85
86 $self->{'gli'} = 0 unless defined $self->{'gli'};
87
88 # read in the collection configuration file
89 my $colcfgname = "$ENV{'GSDLCOLLECTDIR'}/etc/collect.cfg";
90 if (!-e $colcfgname) {
91 die "mgbuilder::new - couldn't find collect.cfg for collection $collection\n";
92 }
93 $self->{'collect_cfg'} = &colcfg::read_collect_cfg ($colcfgname);
94
95 if (!defined($self->{'collect_cfg'}->{'indexes'})) {
96 $self->{'collect_cfg'}->{'indexes'} = [];
97 }
98
99 # sort out subcollection indexes
100 if (defined $self->{'collect_cfg'}->{'indexsubcollections'}) {
101 my $indexes = $self->{'collect_cfg'}->{'indexes'};
102 $self->{'collect_cfg'}->{'indexes'} = [];
103 foreach my $subcollection (@{$self->{'collect_cfg'}->{'indexsubcollections'}}) {
104 foreach my $index (@$indexes) {
105 push (@{$self->{'collect_cfg'}->{'indexes'}}, "$index:$subcollection");
106 }
107 }
108 }
109
110 # sort out language subindexes
111 if (defined $self->{'collect_cfg'}->{'languages'}) {
112 my $indexes = $self->{'collect_cfg'}->{'indexes'};
113 $self->{'collect_cfg'}->{'indexes'} = [];
114 foreach my $language (@{$self->{'collect_cfg'}->{'languages'}}) {
115 foreach my $index (@$indexes) {
116 if (defined ($self->{'collect_cfg'}->{'indexsubcollections'})) {
117 push (@{$self->{'collect_cfg'}->{'indexes'}}, "$index:$language");
118 }
119 else { # add in an empty subcollection field
120 push (@{$self->{'collect_cfg'}->{'indexes'}}, "$index\:\:$language");
121 }
122 }
123 }
124 }
125
126 if (defined($self->{'collect_cfg'}->{'indexes'})) {
127 # make sure that the same index isn't specified more than once
128 my %tmphash = ();
129 my @tmparray = @{$self->{'collect_cfg'}->{'indexes'}};
130 $self->{'collect_cfg'}->{'indexes'} = [];
131 foreach my $i (@tmparray) {
132 if (!defined ($tmphash{$i})) {
133 push (@{$self->{'collect_cfg'}->{'indexes'}}, $i);
134 $tmphash{$i} = 1;
135 }
136 }
137 } else {
138 $self->{'collect_cfg'}->{'indexes'} = [];
139 }
140
141 if (scalar(@{$self->{'collect_cfg'}->{'indexes'}}) == 0) {
142 # no indexes have been specified so we'll build a "dummy:text" index
143 push (@{$self->{'collect_cfg'}->{'indexes'}}, "dummy:text");
144 }
145
146 # get the list of plugins for this collection
147 my $plugins = [];
148 if (defined $self->{'collect_cfg'}->{'plugin'}) {
149 $plugins = $self->{'collect_cfg'}->{'plugin'};
150 }
151
152 # load all the plugins
153
154 #build up the extra global options for the plugins
155 my @global_opts = ();
156 if (defined $self->{'collect_cfg'}->{'separate_cjk'} && $self->{'collect_cfg'}->{'separate_cjk'} =~ /^true$/i) {
157 push @global_opts, "-separate_cjk";
158 }
159 $self->{'pluginfo'} = &plugin::load_plugins ($plugins, $verbosity, $outhandle, $failhandle, \@global_opts);
160
161 if (scalar(@{$self->{'pluginfo'}}) == 0) {
162 print $outhandle "No plugins were loaded.\n";
163 die "\n";
164 }
165
166 # get the list of classifiers for this collection
167 my $classifiers = [];
168 if (defined $self->{'collect_cfg'}->{'classify'}) {
169 $classifiers = $self->{'collect_cfg'}->{'classify'};
170 }
171
172 # load all the classifiers
173 $self->{'classifiers'} = &classify::load_classifiers ($classifiers, $build_dir, $outhandle);
174
175 # load up any dontgdbm fields
176 $self->{'dontgdbm'} = {};
177 if (defined ($self->{'collect_cfg'}->{'dontgdbm'})) {
178 foreach my $dg (@{$self->{'collect_cfg'}->{'dontgdbm'}}) {
179 $self->{'dontgdbm'}->{$dg} = 1;
180 }
181 }
182
183 # load up the document processor for building
184 # if a buildproc class has been created for this collection, use it
185 # otherwise, use the mg buildproc
186 my ($buildprocdir, $buildproctype);
187 if (-e "$ENV{'GSDLCOLLECTDIR'}/perllib/${collection}buildproc.pm") {
188 $buildprocdir = "$ENV{'GSDLCOLLECTDIR'}/perllib";
189 $buildproctype = "${collection}buildproc";
190 } else {
191 $buildprocdir = "$ENV{'GSDLHOME'}/perllib";
192 $buildproctype = "mgbuildproc";
193 }
194
195 require "$buildprocdir/$buildproctype.pm";
196
197 eval("\$self->{'buildproc'} = new $buildproctype(\$collection, " .
198 "\$source_dir, \$build_dir, \$keepold, \$verbosity, \$outhandle)");
199 die "$@" if $@;
200
201 return $self;
202}
203
204sub init {
205 my $self = shift (@_);
206
207 if (!$self->{'debug'} && !$self->{'keepold'}) {
208 # remove any old builds
209 &util::rm_r($self->{'build_dir'});
210 &util::mk_all_dir($self->{'build_dir'});
211
212 # make the text directory
213 my $textdir = "$self->{'build_dir'}/text";
214 &util::mk_all_dir($textdir);
215 }
216}
217
218sub compress_text {
219 my $self = shift (@_);
220 my ($textindex) = @_;
221 my $exedir = "$ENV{'GSDLHOME'}/bin/$ENV{'GSDLOS'}";
222 my $exe = &util::get_os_exe ();
223 my $mg_passes_exe = &util::filename_cat($exedir, "mg_passes$exe");
224 my $mg_compression_dict_exe = &util::filename_cat($exedir, "mg_compression_dict$exe");
225 my $outhandle = $self->{'outhandle'};
226
227 my $maxnumeric = 4;
228 if (defined($self->{'collect_cfg'}->{'maxnumeric'}) &&
229 $self->{'collect_cfg'}->{'maxnumeric'} =~ /^\d+$/) {
230 $maxnumeric = $self->{'collect_cfg'}->{'maxnumeric'};
231 }
232
233 &util::mk_all_dir (&util::filename_cat($self->{'build_dir'}, "text"));
234 my $basefilename = "text/$self->{'collection'}";
235 my $fulltextprefix = &util::filename_cat ($self->{'build_dir'}, $basefilename);
236
237 my $osextra = "";
238 if ($ENV{'GSDLOS'} =~ /^windows$/i) {
239 $fulltextprefix =~ s@/@\\@g;
240 } else {
241 $osextra = " -d /";
242 }
243
244 print $outhandle "\n*** creating the compressed text\n" if ($self->{'verbosity'} >= 1);
245 print STDERR "<Stage name='CompressText'>\n" if $self->{'gli'};
246
247 # collect the statistics for the text
248 # -b $maxdocsize sets the maximum document size to be 12 meg
249 print $outhandle "\n collecting text statistics\n" if ($self->{'verbosity'} >= 1);
250 print STDERR "<Phase name='CollectTextStats'/>\n" if $self->{'gli'};
251
252 my ($handle);
253 if ($self->{'debug'}) {
254 $handle = STDOUT;
255 } else {
256 if (!-e "$mg_passes_exe" ||
257 !open (PIPEOUT, "| mg_passes$exe -f \"$fulltextprefix\" -b $maxdocsize -T1 -M $maxnumeric $osextra")) {
258 print STDERR "<FatalError name='NoRunMGPasses'>\n</Stage>\n" if $self->{'gli'};
259 die "mgbuilder::compress_text - couldn't run $mg_passes_exe\n";
260 }
261 $handle = mgbuilder::PIPEOUT;
262 }
263
264 $self->{'buildproc'}->set_output_handle ($handle);
265 $self->{'buildproc'}->set_mode ('text');
266 $self->{'buildproc'}->set_index ($textindex);
267 $self->{'buildproc'}->set_indexing_text (0);
268
269
270 if ($self->{'no_text'}) {
271 $self->{'buildproc'}->set_store_text(0);
272 } else {
273 $self->{'buildproc'}->set_store_text(1);
274 }
275 $self->{'buildproc'}->reset();
276
277 &plugin::begin($self->{'pluginfo'}, $self->{'source_dir'},
278 $self->{'buildproc'}, $self->{'maxdocs'});
279 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
280 "", {}, $self->{'buildproc'}, $self->{'maxdocs'}, 0, $self->{'gli'});
281 &plugin::end($self->{'pluginfo'});
282
283
284 close ($handle) unless $self->{'debug'};
285
286 $self->print_stats();
287
288 # create the compression dictionary
289 # the compression dictionary is built by assuming the stats are from a seed
290 # dictionary (-S), if a novel word is encountered it is spelled out (-H),
291 # and the resulting dictionary must be less than 5 meg with the most frequent
292 # words being put into the dictionary first (-2 -k 5120)
293 if (!$self->{'debug'}) {
294 print $outhandle "\n creating the compression dictionary\n" if ($self->{'verbosity'} >= 1);
295 print STDERR "<Phase name='CreatingCompress'/>\n" if $self->{'gli'};
296 if (!-e "$mg_compression_dict_exe") {
297 die "mgbuilder::compress_text - couldn't run $mg_compression_dict_exe\n";
298 }
299 system ("mg_compression_dict$exe -f \"$fulltextprefix\" -S -H -2 -k 5120 $osextra");
300
301 # -b $maxdocsize sets the maximum document size to be 12 meg
302 if (!-e "$mg_passes_exe" ||
303 !open ($handle, "| mg_passes$exe -f \"$fulltextprefix\" -b $maxdocsize -T2 -M $maxnumeric $osextra")) {
304 print STDERR "<FatalError name='NoRunMGPasses'/>\n</Stage>\n" if $self->{'gli'};
305 die "mgbuilder::compress_text - couldn't run $mg_passes_exe\n";
306 }
307 }
308 else {
309 print STDERR "<Phase name='SkipCreatingComp'/>\n" if $self->{'gli'};
310 }
311
312 $self->{'buildproc'}->reset();
313 # compress the text
314 print $outhandle "\n compressing the text\n" if ($self->{'verbosity'} >= 1);
315 print STDERR "<Phase name='CompressingText'/>\n" if $self->{'gli'};
316
317 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
318 "", {}, $self->{'buildproc'}, $self->{'maxdocs'}, 0, $self->{'gli'});
319
320 close ($handle) unless $self->{'debug'};
321
322 $self->print_stats();
323 print STDERR "</Stage>\n" if $self->{'gli'};
324}
325
326sub want_built {
327 my $self = shift (@_);
328 my ($index) = @_;
329
330 if (defined ($self->{'collect_cfg'}->{'dontbuild'})) {
331 foreach my $checkstr (@{$self->{'collect_cfg'}->{'dontbuild'}}) {
332 if ($index =~ /^$checkstr$/) {
333 #push (@{$self->{'notbuilt'}}, $self->{'index_mapping'}->{$index});
334 $self->{'notbuilt'}->{$index} = 1;
335 return 0;
336 }
337 }
338 }
339
340 return 1;
341}
342
343sub build_indexes {
344 my $self = shift (@_);
345 my ($indexname) = @_;
346 my $outhandle = $self->{'outhandle'};
347 my $indexes = [];
348 if (defined $indexname && $indexname =~ /\w/) {
349 push @$indexes, $indexname;
350 } else {
351 $indexes = $self->{'collect_cfg'}->{'indexes'};
352 }
353
354 # create the mapping between the index descriptions
355 # and their directory names
356 $self->{'index_mapping'} = $self->create_index_mapping ($indexes);
357
358 # build each of the indexes
359 foreach my $index (@$indexes) {
360 if ($self->want_built($index)) {
361 print $outhandle "\n*** building index $index in subdirectory " .
362 "$self->{'index_mapping'}->{$index}\n" if ($self->{'verbosity'} >= 1);
363 print STDERR "<Stage name='Index' source='$index'>\n" if $self->{'gli'};
364 $self->build_index($index);
365 } else {
366 print $outhandle "\n*** ignoring index $index\n" if ($self->{'verbosity'} >= 1);
367 }
368 }
369}
370
371# creates directory names for each of the index descriptions
372sub create_index_mapping {
373 my $self = shift (@_);
374 my ($indexes) = @_;
375
376 my %mapping = ();
377 $mapping{'indexmaporder'} = [];
378 $mapping{'subcollectionmaporder'} = [];
379 $mapping{'languagemaporder'} = [];
380
381 # dirnames is used to check for collisions. Start this off
382 # with the manditory directory names
383 my %dirnames = ('text'=>'text',
384 'extra'=>'extra');
385 my %pnames = ('index' => {}, 'subcollection' => {}, 'languages' => {});
386 foreach my $index (@$indexes) {
387 my ($level, $gran, $subcollection, $languages) = split (":", $index);
388
389 # the directory name starts with the first character of the index level
390 my ($pindex) = $level =~ /^(.)/;
391
392 # next comes a processed version of the index
393 $pindex .= $self->process_field ($gran);
394 $pindex = lc ($pindex);
395
396 # next comes a processed version of the subcollection if there is one.
397 my $psub = $self->process_field ($subcollection);
398 $psub = lc ($psub);
399
400 # next comes a processed version of the language if there is one.
401 my $plang = $self->process_field ($languages);
402 $plang = lc ($plang);
403
404 my $dirname = $pindex . $psub . $plang;
405
406 # check to be sure all index names are unique
407 while (defined ($dirnames{$dirname})) {
408 $dirname = $self->make_unique (\%pnames, $index, \$pindex, \$psub, \$plang);
409 }
410 $mapping{$index} = $dirname;
411
412 # store the mapping orders as well as the maps
413 # also put index, subcollection and language fields into the mapping thing -
414 # (the full index name (eg document:text:subcol:lang) is not used on
415 # the query page) -these are used for collectionmeta later on
416 if (!defined $mapping{'indexmap'}{"$level:$gran"}) {
417 $mapping{'indexmap'}{"$level:$gran"} = $pindex;
418 push (@{$mapping{'indexmaporder'}}, "$level:$gran");
419 if (!defined $mapping{"$level:$gran"}) {
420 $mapping{"$level:$gran"} = $pindex;
421 }
422 }
423 if ($psub =~ /\w/ && !defined ($mapping{'subcollectionmap'}{$subcollection})) {
424 $mapping{'subcollectionmap'}{$subcollection} = $psub;
425 push (@{$mapping{'subcollectionmaporder'}}, $subcollection);
426 $mapping{$subcollection} = $psub;
427 }
428 if ($plang =~ /\w/ && !defined ($mapping{'languagemap'}{$languages})) {
429 $mapping{'languagemap'}{$languages} = $plang;
430 push (@{$mapping{'languagemaporder'}}, $languages);
431 $mapping{$languages} = $plang;
432 }
433 $dirnames{$dirname} = $index;
434 $pnames{'index'}->{$pindex} = "$level:$gran";
435 $pnames{'subcollection'}->{$psub} = $subcollection;
436 $pnames{'languages'}->{$plang} = $languages;
437 }
438
439 return \%mapping;
440}
441
442# returns a processed version of a field.
443# if the field has only one component the processed
444# version will contain the first character and next consonant
445# of that componant - otherwise it will contain the first
446# character of the first two components
447sub process_field {
448 my $self = shift (@_);
449 my ($field) = @_;
450
451 return "" unless (defined ($field) && $field =~ /\w/);
452
453 my @components = split /,/, $field;
454 if (scalar @components >= 2) {
455 splice (@components, 2);
456 map {s/^(.).*$/$1/;} @components;
457 return join("", @components);
458 } else {
459 my ($a, $b) = $field =~ /^(.).*?([bcdfghjklmnpqrstvwxyz])/i;
460 ($a, $b) = $field =~ /^(.)(.)/ unless defined $a && defined $b;
461 return "$a$b";
462 }
463}
464
465sub make_unique {
466 my $self = shift (@_);
467 my ($namehash, $index, $indexref, $subref, $langref) = @_;
468 my ($level, $gran, $subcollection, $languages) = split (":", $index);
469
470 if ($namehash->{'index'}->{$$indexref} ne "$level:$gran") {
471 $self->get_next_version ($indexref);
472 } elsif ($namehash->{'subcollection'}->{$$subref} ne $subcollection) {
473 $self->get_next_version ($subref);
474 } elsif ($namehash->{'languages'}->{$$langref} ne $languages) {
475 $self->get_next_version ($langref);
476 }
477 return "$$indexref$$subref$$langref";
478}
479
480sub get_next_version {
481 my $self = shift (@_);
482 my ($nameref) = @_;
483 if ($$nameref =~ /(\d\d)$/) {
484 my $num = $1; $num ++;
485 $$nameref =~ s/\d\d$/$num/;
486 } elsif ($$nameref =~ /(\d)$/) {
487 my $num = $1;
488 if ($num == 9) {$$nameref =~ s/\d$/10/;}
489 else {$num ++; $$nameref =~ s/\d$/$num/;}
490 } else {
491 $$nameref =~ s/.$/0/;
492 }
493}
494
495sub build_index {
496 my $self = shift (@_);
497 my ($index) = @_;
498 my $outhandle = $self->{'outhandle'};
499
500 # get the full index directory path and make sure it exists
501 my $indexdir = $self->{'index_mapping'}->{$index};
502 &util::mk_all_dir (&util::filename_cat($self->{'build_dir'}, $indexdir));
503 my $fullindexprefix = &util::filename_cat ($self->{'build_dir'}, $indexdir,
504 $self->{'collection'});
505 my $fulltextprefix = &util::filename_cat ($self->{'build_dir'}, "text",
506 $self->{'collection'});
507
508 # get any os specific stuff
509 my $exedir = "$ENV{'GSDLHOME'}/bin/$ENV{'GSDLOS'}";
510 my $exe = &util::get_os_exe ();
511 my $mg_passes_exe = &util::filename_cat($exedir, "mg_passes$exe");
512 my $mg_perf_hash_build_exe =
513 &util::filename_cat($exedir, "mg_perf_hash_build$exe");
514 my $mg_weights_build_exe =
515 &util::filename_cat ($exedir, "mg_weights_build$exe");
516 my $mg_invf_dict_exe =
517 &util::filename_cat ($exedir, "mg_invf_dict$exe");
518 my $mg_stem_idx_exe =
519 &util::filename_cat ($exedir, "mg_stem_idx$exe");
520
521 my $maxnumeric = 4;
522 if (defined($self->{'collect_cfg'}->{'maxnumeric'}) &&
523 $self->{'collect_cfg'}->{'maxnumeric'} =~ /^\d+$/) {
524 $maxnumeric = $self->{'collect_cfg'}->{'maxnumeric'};
525 }
526
527 my $osextra = "";
528 if ($ENV{'GSDLOS'} =~ /^windows$/i) {
529 $fullindexprefix =~ s@/@\\@g;
530 } else {
531 $osextra = " -d /";
532 if ($outhandle ne "STDERR") {
533 # so mg_passes doesn't print to stderr if we redirect output
534 $osextra .= " 2>/dev/null";
535 }
536 }
537
538 # get the index level from the index description
539 # the index will be level 2 unless we are building a
540 # paragraph level index
541 my $index_level = 2;
542 $index_level = 3 if $index =~ /^paragraph/i;
543
544 # get the index expression if this index belongs
545 # to a subcollection
546 my $indexexparr = [];
547 my $langarr = [];
548 # there may be subcollection info, and language info.
549 my ($level, $fields, $subcollection, $language) = split (":", $index);
550 my @subcollections = ();
551 @subcollections = split /,/, $subcollection if (defined $subcollection);
552
553 foreach my $subcollection (@subcollections) {
554 if (defined ($self->{'collect_cfg'}->{'subcollection'}->{$subcollection})) {
555 push (@$indexexparr, $self->{'collect_cfg'}->{'subcollection'}->{$subcollection});
556 }
557 }
558
559 # add expressions for languages if this index belongs to
560 # a language subcollection - only put languages expressions for the
561 # ones we want in the index
562
563 my @languages = ();
564 my $language_metadata = "Language";
565 if (defined ($self->{'collect_cfg'}->{'language_metadata'})) {
566 $language_metadata = $self->{'collect_cfg'}->{'language_metadata'};
567 }
568 @languages = split /,/, $language if (defined $language);
569 foreach my $language (@languages) {
570 my $not=0;
571 if ($language =~ s/^\!//) {
572 $not = 1;
573 }
574 if($not) {
575 push (@$langarr, "!$language");
576 } else {
577 push (@$langarr, "$language");
578 }
579 }
580
581 # Build index dictionary. Uses verbatim stem method
582 print $outhandle "\n creating index dictionary\n" if ($self->{'verbosity'} >= 1);
583 print STDERR "<Phase name='CreatingIndexDic'/>\n" if $self->{'gli'};
584 my ($handle);
585 if ($self->{'debug'}) {
586 $handle = STDOUT;
587 } else {
588 if (!-e "$mg_passes_exe" ||
589 !open (PIPEOUT, "| mg_passes$exe -f \"$fullindexprefix\" -b $maxdocsize " .
590 "-$index_level -m 32 -s 0 -G -t 10 -N1 -M $maxnumeric $osextra")) {
591 print STDERR "<FatalError name='NoRunMGPasses'/>\n</Stage>\n" if $self->{'gli'};
592 die "mgbuilder::build_index - couldn't run $mg_passes_exe\n";
593 }
594 $handle = mgbuilder::PIPEOUT;
595 }
596
597 # set up the document processor
598 $self->{'buildproc'}->set_output_handle ($handle);
599 $self->{'buildproc'}->set_mode ('text');
600 $self->{'buildproc'}->set_index ($index, $indexexparr);
601 $self->{'buildproc'}->set_index_languages ($language_metadata, $langarr) if (defined $language);
602 $self->{'buildproc'}->set_indexing_text (1);
603 $self->{'buildproc'}->set_store_text(1);
604
605 $self->{'buildproc'}->reset();
606 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
607 "", {}, $self->{'buildproc'}, $self->{'maxdocs'},0, $self->{'gli'});
608 close ($handle) unless $self->{'debug'};
609
610 $self->print_stats();
611
612 # now we check to see if the required files have been produced - if not we quit building this index so the whole process doesn't crap out.
613 # we check on the .id file - index dictionary
614 my $dict_file = "$fullindexprefix.id";
615 if (!-e $dict_file) {
616 print $outhandle "mgbuilder::build_index - Couldn't create index $index\n";
617 $self->{'notbuilt'}->{$index}=1;
618 return;
619 }
620 if (!$self->{'debug'}) {
621 # create the perfect hash function
622 if (!-e "$mg_perf_hash_build_exe") {
623 print STDERR "<FatalError name='NoRunMGHash'/>\n</Stage>\n" if $self->{'gli'};
624 die "mgbuilder::build_index - couldn't run $mg_perf_hash_build_exe\n";
625 }
626 system ("mg_perf_hash_build$exe -f \"$fullindexprefix\" $osextra");
627
628 if (!-e "$mg_passes_exe" ||
629 !open ($handle, "| mg_passes$exe -f \"$fullindexprefix\" -b $maxdocsize " .
630 "-$index_level -c 3 -G -t 10 -N2 -M $maxnumeric $osextra")) {
631 print STDERR "<FatalError name='NoRunMGPasses'/>\n</Stage>\n" if $self->{'gli'};
632 die "mgbuilder::build_index - couldn't run $mg_passes_exe\n";
633 }
634 }
635
636 # invert the text
637 print $outhandle "\n inverting the text\n" if ($self->{'verbosity'} >= 1);
638 print STDERR "<Phase name='InvertingText'/>\n" if $self->{'gli'};
639 $self->{'buildproc'}->reset();
640 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
641 "", {}, $self->{'buildproc'}, $self->{'maxdocs'},0, $self->{'gli'});
642
643
644 $self->print_stats ();
645
646 if (!$self->{'debug'}) {
647
648 close ($handle);
649
650 # create the weights file
651 print $outhandle "\n create the weights file\n" if ($self->{'verbosity'} >= 1);
652 print STDERR "<Phase name='CreateTheWeights'/>\n" if $self->{'gli'};
653 if (!-e "$mg_weights_build_exe") {
654 print STDERR "<FatalError name='NoRunMGWeights'/>\n</Stage>\n" if $self->{'gli'};
655 die "mgbuilder::build_index - couldn't run $mg_weights_build_exe\n";
656 }
657 system ("mg_weights_build$exe -f \"$fullindexprefix\" -t \"$fulltextprefix\" $osextra");
658
659 # create 'on-disk' stemmed dictionary
660 print $outhandle "\n creating 'on-disk' stemmed dictionary\n" if ($self->{'verbosity'} >= 1);
661 print STDERR "<Phase name='CreateStemmedDic'/>\n" if $self->{'gli'};
662 if (!-e "$mg_invf_dict_exe") {
663 print STDERR "<FatalError name='NoRunMGInvf'/>\n</Stage>\n" if $self->{'gli'};
664 die "mgbuilder::build_index - couldn't run $mg_invf_dict_exe\n";
665 }
666 system ("mg_invf_dict$exe -f \"$fullindexprefix\" $osextra");
667
668
669 # creates stem index files for the various stemming methods
670 print $outhandle "\n creating stem indexes\n" if ($self->{'verbosity'} >= 1);
671 print STDERR "<Phase name='CreatingStemIndx'/>\n" if $self->{'gli'};
672 if (!-e "$mg_stem_idx_exe") {
673 print STDERR "<FatalError name='NoRunMGStem'/>\n</Stage>\n" if $self->{'gli'};
674 die "mgbuilder::build_index - couldn't run $mg_stem_idx_exe\n";
675 }
676 system ("mg_stem_idx$exe -b 4096 -s1 -f \"$fullindexprefix\" $osextra");
677 system ("mg_stem_idx$exe -b 4096 -s2 -f \"$fullindexprefix\" $osextra");
678 system ("mg_stem_idx$exe -b 4096 -s3 -f \"$fullindexprefix\" $osextra");
679
680 # remove unwanted files
681 my $tmpdir = &util::filename_cat ($self->{'build_dir'}, $indexdir);
682 opendir (DIR, $tmpdir) || die
683 "mgbuilder::build_index - couldn't read directory $tmpdir\n";
684 foreach my $file (readdir(DIR)) {
685 next if $file =~ /^\./;
686 my ($suffix) = $file =~ /\.([^\.]+)$/;
687 if (defined $suffix && !defined $wanted_index_files{$suffix}) {
688 # delete it!
689 print $outhandle "deleting $file\n" if $self->{'verbosity'} > 2;
690 &util::rm (&util::filename_cat ($tmpdir, $file));
691 }
692 }
693 closedir (DIR);
694 }
695 print STDERR "</Stage>\n" if $self->{'gli'};
696}
697
698sub make_infodatabase {
699 my $self = shift (@_);
700 my $outhandle = $self->{'outhandle'};
701
702 my $textdir = &util::filename_cat($self->{'build_dir'}, "text");
703 my $assocdir = &util::filename_cat($self->{'build_dir'}, "assoc");
704 &util::mk_all_dir ($textdir);
705 &util::mk_all_dir ($assocdir);
706
707 # get db name
708 my $dbext = ".bdb";
709 $dbext = ".ldb" if &util::is_little_endian();
710 my $fulldbname = &util::filename_cat ($textdir, "$self->{'collection'}$dbext");
711 $fulldbname =~ s/\//\\/g if ($ENV{'GSDLOS'} =~ /^windows$/i);
712
713 my $exedir = "$ENV{'GSDLHOME'}/bin/$ENV{'GSDLOS'}";
714 my $exe = &util::get_os_exe ();
715 my $txt2db_exe = &util::filename_cat($exedir, "txt2db$exe");
716
717 print $outhandle "\n*** creating the info database and processing associated files\n"
718 if ($self->{'verbosity'} >= 1);
719 print STDERR "<Stage name='CreateInfoData'>\n" if $self->{'gli'};
720
721 # init all the classifiers
722 &classify::init_classifiers ($self->{'classifiers'});
723
724 # set up the document processor
725 my ($handle);
726 if ($self->{'debug'}) {
727 $handle = STDOUT;
728 } else {
729 if (!-e "$txt2db_exe" || !open (PIPEOUT, "| txt2db$exe \"$fulldbname\"")) {
730 print STDERR "<FatalError name='NoRunText2DB'/>\n</Stage>\n" if $self->{'gli'};
731 die "mgbuilder::make_infodatabase - couldn't run $txt2db_exe\n";
732 }
733 $handle = mgbuilder::PIPEOUT;
734 }
735
736 $self->{'buildproc'}->set_output_handle ($handle);
737 $self->{'buildproc'}->set_mode ('infodb');
738 $self->{'buildproc'}->set_assocdir ($assocdir);
739 $self->{'buildproc'}->set_dontgdbm ($self->{'dontgdbm'});
740 $self->{'buildproc'}->set_classifiers ($self->{'classifiers'});
741 $self->{'buildproc'}->set_indexing_text (0);
742 $self->{'buildproc'}->set_store_text(1);
743 $self->{'buildproc'}->reset();
744
745 if (defined $self->{'collect_cfg'}->{'collectionmeta'}) {
746
747 if (!defined $self->{'index_mapping'}) {
748 $self->{'index_mapping'} =
749 $self->create_index_mapping ($self->{'collect_cfg'}->{'indexes'});
750 }
751
752 print $handle "[collection]\n";
753
754 foreach my $cmeta (keys (%{$self->{'collect_cfg'}->{'collectionmeta'}})) {
755 my $defaultfound=0;
756 my $first=1;
757 my $metadata_entry = "";
758 my $default="";
759 my $cmetamap = "";
760 if ($cmeta =~ s/^\.//) {
761 if (defined $self->{'index_mapping'}->{$cmeta}) {
762 $cmetamap = $self->{'index_mapping'}->{$cmeta};
763 $cmeta = ".$cmeta";
764 }
765 else {
766 print $outhandle "mgbuilder: warning bad collectionmeta option '$cmeta' - ignored\n";
767 next; #ignore this one
768 }
769 }
770 else {
771 $cmetamap = $cmeta; # just using the same name
772 }
773 #iterate through the languages
774 foreach my $lang (keys (%{$self->{'collect_cfg'}->{'collectionmeta'}->{$cmeta}})) {
775 if ($first) {
776 $first=0;
777 #set the default default to the first entry
778 $default=$self->{'collect_cfg'}->{'collectionmeta'}->{$cmeta}->{$lang};
779 }
780 if ($lang =~ /default/) {
781 $defaultfound=1;
782 #the default entry goes first
783 $metadata_entry = "<$cmetamap>" .
784 $self->{'collect_cfg'}->{'collectionmeta'}->{$cmeta}->{'default'} . "\n" . $metadata_entry;
785 }
786 else {
787 my ($l) = $lang =~ /^\[l=(\w*)\]$/;
788 if ($l) {
789 $metadata_entry .= "<$cmetamap:$l>" .
790 $self->{'collect_cfg'}->{'collectionmeta'}->{$cmeta}->{$lang} . "\n";
791
792 # Use the English value as the default if no default is specified
793 if ($l =~ /en/i) {
794 $default=$self->{'collect_cfg'}->{'collectionmeta'}->{$cmeta}->{$lang};
795 }
796 }
797 }
798 }
799 #if we haven't found a default, put one in
800 if (!$defaultfound) {
801 $metadata_entry = "<$cmetamap>$default\n" . $metadata_entry;
802 }
803 #write the entry to the file
804 print $handle $metadata_entry;
805
806 }
807
808 print $handle "\n" . ('-' x 70) . "\n";
809 }
810
811 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
812 "", {}, $self->{'buildproc'}, $self->{'maxdocs'},0, $self->{'gli'});
813
814 # output classification information
815 &classify::output_classify_info ($self->{'classifiers'}, $handle,
816 $self->{'remove_empty_classifications'},
817 $self->{'gli'});
818
819
820 #output doclist
821 my @doclist = $self->{'buildproc'}->get_doc_list();
822 my $docs = join (";",@doclist);
823 print $handle "[browselist]\n";
824 print $handle "<hastxt>0\n";
825 print $handle "<childtype>VList\n";
826 print $handle "<numleafdocs>" . ($#doclist+1) . "\n";
827 print $handle "<thistype>Invisible\n";
828 print $handle "<contains>$docs";
829 print $handle "\n" . ('-' x 70) . "\n";
830
831 close ($handle) if !$self->{'debug'};
832
833 print STDERR "</Stage>\n" if $self->{'gli'};
834}
835
836sub collect_specific {
837 my $self = shift (@_);
838}
839
840sub make_auxiliary_files {
841 my $self = shift (@_);
842 my ($index);
843 my $build_cfg = {};
844 my $outhandle = $self->{'outhandle'};
845
846 print $outhandle "\n*** creating auxiliary files \n" if ($self->{'verbosity'} >= 1);
847 print STDERR "<Stage name='CreatingAuxilary'>\n" if $self->{'gli'};
848
849 # get the text directory
850 &util::mk_all_dir ($self->{'build_dir'});
851
852 # store the build date
853 $build_cfg->{'builddate'} = time;
854 $build_cfg->{'indexstem'} = $self->{'collection'};
855 # store the number of documents and number of bytes
856 $build_cfg->{'numdocs'} = $self->{'buildproc'}->get_num_docs();
857 $build_cfg->{'numbytes'} = $self->{'buildproc'}->get_num_bytes();
858
859 # get additional stats from mg
860 my $exedir = "$ENV{'GSDLHOME'}/bin/$ENV{'GSDLOS'}";
861 my $exe = &util::get_os_exe ();
862 my $mgstat_exe = &util::filename_cat($exedir, "mgstat$exe");
863 my $input_file = &util::filename_cat ("text", $self->{'collection'});
864 if (!-e "$mgstat_exe" || !open (PIPEIN, "mgstat$exe -d \"$self->{'build_dir'}\" -f \"$input_file\" |")) {
865 print $outhandle "Warning: Couldn't open pipe to $mgstat_exe to get additional stats\n";
866 } else {
867 my $line = "";
868 while (defined ($line = <PIPEIN>)) {
869 if ($line =~ /^Words in collection \[dict\]\s+:\s+(\d+)/) {
870 ($build_cfg->{'numwords'}) = $1;
871 } elsif ($line =~ /^Documents\s+:\s+(\d+)/) {
872 ($build_cfg->{'numsections'}) = $1;
873 }
874 }
875 close PIPEIN;
876 }
877
878 # store the mapping between the index names and the directory names
879 # the index map is used to determine what indexes there are, so any that are not built should not be put into the map.
880 my @indexmap = ();
881 foreach my $index (@{$self->{'index_mapping'}->{'indexmaporder'}}) {
882 if (not defined ($self->{'notbuilt'}->{$index})) {
883 push (@indexmap, "$index\-\>$self->{'index_mapping'}->{'indexmap'}->{$index}");
884 }
885 }
886 $build_cfg->{'indexmap'} = \@indexmap;
887
888 my @subcollectionmap = ();
889 foreach my $subcollection (@{$self->{'index_mapping'}->{'subcollectionmaporder'}}) {
890 push (@subcollectionmap, "$subcollection\-\>" .
891 $self->{'index_mapping'}->{'subcollectionmap'}->{$subcollection});
892 }
893 $build_cfg->{'subcollectionmap'} = \@subcollectionmap if scalar (@subcollectionmap);
894
895 my @languagemap = ();
896 foreach my $language (@{$self->{'index_mapping'}->{'languagemaporder'}}) {
897 push (@languagemap, "$language\-\>" .
898 $self->{'index_mapping'}->{'languagemap'}->{$language});
899 }
900 $build_cfg->{'languagemap'} = \@languagemap if scalar (@languagemap);
901
902 #$build_cfg->{'notbuilt'} = $self->{'notbuilt'} if scalar @{$self->{'notbuilt'}};
903 my @notbuilt = ();
904 foreach my $nb (keys %{$self->{'notbuilt'}}) {
905 push (@notbuilt, $nb);
906 }
907 $build_cfg->{'notbuilt'} = \@notbuilt if scalar (@notbuilt);
908 $build_cfg->{'maxnumeric'} = 4;
909 if (defined($self->{'collect_cfg'}->{'maxnumeric'}) &&
910 $self->{'collect_cfg'}->{'maxnumeric'} =~ /^\d+$/) {
911 $build_cfg->{'maxnumeric'} = $self->{'collect_cfg'}->{'maxnumeric'};
912 }
913
914 # write out the build information
915 &cfgread::write_cfg_file("$self->{'build_dir'}/build.cfg", $build_cfg,
916 '^(builddate|numdocs|numbytes|numwords|numsections|maxnumeric|indexstem)$',
917 '^(indexmap|subcollectionmap|languagemap|notbuilt)$');
918
919 print STDERR "</Stage>\n" if $self->{'gli'};
920}
921
922sub deinit {
923 my $self = shift (@_);
924
925 &plugin::deinit($self->{'pluginfo'},$self->{'buildproc'});
926}
927
928sub print_stats {
929 my $self = shift (@_);
930
931 my $outhandle = $self->{'outhandle'};
932 my $indexing_text = $self->{'buildproc'}->get_indexing_text();
933 my $index = $self->{'buildproc'}->get_index();
934 my $num_bytes = $self->{'buildproc'}->get_num_bytes();
935 my $num_processed_bytes = $self->{'buildproc'}->get_num_processed_bytes();
936
937 if ($indexing_text) {
938 print $outhandle "Stats (Creating index $index)\n";
939 } else {
940 print $outhandle "Stats (Compressing text from $index)\n";
941 }
942 print $outhandle "Total bytes in collection: $num_bytes\n";
943 print $outhandle "Total bytes in $index: $num_processed_bytes\n";
944
945 if ($num_processed_bytes < 50 && ($indexing_text || !$self->{'no_text'})) {
946 print $outhandle "***************\n";
947 if ($indexing_text) {
948 print $outhandle "WARNING: There is very little or no text to process for $index\n";
949 } elsif (!$self->{'no_text'}) {
950 print $outhandle "WARNING: There is very little or no text to compress\n";
951 }
952 print $outhandle " Was this your intention?\n";
953 print $outhandle "***************\n";
954 print STDERR "<Warning name='LittleOrNoText'>\n" if $self->{'gli'};
955 }
956}
957
9581;
959
960
961
Note: See TracBrowser for help on using the repository browser.