source: trunk/gsdl/perllib/mgppbuilder.pm@ 932

Last change on this file since 932 was 932, checked in by kjm18, 24 years ago

new building programs for mgpp added

  • Property svn:keywords set to Author Date Id Revision
File size: 27.9 KB
Line 
1###########################################################################
2#
3# mgbuilder.pm -- MGBuilder object
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26package mgppbuilder;
27
28use classify;
29use cfgread;
30use colcfg;
31use plugin;
32use util;
33
34#update this !!!!!!!!!!!!!!!!
35%wanted_index_files = ('td'=>1,
36 't'=>1,
37 'idb'=>1,
38 'ib1'=>1,
39 'ib2'=>1,
40 'ib3'=>1,
41 'i'=>1,
42 'ip'=>1,
43 'tiw'=>1,
44 'wa'=>1);
45
46
47sub new {
48 my ($class, $collection, $source_dir, $build_dir, $verbosity,
49 $maxdocs, $debug, $keepold, $allclassifications) = @_;
50
51 # create an mgppbuilder object
52 my $self = bless {'collection'=>$collection,
53 'source_dir'=>$source_dir,
54 'build_dir'=>$build_dir,
55 'verbosity'=>$verbosity,
56 'maxdocs'=>$maxdocs,
57 'debug'=>$debug,
58 'keepold'=>$keepold,
59 'allclassifications'=>$allclassifications,
60 'notbuilt'=>[] # indexes not built
61 }, $class;
62
63
64 # read in the collection configuration file
65 my $colcfgname = "$ENV{'GSDLCOLLECTDIR'}/etc/collect.cfg";
66 if (!-e $colcfgname) {
67 die "mgppbuilder::new - couldn't find collect.cfg for collection $collection\n";
68 }
69 $self->{'collect_cfg'} = &colcfg::read_collect_cfg ($colcfgname);
70
71 # sort out subcollection indexes
72 if (defined $self->{'collect_cfg'}->{'indexsubcollections'}) {
73 my $indexes = $self->{'collect_cfg'}->{'indexes'};
74 $self->{'collect_cfg'}->{'indexes'} = [];
75 foreach $subcollection (@{$self->{'collect_cfg'}->{'indexsubcollections'}}) {
76 foreach $index (@$indexes) {
77 push (@{$self->{'collect_cfg'}->{'indexes'}}, "$index:$subcollection");
78 }
79 }
80 }
81
82 # sort out language subindexes
83 if (defined $self->{'collect_cfg'}->{'languages'}) {
84 my $indexes = $self->{'collect_cfg'}->{'indexes'};
85 $self->{'collect_cfg'}->{'indexes'} = [];
86 foreach $language (@{$self->{'collect_cfg'}->{'languages'}}) {
87 foreach $index (@$indexes) {
88 push (@{$self->{'collect_cfg'}->{'indexes'}}, "$index:$language");
89 }
90 }
91 }
92
93 # get the list of plugins for this collection
94 my $plugins = [];
95 if (defined $self->{'collect_cfg'}->{'plugin'}) {
96 $plugins = $self->{'collect_cfg'}->{'plugin'};
97 }
98
99 # load all the plugins
100 $self->{'pluginfo'} = &plugin::load_plugins ($plugins);
101 if (scalar(@{$self->{'pluginfo'}}) == 0) {
102 print STDERR "No plugins were loaded.\n";
103 die "\n";
104 }
105
106 # get the list of classifiers for this collection
107 my $classifiers = [];
108 if (defined $self->{'collect_cfg'}->{'classify'}) {
109 $classifiers = $self->{'collect_cfg'}->{'classify'};
110 }
111
112 # load all the classifiers
113 $self->{'classifiers'} = &classify::load_classifiers ($classifiers);
114
115 # load up any dontgdbm fields
116 $self->{'dontgdbm'} = {};
117 if (defined ($self->{'collect_cfg'}->{'dontgdbm'})) {
118 foreach $dg (@{$self->{'collect_cfg'}->{'dontgdbm'}}) {
119 $self->{'dontgdbm'}->{$dg} = 1;
120 }
121 }
122
123 # load up the document processor for building
124 # if a buildproc class has been created for this collection, use it
125 # otherwise, use the mg buildproc
126 my ($buildprocdir, $buildproctype);
127 if (-e "$ENV{'GSDLCOLLECTDIR'}/perllib/${collection}buildproc.pm") {
128 $buildprocdir = "$ENV{'GSDLCOLLECTDIR'}/perllib";
129 $buildproctype = "${collection}buildproc";
130 } else {
131 $buildprocdir = "$ENV{'GSDLHOME'}/perllib";
132 $buildproctype = "mgppbuildproc";
133 }
134 require "$buildprocdir/$buildproctype.pm";
135
136 eval("\$self->{'buildproc'} = new $buildproctype(\$collection, " .
137 "\$source_dir, \$build_dir, \$verbosity)");
138 die "$@" if $@;
139
140
141 return $self;
142}
143
144sub init {
145 my $self = shift (@_);
146
147 if (!$self->{'debug'} && !$self->{'keepold'}) {
148 # remove any old builds
149 &util::rm_r($self->{'build_dir'});
150 &util::mk_all_dir($self->{'build_dir'});
151
152 # make the text directory
153 my $textdir = "$self->{'build_dir'}/text";
154 &util::mk_all_dir($textdir);
155 }
156}
157
158sub build_collection {
159 my $self = shift (@_);
160 my ($textindex, $indexname) = @_;
161
162 print STDERR "build_col, textindex=$textindex, indexname=$indexname\n";
163 my $exedir = "$ENV{'GSDLHOME'}/src/mgpp/text";
164 my $exe = &util::get_os_exe ();
165
166 my $mg_passes_exe = &util::filename_cat($exedir, "mg_passes$exe");
167 my $mg_compression_dict_exe = &util::filename_cat($exedir, "mg_compression_dict$exe");
168 my $mg_perf_hash_build_exe =
169 &util::filename_cat($exedir, "mg_perf_hash_build$exe");
170 my $mg_weights_build_exe =
171 &util::filename_cat ($exedir, "mg_weights_build$exe");
172 my $mg_invf_dict_exe =
173 &util::filename_cat ($exedir, "mg_invf_dict$exe");
174 my $mg_stem_idx_exe =
175 &util::filename_cat ($exedir, "mg_stem_idx$exe");
176
177 &util::mk_all_dir (&util::filename_cat($self->{'build_dir'}, "text"));
178 my $basefilename = "$self->{'collection'}";
179# my $fulltextprefix = &util::filename_cat ($self->{'build_dir'}, $basefilename);
180 # my $fullindexprefix = &util::filename_cat ($self->{'build_dir'},
181 # $self->{'collection'});
182
183 my $fulltextprefix=$self->{'build_dir'}; # note if this works, change all to $directory, change in mg calls!!!!!!!!!!!!!!
184 my $fullindexprefix=$self->{'build_dir'};
185
186 my $directory = $self->{'build_dir'};
187 my $osextra = "";
188 if ($ENV{'GSDLOS'} =~ /^windows$/i) {
189 $fulltextprefix =~ s/\//\\/g;
190 #$directory = ~s/\//\\/g;
191 } else {
192 $osextra = " -d /";
193 }
194
195 #indexname got from command line arg. if not specified, its "", so use
196 # ones stated in cfg file
197 my $indexes = [];
198 if (!(defined $indexname && $indexname =~ /\w/)) {
199 $indexes = $self->{'collect_cfg'}->{'indexes'};
200 $indexname="Title,Organization,Magazine,text";
201 }
202 else {
203 push @$indexes, $indexname;
204 }
205 print STDERR "indexes are: @$indexes\n";
206
207
208 print STDERR "\n*** mg_passes: first pass\n" if ($self->{'verbosity'} >= 1);
209 print STDERR "fulltextprefix=$fulltextprefix\n";
210 # carry out the first pass of mg_passes
211 # -b $maxdocsize sets the maximum document size to be 12 meg - not available any longer
212 print STDERR "\n collecting text statistics\n" if ($self->{'verbosity'} >= 1);
213
214 my ($handle);
215 if ($self->{'debug'}) {
216 $handle = STDOUT;
217 } else {
218 if (!-e "$mg_passes_exe" ||
219 !open (PIPEOUT, "| $mg_passes_exe -K Section -K Paragraph -T1 -I1 -d $fulltextprefix -f $basefilename")) {
220 die "mgppbuilder::compress_text - couldn't run $mg_passes_exe\n";
221 }
222 $handle = mgppbuilder::PIPEOUT;
223 }
224
225
226 #Assume that only going to build one index for now. so index will be
227 # anything specified in cfg file
228 $self->{'buildproc'}->set_output_handle ($handle);
229 $self->{'buildproc'}->set_mode ('text');
230 $self->{'buildproc'}->set_index ($indexname);
231 $self->{'buildproc'}->set_indexing_text (1); # not used at the moment I think
232 $self->{'buildproc'}->reset();
233 &plugin::begin($self->{'pluginfo'}, $self->{'source_dir'},
234 $self->{'buildproc'}, $self->{'maxdocs'});
235 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
236 "", {}, $self->{'buildproc'}, $self->{'maxdocs'});
237 &plugin::end($self->{'pluginfo'});
238 close (PIPEOUT);
239
240 close ($handle) unless $self->{'debug'};
241
242
243 # create the compression dictionary
244 # the compression dictionary is built by assuming the stats are from a seed
245 # dictionary (-S), if a novel word is encountered it is spelled out (-H),
246 # and the resulting dictionary must be less than 5 meg with the most frequent
247 # words being put into the dictionary first (-2 -k 5120)
248 if (!$self->{'debug'}) {
249 print STDERR "\n creating the compression dictionary\n" if ($self->{'verbosity'} >= 1);
250 if (!-e "$mg_compression_dict_exe") {
251 die "mgppbuilder::compress_text - couldn't run $mg_compression_dict_exe\n";
252 }
253 system ("$mg_compression_dict_exe -d $fulltextprefix -f $basefilename");
254
255 # create the perfect hash function
256 if (!-e "$mg_perf_hash_build_exe") {
257 die "mgppbuilder::build_index - couldn't run $mg_perf_hash_build_exe\n";
258 }
259 system ("$mg_perf_hash_build_exe -d $fullindexprefix -f $basefilename");
260
261 # compress the text
262 # -b $maxdocsize sets the maximum document size to be 12 meg
263 if (!$self->{'debug'}) {
264 if (!-e "$mg_passes_exe" ||
265 !open ($handle, "| $mg_passes_exe -K Section -K Paragraph -d $fulltextprefix -f $basefilename -T2 -I2")) {
266 die "mgppbuilder::compress_text - couldn't run $mg_passes_exe\n";
267 }
268 }
269 }
270
271 $self->{'buildproc'}->reset();
272
273 print STDERR "\n compressing the text\n" if ($self->{'verbosity'} >= 1);
274 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
275 "", {}, $self->{'buildproc'}, $self->{'maxdocs'});
276 close ($handle) unless $self->{'debug'};
277
278
279
280 # create the weights file
281 print STDERR "\n create the weights file\n" if ($self->{'verbosity'} >= 1);
282 if (!-e "$mg_weights_build_exe") {
283 die "mgppbuilder::build_index - couldn't run $mg_weights_build_exe\n";
284 }
285 system ("$mg_weights_build_exe -d $fullindexprefix -f $basefilename ");
286
287 # create 'on-disk' stemmed dictionary
288 print STDERR "\n creating 'on-disk' stemmed dictionary\n" if ($self->{'verbosity'} >= 1);
289 if (!-e "$mg_invf_dict_exe") {
290 die "mgppbuilder::build_index - couldn't run $mg_invf_dict_exe\n";
291 }
292 system ("$mg_invf_dict_exe -d $fullindexprefix -f $basefilename");
293
294
295 # creates stem index files for the various stemming methods
296 print STDERR "\n creating stem indexes\n" if ($self->{'verbosity'} >= 1);
297 if (!-e "$mg_stem_idx_exe") {
298 die "mgppbuilder::build_index - couldn't run $mg_stem_idx_exe\n";
299 }
300 system ("$mg_stem_idx_exe -b 4096 -s1 -d $fullindexprefix -f $basefilename");
301 system ("$mg_stem_idx_exe -b 4096 -s2 -d $fullindexprefix -f $basefilename");
302 system ("$mg_stem_idx_exe -b 4096 -s3 -d $fullindexprefix -f $basefilename");
303}
304
305
306#for mgpp with more than one index
307sub compress_text {
308
309 my $self = shift (@_);
310 my ($textindex) = @_;
311
312# $textindex = "Title,Organization,Subject,Magazine,text";
313 my $exedir = "$ENV{'GSDLHOME'}/src/mgpp/text";
314 my $exe = &util::get_os_exe ();
315 my $mg_passes_exe = &util::filename_cat($exedir, "mg_passes$exe");
316 my $mg_compression_dict_exe = &util::filename_cat($exedir, "mg_compression_dict$exe");
317
318 &util::mk_all_dir (&util::filename_cat($self->{'build_dir'}, "text"));
319
320 my $builddir = $self->{'build_dir'};
321 my $basefilename = "text/$self->{'collection'}";
322
323 if ($ENV{'GSDLOS'} =~ /^windows$/i) {
324 $basefilename =~ s/\//\\/g;
325 $builddir =~ s/\//\\/g;
326
327 }
328
329 print STDERR "\n*** creating the compressed text\n" if ($self->{'verbosity'} >= 1);
330
331 # collect the statistics for the text
332 print STDERR "\n collecting text statistics\n" if ($self->{'verbosity'} >= 1);
333
334 my ($handle);
335 if ($self->{'debug'}) {
336 $handle = STDOUT;
337 } else {
338 if (!-e "$mg_passes_exe" ||
339 !open (PIPEOUT, "| $mg_passes_exe -K Section -K Paragraph -d $builddir -f $basefilename -T1")) {
340 die "mgppbuilder::compress_text - couldn't run $mg_passes_exe\n";
341 }
342 $handle = mgppbuilder::PIPEOUT;
343 }
344
345 $self->{'buildproc'}->set_output_handle ($handle);
346 $self->{'buildproc'}->set_mode ('text');
347 $self->{'buildproc'}->set_index ($textindex);
348 $self->{'buildproc'}->set_indexing_text (0);
349 $self->{'buildproc'}->reset();
350 &plugin::begin($self->{'pluginfo'}, $self->{'source_dir'},
351 $self->{'buildproc'}, $self->{'maxdocs'});
352 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
353 "", {}, $self->{'buildproc'}, $self->{'maxdocs'});
354 &plugin::end($self->{'pluginfo'});
355 close (PIPEOUT);
356
357 close ($handle) unless $self->{'debug'};
358
359 # create the compression dictionary
360 # the compression dictionary is built by assuming the stats are from a seed
361 # dictionary (-S), if a novel word is encountered it is spelled out (-H),
362 # and the resulting dictionary must be less than 5 meg with the most
363 # frequent words being put into the dictionary first (-2 -k 5120)
364 # note: this options are left over from mg version
365 if (!$self->{'debug'}) {
366 print STDERR "\n creating the compression dictionary\n" if ($self->{'verbosity'} >= 1);
367 if (!-e "$mg_compression_dict_exe") {
368 die "mgppbuilder::compress_text - couldn't run $mg_compression_dict_exe\n";
369 }
370 system ("$mg_compression_dict_exe -d $builddir -f $basefilename -S -H -2 -k 5120");
371
372
373 if (!$self->{'debug'}) {
374 if (!-e "$mg_passes_exe" ||
375 !open ($handle, "| $mg_passes_exe -K Section -K Paragraph -f $basefilename -d $builddir -T2")) {
376 die "mgppbuilder::compress_text - couldn't run $mg_passes_exe\n";
377 }
378 }
379 }
380
381 $self->{'buildproc'}->reset();
382 # compress the text
383 print STDERR "\n compressing the text\n" if ($self->{'verbosity'} >= 1);
384 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
385 "", {}, $self->{'buildproc'}, $self->{'maxdocs'});
386 close ($handle) unless $self->{'debug'};
387}
388
389sub want_built {
390 my $self = shift (@_);
391 my ($index) = @_;
392
393 if (defined ($self->{'collect_cfg'}->{'dontbuild'})) {
394 foreach $checkstr (@{$self->{'collect_cfg'}->{'dontbuild'}}) {
395 if ($index =~ /^$checkstr$/) {
396 push (@{$self->{'notbuilt'}}, $self->{'index_mapping'}->{$index});
397 return 0;
398 }
399 }
400 }
401
402 return 1;
403}
404
405sub build_indexes {
406 my $self = shift (@_);
407 my ($indexname) = @_;
408
409 my $indexes = [];
410 if (defined $indexname && $indexname =~ /\w/) {
411 push @$indexes, $indexname;
412 } else {
413 $indexes = $self->{'collect_cfg'}->{'indexes'};
414 }
415
416# push @$indexes, "text,Title,Organization,Magazine,Subject";
417# push @$indexes, "Title,Organization,Magazine,Subject";
418 # create the mapping between the index descriptions
419 # and their directory names
420 $self->{'index_mapping'} = $self->create_index_mapping ($indexes);
421
422 # build each of the indexes
423 foreach $index (@$indexes) {
424 if ($self->want_built($index)) {
425 print STDERR "\n*** building index $index in subdirectory " .
426 "$self->{'index_mapping'}->{$index}\n" if ($self->{'verbosity'} >= 1);
427 $self->build_index($index);
428 } else {
429 print STDERR "\n*** ignoring index $index\n" if ($self->{'verbosity'} >= 1);
430 }
431 }
432}
433
434# creates directory names for each of the index descriptions
435sub create_index_mapping {
436 my $self = shift (@_);
437 my ($indexes) = @_;
438
439 my %mapping = ();
440 $mapping{'indexmaporder'} = [];
441 $mapping{'subcollectionmaporder'} = [];
442 $mapping{'languagemaporder'} = [];
443
444 # dirnames is used to check for collisions. Start this off
445 # with the manditory directory names
446 my %dirnames = ('text'=>'text',
447 'extra'=>'extra');
448 my %pnames = ('index' => '', 'subcollection' => '', 'languages' => '');
449
450 foreach $index (@$indexes) {
451 my ($fields, $subcollection, $languages) = split (":", $index);
452
453 # the directory name starts with a processed version of index fields
454 my ($pindex) = $self->process_field($fields);
455 # next comes a processed version of the index
456 $pindex = lc ($pindex);
457
458 # next comes a processed version of the subcollection if there is one.
459 my $psub = $self->process_field ($subcollection);
460 $psub = lc ($psub);
461
462 # next comes a processed version of the language if there is one.
463 my $plang = $self->process_field ($languages);
464 $plang = lc ($plang);
465
466 my $dirname = $pindex . $psub . $plang;
467
468 # check to be sure all index names are unique
469 while (defined ($dirnames{$dirname})) {
470 $dirname = $self->make_unique (\%pnames, $index, \$pindex, \$psub, \$plang);
471 }
472
473 # store the mapping orders as well as the maps
474 if (!defined $mapping{'indexmap'}{"$fields"}) {
475 $mapping{'indexmap'}{"$fields"} = $pindex;
476 push (@{$mapping{'indexmaporder'}}, "$fields");
477 }
478 if ($psub =~ /\w/ && !defined ($mapping{'subcollectionmap'}{$subcollection})) {
479 $mapping{'subcollectionmap'}{$subcollection} = $psub;
480 push (@{$mapping{'subcollectionmaporder'}}, $subcollection);
481 }
482 if ($plang =~ /\w/ && !defined ($mapping{'languagemap'}{$languages})) {
483 $mapping{'languagemap'}{$languages} = $plang;
484 push (@{$mapping{'languagemaporder'}}, $language);
485 }
486 $mapping{$index} = $dirname;
487 $dirnames{$dirname} = $index;
488 $pnames{'index'}{$pindex} = "$fields";
489 $pnames{'subcollection'}{$psub} = $subcollection;
490 $pnames{'languages'}{$plang} = $languages;
491 }
492
493 return \%mapping;
494}
495
496# returns a processed version of a field.
497# if the field has only one component the processed
498# version will contain the first character and next consonant
499# of that componant - otherwise it will contain the first
500# character of the first two components
501sub process_field {
502 my $self = shift (@_);
503 my ($field) = @_;
504
505 return "" unless (defined ($field) && $field =~ /\w/);
506
507 my @components = split /,/, $field;
508 if (scalar @components >= 2) {
509 splice (@components, 2);
510 map {s/^(.).*$/$1/;} @components;
511 return join("", @components);
512 } else {
513 my ($a, $b) = $field =~ /^(.).*?([bcdfghjklmnpqrstvwxyz])/i;
514 ($a, $b) = $field =~ /^(.)(.)/ unless defined $a && defined $b;
515 return "$a$b";
516 }
517}
518
519sub make_unique {
520 my $self = shift (@_);
521 my ($namehash, $index, $indexref, $subref, $langref) = @_;
522 my ($fields, $subcollection, $languages) = split (":", $index);
523
524 if ($namehash->{'index'}->{$$indexref} ne "$fields") {
525 $self->get_next_version ($indexref);
526 } elsif ($namehash->{'subcollection'}->{$$subref} ne $subcollection) {
527 $self->get_next_version ($subref);
528 } elsif ($namehash->{'languages'}->{$$langref} ne $languages) {
529 $self->get_next_version ($langref);
530 }
531 return "$$indexref$$subref$$langref";
532}
533
534sub get_next_version {
535 my $self = shift (@_);
536 my ($nameref) = @_;
537
538 if ($$nameref =~ /(\d\d)$/) {
539 my $num = $1; $num ++;
540 $$nameref =~ s/\d\d$/$num/;
541 } elsif ($$nameref =~ /(\d)$/) {
542 my $num = $1;
543 if ($num == 9) {$$nameref =~ s/\d\d$/10/;}
544 else {$num ++; $$nameref =~ s/\d$/$num/;}
545 } else {
546 $$nameref =~ s/.$/0/;
547 }
548}
549
550sub build_index {
551 my $self = shift (@_);
552 my ($index) = @_;
553
554 # get the full index directory path and make sure it exists
555 my $indexdir = $self->{'index_mapping'}->{$index};
556 &util::mk_all_dir (&util::filename_cat($self->{'build_dir'}, $indexdir));
557 my $builddir = $self->{'build_dir'};
558
559 my $basefilename = &util::filename_cat ($indexdir,
560 $self->{'collection'});
561
562 # get any os specific stuff
563 my $exedir = "$ENV{'GSDLHOME'}/src/mgpp/text";
564
565 my $exe = &util::get_os_exe ();
566 my $mg_passes_exe = &util::filename_cat($exedir, "mg_passes$exe");
567 my $mg_perf_hash_build_exe =
568 &util::filename_cat($exedir, "mg_perf_hash_build$exe");
569 my $mg_weights_build_exe =
570 &util::filename_cat ($exedir, "mg_weights_build$exe");
571 my $mg_invf_dict_exe =
572 &util::filename_cat ($exedir, "mg_invf_dict$exe");
573 my $mg_stem_idx_exe =
574 &util::filename_cat ($exedir, "mg_stem_idx$exe");
575
576 if ($ENV{'GSDLOS'} =~ /^windows$/i) {
577 $builddir=~ s/\//\\/g;
578 $basefilename =~ s/\//\\/g;
579 }
580
581 # get the index expression if this index belongs
582 # to a subcollection
583 my $indexexparr = [];
584 my ($fields, $subcollection) = split (":", $index);
585 my @subcollections = ();
586 @subcollections = split /,/, $subcollection if (defined $subcollection);
587
588 foreach $subcollection (@subcollections) {
589 if (defined ($self->{'collect_cfg'}->{'subcollection'}->{$subcollection})) {
590 push (@$indexexparr, $self->{'collect_cfg'}->{'subcollection'}->{$subcollection});
591 }
592 }
593
594 # add expressions for languages if this index belongs to
595 # a language subcollection
596 foreach $language (@{$self->{'collect_cfg'}->{'languages'}}) {
597 if ($language =~ s/^\!//) {
598 push (@$indexexparr, "!Language/$language/");
599 } else {
600 push (@$indexexparr, "Language/$language/");
601 }
602 }
603
604 # Build index dictionary. Uses verbatim stem method
605 print STDERR "\n creating index dictionary\n" if ($self->{'verbosity'} >= 1);
606 my ($handle);
607 if ($self->{'debug'}) {
608 $handle = STDOUT;
609 } else {
610 if (!-e "$mg_passes_exe" ||
611 !open (PIPEOUT, "| $mg_passes_exe -K Section -K Paragraph -d $builddir -f $basefilename -I1")) {
612 die "mgppbuilder::build_index - couldn't run $mg_passes_exe\n";
613 }
614 $handle = mgppbuilder::PIPEOUT;
615 }
616
617 # set up the document processor
618 $self->{'buildproc'}->set_output_handle ($handle);
619 $self->{'buildproc'}->set_mode ('text');
620 $self->{'buildproc'}->set_index ($index, $indexexparr);
621 $self->{'buildproc'}->set_indexing_text (1);
622
623 $self->{'buildproc'}->reset();
624 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
625 "", {}, $self->{'buildproc'}, $self->{'maxdocs'});
626 close ($handle) unless $self->{'debug'};
627
628 if (!$self->{'debug'}) {
629 # create the perfect hash function
630 if (!-e "$mg_perf_hash_build_exe") {
631 die "mgppbuilder::build_index - couldn't run $mg_perf_hash_build_exe\n";
632 }
633 system ("$mg_perf_hash_build_exe -d $builddir -f $basefilename");
634
635 if (!-e "$mg_passes_exe" ||
636 !open ($handle, "| $mg_passes_exe -K Section -K Paragraph -d $builddir -f $basefilename -I2")) {
637 die "mgppbuilder::build_index - couldn't run $mg_passes_exe\n";
638 }
639 }
640
641 # invert the text
642 print STDERR "\n inverting the text\n" if ($self->{'verbosity'} >= 1);
643
644 $self->{'buildproc'}->reset();
645 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
646 "", {}, $self->{'buildproc'}, $self->{'maxdocs'});
647
648 if (!$self->{'debug'}) {
649
650 close ($handle);
651
652 # create the weights file
653 print STDERR "\n create the weights file\n" if ($self->{'verbosity'} >= 1);
654 if (!-e "$mg_weights_build_exe") {
655 die "mgppbuilder::build_index - couldn't run $mg_weights_build_exe\n";
656 }
657 system ("$mg_weights_build_exe -d $builddir -f $basefilename");
658
659 # create 'on-disk' stemmed dictionary
660 print STDERR "\n creating 'on-disk' stemmed dictionary\n" if ($self->{'verbosity'} >= 1);
661 if (!-e "$mg_invf_dict_exe") {
662 die "mgppbuilder::build_index - couldn't run $mg_invf_dict_exe\n";
663 }
664 system ("$mg_invf_dict_exe -d $builddir -f $basefilename");
665
666
667 # creates stem index files for the various stemming methods
668 print STDERR "\n creating stem indexes\n" if ($self->{'verbosity'} >= 1);
669 if (!-e "$mg_stem_idx_exe") {
670 die "mgppbuilder::build_index - couldn't run $mg_stem_idx_exe\n";
671 }
672 system ("$mg_stem_idx_exe -b 4096 -s1 -d $builddir -f $basefilename");
673 system ("$mg_stem_idx_exe -b 4096 -s2 -d $builddir -f $basefilename");
674 system ("$mg_stem_idx_exe -b 4096 -s3 -d $builddir -f $basefilename");
675
676
677 # remove unwanted files
678# my $tmpdir = &util::filename_cat ($self->{'build_dir'}, $indexdir);
679# opendir (DIR, $tmpdir) || die
680# "mgppbuilder::build_index - couldn't read directory $tmpdir\n";
681# foreach $file (readdir(DIR)) {
682# next if $file =~ /^\./;
683# my ($suffix) = $file =~ /\.([^\.]+)$/;
684# if (defined $suffix && !defined $wanted_index_files{$suffix}) {
685 # delete it!
686# print STDERR "deleting $file\n" if $self->{'verbosity'} > 2;
687# &util::rm (&util::filename_cat ($tmpdir, $file));
688# }
689# }
690# closedir (DIR);
691 }
692}
693
694sub make_infodatabase {
695 my $self = shift (@_);
696 my $textdir = &util::filename_cat($self->{'build_dir'}, "text");
697 my $assocdir = &util::filename_cat($self->{'build_dir'}, "assoc");
698 &util::mk_all_dir ($textdir);
699 &util::mk_all_dir ($assocdir);
700
701 # get db name
702 my $dbext = ".bdb";
703 $dbext = ".ldb" if &util::is_little_endian();
704 my $fulldbname = &util::filename_cat ($textdir, "$self->{'collection'}$dbext");
705 $fulldbname =~ s/\//\\/g if ($ENV{'GSDLOS'} =~ /^windows$/i);
706
707 my $exedir = "$ENV{'GSDLHOME'}/bin/$ENV{'GSDLOS'}";
708 my $exe = &util::get_os_exe ();
709 my $txt2db_exe = &util::filename_cat($exedir, "txt2db$exe");
710
711 print STDERR "\n*** creating the info database and processing associated files\n"
712 if ($self->{'verbosity'} >= 1);
713
714 # init all the classifiers
715 &classify::init_classifiers ($self->{'classifiers'});
716
717 # set up the document processor
718 my ($handle);
719 if ($self->{'debug'}) {
720 $handle = STDOUT;
721 } else {
722 if (!-e "$txt2db_exe" || !open (PIPEOUT, "| $txt2db_exe $fulldbname")) {
723 die "mgppbuilder::make_infodatabase - couldn't run $txt2db_exe\n";
724 }
725 $handle = mgppbuilder::PIPEOUT;
726 }
727
728 $self->{'buildproc'}->set_output_handle ($handle);
729 $self->{'buildproc'}->set_mode ('infodb');
730 $self->{'buildproc'}->set_assocdir ($assocdir);
731 $self->{'buildproc'}->set_dontgdbm ($self->{'dontgdbm'});
732 $self->{'buildproc'}->set_classifiers ($self->{'classifiers'});
733 $self->{'buildproc'}->set_indexing_text (0);
734 $self->{'buildproc'}->reset();
735
736 if (defined $self->{'collect_cfg'}->{'collectionmeta'}) {
737
738 if (!defined $self->{'index_mapping'}) {
739 $self->{'index_mapping'} =
740 $self->create_index_mapping ($self->{'collect_cfg'}->{'indexes'});
741 }
742
743 print $handle "[collection]\n";
744
745 foreach $cmeta (keys (%{$self->{'collect_cfg'}->{'collectionmeta'}})) {
746 if ($cmeta =~ s/^\.//) {
747 if (defined $self->{'index_mapping'}->{$cmeta}) {
748 print $handle "<$self->{'index_mapping'}->{$cmeta}>" .
749 $self->{'collect_cfg'}->{'collectionmeta'}->{".$cmeta"} . "\n";
750 print STDERR "have .section entry in collect file\n";
751 } else {
752 print STDERR "mgppbuilder: warning bad collectionmeta option '$cmeta' - ignored\n";
753 }
754 } else {
755 print $handle "<$cmeta>$self->{'collect_cfg'}->{'collectionmeta'}->{$cmeta}\n";
756 }
757 }
758 print $handle "\n" . ('-' x 70) . "\n";
759
760 }
761
762 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
763 "", {}, $self->{'buildproc'}, $self->{'maxdocs'});
764
765 # output classification information
766 &classify::output_classify_info ($self->{'classifiers'}, $handle,
767 $self->{'allclassifications'});
768
769 close ($handle) if !$self->{'debug'};
770}
771
772sub collect_specific {
773 my $self = shift (@_);
774}
775
776sub make_auxiliary_files {
777 my $self = shift (@_);
778 my ($index);
779 my %build_cfg = ();
780
781 print STDERR "\n*** creating auxiliary files \n" if ($self->{'verbosity'} >= 1);
782
783 # get the text directory
784 &util::mk_all_dir ($self->{'build_dir'});
785
786 # store the build date
787 $build_cfg->{'builddate'} = time;
788
789 # store the number of documents and number of bytes
790 $build_cfg->{'numdocs'} = $self->{'buildproc'}->get_num_docs();
791 $build_cfg->{'numbytes'} = $self->{'buildproc'}->get_num_bytes();
792
793 # store the mapping between the index names and the directory names
794 my @indexmap = ();
795 foreach $index (@{$self->{'index_mapping'}->{'indexmaporder'}}) {
796 push (@indexmap, "$index\-\>$self->{'index_mapping'}->{'indexmap'}->{$index}");
797 }
798 $build_cfg->{'indexmap'} = \@indexmap;
799
800 my @subcollectionmap = ();
801 foreach $subcollection (@{$self->{'index_mapping'}->{'subcollectionmaporder'}}) {
802 push (@subcollectionmap, "$subcollection\-\>" .
803 $self->{'index_mapping'}->{'subcollectionmap'}->{$subcollection});
804 }
805 $build_cfg->{'subcollectionmap'} = \@subcollectionmap if scalar (@subcollectionmap);
806
807 my @languagemap = ();
808 foreach $language (@{$self->{'index_mapping'}->{'languagemaporder'}}) {
809 push (@languagemap, "$language\-\>" .
810 $self->{'index_mapping'}->{'languagemap'}->{$language});
811 }
812 $build_cfg->{'languagemap'} = \@languagemap if scalar (@languagemap);
813
814 $build_cfg->{'notbuilt'} = $self->{'notbuilt'};
815
816 # write out the build information
817 &cfgread::write_cfg_file("$self->{'build_dir'}/build.cfg", $build_cfg,
818 '^(builddate|numdocs|numbytes)$',
819 '^(indexmap|subcollectionmap|languagemap|notbuilt)$');
820
821}
822
823sub deinit {
824 my $self = shift (@_);
825}
826
827
8281;
829
830
Note: See TracBrowser for help on using the repository browser.