source: trunk/gsdl/perllib/mgbuilder.pm@ 246

Last change on this file since 246 was 246, checked in by sjboddie, 25 years ago

Now checks collect.cfg for a 'doctype' field and if it finds it it's passed
to the plugins as metadata and goes in the gdbm as 'classifytype' at the
top level of each document.

  • Property svn:keywords set to Author Date Id Revision
File size: 18.6 KB
Line 
1# MGBuilder object
2#
3
4package mgbuilder;
5
6use classify;
7use cfgread;
8use colcfg;
9use plugin;
10use util;
11
12$maxdocsize = 12000;
13
14%wanted_index_files = ('td'=>1,
15 't'=>1,
16 'idb'=>1,
17 'ib1'=>1,
18 'ib2'=>1,
19 'ib3'=>1,
20 'i'=>1,
21 'ip'=>1,
22 'tiw'=>1,
23 'wa'=>1);
24
25
26sub new {
27 my ($class, $collection, $source_dir, $build_dir, $verbosity, $newgdbm) = @_;
28
29 # create an mgbuilder object
30 my $self = bless {'collection'=>$collection,
31 'source_dir'=>$source_dir,
32 'build_dir'=>$build_dir,
33 'verbosity'=>$verbosity,
34 'newgdbm'=>$newgdbm}, $class;
35
36
37 # read in the collection configuration file
38 my $colcfgname = "$ENV{'GSDLCOLLECTDIR'}/etc/collect.cfg";
39 if (!-e $colcfgname) {
40 die "mgbuilder::new - couldn't find collect.cfg for collection $collection\n";
41 }
42 $self->{'collect_cfg'} = &colcfg::read_collect_cfg ($colcfgname);
43
44 # sort out subcollection indexes
45 if (defined $self->{'collect_cfg'}->{'indexsubcollections'}) {
46 my $indexes = $self->{'collect_cfg'}->{'indexes'};
47 $self->{'collect_cfg'}->{'indexes'} = [];
48 foreach $subcollection (@{$self->{'collect_cfg'}->{'indexsubcollections'}}) {
49 foreach $index (@$indexes) {
50 push (@{$self->{'collect_cfg'}->{'indexes'}}, "$index:$subcollection");
51 }
52 }
53 }
54
55 # sort out language subindexes
56 if (defined $self->{'collect_cfg'}->{'languages'}) {
57 my $indexes = $self->{'collect_cfg'}->{'indexes'};
58 $self->{'collect_cfg'}->{'indexes'} = [];
59 foreach $language (@{$self->{'collect_cfg'}->{'languages'}}) {
60 foreach $index (@$indexes) {
61 push (@{$self->{'collect_cfg'}->{'indexes'}}, "$index:$language");
62 }
63 }
64 }
65
66 # get the list of plugins for this collection
67 my @plugins = (); # some good choice of plugins .... ????
68 if (defined $self->{'collect_cfg'}->{'plugins'}) {
69 @plugins = @{$self->{'collect_cfg'}->{'plugins'}};
70 }
71
72
73 # load all the plugins
74 $self->{'pluginfo'} = &plugin::load_plugins ($collection, \@plugins);
75 if (scalar(@{$self->{'pluginfo'}}) == 0) {
76 print STDERR "No plugins were loaded.\n";
77 die "\n";
78 }
79
80 # load all the classifiers
81 if ($self->{'newgdbm'}) {
82 $self->{'classifiers'} = [];
83 if (open (COLCFG, $colcfgname)) {
84 while (defined ($line = &cfgread::read_cfg_line('mgbuilder::COLCFG'))) {
85 if (scalar(@$line) >= 2) {
86 my $key = shift (@$line);
87 if ($key eq "classify") {
88 my $classinfo = &classify::load_classifier($line);
89 push (@{$self->{'classifiers'}}, $classinfo)
90 if defined $classinfo;
91 }
92 }
93 }
94 close (COLCFG);
95 } else {
96 print STDERR "mgbuilder::new couldn't read the cfg file $colcfgname\n";
97 print STDERR " no classifiers were loaded\n";
98 }
99 }
100
101 # set the classifytype to use for displaying documents - if the doctype field hasn't
102 # been set in the collect.cfg then the receptionist currently defaults to displaying
103 # documents as 'Book'
104 if ($self->{'newgdbm'}) {
105 if (open (COLCFG, $colcfgname)) {
106 while (defined ($line = &cfgread::read_cfg_line('mgbuilder::COLCFG'))) {
107 if (scalar(@$line) == 2) {
108 my $key = shift (@$line);
109 if ($key eq "doctype") {
110 $self->{'classifytype'} = shift (@$line);
111 }
112 }
113 }
114 close (COLCFG);
115 }
116 }
117
118 # load up the document processor for building
119 # if a buildproc class has been created for this collection, use it
120 # otherwise, use the mg buildproc
121 my ($buildprocdir, $buildproctype);
122 if (-e "$ENV{'GSDLCOLLECTDIR'}/perllib/${collection}buildproc.pm") {
123 $buildprocdir = "$ENV{'GSDLCOLLECTDIR'}/perllib";
124 $buildproctype = "${collection}buildproc";
125 } else {
126 $buildprocdir = "$ENV{'GSDLHOME'}/perllib";
127 $buildproctype = "mgbuildproc";
128 }
129 require "$buildprocdir/$buildproctype.pm";
130
131 eval("\$self->{'buildproc'} = new $buildproctype(\$collection, " .
132 "\$source_dir, \$build_dir, \$verbosity, \$newgdbm)");
133 die "$@" if $@;
134
135
136 return $self;
137}
138
139sub init {
140 my $self = shift (@_);
141
142 # remove any old builds
143 &util::rm_r($self->{'build_dir'});
144 &util::mk_all_dir($self->{'build_dir'});
145
146 # make the text directory
147 my $textdir = "$self->{'build_dir'}/text";
148 &util::mk_all_dir($textdir);
149}
150
151sub compress_text {
152 my $self = shift (@_);
153 my ($textindex) = @_;
154 my $exedir = "$ENV{'GSDLHOME'}/bin/$ENV{'GSDLOS'}";
155 my $exe = &util::get_os_exe ();
156
157 &util::mk_all_dir (&util::filename_cat($self->{'build_dir'}, "text"));
158 my $basefilename = "text/$self->{'collection'}";
159 my $fulltextprefix = &util::filename_cat ($self->{'build_dir'}, $basefilename);
160
161 my $osextra = "";
162 if ($ENV{'GSDLOS'} =~ /^windows$/i) {
163 $fulltextprefix =~ s/\//\\/g;
164 } else {
165 $osextra = " -d /";
166 }
167
168 print STDERR "\n*** creating the compressed text\n" if ($self->{'verbosity'} >= 1);
169
170 # set up the document processor
171 $self->{'buildproc'}->set_output_handle ('mgbuilder::PIPEOUT');
172 $self->{'buildproc'}->set_mode ('text');
173 $self->{'buildproc'}->set_index ($textindex);
174
175 # collect the statistics for the text
176 # -b $maxdocsize sets the maximum document size to be 12 meg
177 print STDERR "\n collecting text statistics\n" if ($self->{'verbosity'} >= 1);
178 if (!-e "$exedir/mg_passes$exe" || !open (PIPEOUT,
179 "| $exedir/mg_passes$exe -f $fulltextprefix -b $maxdocsize -T1 $osextra")) {
180 die "mgbuilder::compress_text - couldn't run $exedir/mg_passes$exe\n";
181 }
182 $self->{'buildproc'}->reset();
183 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
184 "", {}, $self->{'buildproc'});
185 close (PIPEOUT);
186
187 # create the compression dictionary
188 # the compression dictionary is built by assuming the stats are from a seed
189 # dictionary (-S), if a novel word is encountered it is spelled out (-H),
190 # and the resulting dictionary must be less than 5 meg with the most frequent
191 # words being put into the dictionary first (-2 -k 5120)
192 print STDERR "\n creating the compression dictionary\n" if ($self->{'verbosity'} >= 1);
193 if (!-e "$exedir/mg_compression_dict$exe") {
194 die "mgbuilder::compress_text - couldn't run $exedir/mg_compression_dict$exe\n";
195 }
196 system ("$exedir/mg_compression_dict$exe -f $fulltextprefix -S -H -2 -k 5120 $osextra");
197
198 # compress the text
199 # -b $maxdocsize sets the maximum document size to be 12 meg
200 print STDERR "\n compressing the text\n" if ($self->{'verbosity'} >= 1);
201 if (!-e "$exedir/mg_passes$exe" || !open (PIPEOUT,
202 "| $exedir/mg_passes$exe -f $fulltextprefix -b $maxdocsize -T2 $osextra")) {
203 die "mgbuilder::compress_text - couldn't run $exedir/mg_passes$exe\n";
204 }
205 $self->{'buildproc'}->reset();
206 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
207 "", {}, $self->{'buildproc'});
208 close (PIPEOUT);
209}
210
211sub build_indexes {
212 my $self = shift (@_);
213 my $indexes = $self->{'collect_cfg'}->{'indexes'};
214
215 # create the mapping between the index descriptions
216 # and their directory names
217 $self->{'index_mapping'} = $self->create_index_mapping ($indexes);
218
219 # build each of the indexes
220 foreach $index (@$indexes) {
221 print STDERR "\n*** building index $index in subdirectory " .
222 "$self->{'index_mapping'}->{$index}\n" if ($self->{'verbosity'} >= 1);
223 $self->build_index($index);
224 }
225}
226
227# creates directory names for each of the index descriptions
228sub create_index_mapping {
229 my $self = shift (@_);
230 my ($indexes) = @_;
231
232 my %mapping = ();
233
234 # dirnames is used to check for collisions. Start this off
235 # with the manditory directory names
236 my %dirnames = ('text'=>'text',
237 'extra'=>'extra');
238 my %pnames = ('index' => '', 'subcollection' => '', 'languages' => '');
239
240 foreach $index (@$indexes) {
241 my ($level, $gran, $subcollection, $languages) = split (":", $index);
242
243 # the directory name starts with the first character of the index level
244 my ($pindex) = $level =~ /^(.)/;
245
246 # next comes a processed version of the index
247 $pindex .= $self->process_field ($gran);
248 $pindex = lc ($pindex);
249
250 # next comes a processed version of the subcollection if there is one.
251 my $psub = $self->process_field ($subcollection);
252 $psub = lc ($psub);
253
254 # next comes a processed version of the language if there is one.
255 my $plang = $self->process_field ($languages);
256 $plang = lc ($plang);
257
258 my $dirname = $pindex . $psub . $plang;
259
260 # check to be sure all index names are unique
261 while (defined ($dirnames{$dirname})) {
262 $dirname = $self->make_unique (\%pnames, $index, \$pindex, \$psub, \$plang);
263 }
264
265 $mapping{'indexmap'}{"$level:$gran"} = $pindex;
266 $mapping{'subcollectionmap'}{$subcollection} = $psub if $psub =~ /\w/;
267 $mapping{'languagemap'}{$languages} = $plang if $plang =~ /\w/;
268 $mapping{$index} = $dirname;
269 $dirnames{$dirname} = $index;
270 $pnames{'index'}{$pindex} = "$level:$gran";
271 $pnames{'subcollection'}{$psub} = $subcollection;
272 $pnames{'languages'}{$plang} = $languages;
273 }
274
275 return \%mapping;
276}
277
278# returns a processed version of a field.
279# if the field has only one component the processed
280# version will contain the first character and next consonant
281# of that componant - otherwise it will contain the first
282# character of the first two components
283sub process_field {
284 my $self = shift (@_);
285 my ($field) = @_;
286
287 return "" unless (defined ($field) && $field =~ /\w/);
288
289 my @components = split /,/, $field;
290 if (scalar @components >= 2) {
291 splice (@components, 2);
292 map {s/^(.).*$/$1/;} @components;
293 return join("", @components);
294 } else {
295 my ($a, $b) = $field =~ /^(.).*?([bcdfghjklmnpqrstvwxyz])/i;
296 ($a, $b) = $field =~ /^(.)(.)/ unless defined $a && defined $b;
297 return "$a$b";
298 }
299}
300
301sub make_unique {
302 my $self = shift (@_);
303 my ($namehash, $index, $indexref, $subref, $langref) = @_;
304 my ($level, $gran, $subcollection, $languages) = split (":", $index);
305
306 if ($namehash->{'index'}->{$$indexref} ne "$level:$gran") {
307 $self->get_next_version ($indexref);
308 } elsif ($namehash->{'subcollection'}->{$$subref} ne $subcollection) {
309 $self->get_next_version ($subref);
310 } elsif ($namehash->{'languages'}->{$$langref} ne $languages) {
311 $self->get_next_version ($langref);
312 }
313 return "$$indexref$$subref$$langref";
314}
315
316sub get_next_version {
317 my $self = shift (@_);
318 my ($nameref) = @_;
319
320 if ($$nameref =~ /(\d\d)$/) {
321 my $num = $1; $num ++;
322 $$nameref =~ s/\d\d$/$num/;
323 } elsif ($$nameref =~ /(\d)$/) {
324 my $num = $1;
325 if ($num == 9) {$$nameref =~ s/\d\d$/10/;}
326 else {$num ++; $$nameref =~ s/\d$/$num/;}
327 } else {
328 $$nameref =~ s/.$/0/;
329 }
330}
331
332sub build_index {
333 my $self = shift (@_);
334 my ($index) = @_;
335
336 # get the full index directory path and make sure it exists
337 my $indexdir = $self->{'index_mapping'}->{$index};
338 &util::mk_all_dir (&util::filename_cat($self->{'build_dir'}, $indexdir));
339 my $fullindexprefix = &util::filename_cat ($self->{'build_dir'}, $indexdir,
340 $self->{'collection'});
341 my $fulltextprefix = &util::filename_cat ($self->{'build_dir'}, "text",
342 $self->{'collection'});
343
344 # get any os specific stuff
345 my $exedir = "$ENV{'GSDLHOME'}/bin/$ENV{'GSDLOS'}";
346 my $exe = &util::get_os_exe ();
347 my $osextra = "";
348 if ($ENV{'GSDLOS'} =~ /^windows$/i) {
349 $fullindexprefix =~ s/\//\\/g;
350 } else {
351 $osextra = " -d /";
352 }
353
354 # get the index level from the index description
355 # the index will be level 2 unless we are building a
356 # paragraph level index
357 my $index_level = 2;
358 $index_level = 3 if $index =~ /^paragraph/i;
359
360 # get the index expression if this index belongs
361 # to a subcollection
362 my $indexexparr = [];
363 my ($level, $fields, $subcollection) = split (":", $index);
364 my @subcollections = ();
365 @subcollections = split /,/, $subcollection if (defined $subcollection);
366
367 foreach $subcollection (@subcollections) {
368 if (defined ($self->{'collect_cfg'}->{'subcollection'}->{$subcollection})) {
369 push (@$indexexparr, $self->{'collect_cfg'}->{'subcollection'}->{$subcollection});
370 }
371 }
372
373 # add expressions for languages if this index belongs to
374 # a language subcollection
375 foreach $language (@{$self->{'collect_cfg'}->{'languages'}}) {
376 if ($language =~ s/^\!//) {
377 push (@$indexexparr, "!Language/$language/");
378 } else {
379 push (@$indexexparr, "Language/$language/");
380 }
381 }
382
383 # set up the document processor
384 $self->{'buildproc'}->set_output_handle ('mgbuilder::PIPEOUT');
385 $self->{'buildproc'}->set_mode ('text');
386 $self->{'buildproc'}->set_index ($index, $indexexparr);
387
388
389 # Build index dictionary. Uses verbatim stem method
390 print STDERR "\n creating index dictionary\n" if ($self->{'verbosity'} >= 1);
391 if (!-e "$exedir/mg_passes$exe" || !open (PIPEOUT,
392 "| $exedir/mg_passes$exe -f $fullindexprefix -b $maxdocsize " .
393 "-$index_level -m 32 -s 0 -G -t 10 -N1 $osextra")) {
394 die "mgbuilder::build_index - couldn't run $exedir/mg_passes$exe\n";
395 }
396 $self->{'buildproc'}->reset();
397 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
398 "", {}, $self->{'buildproc'});
399 close (PIPEOUT);
400
401 # create the perfect hash function
402 if (!-e "$exedir/mg_perf_hash_build$exe") {
403 die "mgbuilder::build_index - couldn't run $exedir/mg_perf_hash_build$exe\n";
404 }
405 system ("$exedir/mg_perf_hash_build$exe -f $fullindexprefix $osextra");
406
407 # invert the text
408 print STDERR "\n inverting the text\n" if ($self->{'verbosity'} >= 1);
409 if (!-e "$exedir/mg_passes$exe" || !open (PIPEOUT,
410 "| $exedir/mg_passes$exe -f $fullindexprefix -b $maxdocsize " .
411 "-$index_level -c 3 -G -t 10 -N2 $osextra")) {
412 die "mgbuilder::build_index - couldn't run $exedir/mg_passes$exe\n";
413 }
414 $self->{'buildproc'}->reset();
415 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
416 "", {}, $self->{'buildproc'});
417 close (PIPEOUT);
418
419 # create the weights file
420 print STDERR "\n create the weights file\n" if ($self->{'verbosity'} >= 1);
421 if (!-e "$exedir/mg_weights_build$exe") {
422 die "mgbuilder::build_index - couldn't run $exedir/mg_weights_build$exe\n";
423 }
424 system ("$exedir/mg_weights_build$exe -f $fullindexprefix -t $fulltextprefix $osextra");
425
426 # create 'on-disk' stemmed dictionary
427 print STDERR "\n creating 'on-disk' stemmed dictionary\n" if ($self->{'verbosity'} >= 1);
428 if (!-e "$exedir/mg_invf_dict$exe") {
429 die "mgbuilder::build_index - couldn't run $exedir/mg_invf_dict$exe\n";
430 }
431 system ("$exedir/mg_invf_dict$exe -f $fullindexprefix $osextra");
432
433
434 # creates stem index files for the various stemming methods
435 print STDERR "\n creating stem indexes\n" if ($self->{'verbosity'} >= 1);
436 if (!-e "$exedir/mg_stem_idx$exe") {
437 die "mgbuilder::build_index - couldn't run $exedir/mg_stem_idx$exe\n";
438 }
439 system ("$exedir/mg_stem_idx$exe -b 4096 -s1 -f $fullindexprefix $osextra");
440 system ("$exedir/mg_stem_idx$exe -b 4096 -s2 -f $fullindexprefix $osextra");
441 system ("$exedir/mg_stem_idx$exe -b 4096 -s3 -f $fullindexprefix $osextra");
442
443
444 # remove unwanted files
445 my $tmpdir = &util::filename_cat ($self->{'build_dir'}, $indexdir);
446 opendir (DIR, $tmpdir) || die
447 "mgbuilder::build_index - couldn't read directory $tmpdir\n";
448 foreach $file (readdir(DIR)) {
449 next if $file =~ /^\./;
450 my ($suffix) = $file =~ /\.([^\.]+)$/;
451 if (defined $suffix && !defined $wanted_index_files{$suffix}) {
452 # delete it!
453# print STDERR "deleting $file\n";
454 &util::rm (&util::filename_cat ($tmpdir, $file));
455 }
456 }
457 closedir (DIR);
458}
459
460sub make_infodatabase {
461 my $self = shift (@_);
462 my $textdir = &util::filename_cat($self->{'build_dir'}, "text");
463 &util::mk_all_dir ($textdir);
464
465 # get db name
466 my $dbext = ".bdb";
467 $dbext = ".ldb" if &util::is_little_endian();
468 my $fulldbname = &util::filename_cat ($textdir, "$self->{'collection'}$dbext");
469 $fulldbname =~ s/\//\\/g if ($ENV{'GSDLOS'} =~ /^windows$/i);
470
471 my $exedir = "$ENV{'GSDLHOME'}/bin/$ENV{'GSDLOS'}";
472 my $exe = &util::get_os_exe ();
473
474 print STDERR "\n*** creating the info database\n" if ($self->{'verbosity'} >= 1);
475
476 # init all the classifiers
477 if ($self->{'newgdbm'}) {
478 &classify::init_classifiers ($self->{'classifiers'});
479 }
480
481 # set up the document processor
482 $self->{'buildproc'}->set_output_handle ('mgbuilder::PIPEOUT');
483 if ($self->{'newgdbm'}) {
484 $self->{'buildproc'}->set_mode ('newinfodb');
485 $self->{'buildproc'}->set_classifiers ($self->{'classifiers'});
486 } else {
487 $self->{'buildproc'}->set_mode ('infodb');
488 }
489
490 # create the infodatabase
491 if (!-e "$exedir/txt2db$exe" || !open (PIPEOUT,
492 "| $exedir/txt2db$exe $fulldbname")) {
493 die "mgbuilder::make_infodatabase - couldn't run $exedir/txt2db$exe\n";
494 }
495 $self->{'buildproc'}->reset();
496
497 my $metadata = {};
498 if (defined $self->{'classifytype'} && $self->{'classifytype'} =~ /\w/) {
499 $metadata->{'classifytype'} = $self->{'classifytype'};
500 }
501 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
502 "", $metadata, $self->{'buildproc'});
503
504 # output classification information
505 if ($self->{'newgdbm'}) {
506 &classify::output_classify_info ($self->{'classifiers'}, 'mgbuilder::PIPEOUT');
507 }
508
509 close (PIPEOUT);
510}
511
512sub make_auxiliary_files {
513 my $self = shift (@_);
514 my ($index);
515 my %build_cfg = ();
516
517 print STDERR "\n*** creating auxiliary files \n" if ($self->{'verbosity'} >= 1);
518
519 # get the text directory
520 &util::mk_all_dir ($self->{'build_dir'});
521
522 # store the build date
523 $build_cfg->{'builddate'} = time;
524
525 # store the number of documents and number of bytes
526 $build_cfg->{'numdocs'} = $self->{'buildproc'}->get_num_docs();
527 $build_cfg->{'numbytes'} = $self->{'buildproc'}->get_num_bytes();
528
529 # store the mapping between the index names and the directory names
530 my @indexmap = ();
531 foreach $index (keys (%{$self->{'index_mapping'}->{'indexmap'}})) {
532 push (@indexmap, "$index\-\>$self->{'index_mapping'}->{'indexmap'}->{$index}");
533 }
534 $build_cfg->{'indexmap'} = \@indexmap;
535
536 my @subcollectionmap = ();
537 foreach $subcollection (keys (%{$self->{'index_mapping'}->{'subcollectionmap'}})) {
538 push (@subcollectionmap, "$subcollection\-\>$self->{'index_mapping'}->{'subcollectionmap'}->{$subcollection}");
539 }
540 $build_cfg->{'subcollectionmap'} = \@subcollectionmap if scalar (@subcollectionmap);
541
542 my @languagemap = ();
543 foreach $language (keys (%{$self->{'index_mapping'}->{'languagemap'}})) {
544 push (@languagemap, "$language\-\>$self->{'index_mapping'}->{'languagemap'}->{$language}");
545 }
546 $build_cfg->{'languagemap'} = \@languagemap if scalar (@languagemap);
547
548 # write out the build information
549 &cfgread::write_cfg_file("$self->{'build_dir'}/build.cfg", $build_cfg,
550 '^(builddate|numdocs|numbytes)$', '^(indexmap|subcollectionmap|languagemap)$');
551
552}
553
554sub deinit {
555 my $self = shift (@_);
556}
557
558
5591;
560
561
Note: See TracBrowser for help on using the repository browser.