Changeset 782
- Timestamp:
- 1999-11-15T17:50:16+13:00 (24 years ago)
- Location:
- trunk/gsdl/perllib
- Files:
-
- 1 added
- 1 edited
Legend:
- Unmodified
- Added
- Removed
-
trunk/gsdl/perllib/mgbuilder.pm
r780 r782 48 48 sub new { 49 49 my ($class, $collection, $source_dir, $build_dir, 50 $verbosity, $maxdocs, $ allclassifications) = @_;50 $verbosity, $maxdocs, $debug, $allclassifications) = @_; 51 51 52 52 # create an mgbuilder object … … 56 56 'verbosity'=>$verbosity, 57 57 'maxdocs'=>$maxdocs, 58 'debug'=>$debug, 58 59 'allclassifications'=>$allclassifications, 59 60 'notbuilt'=>[] # indexes not built … … 155 156 my $self = shift (@_); 156 157 157 # remove any old builds 158 &util::rm_r($self->{'build_dir'}); 159 &util::mk_all_dir($self->{'build_dir'}); 158 if (!$self->{'debug'}) { 159 # remove any old builds 160 &util::rm_r($self->{'build_dir'}); 161 &util::mk_all_dir($self->{'build_dir'}); 160 162 161 # make the text directory 162 my $textdir = "$self->{'build_dir'}/text"; 163 &util::mk_all_dir($textdir); 163 # make the text directory 164 my $textdir = "$self->{'build_dir'}/text"; 165 &util::mk_all_dir($textdir); 166 } 164 167 } 165 168 … … 185 188 print STDERR "\n*** creating the compressed text\n" if ($self->{'verbosity'} >= 1); 186 189 187 # set up the document processor 188 $self->{'buildproc'}->set_output_handle ('mgbuilder::PIPEOUT'); 190 # collect the statistics for the text 191 # -b $maxdocsize sets the maximum document size to be 12 meg 192 print STDERR "\n collecting text statistics\n" if ($self->{'verbosity'} >= 1); 193 194 my ($handle); 195 if ($self->{'debug'}) { 196 $handle = STDOUT; 197 } else { 198 if (!-e "$mg_passes_exe" || 199 !open (PIPEOUT, "| $mg_passes_exe -f $fulltextprefix -b $maxdocsize -T1 $osextra")) { 200 die "mgbuilder::compress_text - couldn't run $mg_passes_exe\n"; 201 } 202 $handle = mgbuilder::PIPEOUT; 203 } 204 205 $self->{'buildproc'}->set_output_handle ($handle); 189 206 $self->{'buildproc'}->set_mode ('text'); 190 207 $self->{'buildproc'}->set_index ($textindex); 191 208 $self->{'buildproc'}->set_indexing_text (0); 192 193 # collect the statistics for the text194 # -b $maxdocsize sets the maximum document size to be 12 meg195 print STDERR "\n collecting text statistics\n" if ($self->{'verbosity'} >= 1);196 if (!-e "$mg_passes_exe" || !open (PIPEOUT,197 "| $mg_passes_exe -f $fulltextprefix -b $maxdocsize -T1 $osextra")) {198 die "mgbuilder::compress_text - couldn't run $mg_passes_exe\n";199 }200 209 $self->{'buildproc'}->reset(); 201 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'}, 202 "", {}, $self->{'buildproc'}, $self->{'maxdocs'}); 203 close (PIPEOUT); 210 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'}, 211 "", {}, $self->{'buildproc'}, $self->{'maxdocs'}); 212 213 close ($handle) unless $self->{'debug'}; 204 214 205 215 # create the compression dictionary … … 208 218 # and the resulting dictionary must be less than 5 meg with the most frequent 209 219 # words being put into the dictionary first (-2 -k 5120) 210 print STDERR "\n creating the compression dictionary\n" if ($self->{'verbosity'} >= 1); 211 if (!-e "$mg_compression_dict_exe") { 212 die "mgbuilder::compress_text - couldn't run $mg_compression_dict_exe\n"; 213 } 214 system ("$mg_compression_dict_exe -f $fulltextprefix -S -H -2 -k 5120 $osextra"); 215 220 if (!$self->{'debug'}) { 221 print STDERR "\n creating the compression dictionary\n" if ($self->{'verbosity'} >= 1); 222 if (!-e "$mg_compression_dict_exe") { 223 die "mgbuilder::compress_text - couldn't run $mg_compression_dict_exe\n"; 224 } 225 system ("$mg_compression_dict_exe -f $fulltextprefix -S -H -2 -k 5120 $osextra"); 226 227 # -b $maxdocsize sets the maximum document size to be 12 meg 228 if (!$self->{'debug'}) { 229 if (!-e "$mg_passes_exe" || 230 !open ($handle, "| $mg_passes_exe -f $fulltextprefix -b $maxdocsize -T2 $osextra")) { 231 die "mgbuilder::compress_text - couldn't run $mg_passes_exe\n"; 232 } 233 } 234 } 235 236 $self->{'buildproc'}->reset(); 216 237 # compress the text 217 # -b $maxdocsize sets the maximum document size to be 12 meg218 238 print STDERR "\n compressing the text\n" if ($self->{'verbosity'} >= 1); 219 if (!-e "$mg_passes_exe" || !open (PIPEOUT,220 "| $mg_passes_exe -f $fulltextprefix -b $maxdocsize -T2 $osextra")) {221 die "mgbuilder::compress_text - couldn't run $mg_passes_exe\n";222 }223 $self->{'buildproc'}->reset();224 239 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'}, 225 240 "", {}, $self->{'buildproc'}, $self->{'maxdocs'}); 226 close ( PIPEOUT);241 close ($handle) unless $self->{'debug'}; 227 242 } 228 243 … … 245 260 sub build_indexes { 246 261 my $self = shift (@_); 247 my $indexes = $self->{'collect_cfg'}->{'indexes'}; 262 my ($indexname) = @_; 263 264 my $indexes = []; 265 if (defined $indexname && $indexname =~ /\w/) { 266 push @$indexes, $indexname; 267 } else { 268 $indexes = $self->{'collect_cfg'}->{'indexes'}; 269 } 248 270 249 271 # create the mapping between the index descriptions … … 441 463 } 442 464 } 443 465 466 # Build index dictionary. Uses verbatim stem method 467 print STDERR "\n creating index dictionary\n" if ($self->{'verbosity'} >= 1); 468 my ($handle); 469 if ($self->{'debug'}) { 470 $handle = STDOUT; 471 } else { 472 if (!-e "$mg_passes_exe" || 473 !open (PIPEOUT, "| $mg_passes_exe -f $fullindexprefix -b $maxdocsize " . 474 "-$index_level -m 32 -s 0 -G -t 10 -N1 $osextra")) { 475 die "mgbuilder::build_index - couldn't run $mg_passes_exe\n"; 476 } 477 $handle = mgbuilder::PIPEOUT; 478 } 479 444 480 # set up the document processor 445 $self->{'buildproc'}->set_output_handle ( 'mgbuilder::PIPEOUT');481 $self->{'buildproc'}->set_output_handle ($handle); 446 482 $self->{'buildproc'}->set_mode ('text'); 447 483 $self->{'buildproc'}->set_index ($index, $indexexparr); 448 484 $self->{'buildproc'}->set_indexing_text (1); 449 485 450 451 # Build index dictionary. Uses verbatim stem method452 print STDERR "\n creating index dictionary\n" if ($self->{'verbosity'} >= 1);453 if (!-e "$mg_passes_exe" || !open (PIPEOUT,454 "| $mg_passes_exe -f $fullindexprefix -b $maxdocsize " .455 "-$index_level -m 32 -s 0 -G -t 10 -N1 $osextra")) {456 die "mgbuilder::build_index - couldn't run $mg_passes_exe\n";457 }458 486 $self->{'buildproc'}->reset(); 459 487 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'}, 460 488 "", {}, $self->{'buildproc'}, $self->{'maxdocs'}); 461 close (PIPEOUT); 462 463 # create the perfect hash function 464 if (!-e "$mg_perf_hash_build_exe") { 465 die "mgbuilder::build_index - couldn't run $mg_perf_hash_build_exe\n"; 466 } 467 system ("$mg_perf_hash_build_exe -f $fullindexprefix $osextra"); 468 489 close ($handle) unless $self->{'debug'}; 490 491 if (!$self->{'debug'}) { 492 # create the perfect hash function 493 if (!-e "$mg_perf_hash_build_exe") { 494 die "mgbuilder::build_index - couldn't run $mg_perf_hash_build_exe\n"; 495 } 496 system ("$mg_perf_hash_build_exe -f $fullindexprefix $osextra"); 497 498 if (!-e "$mg_passes_exe" || 499 !open ($handle, "| $mg_passes_exe -f $fullindexprefix -b $maxdocsize " . 500 "-$index_level -c 3 -G -t 10 -N2 $osextra")) { 501 die "mgbuilder::build_index - couldn't run $mg_passes_exe\n"; 502 } 503 } 504 469 505 # invert the text 470 506 print STDERR "\n inverting the text\n" if ($self->{'verbosity'} >= 1); 471 if (!-e "$mg_passes_exe" || !open (PIPEOUT, 472 "| $mg_passes_exe -f $fullindexprefix -b $maxdocsize " . 473 "-$index_level -c 3 -G -t 10 -N2 $osextra")) { 474 die "mgbuilder::build_index - couldn't run $mg_passes_exe\n"; 475 } 507 476 508 $self->{'buildproc'}->reset(); 477 509 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'}, 478 510 "", {}, $self->{'buildproc'}, $self->{'maxdocs'}); 479 close (PIPEOUT);480 481 # create the weights file482 print STDERR "\n create the weights file\n" if ($self->{'verbosity'} >= 1);483 if (!-e "$mg_weights_build_exe") {484 die "mgbuilder::build_index - couldn't run $mg_weights_build_exe\n";485 }486 system ("$mg_weights_build_exe -f $fullindexprefix -t $fulltextprefix $osextra");487 488 # create 'on-disk' stemmed dictionary489 print STDERR "\n creating 'on-disk' stemmed dictionary\n" if ($self->{'verbosity'} >= 1);490 if (!-e "$mg_invf_dict_exe") {491 die "mgbuilder::build_index - couldn't run $mg_invf_dict_exe\n";492 }493 system ("$mg_invf_dict_exe -f $fullindexprefix $osextra");494 495 496 # creates stem index files for the various stemming methods497 print STDERR "\n creating stem indexes\n" if ($self->{'verbosity'} >= 1);498 if (!-e "$mg_stem_idx_exe") {499 die "mgbuilder::build_index - couldn't run $mg_stem_idx_exe\n";500 }501 system ("$mg_stem_idx_exe -b 4096 -s1 -f $fullindexprefix $osextra");502 system ("$mg_stem_idx_exe -b 4096 -s2 -f $fullindexprefix $osextra");503 system ("$mg_stem_idx_exe -b 4096 -s3 -f $fullindexprefix $osextra");504 505 511 506 # remove unwanted files 507 my $tmpdir = &util::filename_cat ($self->{'build_dir'}, $indexdir); 508 opendir (DIR, $tmpdir) || die 509 "mgbuilder::build_index - couldn't read directory $tmpdir\n"; 510 foreach $file (readdir(DIR)) { 511 next if $file =~ /^\./; 512 my ($suffix) = $file =~ /\.([^\.]+)$/; 513 if (defined $suffix && !defined $wanted_index_files{$suffix}) { 514 # delete it! 515 # print STDERR "deleting $file\n"; 516 &util::rm (&util::filename_cat ($tmpdir, $file)); 517 } 518 } 519 closedir (DIR); 512 if (!$self->{'debug'}) { 513 514 close ($handle); 515 516 # create the weights file 517 print STDERR "\n create the weights file\n" if ($self->{'verbosity'} >= 1); 518 if (!-e "$mg_weights_build_exe") { 519 die "mgbuilder::build_index - couldn't run $mg_weights_build_exe\n"; 520 } 521 system ("$mg_weights_build_exe -f $fullindexprefix -t $fulltextprefix $osextra"); 522 523 # create 'on-disk' stemmed dictionary 524 print STDERR "\n creating 'on-disk' stemmed dictionary\n" if ($self->{'verbosity'} >= 1); 525 if (!-e "$mg_invf_dict_exe") { 526 die "mgbuilder::build_index - couldn't run $mg_invf_dict_exe\n"; 527 } 528 system ("$mg_invf_dict_exe -f $fullindexprefix $osextra"); 529 530 531 # creates stem index files for the various stemming methods 532 print STDERR "\n creating stem indexes\n" if ($self->{'verbosity'} >= 1); 533 if (!-e "$mg_stem_idx_exe") { 534 die "mgbuilder::build_index - couldn't run $mg_stem_idx_exe\n"; 535 } 536 system ("$mg_stem_idx_exe -b 4096 -s1 -f $fullindexprefix $osextra"); 537 system ("$mg_stem_idx_exe -b 4096 -s2 -f $fullindexprefix $osextra"); 538 system ("$mg_stem_idx_exe -b 4096 -s3 -f $fullindexprefix $osextra"); 539 540 541 # remove unwanted files 542 my $tmpdir = &util::filename_cat ($self->{'build_dir'}, $indexdir); 543 opendir (DIR, $tmpdir) || die 544 "mgbuilder::build_index - couldn't read directory $tmpdir\n"; 545 foreach $file (readdir(DIR)) { 546 next if $file =~ /^\./; 547 my ($suffix) = $file =~ /\.([^\.]+)$/; 548 if (defined $suffix && !defined $wanted_index_files{$suffix}) { 549 # delete it! 550 print STDERR "deleting $file\n" if $self->{'verbosity'} > 2; 551 &util::rm (&util::filename_cat ($tmpdir, $file)); 552 } 553 } 554 closedir (DIR); 555 } 520 556 } 521 557 … … 541 577 542 578 # set up the document processor 543 $self->{'buildproc'}->set_output_handle ('mgbuilder::PIPEOUT'); 579 my ($handle); 580 if ($self->{'debug'}) { 581 $handle = STDOUT; 582 } else { 583 if (!-e "$txt2db_exe" || !open (PIPEOUT, "| $txt2db_exe $fulldbname")) { 584 die "mgbuilder::make_infodatabase - couldn't run $txt2db_exe\n"; 585 } 586 $handle = mgbuilder::PIPEOUT; 587 } 588 589 $self->{'buildproc'}->set_output_handle ($handle); 544 590 $self->{'buildproc'}->set_mode ('infodb'); 545 591 $self->{'buildproc'}->set_dontgdbm ($self->{'dontgdbm'}); 546 592 $self->{'buildproc'}->set_classifiers ($self->{'classifiers'}); 547 593 $self->{'buildproc'}->set_indexing_text (0); 548 549 # create the infodatabase550 if (!-e "$txt2db_exe" || !open (PIPEOUT,551 "| $txt2db_exe $fulldbname")) {552 die "mgbuilder::make_infodatabase - couldn't run $txt2db_exe\n";553 }554 594 $self->{'buildproc'}->reset(); 555 595 556 596 if (defined $self->{'collect_cfg'}->{'collectionmeta'}) { 557 597 558 598 if (!defined $self->{'index_mapping'}) { 559 599 $self->{'index_mapping'} = … … 561 601 } 562 602 563 print PIPEOUT"[collection]\n";564 603 print $handle "[collection]\n"; 604 565 605 foreach $cmeta (keys (%{$self->{'collect_cfg'}->{'collectionmeta'}})) { 566 606 if ($cmeta =~ s/^\.//) { 567 607 if (defined $self->{'index_mapping'}->{$cmeta}) { 568 print PIPEOUT"<$self->{'index_mapping'}->{$cmeta}>" .608 print $handle "<$self->{'index_mapping'}->{$cmeta}>" . 569 609 $self->{'collect_cfg'}->{'collectionmeta'}->{".$cmeta"} . "\n"; 570 610 } else { … … 572 612 } 573 613 } else { 574 print PIPEOUT"<$cmeta>$self->{'collect_cfg'}->{'collectionmeta'}->{$cmeta}\n";614 print $handle "<$cmeta>$self->{'collect_cfg'}->{'collectionmeta'}->{$cmeta}\n"; 575 615 } 576 616 } 577 print PIPEOUT "\n" . ('-' x 70) . "\n"; 578 579 } 580 617 print $handle "\n" . ('-' x 70) . "\n"; 618 619 } 581 620 582 621 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'}, … … 584 623 585 624 # output classification information 586 &classify::output_classify_info ($self->{'classifiers'}, 'mgbuilder::PIPEOUT',625 &classify::output_classify_info ($self->{'classifiers'}, $handle, 587 626 $self->{'allclassifications'}); 588 627 589 close ( PIPEOUT);628 close ($handle) if !$self->{'debug'}; 590 629 } 591 630
Note:
See TracChangeset
for help on using the changeset viewer.