- Timestamp:
- 2001-05-29T10:49:04+12:00 (23 years ago)
- File:
-
- 1 edited
Legend:
- Unmodified
- Added
- Removed
-
trunk/gsdl/perllib/mgppbuilder.pm
r1917 r2478 59 59 'i'=>1, 60 60 'il'=>1, 61 'tw'=>1,62 61 'w'=>1, 63 62 'wa'=>1); … … 87 86 sub new { 88 87 my ($class, $collection, $source_dir, $build_dir, $verbosity, 89 $maxdocs, $debug, $keepold, $allclassifications, $outhandle) = @_; 88 $maxdocs, $debug, $keepold, $allclassifications, 89 $outhandle, $no_text) = @_; 90 90 91 91 $outhandle = STDERR unless defined $outhandle; 92 92 $no_text = 0 unless defined $no_text; 93 93 94 # create an mgppbuilder object 94 95 my $self = bless {'collection'=>$collection, … … 101 102 'allclassifications'=>$allclassifications, 102 103 'outhandle'=>$outhandle, 104 'no_text'=>$no_text, 103 105 'notbuilt'=>[], # indexes not built 104 106 'indexfieldmap'=>\%static_indexfield_map … … 130 132 foreach $language (@{$self->{'collect_cfg'}->{'languages'}}) { 131 133 foreach $index (@$indexes) { 132 push (@{$self->{'collect_cfg'}->{'indexes'}}, "$index:$language"); 134 if (defined ($self->{'collect_cfg'}->{'indexsubcollections'})) { 135 push (@{$self->{'collect_cfg'}->{'indexes'}}, "$index:$language"); 136 } 137 else { # add in an empty subcollection field 138 push (@{$self->{'collect_cfg'}->{'indexes'}}, "$index\:\:$language"); 139 } 133 140 } 134 141 } … … 233 240 my ($textindex) = @_; 234 241 235 my $exedir = "$ENV{'GSDLHOME'}/ src/mgpp/text";242 my $exedir = "$ENV{'GSDLHOME'}/bin/$ENV{'GSDLOS'}"; 236 243 my $exe = &util::get_os_exe (); 237 my $mg _passes_exe = &util::filename_cat($exedir, "mg_passes$exe");238 my $mg _compression_dict_exe = &util::filename_cat($exedir, "mg_compression_dict$exe");244 my $mgpp_passes_exe = &util::filename_cat($exedir, "mgpp_passes$exe"); 245 my $mgpp_compression_dict_exe = &util::filename_cat($exedir, "mgpp_compression_dict$exe"); 239 246 my $outhandle = $self->{'outhandle'}; 240 247 … … 244 251 my $basefilename = "text/$self->{'collection'}"; 245 252 246 # mgpp cant work on windows at the moment 247 # if ($ENV{'GSDLOS'} =~ /^windows$/i) { 248 # $basefilename =~ s/\//\\/g; 249 # $builddir =~ s/\//\\/g; 250 # 251 # } 253 if ($ENV{'GSDLOS'} =~ /^windows$/i) { 254 $basefilename =~ s/\//\\/g; 255 $builddir =~ s/\//\\/g; 256 257 } 252 258 253 259 … … 255 261 # the compressor doesn't need to know about paragraphs - never want to 256 262 # retrieve them 257 my $mg _passes_sections = "";263 my $mgpp_passes_sections = ""; 258 264 if ($self->{'levels'}->{'Section'}) { 259 $mg _passes_sections .= "-K Section ";265 $mgpp_passes_sections .= "-K Section "; 260 266 } 261 267 … … 264 270 # collect the statistics for the text 265 271 # -b $maxdocsize sets the maximum document size to be 12 meg 266 print $outhandle "\n collecting text statistics (mg _passes -T1)\n" if ($self->{'verbosity'} >= 1);272 print $outhandle "\n collecting text statistics (mgpp_passes -T1)\n" if ($self->{'verbosity'} >= 1); 267 273 268 274 my ($handle); … … 270 276 $handle = STDOUT; 271 277 } else { 272 if (!-e "$mg _passes_exe" ||273 !open (PIPEOUT, "| $mg_passes_exe $mg_passes_sections -d $builddir -f $basefilename -T1")) {274 die "mgppbuilder::compress_text - couldn't run $mg _passes_exe\n";278 if (!-e "$mgpp_passes_exe" || 279 !open (PIPEOUT, "| mgpp_passes$exe $mgpp_passes_sections -d $builddir -f $basefilename -T1")) { 280 die "mgppbuilder::compress_text - couldn't run $mgpp_passes_exe\n"; 275 281 } 276 282 $handle = mgppbuilder::PIPEOUT; … … 281 287 $self->{'buildproc'}->set_index ($textindex); 282 288 $self->{'buildproc'}->set_indexing_text (0); 289 if ($self->{'no_text'}) { 290 $self->{'buildproc'}->set_store_text(0); 291 } else { 292 $self->{'buildproc'}->set_store_text(1); 293 } 283 294 $self->{'buildproc'}->set_indexfieldmap ($self->{'indexfieldmap'}); 284 295 $self->{'buildproc'}->set_levels ($self->{'levels'}); … … 293 304 close ($handle) unless $self->{'debug'}; 294 305 306 $self->print_stats(); 307 295 308 # create the compression dictionary 296 309 # the compression dictionary is built by assuming the stats are from a seed … … 301 314 if (!$self->{'debug'}) { 302 315 print $outhandle "\n creating the compression dictionary\n" if ($self->{'verbosity'} >= 1); 303 if (!-e "$mg_compression_dict_exe") { 304 die "mgppbuilder::compress_text - couldn't run $mg_compression_dict_exe\n"; 305 } 306 system ("$mg_compression_dict_exe -d $builddir -f $basefilename -S -H -2 -k 5120"); 307 316 if (!-e "$mgpp_compression_dict_exe") { 317 die "mgppbuilder::compress_text - couldn't run $mgpp_compression_dict_exe\n"; 318 } 319 system ("mgpp_compression_dict$exe -d $builddir -f $basefilename -S -H -2 -k 5120"); 308 320 309 321 if (!$self->{'debug'}) { 310 if (!-e "$mg _passes_exe" ||311 !open ($handle, "| $mg_passes_exe $mg_passes_compress_sections -f $basefilename -d $builddir -T2")) {312 die "mgppbuilder::compress_text - couldn't run $mg _passes_exe\n";322 if (!-e "$mgpp_passes_exe" || 323 !open ($handle, "| mgpp_passes$exe $mgpp_passes_sections -f $basefilename -d $builddir -T2")) { 324 die "mgppbuilder::compress_text - couldn't run $mgpp_passes_exe\n"; 313 325 } 314 326 } … … 317 329 $self->{'buildproc'}->reset(); 318 330 # compress the text 319 print $outhandle "\n compressing the text (mg _passes -T2)\n" if ($self->{'verbosity'} >= 1);331 print $outhandle "\n compressing the text (mgpp_passes -T2)\n" if ($self->{'verbosity'} >= 1); 320 332 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'}, 321 333 "", {}, $self->{'buildproc'}, $self->{'maxdocs'}); … … 408 420 } 409 421 422 $mapping{$index} = $dirname; 423 410 424 # store the mapping orders as well as the maps 425 # also put index, subcollection and language fields into the mapping thing - 426 # (the full index name (eg document:text:subcol:lang) is not used on 427 # the query page) -these are used for collectionmeta later on 411 428 if (!defined $mapping{'indexmap'}{"$fields"}) { 412 429 $mapping{'indexmap'}{"$fields"} = $pindex; 413 430 push (@{$mapping{'indexmaporder'}}, "$fields"); 431 if (!defined $mapping{"$fields"}) { 432 $mapping{"$fields"} = $pindex; 433 } 414 434 } 415 435 if ($psub =~ /\w/ && !defined ($mapping{'subcollectionmap'}{$subcollection})) { 416 436 $mapping{'subcollectionmap'}{$subcollection} = $psub; 417 437 push (@{$mapping{'subcollectionmaporder'}}, $subcollection); 438 $mapping{$subcollection} = $psub; 418 439 } 419 440 if ($plang =~ /\w/ && !defined ($mapping{'languagemap'}{$languages})) { 420 441 $mapping{'languagemap'}{$languages} = $plang; 421 442 push (@{$mapping{'languagemaporder'}}, $language); 422 }423 $mapping{$index} = $dirname;443 $mapping{$languages} = $plang; 444 } 424 445 $dirnames{$dirname} = $index; 425 446 $pnames{'index'}{$pindex} = "$fields"; … … 499 520 500 521 # get any os specific stuff 501 my $exedir = "$ENV{'GSDLHOME'}/ src/mgpp/text";522 my $exedir = "$ENV{'GSDLHOME'}/bin/$ENV{'GSDLOS'}"; 502 523 503 524 my $exe = &util::get_os_exe (); 504 my $mg _passes_exe = &util::filename_cat($exedir, "mg_passes$exe");525 my $mgpp_passes_exe = &util::filename_cat($exedir, "mgpp_passes$exe"); 505 526 506 527 # define the section names for mgpasses 507 my $mg _passes_sections = "";528 my $mgpp_passes_sections = ""; 508 529 foreach $level (keys (%{$self->{'levels'}})) { 509 530 if ($level eq "Section" || $level eq "Paragraph") { 510 $mg _passes_sections .= "-K $level ";511 } 512 } 513 514 my $mg _perf_hash_build_exe =515 &util::filename_cat($exedir, "mg _perf_hash_build$exe");516 my $mg _weights_build_exe =517 &util::filename_cat ($exedir, "mg _weights_build$exe");518 my $mg _invf_dict_exe =519 &util::filename_cat ($exedir, "mg _invf_dict$exe");520 my $mg _stem_idx_exe =521 &util::filename_cat ($exedir, "mg _stem_idx$exe");522 523 # if ($ENV{'GSDLOS'} =~ /^windows$/i) { 524 #$builddir=~ s/\//\\/g;525 #$basefilename =~ s/\//\\/g;526 #}531 $mgpp_passes_sections .= "-K $level "; 532 } 533 } 534 535 my $mgpp_perf_hash_build_exe = 536 &util::filename_cat($exedir, "mgpp_perf_hash_build$exe"); 537 my $mgpp_weights_build_exe = 538 &util::filename_cat ($exedir, "mgpp_weights_build$exe"); 539 my $mgpp_invf_dict_exe = 540 &util::filename_cat ($exedir, "mgpp_invf_dict$exe"); 541 my $mgpp_stem_idx_exe = 542 &util::filename_cat ($exedir, "mgpp_stem_idx$exe"); 543 544 if ($ENV{'GSDLOS'} =~ /^windows$/i) { 545 $builddir=~ s/\//\\/g; 546 $basefilename =~ s/\//\\/g; 547 } 527 548 528 549 # get the index expression if this index belongs 529 550 # to a subcollection 530 551 my $indexexparr = []; 531 my ($fields, $subcollection) = split (":", $index); 552 553 # there may be subcollection info, and language info. 554 my ($fields, $subcollection, $language) = split (":", $index); 532 555 my @subcollections = (); 533 556 @subcollections = split /,/, $subcollection if (defined $subcollection); … … 540 563 541 564 # add expressions for languages if this index belongs to 542 # a language subcollection 543 foreach $language (@{$self->{'collect_cfg'}->{'languages'}}) { 565 # a language subcollection - only put languages expressions for the 566 # ones we want in the index 567 568 my @languages = (); 569 @languages = split /,/, $language if (defined $language); 570 foreach $language (@languages) { 571 my $not=0; 544 572 if ($language =~ s/^\!//) { 545 push (@$indexexparr, "!Language/$language/"); 546 } else { 547 push (@$indexexparr, "Language/$language/"); 573 $not = 1; 574 } 575 foreach $lang (@{$self->{'collect_cfg'}->{'languages'}}) { 576 if ($lang eq $language) { 577 if ($not) { 578 push (@$indexexparr, "!Language/$language/"); 579 } else { 580 push (@$indexexparr, "Language/$language/"); 581 } 582 last; 583 } 548 584 } 549 585 } 550 586 551 587 # Build index dictionary. Uses verbatim stem method 552 print $outhandle "\n creating index dictionary (mg _passes -I1)\n" if ($self->{'verbosity'} >= 1);588 print $outhandle "\n creating index dictionary (mgpp_passes -I1)\n" if ($self->{'verbosity'} >= 1); 553 589 my ($handle); 554 590 if ($self->{'debug'}) { 555 591 $handle = STDOUT; 556 592 } else { 557 if (!-e "$mg _passes_exe" ||558 !open (PIPEOUT, "| $mg_passes_exe $mg_passes_sections -d $builddir -f $basefilename -I1")) {559 die "mgppbuilder::build_index - couldn't run $mg _passes_exe\n";593 if (!-e "$mgpp_passes_exe" || 594 !open (PIPEOUT, "| mgpp_passes$exe $mgpp_passes_sections -d $builddir -f $basefilename -I1")) { 595 die "mgppbuilder::build_index - couldn't run $mgpp_passes_exe\n"; 560 596 } 561 597 $handle = mgppbuilder::PIPEOUT; … … 567 603 $self->{'buildproc'}->set_index ($index, $indexexparr); 568 604 $self->{'buildproc'}->set_indexing_text (1); 605 $self->{'buildproc'}->set_store_text(1); 569 606 $self->{'buildproc'}->set_indexfieldmap ($self->{'indexfieldmap'}); 570 607 $self->{'buildproc'}->set_levels ($self->{'levels'}); … … 578 615 if (!$self->{'debug'}) { 579 616 # create the perfect hash function 580 if (!-e "$mg _perf_hash_build_exe") {581 die "mgppbuilder::build_index - couldn't run $mg _perf_hash_build_exe\n";582 } 583 system (" $mg_perf_hash_build_exe -d $builddir -f $basefilename");584 585 if (!-e "$mg _passes_exe" ||586 !open ($handle, "| $mg_passes_exe $mg_passes_sections -d $builddir -f $basefilename -I2")) {587 die "mgppbuilder::build_index - couldn't run $mg _passes_exe\n";617 if (!-e "$mgpp_perf_hash_build_exe") { 618 die "mgppbuilder::build_index - couldn't run $mgpp_perf_hash_build_exe\n"; 619 } 620 system ("mgpp_perf_hash_build$exe -d $builddir -f $basefilename"); 621 622 if (!-e "$mgpp_passes_exe" || 623 !open ($handle, "| mgpp_passes$exe $mgpp_passes_sections -d $builddir -f $basefilename -I2")) { 624 die "mgppbuilder::build_index - couldn't run $mgpp_passes_exe\n"; 588 625 } 589 626 } 590 627 591 628 # invert the text 592 print $outhandle "\n inverting the text (mg _passes -I2)\n" if ($self->{'verbosity'} >= 1);629 print $outhandle "\n inverting the text (mgpp_passes -I2)\n" if ($self->{'verbosity'} >= 1); 593 630 594 631 $self->{'buildproc'}->reset(); … … 604 641 # create the weights file 605 642 print $outhandle "\n create the weights file\n" if ($self->{'verbosity'} >= 1); 606 if (!-e "$mg _weights_build_exe") {607 die "mgppbuilder::build_index - couldn't run $mg _weights_build_exe\n";608 } 609 system (" $mg_weights_build_exe -d $builddir -f $basefilename");643 if (!-e "$mgpp_weights_build_exe") { 644 die "mgppbuilder::build_index - couldn't run $mgpp_weights_build_exe\n"; 645 } 646 system ("mgpp_weights_build$exe -d $builddir -f $basefilename"); 610 647 611 648 # create 'on-disk' stemmed dictionary 612 649 print $outhandle "\n creating 'on-disk' stemmed dictionary\n" if ($self->{'verbosity'} >= 1); 613 if (!-e "$mg _invf_dict_exe") {614 die "mgppbuilder::build_index - couldn't run $mg _invf_dict_exe\n";615 } 616 system (" $mg_invf_dict_exe -d $builddir -f $basefilename");650 if (!-e "$mgpp_invf_dict_exe") { 651 die "mgppbuilder::build_index - couldn't run $mgpp_invf_dict_exe\n"; 652 } 653 system ("mgpp_invf_dict$exe -d $builddir -f $basefilename"); 617 654 618 655 619 656 # creates stem index files for the various stemming methods 620 657 print $outhandle "\n creating stem indexes\n" if ($self->{'verbosity'} >= 1); 621 if (!-e "$mg _stem_idx_exe") {622 die "mgppbuilder::build_index - couldn't run $mg _stem_idx_exe\n";623 } 624 system (" $mg_stem_idx_exe -b 4096 -s1 -d $builddir -f $basefilename");625 system (" $mg_stem_idx_exe -b 4096 -s2 -d $builddir -f $basefilename");626 system (" $mg_stem_idx_exe -b 4096 -s3 -d $builddir -f $basefilename");658 if (!-e "$mgpp_stem_idx_exe") { 659 die "mgppbuilder::build_index - couldn't run $mgpp_stem_idx_exe\n"; 660 } 661 system ("mgpp_stem_idx$exe -b 4096 -s1 -d $builddir -f $basefilename"); 662 system ("mgpp_stem_idx$exe -b 4096 -s2 -d $builddir -f $basefilename"); 663 system ("mgpp_stem_idx$exe -b 4096 -s3 -d $builddir -f $basefilename"); 627 664 628 665 … … 664 701 my $txt2db_exe = &util::filename_cat($exedir, "txt2db$exe"); 665 702 703 # define the indexed field mapping if not already done so (ie if infodb called separately from build_index) 704 if (scalar(keys %{$self->{'buildproc'}->{'indexfieldmap'}}) == 0) { 705 #check build.cfg to see if indexfields have been filled in 706 $buildconfigfile = &util::filename_cat($ENV{'GSDLCOLLECTDIR'}, "building/build.cfg"); 707 if (-e $buildconfigfile) { 708 $buildcfg = &colcfg::read_build_cfg( $buildconfigfile); 709 if (defined $buildcfg->{'indexfields'}) { 710 foreach $field (@{$buildcfg->{'indexfields'}}) { 711 $self->{'buildproc'}->{'indexfields'}->{$field} = 1; 712 } 713 } 714 if (defined $buildcfg->{'indexfieldmap'}) { 715 foreach $field (@{$buildcfg->{'indexfieldmap'}}) { 716 ($f, $v) = $field =~ /^(.*)\-\>(.*)$/; 717 $self->{'buildproc'}->{'indexfieldmap'}->{$f} = $v; 718 } 719 } 720 } 721 } 722 666 723 print $outhandle "\n*** creating the info database and processing associated files\n" 667 724 if ($self->{'verbosity'} >= 1); … … 687 744 $self->{'buildproc'}->set_classifiers ($self->{'classifiers'}); 688 745 $self->{'buildproc'}->set_indexing_text (0); 689 $self->{'buildproc'}->set_indexfieldmap ($self->{'indexfieldmap'}); 746 $self->{'buildproc'}->set_store_text(1); 747 #$self->{'buildproc'}->set_indexfieldmap ($self->{'indexfieldmap'}); 690 748 691 749 $self->{'buildproc'}->reset(); … … 714 772 } 715 773 #print out the indexfield mapping 716 foreach $field (keys (%{$self->{'indexfieldmap'}})) {717 $shortname = $self->{' indexfieldmap'}->{$field};718 print $handle "<$shortname>$field\n" ;774 foreach $field (keys %{$self->{'buildproc'}->{'indexfields'}}) { 775 $shortname = $self->{'buildproc'}->{'indexfieldmap'}->{$field}; 776 print $handle "<$shortname>$field\n" if defined $shortname; 719 777 } 720 778 print $handle "\n" . ('-' x 70) . "\n"; … … 820 878 print $outhandle "Total bytes in $index: $num_processed_bytes\n"; 821 879 822 if ($num_processed_bytes < 50 ) {880 if ($num_processed_bytes < 50 && ($indexing_text || !$self->{'no_text'})) { 823 881 print $outhandle "***************\n"; 824 print $outhandle "WARNING: There is very little or no text to process for $index\n";825 882 if ($indexing_text) { 826 print $outhandle "This may cause an error while attempting to build the index\n"; 827 } else { 828 print $outhandle "This may cause an error while attempting to compress the text\n"; 829 } 883 print $outhandle "WARNING: There is very little or no text to process for $index\n"; 884 } elsif (!$self->{'no_text'}) { 885 print $outhandle "WARNING: There is very little or no text to compress\n"; 886 } 887 print $outhandle " Was this your intention?\n"; 830 888 print $outhandle "***************\n"; 831 889 }
Note:
See TracChangeset
for help on using the changeset viewer.