Changeset 1694
- Timestamp:
- 2000-11-23T11:55:50+13:00 (23 years ago)
- Location:
- trunk/gsdl/perllib
- Files:
-
- 2 edited
Legend:
- Unmodified
- Added
- Removed
-
trunk/gsdl/perllib/mgppbuilder.pm
r1301 r1694 31 31 use plugin; 32 32 use util; 33 use FileHandle; 34 35 36 BEGIN { 37 # set autoflush on for STDERR and STDOUT so that mg 38 # doesn't get out of sync with plugins 39 STDOUT->autoflush(1); 40 STDERR->autoflush(1); 41 } 42 43 END { 44 STDOUT->autoflush(0); 45 STDERR->autoflush(0); 46 } 47 48 $maxdocsize = 12000; 33 49 34 50 #update this !!!!!!!!!!!!!!!! … … 47 63 sub new { 48 64 my ($class, $collection, $source_dir, $build_dir, $verbosity, 49 $maxdocs, $debug, $keepold, $allclassifications) = @_; 65 $maxdocs, $debug, $keepold, $allclassifications, $outhandle) = @_; 66 67 $outhandle = STDERR unless defined $outhandle; 50 68 51 69 # create an mgppbuilder object … … 58 76 'keepold'=>$keepold, 59 77 'allclassifications'=>$allclassifications, 78 'outhandle'=>$outhandle, 60 79 'notbuilt'=>[] # indexes not built 61 80 }, $class; … … 100 119 $self->{'pluginfo'} = &plugin::load_plugins ($plugins); 101 120 if (scalar(@{$self->{'pluginfo'}}) == 0) { 102 print STDERR"No plugins were loaded.\n";121 print $outhandle "No plugins were loaded.\n"; 103 122 die "\n"; 104 123 } … … 111 130 112 131 # load all the classifiers 113 $self->{'classifiers'} = &classify::load_classifiers ($classifiers );132 $self->{'classifiers'} = &classify::load_classifiers ($classifiers, $outhandle); 114 133 115 134 # load up any dontgdbm fields … … 135 154 136 155 eval("\$self->{'buildproc'} = new $buildproctype(\$collection, " . 137 "\$source_dir, \$build_dir, \$verbosity )");156 "\$source_dir, \$build_dir, \$verbosity, \$outhandle)"); 138 157 die "$@" if $@; 139 158 … … 160 179 my ($textindex, $indexname) = @_; 161 180 162 print STDERR "build_col, textindex=$textindex, indexname=$indexname\n"; 181 my $outhandle = $self->{'outhandle'}; 182 183 print $outhandle "build_col, textindex=$textindex, indexname=$indexname\n"; 163 184 my $exedir = "$ENV{'GSDLHOME'}/src/mgpp/text"; 164 185 my $exe = &util::get_os_exe (); … … 203 224 push @$indexes, $indexname; 204 225 } 205 print STDERR"indexes are: @$indexes\n";206 207 208 print STDERR"\n*** mg_passes: first pass\n" if ($self->{'verbosity'} >= 1);209 print STDERR"fulltextprefix=$fulltextprefix\n";226 print $outhandle "indexes are: @$indexes\n"; 227 228 229 print $outhandle "\n*** mg_passes: first pass\n" if ($self->{'verbosity'} >= 1); 230 print $outhandle "fulltextprefix=$fulltextprefix\n"; 210 231 # carry out the first pass of mg_passes 211 232 # -b $maxdocsize sets the maximum document size to be 12 meg - not available any longer 212 print STDERR"\n collecting text statistics\n" if ($self->{'verbosity'} >= 1);233 print $outhandle "\n collecting text statistics\n" if ($self->{'verbosity'} >= 1); 213 234 214 235 my ($handle); … … 247 268 # words being put into the dictionary first (-2 -k 5120) 248 269 if (!$self->{'debug'}) { 249 print STDERR"\n creating the compression dictionary\n" if ($self->{'verbosity'} >= 1);270 print $outhandle "\n creating the compression dictionary\n" if ($self->{'verbosity'} >= 1); 250 271 if (!-e "$mg_compression_dict_exe") { 251 272 die "mgppbuilder::compress_text - couldn't run $mg_compression_dict_exe\n"; … … 271 292 $self->{'buildproc'}->reset(); 272 293 273 print STDERR"\n compressing the text\n" if ($self->{'verbosity'} >= 1);294 print $outhandle "\n compressing the text\n" if ($self->{'verbosity'} >= 1); 274 295 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'}, 275 296 "", {}, $self->{'buildproc'}, $self->{'maxdocs'}); … … 279 300 280 301 # create the weights file 281 print STDERR"\n create the weights file\n" if ($self->{'verbosity'} >= 1);302 print $outhandle "\n create the weights file\n" if ($self->{'verbosity'} >= 1); 282 303 if (!-e "$mg_weights_build_exe") { 283 304 die "mgppbuilder::build_index - couldn't run $mg_weights_build_exe\n"; … … 286 307 287 308 # create 'on-disk' stemmed dictionary 288 print STDERR"\n creating 'on-disk' stemmed dictionary\n" if ($self->{'verbosity'} >= 1);309 print $outhandle "\n creating 'on-disk' stemmed dictionary\n" if ($self->{'verbosity'} >= 1); 289 310 if (!-e "$mg_invf_dict_exe") { 290 311 die "mgppbuilder::build_index - couldn't run $mg_invf_dict_exe\n"; … … 294 315 295 316 # creates stem index files for the various stemming methods 296 print STDERR"\n creating stem indexes\n" if ($self->{'verbosity'} >= 1);317 print $outhandle "\n creating stem indexes\n" if ($self->{'verbosity'} >= 1); 297 318 if (!-e "$mg_stem_idx_exe") { 298 319 die "mgppbuilder::build_index - couldn't run $mg_stem_idx_exe\n"; … … 315 336 my $mg_passes_exe = &util::filename_cat($exedir, "mg_passes$exe"); 316 337 my $mg_compression_dict_exe = &util::filename_cat($exedir, "mg_compression_dict$exe"); 338 my $outhandle = $self->{'outhandle'}; 317 339 318 340 &util::mk_all_dir (&util::filename_cat($self->{'build_dir'}, "text")); … … 327 349 } 328 350 329 print STDERR"\n*** creating the compressed text\n" if ($self->{'verbosity'} >= 1);351 print $outhandle "\n*** creating the compressed text\n" if ($self->{'verbosity'} >= 1); 330 352 331 353 # collect the statistics for the text 332 print STDERR "\n collecting text statistics\n" if ($self->{'verbosity'} >= 1); 354 # -b $maxdocsize sets the maximum document size to be 12 meg 355 print $outhandle "\n collecting text statistics\n" if ($self->{'verbosity'} >= 1); 333 356 334 357 my ($handle); … … 364 387 # note: this options are left over from mg version 365 388 if (!$self->{'debug'}) { 366 print STDERR"\n creating the compression dictionary\n" if ($self->{'verbosity'} >= 1);389 print $outhandle "\n creating the compression dictionary\n" if ($self->{'verbosity'} >= 1); 367 390 if (!-e "$mg_compression_dict_exe") { 368 391 die "mgppbuilder::compress_text - couldn't run $mg_compression_dict_exe\n"; … … 381 404 $self->{'buildproc'}->reset(); 382 405 # compress the text 383 print STDERR"\n compressing the text\n" if ($self->{'verbosity'} >= 1);406 print $outhandle "\n compressing the text\n" if ($self->{'verbosity'} >= 1); 384 407 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'}, 385 408 "", {}, $self->{'buildproc'}, $self->{'maxdocs'}); 386 409 close ($handle) unless $self->{'debug'}; 410 411 $self->print_stats(); 387 412 } 388 413 … … 406 431 my $self = shift (@_); 407 432 my ($indexname) = @_; 433 my $outhandle = $self->{'outhandle'}; 408 434 409 435 my $indexes = []; … … 416 442 # push @$indexes, "text,Title,Organization,Magazine,Subject"; 417 443 # push @$indexes, "Title,Organization,Magazine,Subject"; 444 418 445 # create the mapping between the index descriptions 419 446 # and their directory names … … 423 450 foreach $index (@$indexes) { 424 451 if ($self->want_built($index)) { 425 print STDERR"\n*** building index $index in subdirectory " .452 print $outhandle "\n*** building index $index in subdirectory " . 426 453 "$self->{'index_mapping'}->{$index}\n" if ($self->{'verbosity'} >= 1); 427 454 $self->build_index($index); 428 455 } else { 429 print STDERR"\n*** ignoring index $index\n" if ($self->{'verbosity'} >= 1);456 print $outhandle "\n*** ignoring index $index\n" if ($self->{'verbosity'} >= 1); 430 457 } 431 458 } … … 551 578 my $self = shift (@_); 552 579 my ($index) = @_; 580 my $outhandle = $self->{'outhandle'}; 553 581 554 582 # get the full index directory path and make sure it exists … … 603 631 604 632 # Build index dictionary. Uses verbatim stem method 605 print STDERR"\n creating index dictionary\n" if ($self->{'verbosity'} >= 1);633 print $outhandle "\n creating index dictionary\n" if ($self->{'verbosity'} >= 1); 606 634 my ($handle); 607 635 if ($self->{'debug'}) { … … 626 654 close ($handle) unless $self->{'debug'}; 627 655 656 $self->print_stats(); 657 628 658 if (!$self->{'debug'}) { 629 659 # create the perfect hash function … … 640 670 641 671 # invert the text 642 print STDERR"\n inverting the text\n" if ($self->{'verbosity'} >= 1);672 print $outhandle "\n inverting the text\n" if ($self->{'verbosity'} >= 1); 643 673 644 674 $self->{'buildproc'}->reset(); 645 675 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'}, 646 676 "", {}, $self->{'buildproc'}, $self->{'maxdocs'}); 677 678 $self->print_stats (); 647 679 648 680 if (!$self->{'debug'}) { … … 651 683 652 684 # create the weights file 653 print STDERR"\n create the weights file\n" if ($self->{'verbosity'} >= 1);685 print $outhandle "\n create the weights file\n" if ($self->{'verbosity'} >= 1); 654 686 if (!-e "$mg_weights_build_exe") { 655 687 die "mgppbuilder::build_index - couldn't run $mg_weights_build_exe\n"; … … 658 690 659 691 # create 'on-disk' stemmed dictionary 660 print STDERR"\n creating 'on-disk' stemmed dictionary\n" if ($self->{'verbosity'} >= 1);692 print $outhandle "\n creating 'on-disk' stemmed dictionary\n" if ($self->{'verbosity'} >= 1); 661 693 if (!-e "$mg_invf_dict_exe") { 662 694 die "mgppbuilder::build_index - couldn't run $mg_invf_dict_exe\n"; … … 666 698 667 699 # creates stem index files for the various stemming methods 668 print STDERR"\n creating stem indexes\n" if ($self->{'verbosity'} >= 1);700 print $outhandle "\n creating stem indexes\n" if ($self->{'verbosity'} >= 1); 669 701 if (!-e "$mg_stem_idx_exe") { 670 702 die "mgppbuilder::build_index - couldn't run $mg_stem_idx_exe\n"; … … 684 716 # if (defined $suffix && !defined $wanted_index_files{$suffix}) { 685 717 # delete it! 686 # print STDERR"deleting $file\n" if $self->{'verbosity'} > 2;718 # print $outhandle "deleting $file\n" if $self->{'verbosity'} > 2; 687 719 # &util::rm (&util::filename_cat ($tmpdir, $file)); 688 720 # } … … 694 726 sub make_infodatabase { 695 727 my $self = shift (@_); 728 my $outhandle = $self->{'outhandle'}; 729 730 696 731 my $textdir = &util::filename_cat($self->{'build_dir'}, "text"); 697 732 my $assocdir = &util::filename_cat($self->{'build_dir'}, "assoc"); … … 709 744 my $txt2db_exe = &util::filename_cat($exedir, "txt2db$exe"); 710 745 711 print STDERR"\n*** creating the info database and processing associated files\n"746 print $outhandle "\n*** creating the info database and processing associated files\n" 712 747 if ($self->{'verbosity'} >= 1); 713 748 … … 748 783 print $handle "<$self->{'index_mapping'}->{$cmeta}>" . 749 784 $self->{'collect_cfg'}->{'collectionmeta'}->{".$cmeta"} . "\n"; 750 print STDERR"have .section entry in collect file\n";785 print $outhandle "have .section entry in collect file\n"; 751 786 } else { 752 print STDERR"mgppbuilder: warning bad collectionmeta option '$cmeta' - ignored\n";787 print $outhandle "mgppbuilder: warning bad collectionmeta option '$cmeta' - ignored\n"; 753 788 } 754 789 } else { … … 779 814 my %build_cfg = (); 780 815 781 print STDERR "\n*** creating auxiliary files \n" if ($self->{'verbosity'} >= 1); 816 my $outhandle = $self->{'outhandle'}; 817 print $outhandle "\n*** creating auxiliary files \n" if ($self->{'verbosity'} >= 1); 782 818 783 819 # get the text directory … … 826 862 } 827 863 864 sub print_stats { 865 my $self = shift (@_); 866 867 my $outhandle = $self->{'outhandle'}; 868 my $indexing_text = $self->{'buildproc'}->get_indexing_text(); 869 my $index = $self->{'buildproc'}->get_index(); 870 my $num_bytes = $self->{'buildproc'}->get_num_bytes(); 871 my $num_processed_bytes = $self->{'buildproc'}->get_num_processed_bytes(); 872 873 if ($indexing_text) { 874 print $outhandle "Stats (Creating index $index)\n"; 875 } else { 876 print $outhandle "Stats (Compressing text from $index)\n"; 877 } 878 print $outhandle "Total bytes in collection: $num_bytes\n"; 879 print $outhandle "Total bytes in $index: $num_processed_bytes\n"; 880 881 if ($num_processed_bytes < 50) { 882 print $outhandle "***************\n"; 883 print $outhandle "WARNING: There is very little or no text to process for $index\n"; 884 if ($indexing_text) { 885 print $outhandle "This may cause an error while attempting to build the index\n"; 886 } else { 887 print $outhandle "This may cause an error while attempting to compress the text\n"; 888 } 889 print $outhandle "***************\n"; 890 } 891 892 } 828 893 829 894 1; -
trunk/gsdl/perllib/mgppbuildproc.pm
r932 r1694 42 42 43 43 sub new { 44 my ($class, $collection, $source_dir, $build_dir, $verbosity) = @_; 44 my ($class, $collection, $source_dir, $build_dir, 45 $verbosity, $outhandle) = @_; 45 46 my $self = new docproc (); 47 48 # outhandle is where all the debugging info goes 49 # output_handle is where the output of the plugins is piped 50 # to (i.e. mg, gdbm etc.) 51 $outhandle = STDERR unless defined $outhandle; 46 52 47 53 $self->{'collection'} = $collection; … … 59 65 $self->{'num_sections'} = 0; 60 66 $self->{'num_bytes'} = 0; 67 $self->{'num_processed_bytes'} = 0; 68 $self->{'outhandle'} = $outhandle; 61 69 62 70 $self->{'indexing_text'} = 0; … … 70 78 $self->{'num_docs'} = 0; 71 79 $self->{'num_sections'} = 0; 80 $self->{'num_processed_bytes'} = 0; 72 81 $self->{'num_bytes'} = 0; 73 82 } … … 85 94 } 86 95 96 # num_bytes is the actual number of bytes in the collection 97 # this is normally the same as what's processed during text compression 87 98 sub get_num_bytes { 88 99 my $self = shift (@_); 89 100 90 101 return $self->{'num_bytes'}; 102 } 103 104 # num_processed_bytes is the number of bytes actually passed 105 # to mgpp for the current index 106 sub get_num_processed_bytes { 107 my $self = shift (@_); 108 109 return $self->{'num_processed_bytes'}; 91 110 } 92 111 … … 127 146 } 128 147 148 sub get_index { 149 my $self = shift (@_); 150 151 return $self->{'index'}; 152 } 153 129 154 sub set_classifiers { 130 155 my $self = shift (@_); … … 139 164 140 165 $self->{'indexing_text'} = $indexing_text; 166 } 167 168 sub get_indexing_text { 169 my $self = shift (@_); 170 171 return $self->{'indexing_text'}; 141 172 } 142 173 … … 240 271 else { print $handle "[$doc_OID.$section]\n"; } 241 272 242 # output the fact that this document is a document 243 #print $handle "<doctype>doc\n"; 273 # output the fact that this document is a document (unless doctype 274 # has been set to something else from within a plugin 275 my $dtype = $doc_obj->get_metadata_element ($section, "doctype"); 276 if (!defined $dtype || $dtype !~ /\w/) { 277 print $handle "<doctype>doc\n"; 278 } 244 279 245 280 # output whether this node contains text … … 251 286 252 287 # output all the section metadata 253 my $found_doctype = 0;288 #my $found_doctype = 0; 254 289 my $metadata = $doc_obj->get_all_metadata ($section); 255 290 foreach $pair (@$metadata) { 256 291 my ($field, $value) = (@$pair); 257 292 258 $found_doctype = 1 if $field eq "doctype";293 #$found_doctype = 1 if $field eq "doctype"; 259 294 if ($field ne "Identifier" && $field !~ /^gsdl/ && 260 295 defined $value && $value ne "") { … … 282 317 # (unless doctype was already output as part of 283 318 # metadata) 284 if (!$found_doctype && !defined $self->{'dontgdbm'}->{'doctype'}) {285 print $handle "<doctype>doc\n";286 }319 #if (!$found_doctype && !defined $self->{'dontgdbm'}->{'doctype'}) { 320 # print $handle "<doctype>doc\n"; 321 #} 287 322 288 323
Note:
See TracChangeset
for help on using the changeset viewer.