Changeset 1251
- Timestamp:
- 2000-06-29T10:34:25+12:00 (24 years ago)
- Location:
- trunk/gsdl/perllib
- Files:
-
- 2 edited
Legend:
- Unmodified
- Added
- Removed
-
trunk/gsdl/perllib/mgbuilder.pm
r1246 r1251 207 207 close ($handle) unless $self->{'debug'}; 208 208 209 $self->print_stats(); 210 209 211 # create the compression dictionary 210 212 # the compression dictionary is built by assuming the stats are from a seed … … 232 234 "", {}, $self->{'buildproc'}, $self->{'maxdocs'}); 233 235 close ($handle) unless $self->{'debug'}; 236 237 $self->print_stats(); 234 238 } 235 239 … … 481 485 close ($handle) unless $self->{'debug'}; 482 486 487 $self->print_stats(); 488 483 489 if (!$self->{'debug'}) { 484 490 # create the perfect hash function … … 502 508 "", {}, $self->{'buildproc'}, $self->{'maxdocs'}); 503 509 510 $self->print_stats (); 511 504 512 if (!$self->{'debug'}) { 505 513 … … 680 688 } 681 689 690 sub print_stats { 691 my $self = shift (@_); 692 693 my $indexing_text = $self->{'buildproc'}->get_indexing_text(); 694 my $index = $self->{'buildproc'}->get_index(); 695 my $num_bytes = $self->{'buildproc'}->get_num_bytes(); 696 my $num_processed_bytes = $self->{'buildproc'}->get_num_processed_bytes(); 697 698 if ($indexing_text) { 699 print STDERR "Stats (Creating index $index)\n"; 700 } else { 701 print STDERR "Stats (Compressing text from $index)\n"; 702 } 703 print STDERR "Total bytes in collection: $num_bytes\n"; 704 print STDERR "Total bytes in $index: $num_processed_bytes\n"; 705 706 if ($num_processed_bytes < 50) { 707 print STDERR "***************\n"; 708 print STDERR "WARNING: There is very little or no text to process for $index\n"; 709 if ($indexing_text) { 710 print STDERR "This may cause an error while attempting to build the index\n"; 711 } else { 712 print STDERR "This may cause an error while attempting to compress the text\n"; 713 } 714 print STDERR "***************\n"; 715 } 716 } 682 717 683 718 1; -
trunk/gsdl/perllib/mgbuildproc.pm
r1072 r1251 58 58 $self->{'num_sections'} = 0; 59 59 $self->{'num_bytes'} = 0; 60 $self->{'num_processed_bytes'} = 0; 60 61 61 62 $self->{'indexing_text'} = 0; … … 69 70 $self->{'num_docs'} = 0; 70 71 $self->{'num_sections'} = 0; 72 $self->{'num_processed_bytes'} = 0; 71 73 $self->{'num_bytes'} = 0; 72 74 } … … 84 86 } 85 87 88 # num_bytes is the actual number of bytes in the collection 89 # this is normally the same as what's processed during text compression 86 90 sub get_num_bytes { 87 91 my $self = shift (@_); 88 92 89 93 return $self->{'num_bytes'}; 94 } 95 96 # num_processed_bytes is the number of bytes actually passed 97 # to mg for the current index 98 sub get_num_processed_bytes { 99 my $self = shift (@_); 100 101 return $self->{'num_processed_bytes'}; 90 102 } 91 103 … … 126 138 } 127 139 140 sub get_index { 141 my $self = shift (@_); 142 143 return $self->{'index'}; 144 } 145 128 146 sub set_classifiers { 129 147 my $self = shift (@_); … … 138 156 139 157 $self->{'indexing_text'} = $indexing_text; 158 } 159 160 sub get_indexing_text { 161 my $self = shift (@_); 162 163 return $self->{'indexing_text'}; 140 164 } 141 165 … … 416 440 if ($real_field eq "text") { 417 441 $new_text = $doc_obj->get_text ($section); 442 $self->{'num_processed_bytes'} += length ($new_text); 418 443 $new_text =~ s/[\cB\cC]//g; 419 444 $self->find_paragraphs($new_text); … … 423 448 foreach $meta (@{$doc_obj->get_metadata ($section, $real_field)}) { 424 449 $meta =~ s/[\cB\cC]//g; 450 $self->{'num_processed_bytes'} += length ($meta); 425 451 $new_text .= "\cC" unless $first; 426 452 $new_text .= $meta;
Note:
See TracChangeset
for help on using the changeset viewer.