Changeset 25155 for gs2-extensions
- Timestamp:
- 2012-02-28T16:40:01+13:00 (12 years ago)
- Location:
- gs2-extensions/ngramj/perllib
- Files:
-
- 2 edited
Legend:
- Unmodified
- Added
- Removed
-
gs2-extensions/ngramj/perllib/ngramj.pm
r25141 r25155 33 33 34 34 use strict; 35 35 no strict 'refs'; # allow filehandles to be variables and viceversa 36 36 37 37 sub new { 38 38 my $class = shift (@_); 39 40 my $self = {}; 39 my ($verbosity,$outhandle) = @_; 40 41 my $self = { 'verbosity' => $verbosity, 'outhandle' => $outhandle }; 41 42 42 43 my $ngram_jar = &util::filename_cat($ENV{'GSDLHOME'},"ext","ngramj","jars","cngram.jar"); … … 45 46 46 47 $self->{'java_cmd'} = $java_cmd; 47 48 48 49 49 return bless $self, $class; … … 87 87 my @lang_array = split(/\s+/,$lang_group); 88 88 89 my @lang_summary = ( "++ Ngram language probabilities:\n++ "); 90 89 91 foreach my $l (@lang_array) { 90 p rint STDERR "l = $l\n";92 push(@lang_summary,$l); 91 93 my ($lang,$score) = ($l =~ m/^(.+):(.+)$/); 92 94 … … 95 97 96 98 push(@$lang_encode_pairs,$lang_pair); 99 } 100 push(@lang_summary,"\n"); 101 102 if ($self->{'verbosity'}>=2) { 103 my $outhandle = $self->{'outhandle'}; 104 my $lang_summary_str = join(" ",@lang_summary); 105 print $outhandle $lang_summary_str; 97 106 } 98 107 } … … 105 114 return undef; 106 115 } 107 108 109 116 110 117 &util::rm($tmp_txt_filename); … … 119 126 120 127 return $self->classify_contents($contents_ref,$filename,$filter_by_encoding); 121 122 128 } 123 129 -
gs2-extensions/ngramj/perllib/plugins/ReadTextFile.pm
r25141 r25155 120 120 my ($pluginfo, $base_dir, $file, $block_hash, $metadata, $processor, $maxdocs, $total_count, $gli) = @_; 121 121 122 print STDERR "**** In Pei Jones Local version of ReadTextFile\n"; 122 my $verbosity = $self->{'verbosity'}; 123 if ($verbosity>=2) { 124 my $outhandle = $self->{'outhandle'}; 125 print $outhandle "++ Using Ngram-Java version of ReadTextFile.pm ++\n"; 126 } 123 127 124 128 my $outhandle = $self->{'outhandle'}; … … 329 333 my ($filename) = @_; 330 334 331 $self->{'textcat'} = new ngramj( ) if (!defined($self->{'textcat'}));335 $self->{'textcat'} = new ngramj($self->{'verbosity'},$self->{'outhandle'}) if (!defined($self->{'textcat'})); 332 336 333 337 my ($language, $encoding, $extracted_encoding); … … 629 633 630 634 else { # need to use textcat to get either the language, or get both language and encoding 631 $self->{'ngramj'} = new ngramj( ) if (!defined($self->{'ngramj'}));635 $self->{'ngramj'} = new ngramj($self->{'verbosity'},$self->{'outhandle'}) if (!defined($self->{'ngramj'})); 632 636 633 637 if($found_html_encoding) { # know encoding, find language by limiting search to known encoding
Note:
See TracChangeset
for help on using the changeset viewer.