Changeset 2235
- Timestamp:
- 2001-04-01T13:04:26+12:00 (23 years ago)
- Location:
- trunk/gsdl/perllib
- Files:
-
- 2 edited
Legend:
- Unmodified
- Added
- Removed
-
trunk/gsdl/perllib/plugins/BasPlug.pm
r2219 r2235 139 139 $self->{'outhandle'} = STDERR; 140 140 my $year = (localtime)[5]+1900; 141 142 141 142 $self->{'textcat'} = new textcat(); 143 143 144 # general options available to all plugins 144 145 if (!parsargv::parse(\@_, … … 386 387 387 388 # get the language/encoding 388 my @results = textcat::classify($text);389 my $results = $self->{'textcat'}->classify(\$text); 389 390 390 391 # if textcat returns 3 or less possibilities we'll use the 391 392 # first one in the list - otherwise use the defaults 392 if (scalar @ results > 3) {393 if (scalar @$results > 3) { 393 394 394 395 if ($self->{'input_encoding'} ne 'auto') { … … 409 410 410 411 # format language/encoding 411 my ($language, $encoding) = $results [0] =~ /^([^-]*)(?:-(.*))?$/;412 my ($language, $encoding) = $results->[0] =~ /^([^-]*)(?:-(.*))?$/; 412 413 if (!defined $language) { 413 414 if ($self->{'verbosity'}) { -
trunk/gsdl/perllib/textcat.pm
r1316 r2235 32 32 package textcat; 33 33 34 use strict;35 #use Benchmark;36 37 34 # OPTIONS 38 35 my $model_dir = $ENV{'GSDLHOME'} . "/perllib/textcat"; … … 44 41 my $non_word_characters = '0-9\s'; 45 42 43 sub new { 44 my $class = shift (@_); 45 my $self = {}; 46 47 # open directory to find which languages are supported 48 opendir DIR, "$model_dir" or die "directory $model_dir: $!\n"; 49 my @languages = sort(grep { s/\.lm// && -r "$model_dir/$_.lm" } readdir(DIR)); 50 closedir DIR; 51 @languages or die "sorry, can't read any language models from $model_dir\n" . 52 "language models must reside in files with .lm ending\n"; 53 54 # load model and count for each language. 55 foreach my $language (@languages) { 56 my %ngram=(); 57 my $rang=1; 58 open(LM, "$model_dir/$language.lm") || die "cannot open $language.lm: $!\n"; 59 while (<LM>) { 60 chomp; 61 # only use lines starting with appropriate character. Others are ignored. 62 if (/^[^$non_word_characters]+/o) { 63 $self->{'ngrams'}->{$language}->{$&} = $rang++; 64 } 65 } 66 close(LM); 67 } 68 69 $self->{'languages'} = \@languages; 70 return bless $self, $class; 71 } 72 73 46 74 47 75 # CLASSIFICATION … … 52 80 53 81 sub classify { 54 my ($input)=@_;55 my %results=();56 my $maxp = $opt_t;82 my ($self, $inputref)=@_; 83 my %results = (); 84 my $maxp = $opt_t; 57 85 58 # open directory to find which languages are supported 59 opendir DIR, "$model_dir" or die "directory $model_dir: $!\n"; 60 my @languages = sort(grep { s/\.lm// && -r "$model_dir/$_.lm" } readdir(DIR)); 61 closedir DIR; 62 @languages or die "sorry, can't read any language models from $model_dir\n" . 63 "language models must reside in files with .lm ending\n"; 86 # create ngrams for input. 87 my $unknown = $self->create_lm($inputref); 64 88 65 # create ngrams for input. Note that hash %unknown is not used; 66 # it contains the actual counts which are only used under -n: creating 67 # new language model (and even then they are not really required). 68 my @unknown=create_lm($input); 69 # load model and count for each language. 70 my $language; 71 # my $t1 = new Benchmark; 72 foreach $language (@languages) { 73 # loads the language model into hash %$language. 74 my %ngram=(); 75 my $rang=1; 76 open(LM,"$model_dir/$language.lm") || die "cannot open $language.lm: $!\n"; 77 while (<LM>) { 78 chomp; 79 # only use lines starting with appropriate character. Others are 80 # ignored. 81 if (/^[^$non_word_characters]+/o) { 82 $ngram{$&} = $rang++; 83 } 89 foreach my $language (@{$self->{'languages'}}) { 90 91 # compare language model with input ngrams list 92 my ($i,$p)=(0,0); 93 while ($i < scalar (@$unknown)) { 94 if (defined ($self->{'ngrams'}->{$language}->{$unknown->[$i]})) { 95 $p=$p+abs($self->{'ngrams'}->{$language}->{$unknown->[$i]}-$i); 96 } else { 97 $p=$p+$maxp; 98 } 99 ++$i; 100 } 101 $results{$language} = $p; 84 102 } 85 close(LM); 86 #print STDERR "loaded language model $language\n" if $opt_v; 87 88 # compares the language model with input ngrams list 89 my ($i,$p)=(0,0); 90 while ($i < @unknown) { 91 if ($ngram{$unknown[$i]}) { 92 $p=$p+abs($ngram{$unknown[$i]}-$i); 93 } else { 94 $p=$p+$maxp; 95 } 96 ++$i; 103 104 my @results = sort { $results{$a} <=> $results{$b} } keys %results; 105 my $a = $results{$results[0]}; 106 107 my @answers=(shift(@results)); 108 while (@results && $results{$results[0]} < ($opt_u *$a)) { 109 @answers=(@answers,shift(@results)); 97 110 } 98 #print STDERR "$language: $p\n" if $opt_v;99 100 $results{$language} = $p;101 }102 # print STDERR "read language models done (" .103 # timestr(timediff(new Benchmark, $t1)) .104 # ".\n" if $opt_v;105 my @results = sort { $results{$a} <=> $results{$b} } keys %results;106 107 # print join("\n",map { "$_\t $results{$_}"; } @results),"\n" if $opt_v;108 my $a = $results{$results[0]};109 110 my @answers=(shift(@results));111 while (@results && $results{$results[0]} < ($opt_u *$a)) {112 @answers=(@answers,shift(@results));113 }114 111 115 return@answers;112 return \@answers; 116 113 } 117 114 115 sub create_lm { 116 # $ngram contains reference to the hash we build 117 # then add the ngrams found in each word in the hash 118 my ($self, $textref) = @_; 119 120 my $ngram = {}; 118 121 122 foreach my $word (split(/[$non_word_characters]+/, $$textref)) { 123 $word = "_" . $word . "_"; 124 my $len = length($word); 125 my $flen=$len; 126 my $i; 119 127 120 sub create_lm { 121 # my $t1 = new Benchmark; 122 my $ngram; 123 ($_,$ngram) = @_; #$ngram contains reference to the hash we build 124 # then add the ngrams found in each word in the hash 125 my $word; 126 foreach $word (split("[$non_word_characters]+")) { 127 $word = "_" . $word . "_"; 128 my $len = length($word); 129 my $flen=$len; 130 my $i; 131 for ($i=0;$i<$flen;$i++) { 132 $$ngram{substr($word,$i,5)}++ if $len > 4; 133 $$ngram{substr($word,$i,4)}++ if $len > 3; 134 $$ngram{substr($word,$i,3)}++ if $len > 2; 135 $$ngram{substr($word,$i,2)}++ if $len > 1; 136 $$ngram{substr($word,$i,1)}++; 137 $len--; 128 for ($i=0; $i<$flen; $i++) { 129 $ngram->{substr($word,$i,5)}++ if $len > 4; 130 $ngram->{substr($word,$i,4)}++ if $len > 3; 131 $ngram->{substr($word,$i,3)}++ if $len > 2; 132 $ngram->{substr($word,$i,2)}++ if $len > 1; 133 $ngram->{substr($word,$i,1)}++; 134 $len--; 135 } 138 136 } 139 }140 ###print "@{[%$ngram]}";141 # my $t2 = new Benchmark;142 # print STDERR "count_ngrams done (".143 # timestr(timediff($t2, $t1)) .").\n" if $opt_v;144 137 145 # as suggested by Karel P. de Vos, [email protected], we speed up 146 # sorting by removing singletons 147 map { my $key=$_; if ($$ngram{$key} <= $opt_f) 148 { delete $$ngram{$key}; }; } keys %$ngram; 149 138 map { if ($ngram->{$_} <= $opt_f) { delete $ngram->{$_}; } 139 } keys %$ngram; 150 140 151 # sort the ngrams, and spit out the $opt_t frequent ones. 152 # adding `or $a cmp $b' in the sort block makes sorting five 153 # times slower..., although it would be somewhat nicer (unique result) 154 my @sorted = sort { $$ngram{$b} <=> $$ngram{$a} } keys %$ngram; 155 splice(@sorted,$opt_t) if (@sorted > $opt_t); 156 # print STDERR "sorting done (" . 157 # timestr(timediff(new Benchmark, $t2)) . 158 # ").\n" if $opt_v; 159 return @sorted; 141 # sort the ngrams, and spit out the $opt_t frequent ones. 142 # adding `or $a cmp $b' in the sort block makes sorting five 143 # times slower..., although it would be somewhat nicer (unique result) 144 my @sorted = sort { $ngram->{$b} <=> $ngram->{$a} } keys %$ngram; 145 splice(@sorted,$opt_t) if (@sorted > $opt_t); 146 return \@sorted; 160 147 } 161 148
Note:
See TracChangeset
for help on using the changeset viewer.