Context Navigation

← Previous Changeset
Next Changeset →

Changeset 2235

Timestamp:

2001-04-01T13:04:26+12:00 (23 years ago)

Author:

sjboddie

Message:

Hacked the textcat package about so that it only reads all the language
models once (instead of reading them in before processing each document).
Fairly significant speed improvements, as you'd expect.

Location:

trunk/gsdl/perllib

Files:

: 2 edited

plugins/BasPlug.pm (modified) (3 diffs)
textcat.pm (modified) (3 diffs)

Legend:

: Unmodified
: Added
: Removed

trunk/gsdl/perllib/plugins/BasPlug.pm

-              r2219
+              r2235
     $self->{'outhandle'} = STDERR;
     my $year = (localtime)[5]+1900;
+    $self->{'textcat'} = new textcat();
     # general options available to all plugins
     if (!parsargv::parse(\@_,
 …
     # get the language/encoding
     my @results = textcat::classify($text);
+    my $results = $self->{'textcat'}->classify(\$text);
     # if textcat returns 3 or less possibilities we'll use the
     # first one in the list - otherwise use the defaults
     if (scalar @results > 3) {
+    if (scalar @$results > 3) {
     if ($self->{'input_encoding'} ne 'auto') {
 …
     # format language/encoding
     my ($language, $encoding) = $results[0] =~ /^([^-]*)(?:-(.*))?$/;
+    my ($language, $encoding) = $results->[0] =~ /^([^-]*)(?:-(.*))?$/;
     if (!defined $language) {
     if ($self->{'verbosity'}) {

trunk/gsdl/perllib/textcat.pm

-              r1316
+              r2235
 package textcat;
-use strict;
-#use Benchmark;
 # OPTIONS
 my $model_dir = $ENV{'GSDLHOME'} . "/perllib/textcat";
 …
 my $non_word_characters = '0-9\s';
+sub new {
+    my $class = shift (@_);
+    my $self = {};
+    # open directory to find which languages are supported
+    opendir DIR, "$model_dir" or die "directory $model_dir: $!\n";
+    my @languages = sort(grep { s/\.lm// && -r "$model_dir/$_.lm" } readdir(DIR));
+    closedir DIR;
+    @languages or die "sorry, can't read any language models from $model_dir\n" .
+    "language models must reside in files with .lm ending\n";
+    # load model and count for each language.
+    foreach my $language (@languages) {
+    my %ngram=();
+    my $rang=1;
+    open(LM, "$model_dir/$language.lm") || die "cannot open $language.lm: $!\n";
+    while (<LM>) {
+        chomp;
+        # only use lines starting with appropriate character. Others are ignored.
+        if (/^[^$non_word_characters]+/o) {
+        $self->{'ngrams'}->{$language}->{$&} = $rang++;
+        }
+    }
+    close(LM);
+    }
+    $self->{'languages'} = \@languages;
+    return bless $self, $class;
+}
 # CLASSIFICATION
 …
 sub classify {
   my ($input)=@_;
   my %results=();
   my $maxp = $opt_t;
+    my ($self, $inputref)=@_;
+    my %results = ();
+    my $maxp = $opt_t;
+  # open directory to find which languages are supported
+  opendir DIR, "$model_dir" or die "directory $model_dir: $!\n";
+  my @languages = sort(grep { s/\.lm// && -r "$model_dir/$_.lm" } readdir(DIR));
+  closedir DIR;
+  @languages or die "sorry, can't read any language models from $model_dir\n" .
+    "language models must reside in files with .lm ending\n";
+    # create ngrams for input.
+    my $unknown = $self->create_lm($inputref);
+  # create ngrams for input. Note that hash %unknown is not used;
+  # it contains the actual counts which are only used under -n: creating
+  # new language model (and even then they are not really required).
+  my @unknown=create_lm($input);
+  # load model and count for each language.
+  my $language;
+  # my $t1 = new Benchmark;
+  foreach $language (@languages) {
+    # loads the language model into hash %$language.
+    my %ngram=();
+    my $rang=1;
+    open(LM,"$model_dir/$language.lm") || die "cannot open $language.lm: $!\n";
+    while (<LM>) {
+      chomp;
+      # only use lines starting with appropriate character. Others are
+      # ignored.
+      if (/^[^$non_word_characters]+/o) {
+    $ngram{$&} = $rang++;
+      }
+    foreach my $language (@{$self->{'languages'}}) {
+    # compare language model with input ngrams list
+    my ($i,$p)=(0,0);
+    while ($i < scalar (@$unknown)) {
+        if (defined ($self->{'ngrams'}->{$language}->{$unknown->[$i]})) {
+        $p=$p+abs($self->{'ngrams'}->{$language}->{$unknown->[$i]}-$i);
+        } else {
+        $p=$p+$maxp;
+        }
+        ++$i;
+    }
+    $results{$language} = $p;
+    }
+    close(LM);
+    #print STDERR "loaded language model $language\n" if $opt_v;
+    # compares the language model with input ngrams list
+    my ($i,$p)=(0,0);
+    while ($i < @unknown) {
+      if ($ngram{$unknown[$i]}) {
+    $p=$p+abs($ngram{$unknown[$i]}-$i);
+      } else {
+    $p=$p+$maxp;
+      }
+      ++$i;
+    my @results = sort { $results{$a} <=> $results{$b} } keys %results;
+    my $a = $results{$results[0]};
+    my @answers=(shift(@results));
+    while (@results && $results{$results[0]} < ($opt_u *$a)) {
+    @answers=(@answers,shift(@results));
+    }
-    #print STDERR "$language: $p\n" if $opt_v;
-    $results{$language} = $p;
+  }
-  # print STDERR "read language models done (" .
-  #   timestr(timediff(new Benchmark, $t1)) .
-  #   ".\n" if $opt_v;
-  my @results = sort { $results{$a} <=> $results{$b} } keys %results;
-  # print join("\n",map { "$_\t $results{$_}"; } @results),"\n" if $opt_v;
-  my $a = $results{$results[0]};
-  my @answers=(shift(@results));
-  while (@results && $results{$results[0]} < ($opt_u *$a)) {
-    @answers=(@answers,shift(@results));
+  }
   return @answers;
+    return \@answers;
+}
+sub create_lm {
+    # $ngram contains reference to the hash we build
+    # then add the ngrams found in each word in the hash
+    my ($self, $textref) = @_;
+    my $ngram = {};
+    foreach my $word (split(/[$non_word_characters]+/, $$textref)) {
+    $word = "_" . $word . "_";
+    my $len = length($word);
+    my $flen=$len;
+    my $i;
+sub create_lm {
+  # my $t1 = new Benchmark;
+  my $ngram;
+  ($_,$ngram) = @_;  #$ngram contains reference to the hash we build
+                     # then add the ngrams found in each word in the hash
+  my $word;
+  foreach $word (split("[$non_word_characters]+")) {
+    $word = "_" . $word . "_";
+    my $len = length($word);
+    my $flen=$len;
+    my $i;
+    for ($i=0;$i<$flen;$i++) {
+      $$ngram{substr($word,$i,5)}++ if $len > 4;
+      $$ngram{substr($word,$i,4)}++ if $len > 3;
+      $$ngram{substr($word,$i,3)}++ if $len > 2;
+      $$ngram{substr($word,$i,2)}++ if $len > 1;
+      $$ngram{substr($word,$i,1)}++;
+      $len--;
+    for ($i=0; $i<$flen; $i++) {
+        $ngram->{substr($word,$i,5)}++ if $len > 4;
+        $ngram->{substr($word,$i,4)}++ if $len > 3;
+        $ngram->{substr($word,$i,3)}++ if $len > 2;
+        $ngram->{substr($word,$i,2)}++ if $len > 1;
+        $ngram->{substr($word,$i,1)}++;
+        $len--;
+    }
+    }
+  }
-  ###print "@{[%$ngram]}";
-  # my $t2 = new Benchmark;
-  # print STDERR "count_ngrams done (".
-  #   timestr(timediff($t2, $t1)) .").\n" if $opt_v;
+  # as suggested by Karel P. de Vos, [email protected], we speed up
+  # sorting by removing singletons
+  map { my $key=$_; if ($$ngram{$key} <= $opt_f)
+             { delete $$ngram{$key}; }; } keys %$ngram;
+    map { if ($ngram->{$_} <= $opt_f) { delete $ngram->{$_}; }
+      } keys %$ngram;
+  # sort the ngrams, and spit out the $opt_t frequent ones.
+  # adding  `or $a cmp $b' in the sort block makes sorting five
+  # times slower..., although it would be somewhat nicer (unique result)
+  my @sorted = sort { $$ngram{$b} <=> $$ngram{$a} } keys %$ngram;
+  splice(@sorted,$opt_t) if (@sorted > $opt_t);
+  # print STDERR "sorting done (" .
+  #  timestr(timediff(new Benchmark, $t2)) .
+  #    ").\n" if $opt_v;
+  return @sorted;
+    # sort the ngrams, and spit out the $opt_t frequent ones.
+    # adding  `or $a cmp $b' in the sort block makes sorting five
+    # times slower..., although it would be somewhat nicer (unique result)
+    my @sorted = sort { $ngram->{$b} <=> $ngram->{$a} } keys %$ngram;
+    splice(@sorted,$opt_t) if (@sorted > $opt_t);
+    return \@sorted;
+}

Note: See TracChangeset for help on using the changeset viewer.

Context Navigation

Changeset 2235

Legend:

trunk/gsdl/perllib/plugins/BasPlug.pm

trunk/gsdl/perllib/textcat.pm

Download in other formats: