Ignore:
Timestamp:
2001-04-01T13:04:26+12:00 (23 years ago)
Author:
sjboddie
Message:

Hacked the textcat package about so that it only reads all the language
models once (instead of reading them in before processing each document).
Fairly significant speed improvements, as you'd expect.

File:
1 edited

Legend:

Unmodified
Added
Removed
  • trunk/gsdl/perllib/textcat.pm

    r1316 r2235  
    3232package textcat;
    3333
    34 use strict;
    35 #use Benchmark;
    36 
    3734# OPTIONS
    3835my $model_dir = $ENV{'GSDLHOME'} . "/perllib/textcat";
     
    4441my $non_word_characters = '0-9\s';
    4542
     43sub new {
     44    my $class = shift (@_);
     45    my $self = {};
     46
     47    # open directory to find which languages are supported
     48    opendir DIR, "$model_dir" or die "directory $model_dir: $!\n";
     49    my @languages = sort(grep { s/\.lm// && -r "$model_dir/$_.lm" } readdir(DIR));
     50    closedir DIR;
     51    @languages or die "sorry, can't read any language models from $model_dir\n" .
     52    "language models must reside in files with .lm ending\n";
     53
     54    # load model and count for each language.
     55    foreach my $language (@languages) {
     56    my %ngram=();
     57    my $rang=1;
     58    open(LM, "$model_dir/$language.lm") || die "cannot open $language.lm: $!\n";
     59    while (<LM>) {
     60        chomp;
     61        # only use lines starting with appropriate character. Others are ignored.
     62        if (/^[^$non_word_characters]+/o) {
     63        $self->{'ngrams'}->{$language}->{$&} = $rang++;
     64        }
     65    }
     66    close(LM);
     67    }
     68
     69    $self->{'languages'} = \@languages;
     70    return bless $self, $class;
     71}
     72
     73
    4674
    4775# CLASSIFICATION
     
    5280
    5381sub classify {
    54   my ($input)=@_;
    55   my %results=();
    56   my $maxp = $opt_t;
     82    my ($self, $inputref)=@_;
     83    my %results = ();
     84    my $maxp = $opt_t;
    5785
    58   # open directory to find which languages are supported
    59   opendir DIR, "$model_dir" or die "directory $model_dir: $!\n";
    60   my @languages = sort(grep { s/\.lm// && -r "$model_dir/$_.lm" } readdir(DIR));
    61   closedir DIR;
    62   @languages or die "sorry, can't read any language models from $model_dir\n" .
    63     "language models must reside in files with .lm ending\n";
     86    # create ngrams for input.
     87    my $unknown = $self->create_lm($inputref);
    6488
    65   # create ngrams for input. Note that hash %unknown is not used;
    66   # it contains the actual counts which are only used under -n: creating
    67   # new language model (and even then they are not really required).
    68   my @unknown=create_lm($input);
    69   # load model and count for each language.
    70   my $language;
    71   # my $t1 = new Benchmark;
    72   foreach $language (@languages) {
    73     # loads the language model into hash %$language.
    74     my %ngram=();
    75     my $rang=1;
    76     open(LM,"$model_dir/$language.lm") || die "cannot open $language.lm: $!\n";
    77     while (<LM>) {
    78       chomp;
    79       # only use lines starting with appropriate character. Others are
    80       # ignored.
    81       if (/^[^$non_word_characters]+/o) {
    82     $ngram{$&} = $rang++;
    83       }
     89    foreach my $language (@{$self->{'languages'}}) {
     90   
     91    # compare language model with input ngrams list
     92    my ($i,$p)=(0,0);
     93    while ($i < scalar (@$unknown)) {
     94        if (defined ($self->{'ngrams'}->{$language}->{$unknown->[$i]})) {
     95        $p=$p+abs($self->{'ngrams'}->{$language}->{$unknown->[$i]}-$i);
     96        } else {
     97        $p=$p+$maxp;
     98        }
     99        ++$i;
     100    }
     101    $results{$language} = $p;
    84102    }
    85     close(LM);
    86     #print STDERR "loaded language model $language\n" if $opt_v;
    87    
    88     # compares the language model with input ngrams list
    89     my ($i,$p)=(0,0);
    90     while ($i < @unknown) {
    91       if ($ngram{$unknown[$i]}) {
    92     $p=$p+abs($ngram{$unknown[$i]}-$i);
    93       } else {
    94     $p=$p+$maxp;
    95       }
    96       ++$i;
     103
     104    my @results = sort { $results{$a} <=> $results{$b} } keys %results;
     105    my $a = $results{$results[0]};
     106 
     107    my @answers=(shift(@results));
     108    while (@results && $results{$results[0]} < ($opt_u *$a)) {
     109    @answers=(@answers,shift(@results));
    97110    }
    98     #print STDERR "$language: $p\n" if $opt_v;
    99    
    100     $results{$language} = $p;
    101   }
    102   # print STDERR "read language models done (" .
    103   #   timestr(timediff(new Benchmark, $t1)) .
    104   #   ".\n" if $opt_v;
    105   my @results = sort { $results{$a} <=> $results{$b} } keys %results;
    106  
    107   # print join("\n",map { "$_\t $results{$_}"; } @results),"\n" if $opt_v;
    108   my $a = $results{$results[0]};
    109  
    110   my @answers=(shift(@results));
    111   while (@results && $results{$results[0]} < ($opt_u *$a)) {
    112     @answers=(@answers,shift(@results));
    113   }
    114111
    115   return @answers;
     112    return \@answers;
    116113}
    117114
     115sub create_lm {
     116    # $ngram contains reference to the hash we build
     117    # then add the ngrams found in each word in the hash
     118    my ($self, $textref) = @_; 
     119   
     120    my $ngram = {};
    118121
     122    foreach my $word (split(/[$non_word_characters]+/, $$textref)) {
     123    $word = "_" . $word . "_";
     124    my $len = length($word);
     125    my $flen=$len;
     126    my $i;
    119127
    120 sub create_lm {
    121   # my $t1 = new Benchmark;
    122   my $ngram;
    123   ($_,$ngram) = @_;  #$ngram contains reference to the hash we build
    124                      # then add the ngrams found in each word in the hash
    125   my $word;
    126   foreach $word (split("[$non_word_characters]+")) {
    127     $word = "_" . $word . "_";
    128     my $len = length($word);
    129     my $flen=$len;
    130     my $i;
    131     for ($i=0;$i<$flen;$i++) {
    132       $$ngram{substr($word,$i,5)}++ if $len > 4;
    133       $$ngram{substr($word,$i,4)}++ if $len > 3;
    134       $$ngram{substr($word,$i,3)}++ if $len > 2;
    135       $$ngram{substr($word,$i,2)}++ if $len > 1;
    136       $$ngram{substr($word,$i,1)}++;
    137       $len--;
     128    for ($i=0; $i<$flen; $i++) {
     129        $ngram->{substr($word,$i,5)}++ if $len > 4;
     130        $ngram->{substr($word,$i,4)}++ if $len > 3;
     131        $ngram->{substr($word,$i,3)}++ if $len > 2;
     132        $ngram->{substr($word,$i,2)}++ if $len > 1;
     133        $ngram->{substr($word,$i,1)}++;
     134        $len--;
     135    }
    138136    }
    139   }
    140   ###print "@{[%$ngram]}";
    141   # my $t2 = new Benchmark;
    142   # print STDERR "count_ngrams done (".
    143   #   timestr(timediff($t2, $t1)) .").\n" if $opt_v;
    144137
    145   # as suggested by Karel P. de Vos, [email protected], we speed up
    146   # sorting by removing singletons
    147   map { my $key=$_; if ($$ngram{$key} <= $opt_f)
    148              { delete $$ngram{$key}; }; } keys %$ngram;
    149 
     138    map { if ($ngram->{$_} <= $opt_f) { delete $ngram->{$_}; }
     139      } keys %$ngram;
    150140 
    151   # sort the ngrams, and spit out the $opt_t frequent ones.
    152   # adding  `or $a cmp $b' in the sort block makes sorting five
    153   # times slower..., although it would be somewhat nicer (unique result)
    154   my @sorted = sort { $$ngram{$b} <=> $$ngram{$a} } keys %$ngram;
    155   splice(@sorted,$opt_t) if (@sorted > $opt_t);
    156   # print STDERR "sorting done (" .
    157   #  timestr(timediff(new Benchmark, $t2)) .
    158   #    ").\n" if $opt_v;
    159   return @sorted;
     141    # sort the ngrams, and spit out the $opt_t frequent ones.
     142    # adding  `or $a cmp $b' in the sort block makes sorting five
     143    # times slower..., although it would be somewhat nicer (unique result)
     144    my @sorted = sort { $ngram->{$b} <=> $ngram->{$a} } keys %$ngram;
     145    splice(@sorted,$opt_t) if (@sorted > $opt_t);
     146    return \@sorted;
    160147}
    161148
Note: See TracChangeset for help on using the changeset viewer.