########################################################################### # # textcat.pm -- Identify the language of a piece of text # # # This file is based on TextCat version 1.08 by Gertjan van Noord # Copyright (C) 1997 Gertjan van Noord (vannoord@let.rug.nl) # TextCat is available from: http://odur.let.rug.nl/~vannoord/TextCat # # It was modified by Gordon Paynter (gwp@cs.waikato.ac.nz) and turned # into a package for use in Greenstone digital library system. Most of # the modifications consist of commenting out or deleting functionality # I don't need. # # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation; either version 2 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program; if not, write to the Free Software # Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. # ########################################################################### package textcat; use strict; #use Benchmark; # OPTIONS my $model_dir = $ENV{'GSDLHOME'} . "/perllib/textcat"; my $opt_f = 1; # Ngrams which occur <= this number of times are removed my $opt_t = 400; # topmost number of ngrams that should be used my $opt_u = 1.05; # how much worse result must be before it is ignored my $non_word_characters = '0-9\s'; # CLASSIFICATION # # What language is a text string? # Input: text string # Output: array of language names sub classify { my ($input)=@_; my %results=(); my $maxp = $opt_t; # open directory to find which languages are supported opendir DIR, "$model_dir" or die "directory $model_dir: $!\n"; my @languages = sort(grep { s/\.lm// && -r "$model_dir/$_.lm" } readdir(DIR)); closedir DIR; @languages or die "sorry, can't read any language models from $model_dir\n" . "language models must reside in files with .lm ending\n"; # create ngrams for input. Note that hash %unknown is not used; # it contains the actual counts which are only used under -n: creating # new language model (and even then they are not really required). my @unknown=create_lm($input); # load model and count for each language. my $language; # my $t1 = new Benchmark; foreach $language (@languages) { # loads the language model into hash %$language. my %ngram=(); my $rang=1; open(LM,"$model_dir/$language.lm") || die "cannot open $language.lm: $!\n"; while () { chomp; # only use lines starting with appropriate character. Others are # ignored. if (/^[^$non_word_characters]+/o) { $ngram{$&} = $rang++; } } close(LM); #print STDERR "loaded language model $language\n" if $opt_v; # compares the language model with input ngrams list my ($i,$p)=(0,0); while ($i < @unknown) { if ($ngram{$unknown[$i]}) { $p=$p+abs($ngram{$unknown[$i]}-$i); } else { $p=$p+$maxp; } ++$i; } #print STDERR "$language: $p\n" if $opt_v; $results{$language} = $p; } # print STDERR "read language models done (" . # timestr(timediff(new Benchmark, $t1)) . # ".\n" if $opt_v; my @results = sort { $results{$a} <=> $results{$b} } keys %results; # print join("\n",map { "$_\t $results{$_}"; } @results),"\n" if $opt_v; my $a = $results{$results[0]}; my @answers=(shift(@results)); while (@results && $results{$results[0]} < ($opt_u *$a)) { @answers=(@answers,shift(@results)); } return @answers; } sub create_lm { # my $t1 = new Benchmark; my $ngram; ($_,$ngram) = @_; #$ngram contains reference to the hash we build # then add the ngrams found in each word in the hash my $word; foreach $word (split("[$non_word_characters]+")) { $word = "_" . $word . "_"; my $len = length($word); my $flen=$len; my $i; for ($i=0;$i<$flen;$i++) { $$ngram{substr($word,$i,5)}++ if $len > 4; $$ngram{substr($word,$i,4)}++ if $len > 3; $$ngram{substr($word,$i,3)}++ if $len > 2; $$ngram{substr($word,$i,2)}++ if $len > 1; $$ngram{substr($word,$i,1)}++; $len--; } } ###print "@{[%$ngram]}"; # my $t2 = new Benchmark; # print STDERR "count_ngrams done (". # timestr(timediff($t2, $t1)) .").\n" if $opt_v; # as suggested by Karel P. de Vos, k.vos@elsevier.nl, we speed up # sorting by removing singletons map { my $key=$_; if ($$ngram{$key} <= $opt_f) { delete $$ngram{$key}; }; } keys %$ngram; # sort the ngrams, and spit out the $opt_t frequent ones. # adding `or $a cmp $b' in the sort block makes sorting five # times slower..., although it would be somewhat nicer (unique result) my @sorted = sort { $$ngram{$b} <=> $$ngram{$a} } keys %$ngram; splice(@sorted,$opt_t) if (@sorted > $opt_t); # print STDERR "sorting done (" . # timestr(timediff(new Benchmark, $t2)) . # ").\n" if $opt_v; return @sorted; } 1;