[1316] | 1 | ###########################################################################
|
---|
| 2 | #
|
---|
| 3 | # textcat.pm -- Identify the language of a piece of text
|
---|
| 4 | #
|
---|
| 5 | #
|
---|
| 6 | # This file is based on TextCat version 1.08 by Gertjan van Noord
|
---|
| 7 | # Copyright (C) 1997 Gertjan van Noord ([email protected])
|
---|
| 8 | # TextCat is available from: http://odur.let.rug.nl/~vannoord/TextCat
|
---|
| 9 | #
|
---|
| 10 | # It was modified by Gordon Paynter ([email protected]) and turned
|
---|
| 11 | # into a package for use in Greenstone digital library system. Most of
|
---|
| 12 | # the modifications consist of commenting out or deleting functionality
|
---|
| 13 | # I don't need.
|
---|
| 14 | #
|
---|
| 15 | #
|
---|
| 16 | # This program is free software; you can redistribute it and/or modify
|
---|
| 17 | # it under the terms of the GNU General Public License as published by
|
---|
| 18 | # the Free Software Foundation; either version 2 of the License, or
|
---|
| 19 | # (at your option) any later version.
|
---|
| 20 | #
|
---|
| 21 | # This program is distributed in the hope that it will be useful,
|
---|
| 22 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
|
---|
| 23 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
---|
| 24 | # GNU General Public License for more details.
|
---|
| 25 | #
|
---|
| 26 | # You should have received a copy of the GNU General Public License
|
---|
| 27 | # along with this program; if not, write to the Free Software
|
---|
| 28 | # Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
|
---|
| 29 | #
|
---|
| 30 | ###########################################################################
|
---|
| 31 |
|
---|
| 32 | package textcat;
|
---|
| 33 |
|
---|
| 34 | use strict;
|
---|
| 35 | #use Benchmark;
|
---|
| 36 |
|
---|
| 37 | # OPTIONS
|
---|
| 38 | my $model_dir = $ENV{'GSDLHOME'} . "/perllib/textcat";
|
---|
| 39 |
|
---|
| 40 | my $opt_f = 1; # Ngrams which occur <= this number of times are removed
|
---|
| 41 | my $opt_t = 400; # topmost number of ngrams that should be used
|
---|
| 42 | my $opt_u = 1.05; # how much worse result must be before it is ignored
|
---|
| 43 |
|
---|
| 44 | my $non_word_characters = '0-9\s';
|
---|
| 45 |
|
---|
| 46 |
|
---|
| 47 | # CLASSIFICATION
|
---|
| 48 | #
|
---|
| 49 | # What language is a text string?
|
---|
| 50 | # Input: text string
|
---|
| 51 | # Output: array of language names
|
---|
| 52 |
|
---|
| 53 | sub classify {
|
---|
| 54 | my ($input)=@_;
|
---|
| 55 | my %results=();
|
---|
| 56 | my $maxp = $opt_t;
|
---|
| 57 |
|
---|
| 58 | # open directory to find which languages are supported
|
---|
| 59 | opendir DIR, "$model_dir" or die "directory $model_dir: $!\n";
|
---|
| 60 | my @languages = sort(grep { s/\.lm// && -r "$model_dir/$_.lm" } readdir(DIR));
|
---|
| 61 | closedir DIR;
|
---|
| 62 | @languages or die "sorry, can't read any language models from $model_dir\n" .
|
---|
| 63 | "language models must reside in files with .lm ending\n";
|
---|
| 64 |
|
---|
| 65 | # create ngrams for input. Note that hash %unknown is not used;
|
---|
| 66 | # it contains the actual counts which are only used under -n: creating
|
---|
| 67 | # new language model (and even then they are not really required).
|
---|
| 68 | my @unknown=create_lm($input);
|
---|
| 69 | # load model and count for each language.
|
---|
| 70 | my $language;
|
---|
| 71 | # my $t1 = new Benchmark;
|
---|
| 72 | foreach $language (@languages) {
|
---|
| 73 | # loads the language model into hash %$language.
|
---|
| 74 | my %ngram=();
|
---|
| 75 | my $rang=1;
|
---|
| 76 | open(LM,"$model_dir/$language.lm") || die "cannot open $language.lm: $!\n";
|
---|
| 77 | while (<LM>) {
|
---|
| 78 | chomp;
|
---|
| 79 | # only use lines starting with appropriate character. Others are
|
---|
| 80 | # ignored.
|
---|
| 81 | if (/^[^$non_word_characters]+/o) {
|
---|
| 82 | $ngram{$&} = $rang++;
|
---|
| 83 | }
|
---|
| 84 | }
|
---|
| 85 | close(LM);
|
---|
| 86 | #print STDERR "loaded language model $language\n" if $opt_v;
|
---|
| 87 |
|
---|
| 88 | # compares the language model with input ngrams list
|
---|
| 89 | my ($i,$p)=(0,0);
|
---|
| 90 | while ($i < @unknown) {
|
---|
| 91 | if ($ngram{$unknown[$i]}) {
|
---|
| 92 | $p=$p+abs($ngram{$unknown[$i]}-$i);
|
---|
| 93 | } else {
|
---|
| 94 | $p=$p+$maxp;
|
---|
| 95 | }
|
---|
| 96 | ++$i;
|
---|
| 97 | }
|
---|
| 98 | #print STDERR "$language: $p\n" if $opt_v;
|
---|
| 99 |
|
---|
| 100 | $results{$language} = $p;
|
---|
| 101 | }
|
---|
| 102 | # print STDERR "read language models done (" .
|
---|
| 103 | # timestr(timediff(new Benchmark, $t1)) .
|
---|
| 104 | # ".\n" if $opt_v;
|
---|
| 105 | my @results = sort { $results{$a} <=> $results{$b} } keys %results;
|
---|
| 106 |
|
---|
| 107 | # print join("\n",map { "$_\t $results{$_}"; } @results),"\n" if $opt_v;
|
---|
| 108 | my $a = $results{$results[0]};
|
---|
| 109 |
|
---|
| 110 | my @answers=(shift(@results));
|
---|
| 111 | while (@results && $results{$results[0]} < ($opt_u *$a)) {
|
---|
| 112 | @answers=(@answers,shift(@results));
|
---|
| 113 | }
|
---|
| 114 |
|
---|
| 115 | return @answers;
|
---|
| 116 | }
|
---|
| 117 |
|
---|
| 118 |
|
---|
| 119 |
|
---|
| 120 | sub create_lm {
|
---|
| 121 | # my $t1 = new Benchmark;
|
---|
| 122 | my $ngram;
|
---|
| 123 | ($_,$ngram) = @_; #$ngram contains reference to the hash we build
|
---|
| 124 | # then add the ngrams found in each word in the hash
|
---|
| 125 | my $word;
|
---|
| 126 | foreach $word (split("[$non_word_characters]+")) {
|
---|
| 127 | $word = "_" . $word . "_";
|
---|
| 128 | my $len = length($word);
|
---|
| 129 | my $flen=$len;
|
---|
| 130 | my $i;
|
---|
| 131 | for ($i=0;$i<$flen;$i++) {
|
---|
| 132 | $$ngram{substr($word,$i,5)}++ if $len > 4;
|
---|
| 133 | $$ngram{substr($word,$i,4)}++ if $len > 3;
|
---|
| 134 | $$ngram{substr($word,$i,3)}++ if $len > 2;
|
---|
| 135 | $$ngram{substr($word,$i,2)}++ if $len > 1;
|
---|
| 136 | $$ngram{substr($word,$i,1)}++;
|
---|
| 137 | $len--;
|
---|
| 138 | }
|
---|
| 139 | }
|
---|
| 140 | ###print "@{[%$ngram]}";
|
---|
| 141 | # my $t2 = new Benchmark;
|
---|
| 142 | # print STDERR "count_ngrams done (".
|
---|
| 143 | # timestr(timediff($t2, $t1)) .").\n" if $opt_v;
|
---|
| 144 |
|
---|
| 145 | # as suggested by Karel P. de Vos, [email protected], we speed up
|
---|
| 146 | # sorting by removing singletons
|
---|
| 147 | map { my $key=$_; if ($$ngram{$key} <= $opt_f)
|
---|
| 148 | { delete $$ngram{$key}; }; } keys %$ngram;
|
---|
| 149 |
|
---|
| 150 |
|
---|
| 151 | # sort the ngrams, and spit out the $opt_t frequent ones.
|
---|
| 152 | # adding `or $a cmp $b' in the sort block makes sorting five
|
---|
| 153 | # times slower..., although it would be somewhat nicer (unique result)
|
---|
| 154 | my @sorted = sort { $$ngram{$b} <=> $$ngram{$a} } keys %$ngram;
|
---|
| 155 | splice(@sorted,$opt_t) if (@sorted > $opt_t);
|
---|
| 156 | # print STDERR "sorting done (" .
|
---|
| 157 | # timestr(timediff(new Benchmark, $t2)) .
|
---|
| 158 | # ").\n" if $opt_v;
|
---|
| 159 | return @sorted;
|
---|
| 160 | }
|
---|
| 161 |
|
---|
| 162 | 1;
|
---|