source: trunk/gsdl/perllib/textcat.pm@ 1483

Last change on this file since 1483 was 1316, checked in by paynter, 24 years ago

The textcat language identification package.

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 5.2 KB
Line 
1###########################################################################
2#
3# textcat.pm -- Identify the language of a piece of text
4#
5#
6# This file is based on TextCat version 1.08 by Gertjan van Noord
7# Copyright (C) 1997 Gertjan van Noord ([email protected])
8# TextCat is available from: http://odur.let.rug.nl/~vannoord/TextCat
9#
10# It was modified by Gordon Paynter ([email protected]) and turned
11# into a package for use in Greenstone digital library system. Most of
12# the modifications consist of commenting out or deleting functionality
13# I don't need.
14#
15#
16# This program is free software; you can redistribute it and/or modify
17# it under the terms of the GNU General Public License as published by
18# the Free Software Foundation; either version 2 of the License, or
19# (at your option) any later version.
20#
21# This program is distributed in the hope that it will be useful,
22# but WITHOUT ANY WARRANTY; without even the implied warranty of
23# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
24# GNU General Public License for more details.
25#
26# You should have received a copy of the GNU General Public License
27# along with this program; if not, write to the Free Software
28# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
29#
30###########################################################################
31
32package textcat;
33
34use strict;
35#use Benchmark;
36
37# OPTIONS
38my $model_dir = $ENV{'GSDLHOME'} . "/perllib/textcat";
39
40my $opt_f = 1; # Ngrams which occur <= this number of times are removed
41my $opt_t = 400; # topmost number of ngrams that should be used
42my $opt_u = 1.05; # how much worse result must be before it is ignored
43
44my $non_word_characters = '0-9\s';
45
46
47# CLASSIFICATION
48#
49# What language is a text string?
50# Input: text string
51# Output: array of language names
52
53sub classify {
54 my ($input)=@_;
55 my %results=();
56 my $maxp = $opt_t;
57
58 # open directory to find which languages are supported
59 opendir DIR, "$model_dir" or die "directory $model_dir: $!\n";
60 my @languages = sort(grep { s/\.lm// && -r "$model_dir/$_.lm" } readdir(DIR));
61 closedir DIR;
62 @languages or die "sorry, can't read any language models from $model_dir\n" .
63 "language models must reside in files with .lm ending\n";
64
65 # create ngrams for input. Note that hash %unknown is not used;
66 # it contains the actual counts which are only used under -n: creating
67 # new language model (and even then they are not really required).
68 my @unknown=create_lm($input);
69 # load model and count for each language.
70 my $language;
71 # my $t1 = new Benchmark;
72 foreach $language (@languages) {
73 # loads the language model into hash %$language.
74 my %ngram=();
75 my $rang=1;
76 open(LM,"$model_dir/$language.lm") || die "cannot open $language.lm: $!\n";
77 while (<LM>) {
78 chomp;
79 # only use lines starting with appropriate character. Others are
80 # ignored.
81 if (/^[^$non_word_characters]+/o) {
82 $ngram{$&} = $rang++;
83 }
84 }
85 close(LM);
86 #print STDERR "loaded language model $language\n" if $opt_v;
87
88 # compares the language model with input ngrams list
89 my ($i,$p)=(0,0);
90 while ($i < @unknown) {
91 if ($ngram{$unknown[$i]}) {
92 $p=$p+abs($ngram{$unknown[$i]}-$i);
93 } else {
94 $p=$p+$maxp;
95 }
96 ++$i;
97 }
98 #print STDERR "$language: $p\n" if $opt_v;
99
100 $results{$language} = $p;
101 }
102 # print STDERR "read language models done (" .
103 # timestr(timediff(new Benchmark, $t1)) .
104 # ".\n" if $opt_v;
105 my @results = sort { $results{$a} <=> $results{$b} } keys %results;
106
107 # print join("\n",map { "$_\t $results{$_}"; } @results),"\n" if $opt_v;
108 my $a = $results{$results[0]};
109
110 my @answers=(shift(@results));
111 while (@results && $results{$results[0]} < ($opt_u *$a)) {
112 @answers=(@answers,shift(@results));
113 }
114
115 return @answers;
116}
117
118
119
120sub create_lm {
121 # my $t1 = new Benchmark;
122 my $ngram;
123 ($_,$ngram) = @_; #$ngram contains reference to the hash we build
124 # then add the ngrams found in each word in the hash
125 my $word;
126 foreach $word (split("[$non_word_characters]+")) {
127 $word = "_" . $word . "_";
128 my $len = length($word);
129 my $flen=$len;
130 my $i;
131 for ($i=0;$i<$flen;$i++) {
132 $$ngram{substr($word,$i,5)}++ if $len > 4;
133 $$ngram{substr($word,$i,4)}++ if $len > 3;
134 $$ngram{substr($word,$i,3)}++ if $len > 2;
135 $$ngram{substr($word,$i,2)}++ if $len > 1;
136 $$ngram{substr($word,$i,1)}++;
137 $len--;
138 }
139 }
140 ###print "@{[%$ngram]}";
141 # my $t2 = new Benchmark;
142 # print STDERR "count_ngrams done (".
143 # timestr(timediff($t2, $t1)) .").\n" if $opt_v;
144
145 # as suggested by Karel P. de Vos, [email protected], we speed up
146 # sorting by removing singletons
147 map { my $key=$_; if ($$ngram{$key} <= $opt_f)
148 { delete $$ngram{$key}; }; } keys %$ngram;
149
150
151 # sort the ngrams, and spit out the $opt_t frequent ones.
152 # adding `or $a cmp $b' in the sort block makes sorting five
153 # times slower..., although it would be somewhat nicer (unique result)
154 my @sorted = sort { $$ngram{$b} <=> $$ngram{$a} } keys %$ngram;
155 splice(@sorted,$opt_t) if (@sorted > $opt_t);
156 # print STDERR "sorting done (" .
157 # timestr(timediff(new Benchmark, $t2)) .
158 # ").\n" if $opt_v;
159 return @sorted;
160}
161
1621;
Note: See TracBrowser for help on using the repository browser.