source: gsdl/trunk/perllib/textcat.pm@ 14119

Last change on this file since 14119 was 2235, checked in by sjboddie, 23 years ago

Hacked the textcat package about so that it only reads all the language
models once (instead of reading them in before processing each document).
Fairly significant speed improvements, as you'd expect.

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 4.5 KB
Line 
1###########################################################################
2#
3# textcat.pm -- Identify the language of a piece of text
4#
5#
6# This file is based on TextCat version 1.08 by Gertjan van Noord
7# Copyright (C) 1997 Gertjan van Noord ([email protected])
8# TextCat is available from: http://odur.let.rug.nl/~vannoord/TextCat
9#
10# It was modified by Gordon Paynter ([email protected]) and turned
11# into a package for use in Greenstone digital library system. Most of
12# the modifications consist of commenting out or deleting functionality
13# I don't need.
14#
15#
16# This program is free software; you can redistribute it and/or modify
17# it under the terms of the GNU General Public License as published by
18# the Free Software Foundation; either version 2 of the License, or
19# (at your option) any later version.
20#
21# This program is distributed in the hope that it will be useful,
22# but WITHOUT ANY WARRANTY; without even the implied warranty of
23# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
24# GNU General Public License for more details.
25#
26# You should have received a copy of the GNU General Public License
27# along with this program; if not, write to the Free Software
28# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
29#
30###########################################################################
31
32package textcat;
33
34# OPTIONS
35my $model_dir = $ENV{'GSDLHOME'} . "/perllib/textcat";
36
37my $opt_f = 1; # Ngrams which occur <= this number of times are removed
38my $opt_t = 400; # topmost number of ngrams that should be used
39my $opt_u = 1.05; # how much worse result must be before it is ignored
40
41my $non_word_characters = '0-9\s';
42
43sub new {
44 my $class = shift (@_);
45 my $self = {};
46
47 # open directory to find which languages are supported
48 opendir DIR, "$model_dir" or die "directory $model_dir: $!\n";
49 my @languages = sort(grep { s/\.lm// && -r "$model_dir/$_.lm" } readdir(DIR));
50 closedir DIR;
51 @languages or die "sorry, can't read any language models from $model_dir\n" .
52 "language models must reside in files with .lm ending\n";
53
54 # load model and count for each language.
55 foreach my $language (@languages) {
56 my %ngram=();
57 my $rang=1;
58 open(LM, "$model_dir/$language.lm") || die "cannot open $language.lm: $!\n";
59 while (<LM>) {
60 chomp;
61 # only use lines starting with appropriate character. Others are ignored.
62 if (/^[^$non_word_characters]+/o) {
63 $self->{'ngrams'}->{$language}->{$&} = $rang++;
64 }
65 }
66 close(LM);
67 }
68
69 $self->{'languages'} = \@languages;
70 return bless $self, $class;
71}
72
73
74
75# CLASSIFICATION
76#
77# What language is a text string?
78# Input: text string
79# Output: array of language names
80
81sub classify {
82 my ($self, $inputref)=@_;
83 my %results = ();
84 my $maxp = $opt_t;
85
86 # create ngrams for input.
87 my $unknown = $self->create_lm($inputref);
88
89 foreach my $language (@{$self->{'languages'}}) {
90
91 # compare language model with input ngrams list
92 my ($i,$p)=(0,0);
93 while ($i < scalar (@$unknown)) {
94 if (defined ($self->{'ngrams'}->{$language}->{$unknown->[$i]})) {
95 $p=$p+abs($self->{'ngrams'}->{$language}->{$unknown->[$i]}-$i);
96 } else {
97 $p=$p+$maxp;
98 }
99 ++$i;
100 }
101 $results{$language} = $p;
102 }
103
104 my @results = sort { $results{$a} <=> $results{$b} } keys %results;
105 my $a = $results{$results[0]};
106
107 my @answers=(shift(@results));
108 while (@results && $results{$results[0]} < ($opt_u *$a)) {
109 @answers=(@answers,shift(@results));
110 }
111
112 return \@answers;
113}
114
115sub create_lm {
116 # $ngram contains reference to the hash we build
117 # then add the ngrams found in each word in the hash
118 my ($self, $textref) = @_;
119
120 my $ngram = {};
121
122 foreach my $word (split(/[$non_word_characters]+/, $$textref)) {
123 $word = "_" . $word . "_";
124 my $len = length($word);
125 my $flen=$len;
126 my $i;
127
128 for ($i=0; $i<$flen; $i++) {
129 $ngram->{substr($word,$i,5)}++ if $len > 4;
130 $ngram->{substr($word,$i,4)}++ if $len > 3;
131 $ngram->{substr($word,$i,3)}++ if $len > 2;
132 $ngram->{substr($word,$i,2)}++ if $len > 1;
133 $ngram->{substr($word,$i,1)}++;
134 $len--;
135 }
136 }
137
138 map { if ($ngram->{$_} <= $opt_f) { delete $ngram->{$_}; }
139 } keys %$ngram;
140
141 # sort the ngrams, and spit out the $opt_t frequent ones.
142 # adding `or $a cmp $b' in the sort block makes sorting five
143 # times slower..., although it would be somewhat nicer (unique result)
144 my @sorted = sort { $ngram->{$b} <=> $ngram->{$a} } keys %$ngram;
145 splice(@sorted,$opt_t) if (@sorted > $opt_t);
146 return \@sorted;
147}
148
1491;
Note: See TracBrowser for help on using the repository browser.