source: gsdl/trunk/perllib/textcat.pm@ 15894

Last change on this file since 15894 was 15894, checked in by mdewsnip, 16 years ago

Added "use strict" to the files missing it.

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 4.5 KB
Line 
1###########################################################################
2#
3# textcat.pm -- Identify the language of a piece of text
4#
5#
6# This file is based on TextCat version 1.08 by Gertjan van Noord
7# Copyright (C) 1997 Gertjan van Noord ([email protected])
8# TextCat is available from: http://odur.let.rug.nl/~vannoord/TextCat
9#
10# It was modified by Gordon Paynter ([email protected]) and turned
11# into a package for use in Greenstone digital library system. Most of
12# the modifications consist of commenting out or deleting functionality
13# I don't need.
14#
15#
16# This program is free software; you can redistribute it and/or modify
17# it under the terms of the GNU General Public License as published by
18# the Free Software Foundation; either version 2 of the License, or
19# (at your option) any later version.
20#
21# This program is distributed in the hope that it will be useful,
22# but WITHOUT ANY WARRANTY; without even the implied warranty of
23# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
24# GNU General Public License for more details.
25#
26# You should have received a copy of the GNU General Public License
27# along with this program; if not, write to the Free Software
28# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
29#
30###########################################################################
31
32package textcat;
33
34use strict;
35
36# OPTIONS
37my $model_dir = $ENV{'GSDLHOME'} . "/perllib/textcat";
38
39my $opt_f = 1; # Ngrams which occur <= this number of times are removed
40my $opt_t = 400; # topmost number of ngrams that should be used
41my $opt_u = 1.05; # how much worse result must be before it is ignored
42
43my $non_word_characters = '0-9\s';
44
45sub new {
46 my $class = shift (@_);
47 my $self = {};
48
49 # open directory to find which languages are supported
50 opendir DIR, "$model_dir" or die "directory $model_dir: $!\n";
51 my @languages = sort(grep { s/\.lm// && -r "$model_dir/$_.lm" } readdir(DIR));
52 closedir DIR;
53 @languages or die "sorry, can't read any language models from $model_dir\n" .
54 "language models must reside in files with .lm ending\n";
55
56 # load model and count for each language.
57 foreach my $language (@languages) {
58 my %ngram=();
59 my $rang=1;
60 open(LM, "$model_dir/$language.lm") || die "cannot open $language.lm: $!\n";
61 while (<LM>) {
62 chomp;
63 # only use lines starting with appropriate character. Others are ignored.
64 if (/^[^$non_word_characters]+/o) {
65 $self->{'ngrams'}->{$language}->{$&} = $rang++;
66 }
67 }
68 close(LM);
69 }
70
71 $self->{'languages'} = \@languages;
72 return bless $self, $class;
73}
74
75
76
77# CLASSIFICATION
78#
79# What language is a text string?
80# Input: text string
81# Output: array of language names
82
83sub classify {
84 my ($self, $inputref)=@_;
85 my %results = ();
86 my $maxp = $opt_t;
87
88 # create ngrams for input.
89 my $unknown = $self->create_lm($inputref);
90
91 foreach my $language (@{$self->{'languages'}}) {
92
93 # compare language model with input ngrams list
94 my ($i,$p)=(0,0);
95 while ($i < scalar (@$unknown)) {
96 if (defined ($self->{'ngrams'}->{$language}->{$unknown->[$i]})) {
97 $p=$p+abs($self->{'ngrams'}->{$language}->{$unknown->[$i]}-$i);
98 } else {
99 $p=$p+$maxp;
100 }
101 ++$i;
102 }
103 $results{$language} = $p;
104 }
105
106 my @results = sort { $results{$a} <=> $results{$b} } keys %results;
107 my $a = $results{$results[0]};
108
109 my @answers=(shift(@results));
110 while (@results && $results{$results[0]} < ($opt_u *$a)) {
111 @answers=(@answers,shift(@results));
112 }
113
114 return \@answers;
115}
116
117sub create_lm {
118 # $ngram contains reference to the hash we build
119 # then add the ngrams found in each word in the hash
120 my ($self, $textref) = @_;
121
122 my $ngram = {};
123
124 foreach my $word (split(/[$non_word_characters]+/, $$textref)) {
125 $word = "_" . $word . "_";
126 my $len = length($word);
127 my $flen=$len;
128 my $i;
129
130 for ($i=0; $i<$flen; $i++) {
131 $ngram->{substr($word,$i,5)}++ if $len > 4;
132 $ngram->{substr($word,$i,4)}++ if $len > 3;
133 $ngram->{substr($word,$i,3)}++ if $len > 2;
134 $ngram->{substr($word,$i,2)}++ if $len > 1;
135 $ngram->{substr($word,$i,1)}++;
136 $len--;
137 }
138 }
139
140 map { if ($ngram->{$_} <= $opt_f) { delete $ngram->{$_}; }
141 } keys %$ngram;
142
143 # sort the ngrams, and spit out the $opt_t frequent ones.
144 # adding `or $a cmp $b' in the sort block makes sorting five
145 # times slower..., although it would be somewhat nicer (unique result)
146 my @sorted = sort { $ngram->{$b} <=> $ngram->{$a} } keys %$ngram;
147 splice(@sorted,$opt_t) if (@sorted > $opt_t);
148 return \@sorted;
149}
150
1511;
Note: See TracBrowser for help on using the repository browser.