Context Navigation

← Previous Revision
Latest Revision
Next Revision →
Blame
Revision Log

source: trunk/gsdl/perllib/textcat.pm@ 1483

Last change on this file since 1483 was 1316, checked in by paynter, 24 years ago
The textcat language identification package.
Property svn:executable set to ``* Property svn:keywords set to `Author Date Id Revision`
File size: 5.2 KB

Line
1	###########################################################################
2	#
3	# textcat.pm -- Identify the language of a piece of text
4	#
5	#
6	# This file is based on TextCat version 1.08 by Gertjan van Noord
7	# Copyright (C) 1997 Gertjan van Noord ([email protected])
8	# TextCat is available from: http://odur.let.rug.nl/~vannoord/TextCat
9	#
10	# It was modified by Gordon Paynter ([email protected]) and turned
11	# into a package for use in Greenstone digital library system. Most of
12	# the modifications consist of commenting out or deleting functionality
13	# I don't need.
14	#
15	#
16	# This program is free software; you can redistribute it and/or modify
17	# it under the terms of the GNU General Public License as published by
18	# the Free Software Foundation; either version 2 of the License, or
19	# (at your option) any later version.
20	#
21	# This program is distributed in the hope that it will be useful,
22	# but WITHOUT ANY WARRANTY; without even the implied warranty of
23	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
24	# GNU General Public License for more details.
25	#
26	# You should have received a copy of the GNU General Public License
27	# along with this program; if not, write to the Free Software
28	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
29	#
30	###########################################################################
31
32	package textcat;
33
34	use strict;
35	#use Benchmark;
36
37	# OPTIONS
38	my $model_dir = $ENV{'GSDLHOME'} . "/perllib/textcat";
39
40	my $opt_f = 1; # Ngrams which occur <= this number of times are removed
41	my $opt_t = 400; # topmost number of ngrams that should be used
42	my $opt_u = 1.05; # how much worse result must be before it is ignored
43
44	my $non_word_characters = '0-9\s';
45
46
47	# CLASSIFICATION
48	#
49	# What language is a text string?
50	# Input: text string
51	# Output: array of language names
52
53	sub classify {
54	my ($input)=@_;
55	my %results=();
56	my $maxp = $opt_t;
57
58	# open directory to find which languages are supported
59	opendir DIR, "$model_dir" or die "directory $model_dir: $!\n";
60	my @languages = sort(grep { s/\.lm// && -r "$model_dir/$_.lm" } readdir(DIR));
61	closedir DIR;
62	@languages or die "sorry, can't read any language models from $model_dir\n" .
63	"language models must reside in files with .lm ending\n";
64
65	# create ngrams for input. Note that hash %unknown is not used;
66	# it contains the actual counts which are only used under -n: creating
67	# new language model (and even then they are not really required).
68	my @unknown=create_lm($input);
69	# load model and count for each language.
70	my $language;
71	# my $t1 = new Benchmark;
72	foreach $language (@languages) {
73	# loads the language model into hash %$language.
74	my %ngram=();
75	my $rang=1;
76	open(LM,"$model_dir/$language.lm") \|\| die "cannot open $language.lm: $!\n";
77	while (<LM>) {
78	chomp;
79	# only use lines starting with appropriate character. Others are
80	# ignored.
81	if (/^[^$non_word_characters]+/o) {
82	$ngram{$&} = $rang++;
83	}
84	}
85	close(LM);
86	#print STDERR "loaded language model $language\n" if $opt_v;
87
88	# compares the language model with input ngrams list
89	my ($i,$p)=(0,0);
90	while ($i < @unknown) {
91	if ($ngram{$unknown[$i]}) {
92	$p=$p+abs($ngram{$unknown[$i]}-$i);
93	} else {
94	$p=$p+$maxp;
95	}
96	++$i;
97	}
98	#print STDERR "$language: $p\n" if $opt_v;
99
100	$results{$language} = $p;
101	}
102	# print STDERR "read language models done (" .
103	# timestr(timediff(new Benchmark, $t1)) .
104	# ".\n" if $opt_v;
105	my @results = sort { $results{$a} <=> $results{$b} } keys %results;
106
107	# print join("\n",map { "$_\t $results{$_}"; } @results),"\n" if $opt_v;
108	my $a = $results{$results[0]};
109
110	my @answers=(shift(@results));
111	while (@results && $results{$results[0]} < ($opt_u *$a)) {
112	@answers=(@answers,shift(@results));
113	}
114
115	return @answers;
116	}
117
118
119
120	sub create_lm {
121	# my $t1 = new Benchmark;
122	my $ngram;
123	($_,$ngram) = @_; #$ngram contains reference to the hash we build
124	# then add the ngrams found in each word in the hash
125	my $word;
126	foreach $word (split("[$non_word_characters]+")) {
127	$word = "_" . $word . "_";
128	my $len = length($word);
129	my $flen=$len;
130	my $i;
131	for ($i=0;$i<$flen;$i++) {
132	$$ngram{substr($word,$i,5)}++ if $len > 4;
133	$$ngram{substr($word,$i,4)}++ if $len > 3;
134	$$ngram{substr($word,$i,3)}++ if $len > 2;
135	$$ngram{substr($word,$i,2)}++ if $len > 1;
136	$$ngram{substr($word,$i,1)}++;
137	$len--;
138	}
139	}
140	###print "@{[%$ngram]}";
141	# my $t2 = new Benchmark;
142	# print STDERR "count_ngrams done (".
143	# timestr(timediff($t2, $t1)) .").\n" if $opt_v;
144
145	# as suggested by Karel P. de Vos, [email protected], we speed up
146	# sorting by removing singletons
147	map { my $key=$_; if ($$ngram{$key} <= $opt_f)
148	{ delete $$ngram{$key}; }; } keys %$ngram;
149
150
151	# sort the ngrams, and spit out the $opt_t frequent ones.
152	# adding `or $a cmp $b' in the sort block makes sorting five
153	# times slower..., although it would be somewhat nicer (unique result)
154	my @sorted = sort { $$ngram{$b} <=> $$ngram{$a} } keys %$ngram;
155	splice(@sorted,$opt_t) if (@sorted > $opt_t);
156	# print STDERR "sorting done (" .
157	# timestr(timediff(new Benchmark, $t2)) .
158	# ").\n" if $opt_v;
159	return @sorted;
160	}
161
162	1;

Note: See TracBrowser for help on using the repository browser.

Download in other formats: