Context Navigation

source: gsdl/trunk/perllib/textcat.pm@ 15894

Last change on this file since 15894 was 15894, checked in by mdewsnip, 16 years ago
Added "use strict" to the files missing it.
Property svn:executable set to ``* Property svn:keywords set to `Author Date Id Revision`
File size: 4.5 KB

Line
1	###########################################################################
2	#
3	# textcat.pm -- Identify the language of a piece of text
4	#
5	#
6	# This file is based on TextCat version 1.08 by Gertjan van Noord
7	# Copyright (C) 1997 Gertjan van Noord ([email protected])
8	# TextCat is available from: http://odur.let.rug.nl/~vannoord/TextCat
9	#
10	# It was modified by Gordon Paynter ([email protected]) and turned
11	# into a package for use in Greenstone digital library system. Most of
12	# the modifications consist of commenting out or deleting functionality
13	# I don't need.
14	#
15	#
16	# This program is free software; you can redistribute it and/or modify
17	# it under the terms of the GNU General Public License as published by
18	# the Free Software Foundation; either version 2 of the License, or
19	# (at your option) any later version.
20	#
21	# This program is distributed in the hope that it will be useful,
22	# but WITHOUT ANY WARRANTY; without even the implied warranty of
23	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
24	# GNU General Public License for more details.
25	#
26	# You should have received a copy of the GNU General Public License
27	# along with this program; if not, write to the Free Software
28	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
29	#
30	###########################################################################
31
32	package textcat;
33
34	use strict;
35
36	# OPTIONS
37	my $model_dir = $ENV{'GSDLHOME'} . "/perllib/textcat";
38
39	my $opt_f = 1; # Ngrams which occur <= this number of times are removed
40	my $opt_t = 400; # topmost number of ngrams that should be used
41	my $opt_u = 1.05; # how much worse result must be before it is ignored
42
43	my $non_word_characters = '0-9\s';
44
45	sub new {
46	my $class = shift (@_);
47	my $self = {};
48
49	# open directory to find which languages are supported
50	opendir DIR, "$model_dir" or die "directory $model_dir: $!\n";
51	my @languages = sort(grep { s/\.lm// && -r "$model_dir/$_.lm" } readdir(DIR));
52	closedir DIR;
53	@languages or die "sorry, can't read any language models from $model_dir\n" .
54	"language models must reside in files with .lm ending\n";
55
56	# load model and count for each language.
57	foreach my $language (@languages) {
58	my %ngram=();
59	my $rang=1;
60	open(LM, "$model_dir/$language.lm") \|\| die "cannot open $language.lm: $!\n";
61	while (<LM>) {
62	chomp;
63	# only use lines starting with appropriate character. Others are ignored.
64	if (/^[^$non_word_characters]+/o) {
65	$self->{'ngrams'}->{$language}->{$&} = $rang++;
66	}
67	}
68	close(LM);
69	}
70
71	$self->{'languages'} = \@languages;
72	return bless $self, $class;
73	}
74
75
76
77	# CLASSIFICATION
78	#
79	# What language is a text string?
80	# Input: text string
81	# Output: array of language names
82
83	sub classify {
84	my ($self, $inputref)=@_;
85	my %results = ();
86	my $maxp = $opt_t;
87
88	# create ngrams for input.
89	my $unknown = $self->create_lm($inputref);
90
91	foreach my $language (@{$self->{'languages'}}) {
92
93	# compare language model with input ngrams list
94	my ($i,$p)=(0,0);
95	while ($i < scalar (@$unknown)) {
96	if (defined ($self->{'ngrams'}->{$language}->{$unknown->[$i]})) {
97	$p=$p+abs($self->{'ngrams'}->{$language}->{$unknown->[$i]}-$i);
98	} else {
99	$p=$p+$maxp;
100	}
101	++$i;
102	}
103	$results{$language} = $p;
104	}
105
106	my @results = sort { $results{$a} <=> $results{$b} } keys %results;
107	my $a = $results{$results[0]};
108
109	my @answers=(shift(@results));
110	while (@results && $results{$results[0]} < ($opt_u *$a)) {
111	@answers=(@answers,shift(@results));
112	}
113
114	return \@answers;
115	}
116
117	sub create_lm {
118	# $ngram contains reference to the hash we build
119	# then add the ngrams found in each word in the hash
120	my ($self, $textref) = @_;
121
122	my $ngram = {};
123
124	foreach my $word (split(/[$non_word_characters]+/, $$textref)) {
125	$word = "_" . $word . "_";
126	my $len = length($word);
127	my $flen=$len;
128	my $i;
129
130	for ($i=0; $i<$flen; $i++) {
131	$ngram->{substr($word,$i,5)}++ if $len > 4;
132	$ngram->{substr($word,$i,4)}++ if $len > 3;
133	$ngram->{substr($word,$i,3)}++ if $len > 2;
134	$ngram->{substr($word,$i,2)}++ if $len > 1;
135	$ngram->{substr($word,$i,1)}++;
136	$len--;
137	}
138	}
139
140	map { if ($ngram->{$_} <= $opt_f) { delete $ngram->{$_}; }
141	} keys %$ngram;
142
143	# sort the ngrams, and spit out the $opt_t frequent ones.
144	# adding `or $a cmp $b' in the sort block makes sorting five
145	# times slower..., although it would be somewhat nicer (unique result)
146	my @sorted = sort { $ngram->{$b} <=> $ngram->{$a} } keys %$ngram;
147	splice(@sorted,$opt_t) if (@sorted > $opt_t);
148	return \@sorted;
149	}
150
151	1;

Note: See TracBrowser for help on using the repository browser.

Download in other formats: