Context Navigation

source: main/trunk/greenstone2/perllib/textcat.pm@ 31960

Last change on this file since 31960 was 19050, checked in by davidb, 15 years ago
No need to set filecontents_cache to undef first
Property svn:executable set to ``* Property svn:keywords set to `Author Date Id Revision`
File size: 11.8 KB

Line
1	###########################################################################
2	#
3	# textcat.pm -- Identify the language of a piece of text
4	#
5	#
6	# This file is based on TextCat version 1.08 by Gertjan van Noord
7	# Copyright (C) 1997 Gertjan van Noord ([email protected])
8	# TextCat is available from: http://odur.let.rug.nl/~vannoord/TextCat
9	#
10	# It was modified by Gordon Paynter ([email protected]) and turned
11	# into a package for use in Greenstone digital library system. Most of
12	# the modifications consist of commenting out or deleting functionality
13	# I don't need.
14	#
15	#
16	# This program is free software; you can redistribute it and/or modify
17	# it under the terms of the GNU General Public License as published by
18	# the Free Software Foundation; either version 2 of the License, or
19	# (at your option) any later version.
20	#
21	# This program is distributed in the hope that it will be useful,
22	# but WITHOUT ANY WARRANTY; without even the implied warranty of
23	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
24	# GNU General Public License for more details.
25	#
26	# You should have received a copy of the GNU General Public License
27	# along with this program; if not, write to the Free Software
28	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
29	#
30	###########################################################################
31
32	package textcat;
33
34	use strict;
35
36	# OPTIONS
37	my $model_dir = $ENV{'GSDLHOME'} . "/perllib/textcat";
38
39	my $opt_f = 1; # Ngrams which occur <= this number of times are removed
40	my $opt_t = 400; # topmost number of ngrams that should be used
41	my $opt_u = 1.05; # how much worse result must be before it is ignored
42
43	my $non_word_characters = '0-9\s';
44
45	# caching related
46	my %filename_cache = (); # map of cached text-strings each to array of char-encodings for the strings themselves
47	my %filecontents_cache = (); # map of cached filenames to array of char-encodings for the contents of the files
48	my $MAX_CACHE_SIZE = 1000;
49
50	sub new {
51	my $class = shift (@_);
52	my ($tmp_f, $tmp_t, $tmp_u) = @_;
53
54	my $self = {};
55
56	# open directory to find which languages are supported
57	opendir DIR, "$model_dir" or die "directory $model_dir: $!\n";
58	my @languages = sort(grep { s/\.lm// && -r "$model_dir/$_.lm" } readdir(DIR));
59	closedir DIR;
60	@languages or die "sorry, can't read any language models from $model_dir\n" .
61	"language models must reside in files with .lm ending\n";
62
63	# load model and count for each language.
64	foreach my $language (@languages) {
65	my %ngram=();
66	my $rang=1;
67	open(LM, "$model_dir/$language.lm") \|\| die "cannot open $language.lm: $!\n";
68	while (<LM>) {
69	chomp;
70	# only use lines starting with appropriate character. Others are ignored.
71	if (/^[^$non_word_characters]+/o) {
72	$self->{'ngrams'}->{$language}->{$&} = $rang++;
73	}
74	}
75	close(LM);
76	}
77
78	$self->{'languages'} = \@languages;
79
80	$self->{'opt_f'} = defined($tmp_f) ? $tmp_f : $opt_f;
81	$self->{'opt_t'} = defined($tmp_t) ? $tmp_t : $opt_t;
82	$self->{'opt_u'} = defined($tmp_u) ? $tmp_u : $opt_u;
83	$self->{'max_cache_size'} = $MAX_CACHE_SIZE;
84
85	return bless $self, $class;
86	}
87
88
89	# CLASSIFICATION
90	#
91	# What language is a text string?
92	# Input: text string
93	# Output: array of language names
94	# $languages is the set of language models to consider (to textcat on)
95	# Can be set to filter out language models that don't belong to the given encoding
96	# in order to obtain a list of the probable languages for that known encoding.
97	# $filter_by_encoding indicates what encoding to narrow the search for languages down to.
98	# This is for when we already know the encoding, but we're still looking for the language.
99	sub classify {
100	my ($self, $inputref, $filter_by_encoding)=@_;
101	my $languages;
102	@$languages = ();
103
104	# filter language filenames by encoding
105	if(defined $filter_by_encoding) {
106	# make sure to normalize language and filtering encoding so we are not
107	# stuck comparing hyphens with underscores in such things as iso-8859-1
108	my $normalized_filter = $filter_by_encoding;
109	$normalized_filter =~ s/[\W\_]//g;
110
111	foreach my $lang (@{$self->{'languages'}}) {
112	my $normalized_lang = $lang;
113	$normalized_lang =~ s/[\W\_]//g;
114
115	if($normalized_lang =~ m/$normalized_filter/i) {
116	push (@$languages, $lang);
117	}
118	}
119	}
120
121	# if the filter_by_encoding wasn't in the list of language model filenames
122	# or if we're not filtering, then work with all language model filenames
123	if(scalar @$languages == 0) {
124	$languages = $self->{'languages'};
125	}
126
127	my %results = ();
128	my $maxp = $self->{'opt_t'};
129
130	# create ngrams for input.
131	my $unknown = $self->create_lm($inputref);
132
133	foreach my $language (@$languages) {
134	# compare language model with input ngrams list
135	my ($i,$p)=(0,0);
136	while ($i < scalar (@$unknown)) {
137	if (defined ($self->{'ngrams'}->{$language}->{$unknown->[$i]})) {
138	$p=$p+abs($self->{'ngrams'}->{$language}->{$unknown->[$i]}-$i);
139	} else {
140	$p=$p+$maxp;
141	}
142	++$i;
143	}
144	$results{$language} = $p;
145	}
146
147	my @results = sort { $results{$a} <=> $results{$b} } keys %results;
148	my $a = $results{$results[0]};
149
150	my @answers=(shift(@results));
151	while (@results && $results{$results[0]} < ($self->{'opt_u'} *$a)) {
152	@answers=(@answers,shift(@results));
153	}
154
155	return \@answers;
156	}
157
158
159	# Same as below, but caches textcat results on filenames for subsequent use.
160	# The cache is a map of the filename to the corresponding filename_encodings
161	# (an array of results returned by textcat of the possible filename-encodings
162	# for the indexing filename string itself).
163	# Need to make sure that the filename is only the tailname: no path and no
164	# extension (no digits), in order to make optimum use of cached textcat.
165	# Textcat is performed on $filename_ref and the results associated with $filename_ref.
166	# The cache will be cleared when the max_cache_size is reached, which is
167	# MAX_CACHE_SIZE by default or can be specified as a parameter. The cache
168	# can also be cleared by a call to clear_filename_cache.
169	sub classify_cached_filename {
170	my ($self, $filename_ref)=@_;
171
172	# if not already in the cache, work it out and put it there
173	if (!defined $filename_cache{$$filename_ref})
174	{
175	if (scalar (keys %filename_cache) >= $self->{'max_cache_size'}) {
176	$self->clear_filename_cache();
177	}
178	$filename_cache{$$filename_ref} = $self->classify($filename_ref);
179	}
180
181	# return cached array of encodings for the given string
182	return $filename_cache{$$filename_ref};
183	}
184
185	# Same as above, but caches textcat results on filecontents for subsequent use.
186	# Textcat on a file's contents to work out its possible encodings. Uses the cache.
187	# The cache is a map of the filename to an array of possible filename_encodings
188	# for the contents of the file returned by textcat.
189	# Textcat is performed on $contents_ref and the results associated with $filename.
190	# The cache will be cleared when the max_cache_size is reached, which is
191	# MAX_CACHE_SIZE by default or can be specified as a parameter. The cache
192	# can also be cleared by a call to clear_filecontents_cache.
193	sub classify_contents {
194	my ($self, $contents_ref, $filename)=@_;
195
196	# if not already in the cache, work it out and put it there
197	if (!defined $filecontents_cache{$filename})
198	{
199	if (scalar (keys %filecontents_cache) >= $self->{'max_cache_size'}) {
200	$self->clear_filecontents_cache();
201	}
202
203	# Finally, we can perform the textcat classification of language and encoding
204	$filecontents_cache{$filename} = $self->classify($contents_ref);
205	}
206	# return cached array of content encodings for the given filename
207	return $filecontents_cache{$filename};
208	}
209
210
211	# Given the known encoding for a file's contents, performs a textcat
212	# filtering on the languages for the given encoding. Results are stored
213	# in the cache TWICE: once under $filename\|$filter_by_encoding, and
214	# once under the usual $filename, so that subsequent calls to either
215	# this method or classify_contents using the same filename will not
216	# perform textcat again.
217	sub classify_contents_for_encoding {
218	my ($self, $contents_ref, $filename, $filter_by_encoding)=@_;
219
220	if (!defined $filecontents_cache{"$filename\|$filter_by_encoding"})
221	{
222	if (scalar (keys %filecontents_cache) >= $self->{'max_cache_size'}) {
223	$self->clear_filecontents_cache();
224	}
225
226	$filecontents_cache{"$filename\|$filter_by_encoding"} = $self->classify($contents_ref, $filter_by_encoding);
227	# store this in cache again under $filename entry, so that subsequent
228	# calls to classify_contents will find it in the cache already
229	$filecontents_cache{$filename} = $self->classify($contents_ref, $filter_by_encoding);
230	}
231	return $filecontents_cache{$filename};
232	}
233
234
235	# This method returns the most frequently occurring encoding
236	# but only if any encoding occurs more than once in the given results.
237	# Otherwise, "" is returned.
238	sub most_frequent_encoding {
239	my ($self, $results) = @_;
240	my $best_encoding = "";
241
242	# guessed_encodings is a hashmap of Encoding -> Frequency pairs
243	my %guessed_encodings = ();
244	foreach my $result (@$results) {
245	# Get the encoding portion of a language-model filename like en-iso8859_1
246	my ($encoding) = ($result =~ /^(?:[^\-]+)\-([^\-]+)$/);
247	if(!defined($guessed_encodings{$encoding})) {
248	$guessed_encodings{$encoding} = 0;
249	}
250	$guessed_encodings{$encoding}++;
251	}
252
253	$guessed_encodings{""}=-1; # for default best_encoding of ""
254
255	foreach my $enc (keys %guessed_encodings) {
256	if ($guessed_encodings{$enc} > $guessed_encodings{$best_encoding}) {
257	$best_encoding = $enc;
258	}
259	}
260
261	# If best_encoding's frequency == 1, then the frequency for all encodings will
262	# be 1 since the sum total of all frequencies is num_results: if any encoding
263	# has frequency > 1 (it's possibly the best_encoding), one or more of the others
264	# would have been at 0 frequency to compensate.
265	return ($guessed_encodings{$best_encoding} > 1) ? $best_encoding : "";
266	}
267
268
269	# set some of the specific member variables
270	sub set_opts {
271	my ($self, $opt_freq, $opt_factor, $opt_top, $max_size_of_cache)=@_;
272
273	$self->{'opt_f'} = $opt_freq if defined $opt_freq;
274	$self->{'opt_u'} = $opt_factor if defined $opt_factor;
275	$self->{'opt_t'} = $opt_top if defined $opt_top;
276
277	$self->{'max_cache_size'} = $max_size_of_cache if defined $max_size_of_cache;
278	}
279
280	sub get_opts {
281	my $self = shift (@_);
282	return ($self->{'opt_f'}, $self->{'opt_u'}, $self->{'opt_t'}, $self->{'max_cache_size'});
283	}
284
285	# Clears the filename cache (a map of strings to the textcat results for each string).
286	sub clear_filename_cache {
287	my $self = shift (@_);
288
289	%filename_cache = ();
290	}
291
292	# Clears the filecontents cache (a map of filenames to the textcat results on the contents of each file).
293	sub clear_filecontents_cache {
294	my $self = shift (@_);
295
296	%filecontents_cache = ();
297	}
298
299	sub create_lm {
300	# $ngram contains reference to the hash we build
301	# then add the ngrams found in each word in the hash
302	my ($self, $textref) = @_;
303
304	my $ngram = {};
305
306	foreach my $word (split(/[$non_word_characters]+/, $$textref)) {
307	$word = "_" . $word . "_";
308	my $len = length($word);
309	my $flen=$len;
310	my $i;
311
312	for ($i=0; $i<$flen; $i++) {
313	$ngram->{substr($word,$i,5)}++ if $len > 4;
314	$ngram->{substr($word,$i,4)}++ if $len > 3;
315	$ngram->{substr($word,$i,3)}++ if $len > 2;
316	$ngram->{substr($word,$i,2)}++ if $len > 1;
317	$ngram->{substr($word,$i,1)}++;
318	$len--;
319	}
320	}
321
322	map { if ($ngram->{$_} <= $self->{'opt_f'}) { delete $ngram->{$_}; }
323	} keys %$ngram;
324
325	# sort the ngrams, and spit out the $opt_t frequent ones.
326	# adding `or $a cmp $b' in the sort block makes sorting five
327	# times slower..., although it would be somewhat nicer (unique result)
328	my @sorted = sort { $ngram->{$b} <=> $ngram->{$a} } keys %$ngram;
329	splice(@sorted,$self->{'opt_t'}) if (@sorted > $self->{'opt_t'});
330	return \@sorted;
331	}
332
333	1;

Note: See TracBrowser for help on using the repository browser.

Download in other formats: