Context Navigation

source: gsdl/trunk/perllib/textcat.pm@ 18463

Last change on this file since 18463 was 17214, checked in by ak19, 16 years ago
Significant changes: 1. Textcat can be restricted to a given encoding when the encoding of a file's contents is known. In this case we are after the language of the contents. 2. Code for working out most frequently occurring encoding has been moved into here from ReadTextFile.pm. 3. Subroutines renamed and unnecessary parameters passed to the various versions of classify() have been moved into set and get methods instead.
Property svn:executable set to ``* Property svn:keywords set to `Author Date Id Revision`
File size: 11.9 KB

Line
1	###########################################################################
2	#
3	# textcat.pm -- Identify the language of a piece of text
4	#
5	#
6	# This file is based on TextCat version 1.08 by Gertjan van Noord
7	# Copyright (C) 1997 Gertjan van Noord ([email protected])
8	# TextCat is available from: http://odur.let.rug.nl/~vannoord/TextCat
9	#
10	# It was modified by Gordon Paynter ([email protected]) and turned
11	# into a package for use in Greenstone digital library system. Most of
12	# the modifications consist of commenting out or deleting functionality
13	# I don't need.
14	#
15	#
16	# This program is free software; you can redistribute it and/or modify
17	# it under the terms of the GNU General Public License as published by
18	# the Free Software Foundation; either version 2 of the License, or
19	# (at your option) any later version.
20	#
21	# This program is distributed in the hope that it will be useful,
22	# but WITHOUT ANY WARRANTY; without even the implied warranty of
23	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
24	# GNU General Public License for more details.
25	#
26	# You should have received a copy of the GNU General Public License
27	# along with this program; if not, write to the Free Software
28	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
29	#
30	###########################################################################
31
32	package textcat;
33
34	use strict;
35
36	# OPTIONS
37	my $model_dir = $ENV{'GSDLHOME'} . "/perllib/textcat";
38
39	my $opt_f = 1; # Ngrams which occur <= this number of times are removed
40	my $opt_t = 400; # topmost number of ngrams that should be used
41	my $opt_u = 1.05; # how much worse result must be before it is ignored
42
43	my $non_word_characters = '0-9\s';
44
45	# caching related
46	my %filename_cache = (); # map of cached text-strings each to array of char-encodings for the strings themselves
47	my %filecontents_cache = (); # map of cached filenames to array of char-encodings for the contents of the files
48	my $MAX_CACHE_SIZE = 1000;
49
50	sub new {
51	my $class = shift (@_);
52	my ($tmp_f, $tmp_t, $tmp_u) = @_;
53
54	my $self = {};
55
56	# open directory to find which languages are supported
57	opendir DIR, "$model_dir" or die "directory $model_dir: $!\n";
58	my @languages = sort(grep { s/\.lm// && -r "$model_dir/$_.lm" } readdir(DIR));
59	closedir DIR;
60	@languages or die "sorry, can't read any language models from $model_dir\n" .
61	"language models must reside in files with .lm ending\n";
62
63	# load model and count for each language.
64	foreach my $language (@languages) {
65	my %ngram=();
66	my $rang=1;
67	open(LM, "$model_dir/$language.lm") \|\| die "cannot open $language.lm: $!\n";
68	while (<LM>) {
69	chomp;
70	# only use lines starting with appropriate character. Others are ignored.
71	if (/^[^$non_word_characters]+/o) {
72	$self->{'ngrams'}->{$language}->{$&} = $rang++;
73	}
74	}
75	close(LM);
76	}
77
78	$self->{'languages'} = \@languages;
79
80	$self->{'opt_f'} = defined($tmp_f) ? $tmp_f : $opt_f;
81	$self->{'opt_t'} = defined($tmp_t) ? $tmp_t : $opt_t;
82	$self->{'opt_u'} = defined($tmp_u) ? $tmp_u : $opt_u;
83	$self->{'max_cache_size'} = $MAX_CACHE_SIZE;
84
85	return bless $self, $class;
86	}
87
88
89	# CLASSIFICATION
90	#
91	# What language is a text string?
92	# Input: text string
93	# Output: array of language names
94	# $languages is the set of language models to consider (to textcat on)
95	# Can be set to filter out language models that don't belong to the given encoding
96	# in order to obtain a list of the probable languages for that known encoding.
97	# $filter_by_encoding indicates what encoding to narrow the search for languages down to.
98	# This is for when we already know the encoding, but we're still looking for the language.
99	sub classify {
100	my ($self, $inputref, $filter_by_encoding)=@_;
101	my $languages;
102	@$languages = ();
103
104	# filter language filenames by encoding
105	if(defined $filter_by_encoding) {
106	# make sure to normalize language and filtering encoding so we are not
107	# stuck comparing hyphens with underscores in such things as iso-8859-1
108	my $normalized_filter = $filter_by_encoding;
109	$normalized_filter =~ s/[\W\_]//g;
110
111	foreach my $lang (@{$self->{'languages'}}) {
112	my $normalized_lang = $lang;
113	$normalized_lang =~ s/[\W\_]//g;
114
115	if($normalized_lang =~ m/$normalized_filter/i) {
116	push (@$languages, $lang);
117	}
118	}
119	}
120
121	# if the filter_by_encoding wasn't in the list of language model filenames
122	# or if we're not filtering, then work with all language model filenames
123	if(scalar @$languages == 0) {
124	$languages = $self->{'languages'};
125	}
126
127	my %results = ();
128	my $maxp = $self->{'opt_t'};
129
130	# create ngrams for input.
131	my $unknown = $self->create_lm($inputref);
132
133	foreach my $language (@$languages) {
134	# compare language model with input ngrams list
135	my ($i,$p)=(0,0);
136	while ($i < scalar (@$unknown)) {
137	if (defined ($self->{'ngrams'}->{$language}->{$unknown->[$i]})) {
138	$p=$p+abs($self->{'ngrams'}->{$language}->{$unknown->[$i]}-$i);
139	} else {
140	$p=$p+$maxp;
141	}
142	++$i;
143	}
144	$results{$language} = $p;
145	}
146
147	my @results = sort { $results{$a} <=> $results{$b} } keys %results;
148	my $a = $results{$results[0]};
149
150	my @answers=(shift(@results));
151	while (@results && $results{$results[0]} < ($self->{'opt_u'} *$a)) {
152	@answers=(@answers,shift(@results));
153	}
154
155	return \@answers;
156	}
157
158
159	# Same as below, but caches textcat results on filenames for subsequent use.
160	# The cache is a map of the filename to the corresponding filename_encodings
161	# (an array of results returned by textcat of the possible filename-encodings
162	# for the indexing filename string itself).
163	# Need to make sure that the filename is only the tailname: no path and no
164	# extension (no digits), in order to make optimum use of cached textcat.
165	# Textcat is performed on $filename_ref and the results associated with $filename_ref.
166	# The cache will be cleared when the max_cache_size is reached, which is
167	# MAX_CACHE_SIZE by default or can be specified as a parameter. The cache
168	# can also be cleared by a call to clear_filename_cache.
169	sub classify_cached_filename {
170	my ($self, $filename_ref)=@_;
171
172	# if not already in the cache, work it out and put it there
173	if (!defined $filename_cache{$$filename_ref})
174	{
175	if (scalar (keys %filename_cache) >= $self->{'max_cache_size'}) {
176	$self->clear_filename_cache();
177	}
178	$filename_cache{$$filename_ref} = $self->classify($filename_ref);
179	}
180
181	# return cached array of encodings for the given string
182	return $filename_cache{$$filename_ref};
183	}
184
185	# Same as above, but caches textcat results on filecontents for subsequent use.
186	# Textcat on a file's contents to work out its possible encodings. Uses the cache.
187	# The cache is a map of the filename to an array of possible filename_encodings
188	# for the contents of the file returned by textcat.
189	# Textcat is performed on $contents_ref and the results associated with $filename.
190	# The cache will be cleared when the max_cache_size is reached, which is
191	# MAX_CACHE_SIZE by default or can be specified as a parameter. The cache
192	# can also be cleared by a call to clear_filecontents_cache.
193	sub classify_contents {
194	my ($self, $contents_ref, $filename)=@_;
195
196	# if not already in the cache, work it out and put it there
197	if (!defined $filecontents_cache{$filename})
198	{
199	if (scalar (keys %filecontents_cache) >= $self->{'max_cache_size'}) {
200	$self->clear_filecontents_cache();
201	}
202
203	# Finally, we can perform the textcat classification of language and encoding
204	$filecontents_cache{$filename} = $self->classify($contents_ref);
205	}
206	# return cached array of content encodings for the given filename
207	return $filecontents_cache{$filename};
208	}
209
210
211	# Given the known encoding for a file's contents, performs a textcat
212	# filtering on the languages for the given encoding. Results are stored
213	# in the cache TWICE: once under $filename\|$filter_by_encoding, and
214	# once under the usual $filename, so that subsequent calls to either
215	# this method or classify_contents using the same filename will not
216	# perform textcat again.
217	sub classify_contents_for_encoding {
218	my ($self, $contents_ref, $filename, $filter_by_encoding)=@_;
219
220	if (!defined $filecontents_cache{"$filename\|$filter_by_encoding"})
221	{
222	if (scalar (keys %filecontents_cache) >= $self->{'max_cache_size'}) {
223	$self->clear_filecontents_cache();
224	}
225
226	$filecontents_cache{"$filename\|$filter_by_encoding"} = $self->classify($contents_ref, $filter_by_encoding);
227	# store this in cache again under $filename entry, so that subsequent
228	# calls to classify_contents will find it in the cache already
229	$filecontents_cache{$filename} = $self->classify($contents_ref, $filter_by_encoding);
230	}
231	return $filecontents_cache{$filename};
232	}
233
234
235	# This method returns the most frequently occurring encoding
236	# but only if any encoding occurs more than once in the given results.
237	# Otherwise, "" is returned.
238	sub most_frequent_encoding {
239	my ($self, $results) = @_;
240	my $best_encoding = "";
241
242	# guessed_encodings is a hashmap of Encoding -> Frequency pairs
243	my %guessed_encodings = ();
244	foreach my $result (@$results) {
245	# Get the encoding portion of a language-model filename like en-iso8859_1
246	my ($encoding) = ($result =~ /^(?:[^\-]+)\-([^\-]+)$/);
247	if(!defined($guessed_encodings{$encoding})) {
248	$guessed_encodings{$encoding} = 0;
249	}
250	$guessed_encodings{$encoding}++;
251	}
252
253	$guessed_encodings{""}=-1; # for default best_encoding of ""
254
255	foreach my $enc (keys %guessed_encodings) {
256	if ($guessed_encodings{$enc} > $guessed_encodings{$best_encoding}) {
257	$best_encoding = $enc;
258	}
259	}
260
261	# If best_encoding's frequency == 1, then the frequency for all encodings will
262	# be 1 since the sum total of all frequencies is num_results: if any encoding
263	# has frequency > 1 (it's possibly the best_encoding), one or more of the others
264	# would have been at 0 frequency to compensate.
265	return ($guessed_encodings{$best_encoding} > 1) ? $best_encoding : "";
266	}
267
268
269	# set some of the specific member variables
270	sub set_opts {
271	my ($self, $opt_freq, $opt_factor, $opt_top, $max_size_of_cache)=@_;
272
273	$self->{'opt_f'} = $opt_freq if defined $opt_freq;
274	$self->{'opt_u'} = $opt_factor if defined $opt_factor;
275	$self->{'opt_t'} = $opt_top if defined $opt_top;
276
277	$self->{'max_cache_size'} = $max_size_of_cache if defined $max_size_of_cache;
278	}
279
280	sub get_opts {
281	my $self = shift (@_);
282	return ($self->{'opt_f'}, $self->{'opt_u'}, $self->{'opt_t'}, $self->{'max_cache_size'});
283	}
284
285	# Clears the filename cache (a map of strings to the textcat results for each string).
286	sub clear_filename_cache {
287	my $self = shift (@_);
288
289	%filename_cache = undef; # does this suffice to release memory?
290	%filename_cache = ();
291	}
292
293	# Clears the filecontents cache (a map of filenames to the textcat results on the contents of each file).
294	sub clear_filecontents_cache {
295	my $self = shift (@_);
296
297	%filecontents_cache = undef; # does this suffice to release memory?
298	%filecontents_cache = ();
299	}
300
301	sub create_lm {
302	# $ngram contains reference to the hash we build
303	# then add the ngrams found in each word in the hash
304	my ($self, $textref) = @_;
305
306	my $ngram = {};
307
308	foreach my $word (split(/[$non_word_characters]+/, $$textref)) {
309	$word = "_" . $word . "_";
310	my $len = length($word);
311	my $flen=$len;
312	my $i;
313
314	for ($i=0; $i<$flen; $i++) {
315	$ngram->{substr($word,$i,5)}++ if $len > 4;
316	$ngram->{substr($word,$i,4)}++ if $len > 3;
317	$ngram->{substr($word,$i,3)}++ if $len > 2;
318	$ngram->{substr($word,$i,2)}++ if $len > 1;
319	$ngram->{substr($word,$i,1)}++;
320	$len--;
321	}
322	}
323
324	map { if ($ngram->{$_} <= $self->{'opt_f'}) { delete $ngram->{$_}; }
325	} keys %$ngram;
326
327	# sort the ngrams, and spit out the $opt_t frequent ones.
328	# adding `or $a cmp $b' in the sort block makes sorting five
329	# times slower..., although it would be somewhat nicer (unique result)
330	my @sorted = sort { $ngram->{$b} <=> $ngram->{$a} } keys %$ngram;
331	splice(@sorted,$self->{'opt_t'}) if (@sorted > $self->{'opt_t'});
332	return \@sorted;
333	}
334
335	1;

Note: See TracBrowser for help on using the repository browser.

Download in other formats: