Context Navigation

source: gsdl/trunk/perllib/textcat.pm@ 16674

Last change on this file since 16674 was 16674, checked in by ak19, 16 years ago
Added caching for textcat results on filecontents as well: a second map now stores the mapping from filenames to the cached textcat results on the contents of those files.
Property svn:executable set to ``* Property svn:keywords set to `Author Date Id Revision`
File size: 8.3 KB

Rev	Line
[1316]	1	###########################################################################
	2	#
	3	# textcat.pm -- Identify the language of a piece of text
	4	#
	5	#
	6	# This file is based on TextCat version 1.08 by Gertjan van Noord
	7	# Copyright (C) 1997 Gertjan van Noord ([email protected])
	8	# TextCat is available from: http://odur.let.rug.nl/~vannoord/TextCat
	9	#
	10	# It was modified by Gordon Paynter ([email protected]) and turned
	11	# into a package for use in Greenstone digital library system. Most of
	12	# the modifications consist of commenting out or deleting functionality
	13	# I don't need.
	14	#
	15	#
	16	# This program is free software; you can redistribute it and/or modify
	17	# it under the terms of the GNU General Public License as published by
	18	# the Free Software Foundation; either version 2 of the License, or
	19	# (at your option) any later version.
	20	#
	21	# This program is distributed in the hope that it will be useful,
	22	# but WITHOUT ANY WARRANTY; without even the implied warranty of
	23	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	24	# GNU General Public License for more details.
	25	#
	26	# You should have received a copy of the GNU General Public License
	27	# along with this program; if not, write to the Free Software
	28	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
	29	#
	30	###########################################################################
	31
	32	package textcat;
	33
[15894]	34	use strict;
	35
[1316]	36	# OPTIONS
	37	my $model_dir = $ENV{'GSDLHOME'} . "/perllib/textcat";
	38
	39	my $opt_f = 1; # Ngrams which occur <= this number of times are removed
	40	my $opt_t = 400; # topmost number of ngrams that should be used
	41	my $opt_u = 1.05; # how much worse result must be before it is ignored
	42
	43	my $non_word_characters = '0-9\s';
	44
[16554]	45	# caching related
[16674]	46	my %filename_cache = (); # map of cached text-strings each to array of char-encodings for the strings themselves
	47	my %filecontents_cache = (); # map of cached filenames to array of char-encodings for the contents of the files
[16554]	48	my $MAX_CACHE_SIZE = 1000;
	49
[2235]	50	sub new {
	51	my $class = shift (@_);
[16554]	52	my ($tmp_f, $tmp_t, $tmp_u) = @_;
	53
[2235]	54	my $self = {};
[1316]	55
[2235]	56	# open directory to find which languages are supported
	57	opendir DIR, "$model_dir" or die "directory $model_dir: $!\n";
	58	my @languages = sort(grep { s/\.lm// && -r "$model_dir/$_.lm" } readdir(DIR));
	59	closedir DIR;
	60	@languages or die "sorry, can't read any language models from $model_dir\n" .
	61	"language models must reside in files with .lm ending\n";
	62
	63	# load model and count for each language.
	64	foreach my $language (@languages) {
	65	my %ngram=();
	66	my $rang=1;
	67	open(LM, "$model_dir/$language.lm") \|\| die "cannot open $language.lm: $!\n";
	68	while (<LM>) {
	69	chomp;
	70	# only use lines starting with appropriate character. Others are ignored.
	71	if (/^[^$non_word_characters]+/o) {
	72	$self->{'ngrams'}->{$language}->{$&} = $rang++;
	73	}
	74	}
	75	close(LM);
	76	}
	77
	78	$self->{'languages'} = \@languages;
[16554]	79
	80	$self->{'opt_f'} = defined($tmp_f) ? $tmp_f : $opt_f;
	81	$self->{'opt_t'} = defined($tmp_t) ? $tmp_t : $opt_t;
	82	$self->{'opt_u'} = defined($tmp_u) ? $tmp_u : $opt_u;
	83	$self->{'max_cache_size'} = $MAX_CACHE_SIZE;
	84
[2235]	85	return bless $self, $class;
	86	}
	87
	88
	89
[1316]	90	# CLASSIFICATION
	91	#
	92	# What language is a text string?
	93	# Input: text string
	94	# Output: array of language names
	95
	96	sub classify {
[16554]	97	my ($self, $inputref, $opt_freq, $opt_factor, $opt_top)=@_;
	98
	99	$self->{'opt_f'} = $opt_freq if defined $opt_freq;
	100	$self->{'opt_u'} = $opt_factor if defined $opt_factor;
	101	$self->{'opt_t'} = $opt_top if defined $opt_top;
	102
[2235]	103	my %results = ();
[16554]	104	my $maxp = $self->{'opt_t'};
[1316]	105
[2235]	106	# create ngrams for input.
	107	my $unknown = $self->create_lm($inputref);
[1316]	108
[2235]	109	foreach my $language (@{$self->{'languages'}}) {
[1316]	110
[2235]	111	# compare language model with input ngrams list
	112	my ($i,$p)=(0,0);
	113	while ($i < scalar (@$unknown)) {
	114	if (defined ($self->{'ngrams'}->{$language}->{$unknown->[$i]})) {
	115	$p=$p+abs($self->{'ngrams'}->{$language}->{$unknown->[$i]}-$i);
	116	} else {
	117	$p=$p+$maxp;
	118	}
	119	++$i;
	120	}
	121	$results{$language} = $p;
[1316]	122	}
[2235]	123
	124	my @results = sort { $results{$a} <=> $results{$b} } keys %results;
	125	my $a = $results{$results[0]};
[1316]	126
[2235]	127	my @answers=(shift(@results));
[16554]	128	while (@results && $results{$results[0]} < ($self->{'opt_u'} *$a)) {
[2235]	129	@answers=(@answers,shift(@results));
	130	}
[1316]	131
[2235]	132	return \@answers;
[1316]	133	}
	134
[16674]	135	# Same as above, but caches textcat results on filenames for subsequent use.
	136	# The cache is a map of the filename to the corresponding filename_encodings
	137	# (an array of results returned by textcat of the possible filename-encodings
	138	# for the indexing filename string itself).
	139	# Need to make sure that the filename is only the tailname: no path and no
	140	# extension (no digits), in order to make optimum use of cached textcat.
	141	# Textcat is performed on $filename_ref and the results associated with $filename_ref.
	142	# The cache will be cleared when the max_cache_size is reached, which is
	143	# MAX_CACHE_SIZE by default or can be specified as a parameter. The cache
	144	# can also be cleared by a call to clear_filename_cache.
	145	sub classify_cached_filename {
	146	my ($self, $filename_ref, $opt_freq, $opt_factor, $opt_top, $max_size_of_cache)=@_;
[16554]	147	$self->{'max_cache_size'} = $max_size_of_cache if defined $max_size_of_cache;
	148
	149	# if not already in the cache, work it out and put it there
[16674]	150	if (!defined $filename_cache{$$filename_ref})
[16554]	151	{
[16674]	152	if (scalar (keys %filename_cache) >= $self->{'max_cache_size'}) {
	153	$self->clear_filename_cache();
[16554]	154	}
[16674]	155	$filename_cache{$$filename_ref} = $self->classify($filename_ref, $opt_freq, $opt_factor, $opt_top);
	156	}
	157
	158	# return cached array of encodings for the given string
	159	return $filename_cache{$$filename_ref};
	160	}
	161
	162	# Same as above, but caches textcat results on filecontents for subsequent use.
	163	# The cache is a map of the filename to an array of possible filename_encodings
	164	# for the contents of the file returned by textcat.
	165	# Textcat is performed on $contents_ref and the results associated with $filename.
	166	# The cache will be cleared when the max_cache_size is reached, which is
	167	# MAX_CACHE_SIZE by default or can be specified as a parameter. The cache
	168	# can also be cleared by a call to clear_filecontents_cache.
	169	sub classify_cached_filecontents {
	170	my ($self, $contents_ref, $filename, $opt_freq, $opt_factor, $opt_top, $max_size_of_cache)=@_;
	171	$self->{'max_cache_size'} = $max_size_of_cache if defined $max_size_of_cache;
	172
	173	# if not already in the cache, work it out and put it there
	174	if (!defined $filecontents_cache{$filename})
	175	{
	176	if (scalar (keys %filecontents_cache) >= $self->{'max_cache_size'}) {
	177	$self->clear_filecontents_cache();
	178	}
	179	$filecontents_cache{$filename} = $self->classify($contents_ref, $opt_freq, $opt_factor, $opt_top);
[16554]	180	}
	181
[16674]	182	# return cached array of content encodings for the given filename
	183	return $filecontents_cache{$filename};
	184	}
[16554]	185
[16674]	186	# Clears the filename cache (a map of strings to the textcat results for each string).
	187	sub clear_filename_cache {
	188	my $self = shift (@_);
	189
	190	%filename_cache = undef; # does this suffice to release memory?
	191	%filename_cache = ();
[16554]	192	}
	193
[16674]	194	# Clears the filecontents cache (a map of filenames to the textcat results on the contents of each file).
	195	sub clear_filecontents_cache {
[16554]	196	my $self = shift (@_);
	197
[16674]	198	%filecontents_cache = undef; # does this suffice to release memory?
	199	%filecontents_cache = ();
[16554]	200	}
	201
[2235]	202	sub create_lm {
	203	# $ngram contains reference to the hash we build
	204	# then add the ngrams found in each word in the hash
	205	my ($self, $textref) = @_;
	206
	207	my $ngram = {};
[1316]	208
[2235]	209	foreach my $word (split(/[$non_word_characters]+/, $$textref)) {
	210	$word = "_" . $word . "_";
	211	my $len = length($word);
	212	my $flen=$len;
	213	my $i;
[1316]	214
[2235]	215	for ($i=0; $i<$flen; $i++) {
	216	$ngram->{substr($word,$i,5)}++ if $len > 4;
	217	$ngram->{substr($word,$i,4)}++ if $len > 3;
	218	$ngram->{substr($word,$i,3)}++ if $len > 2;
	219	$ngram->{substr($word,$i,2)}++ if $len > 1;
	220	$ngram->{substr($word,$i,1)}++;
	221	$len--;
	222	}
[1316]	223	}
	224
[16554]	225	map { if ($ngram->{$_} <= $self->{'opt_f'}) { delete $ngram->{$_}; }
[2235]	226	} keys %$ngram;
[1316]	227
[2235]	228	# sort the ngrams, and spit out the $opt_t frequent ones.
	229	# adding `or $a cmp $b' in the sort block makes sorting five
	230	# times slower..., although it would be somewhat nicer (unique result)
	231	my @sorted = sort { $ngram->{$b} <=> $ngram->{$a} } keys %$ngram;
[16554]	232	splice(@sorted,$self->{'opt_t'}) if (@sorted > $self->{'opt_t'});
[2235]	233	return \@sorted;
[1316]	234	}
	235
	236	1;

Note: See TracBrowser for help on using the repository browser.

Download in other formats: