Context Navigation

source: gsdl/trunk/perllib/textcat.pm@ 18509

Last change on this file since 18509 was 17214, checked in by ak19, 16 years ago
Significant changes: 1. Textcat can be restricted to a given encoding when the encoding of a file's contents is known. In this case we are after the language of the contents. 2. Code for working out most frequently occurring encoding has been moved into here from ReadTextFile.pm. 3. Subroutines renamed and unnecessary parameters passed to the various versions of classify() have been moved into set and get methods instead.
Property svn:executable set to ``* Property svn:keywords set to `Author Date Id Revision`
File size: 11.9 KB

Rev	Line
[1316]	1	###########################################################################
	2	#
	3	# textcat.pm -- Identify the language of a piece of text
	4	#
	5	#
	6	# This file is based on TextCat version 1.08 by Gertjan van Noord
	7	# Copyright (C) 1997 Gertjan van Noord ([email protected])
	8	# TextCat is available from: http://odur.let.rug.nl/~vannoord/TextCat
	9	#
	10	# It was modified by Gordon Paynter ([email protected]) and turned
	11	# into a package for use in Greenstone digital library system. Most of
	12	# the modifications consist of commenting out or deleting functionality
	13	# I don't need.
	14	#
	15	#
	16	# This program is free software; you can redistribute it and/or modify
	17	# it under the terms of the GNU General Public License as published by
	18	# the Free Software Foundation; either version 2 of the License, or
	19	# (at your option) any later version.
	20	#
	21	# This program is distributed in the hope that it will be useful,
	22	# but WITHOUT ANY WARRANTY; without even the implied warranty of
	23	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	24	# GNU General Public License for more details.
	25	#
	26	# You should have received a copy of the GNU General Public License
	27	# along with this program; if not, write to the Free Software
	28	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
	29	#
	30	###########################################################################
	31
	32	package textcat;
	33
[15894]	34	use strict;
	35
[1316]	36	# OPTIONS
	37	my $model_dir = $ENV{'GSDLHOME'} . "/perllib/textcat";
	38
	39	my $opt_f = 1; # Ngrams which occur <= this number of times are removed
	40	my $opt_t = 400; # topmost number of ngrams that should be used
	41	my $opt_u = 1.05; # how much worse result must be before it is ignored
	42
	43	my $non_word_characters = '0-9\s';
	44
[16554]	45	# caching related
[16674]	46	my %filename_cache = (); # map of cached text-strings each to array of char-encodings for the strings themselves
	47	my %filecontents_cache = (); # map of cached filenames to array of char-encodings for the contents of the files
[16554]	48	my $MAX_CACHE_SIZE = 1000;
	49
[2235]	50	sub new {
	51	my $class = shift (@_);
[16554]	52	my ($tmp_f, $tmp_t, $tmp_u) = @_;
	53
[2235]	54	my $self = {};
[1316]	55
[2235]	56	# open directory to find which languages are supported
	57	opendir DIR, "$model_dir" or die "directory $model_dir: $!\n";
	58	my @languages = sort(grep { s/\.lm// && -r "$model_dir/$_.lm" } readdir(DIR));
	59	closedir DIR;
	60	@languages or die "sorry, can't read any language models from $model_dir\n" .
	61	"language models must reside in files with .lm ending\n";
	62
	63	# load model and count for each language.
	64	foreach my $language (@languages) {
	65	my %ngram=();
	66	my $rang=1;
	67	open(LM, "$model_dir/$language.lm") \|\| die "cannot open $language.lm: $!\n";
	68	while (<LM>) {
	69	chomp;
	70	# only use lines starting with appropriate character. Others are ignored.
	71	if (/^[^$non_word_characters]+/o) {
	72	$self->{'ngrams'}->{$language}->{$&} = $rang++;
	73	}
	74	}
	75	close(LM);
	76	}
	77
	78	$self->{'languages'} = \@languages;
[16554]	79
	80	$self->{'opt_f'} = defined($tmp_f) ? $tmp_f : $opt_f;
	81	$self->{'opt_t'} = defined($tmp_t) ? $tmp_t : $opt_t;
	82	$self->{'opt_u'} = defined($tmp_u) ? $tmp_u : $opt_u;
	83	$self->{'max_cache_size'} = $MAX_CACHE_SIZE;
	84
[2235]	85	return bless $self, $class;
	86	}
	87
	88
[1316]	89	# CLASSIFICATION
	90	#
	91	# What language is a text string?
	92	# Input: text string
	93	# Output: array of language names
[17214]	94	# $languages is the set of language models to consider (to textcat on)
	95	# Can be set to filter out language models that don't belong to the given encoding
	96	# in order to obtain a list of the probable languages for that known encoding.
	97	# $filter_by_encoding indicates what encoding to narrow the search for languages down to.
	98	# This is for when we already know the encoding, but we're still looking for the language.
	99	sub classify {
	100	my ($self, $inputref, $filter_by_encoding)=@_;
	101	my $languages;
	102	@$languages = ();
[1316]	103
[17214]	104	# filter language filenames by encoding
	105	if(defined $filter_by_encoding) {
	106	# make sure to normalize language and filtering encoding so we are not
	107	# stuck comparing hyphens with underscores in such things as iso-8859-1
	108	my $normalized_filter = $filter_by_encoding;
	109	$normalized_filter =~ s/[\W\_]//g;
	110
	111	foreach my $lang (@{$self->{'languages'}}) {
	112	my $normalized_lang = $lang;
	113	$normalized_lang =~ s/[\W\_]//g;
	114
	115	if($normalized_lang =~ m/$normalized_filter/i) {
	116	push (@$languages, $lang);
	117	}
	118	}
	119	}
	120
	121	# if the filter_by_encoding wasn't in the list of language model filenames
	122	# or if we're not filtering, then work with all language model filenames
	123	if(scalar @$languages == 0) {
	124	$languages = $self->{'languages'};
	125	}
	126
[2235]	127	my %results = ();
[16554]	128	my $maxp = $self->{'opt_t'};
[1316]	129
[2235]	130	# create ngrams for input.
	131	my $unknown = $self->create_lm($inputref);
[1316]	132
[17214]	133	foreach my $language (@$languages) {
[2235]	134	# compare language model with input ngrams list
	135	my ($i,$p)=(0,0);
	136	while ($i < scalar (@$unknown)) {
	137	if (defined ($self->{'ngrams'}->{$language}->{$unknown->[$i]})) {
	138	$p=$p+abs($self->{'ngrams'}->{$language}->{$unknown->[$i]}-$i);
	139	} else {
	140	$p=$p+$maxp;
	141	}
	142	++$i;
	143	}
	144	$results{$language} = $p;
[1316]	145	}
[2235]	146
	147	my @results = sort { $results{$a} <=> $results{$b} } keys %results;
	148	my $a = $results{$results[0]};
[1316]	149
[2235]	150	my @answers=(shift(@results));
[16554]	151	while (@results && $results{$results[0]} < ($self->{'opt_u'} *$a)) {
[2235]	152	@answers=(@answers,shift(@results));
	153	}
[1316]	154
[2235]	155	return \@answers;
[1316]	156	}
	157
[17214]	158
	159	# Same as below, but caches textcat results on filenames for subsequent use.
[16674]	160	# The cache is a map of the filename to the corresponding filename_encodings
	161	# (an array of results returned by textcat of the possible filename-encodings
	162	# for the indexing filename string itself).
	163	# Need to make sure that the filename is only the tailname: no path and no
	164	# extension (no digits), in order to make optimum use of cached textcat.
	165	# Textcat is performed on $filename_ref and the results associated with $filename_ref.
	166	# The cache will be cleared when the max_cache_size is reached, which is
	167	# MAX_CACHE_SIZE by default or can be specified as a parameter. The cache
	168	# can also be cleared by a call to clear_filename_cache.
	169	sub classify_cached_filename {
[17214]	170	my ($self, $filename_ref)=@_;
[16554]	171
	172	# if not already in the cache, work it out and put it there
[16674]	173	if (!defined $filename_cache{$$filename_ref})
[16554]	174	{
[16674]	175	if (scalar (keys %filename_cache) >= $self->{'max_cache_size'}) {
	176	$self->clear_filename_cache();
[16554]	177	}
[17214]	178	$filename_cache{$$filename_ref} = $self->classify($filename_ref);
[16674]	179	}
	180
	181	# return cached array of encodings for the given string
	182	return $filename_cache{$$filename_ref};
	183	}
	184
	185	# Same as above, but caches textcat results on filecontents for subsequent use.
[17214]	186	# Textcat on a file's contents to work out its possible encodings. Uses the cache.
[16674]	187	# The cache is a map of the filename to an array of possible filename_encodings
	188	# for the contents of the file returned by textcat.
	189	# Textcat is performed on $contents_ref and the results associated with $filename.
	190	# The cache will be cleared when the max_cache_size is reached, which is
	191	# MAX_CACHE_SIZE by default or can be specified as a parameter. The cache
	192	# can also be cleared by a call to clear_filecontents_cache.
[17214]	193	sub classify_contents {
	194	my ($self, $contents_ref, $filename)=@_;
	195
[16674]	196	# if not already in the cache, work it out and put it there
[17214]	197	if (!defined $filecontents_cache{$filename})
[16674]	198	{
	199	if (scalar (keys %filecontents_cache) >= $self->{'max_cache_size'}) {
	200	$self->clear_filecontents_cache();
	201	}
[17214]	202
	203	# Finally, we can perform the textcat classification of language and encoding
	204	$filecontents_cache{$filename} = $self->classify($contents_ref);
[16554]	205	}
[16674]	206	# return cached array of content encodings for the given filename
	207	return $filecontents_cache{$filename};
	208	}
[16554]	209
[17214]	210
	211	# Given the known encoding for a file's contents, performs a textcat
	212	# filtering on the languages for the given encoding. Results are stored
	213	# in the cache TWICE: once under $filename\|$filter_by_encoding, and
	214	# once under the usual $filename, so that subsequent calls to either
	215	# this method or classify_contents using the same filename will not
	216	# perform textcat again.
	217	sub classify_contents_for_encoding {
	218	my ($self, $contents_ref, $filename, $filter_by_encoding)=@_;
	219
	220	if (!defined $filecontents_cache{"$filename\|$filter_by_encoding"})
	221	{
	222	if (scalar (keys %filecontents_cache) >= $self->{'max_cache_size'}) {
	223	$self->clear_filecontents_cache();
	224	}
	225
	226	$filecontents_cache{"$filename\|$filter_by_encoding"} = $self->classify($contents_ref, $filter_by_encoding);
	227	# store this in cache again under $filename entry, so that subsequent
	228	# calls to classify_contents will find it in the cache already
	229	$filecontents_cache{$filename} = $self->classify($contents_ref, $filter_by_encoding);
	230	}
	231	return $filecontents_cache{$filename};
	232	}
	233
	234
	235	# This method returns the most frequently occurring encoding
	236	# but only if any encoding occurs more than once in the given results.
	237	# Otherwise, "" is returned.
	238	sub most_frequent_encoding {
	239	my ($self, $results) = @_;
	240	my $best_encoding = "";
	241
	242	# guessed_encodings is a hashmap of Encoding -> Frequency pairs
	243	my %guessed_encodings = ();
	244	foreach my $result (@$results) {
	245	# Get the encoding portion of a language-model filename like en-iso8859_1
	246	my ($encoding) = ($result =~ /^(?:[^\-]+)\-([^\-]+)$/);
	247	if(!defined($guessed_encodings{$encoding})) {
	248	$guessed_encodings{$encoding} = 0;
	249	}
	250	$guessed_encodings{$encoding}++;
	251	}
	252
	253	$guessed_encodings{""}=-1; # for default best_encoding of ""
	254
	255	foreach my $enc (keys %guessed_encodings) {
	256	if ($guessed_encodings{$enc} > $guessed_encodings{$best_encoding}) {
	257	$best_encoding = $enc;
	258	}
	259	}
	260
	261	# If best_encoding's frequency == 1, then the frequency for all encodings will
	262	# be 1 since the sum total of all frequencies is num_results: if any encoding
	263	# has frequency > 1 (it's possibly the best_encoding), one or more of the others
	264	# would have been at 0 frequency to compensate.
	265	return ($guessed_encodings{$best_encoding} > 1) ? $best_encoding : "";
	266	}
	267
	268
	269	# set some of the specific member variables
	270	sub set_opts {
	271	my ($self, $opt_freq, $opt_factor, $opt_top, $max_size_of_cache)=@_;
	272
	273	$self->{'opt_f'} = $opt_freq if defined $opt_freq;
	274	$self->{'opt_u'} = $opt_factor if defined $opt_factor;
	275	$self->{'opt_t'} = $opt_top if defined $opt_top;
	276
	277	$self->{'max_cache_size'} = $max_size_of_cache if defined $max_size_of_cache;
	278	}
	279
	280	sub get_opts {
	281	my $self = shift (@_);
	282	return ($self->{'opt_f'}, $self->{'opt_u'}, $self->{'opt_t'}, $self->{'max_cache_size'});
	283	}
	284
[16674]	285	# Clears the filename cache (a map of strings to the textcat results for each string).
	286	sub clear_filename_cache {
	287	my $self = shift (@_);
	288
	289	%filename_cache = undef; # does this suffice to release memory?
	290	%filename_cache = ();
[16554]	291	}
	292
[16674]	293	# Clears the filecontents cache (a map of filenames to the textcat results on the contents of each file).
	294	sub clear_filecontents_cache {
[16554]	295	my $self = shift (@_);
	296
[16674]	297	%filecontents_cache = undef; # does this suffice to release memory?
	298	%filecontents_cache = ();
[16554]	299	}
	300
[2235]	301	sub create_lm {
	302	# $ngram contains reference to the hash we build
	303	# then add the ngrams found in each word in the hash
	304	my ($self, $textref) = @_;
	305
	306	my $ngram = {};
[1316]	307
[2235]	308	foreach my $word (split(/[$non_word_characters]+/, $$textref)) {
	309	$word = "_" . $word . "_";
	310	my $len = length($word);
	311	my $flen=$len;
	312	my $i;
[1316]	313
[2235]	314	for ($i=0; $i<$flen; $i++) {
	315	$ngram->{substr($word,$i,5)}++ if $len > 4;
	316	$ngram->{substr($word,$i,4)}++ if $len > 3;
	317	$ngram->{substr($word,$i,3)}++ if $len > 2;
	318	$ngram->{substr($word,$i,2)}++ if $len > 1;
	319	$ngram->{substr($word,$i,1)}++;
	320	$len--;
	321	}
[1316]	322	}
	323
[16554]	324	map { if ($ngram->{$_} <= $self->{'opt_f'}) { delete $ngram->{$_}; }
[2235]	325	} keys %$ngram;
[1316]	326
[2235]	327	# sort the ngrams, and spit out the $opt_t frequent ones.
	328	# adding `or $a cmp $b' in the sort block makes sorting five
	329	# times slower..., although it would be somewhat nicer (unique result)
	330	my @sorted = sort { $ngram->{$b} <=> $ngram->{$a} } keys %$ngram;
[16554]	331	splice(@sorted,$self->{'opt_t'}) if (@sorted > $self->{'opt_t'});
[2235]	332	return \@sorted;
[1316]	333	}
	334
	335	1;

Note: See TracBrowser for help on using the repository browser.

Download in other formats: