########################################################################### # # ngramj.pm -- Identify the language of a piece of text # # # This file is based on TextCat version 1.08 by Gertjan van Noord # Copyright (C) 1997 Gertjan van Noord (vannoord@let.rug.nl) # TextCat is available from: http://odur.let.rug.nl/~vannoord/TextCat # # It was modified by Gordon Paynter (gwp@cs.waikato.ac.nz) and turned # into a package for use in Greenstone digital library system. Most of # the modifications consist of commenting out or deleting functionality # I don't need. # # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation; either version 2 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program; if not, write to the Free Software # Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. # ########################################################################### package ngramj; use strict; sub new { my $class = shift (@_); my $self = {}; my $ngram_jar = &util::filename_cat($ENV{'GSDLHOME'},"ext","ngramj","jars","cngram.jar"); my $java_cmd = "java -jar $ngram_jar"; $self->{'java_cmd'} = $java_cmd; return bless $self, $class; } sub classify_contents { my ($self, $contents_ref, $filename, $opt_encoding)=@_; # save contents_ref to tmp file my $tmp_txt_filename = &util::get_tmp_filename("txt"); if (open(TOUT,">$tmp_txt_filename")) { binmode(TOUT,":utf8"); print TOUT $$contents_ref; close(TOUT); } else { print STDERR "Failed to open $tmp_txt_filename\n"; print STDERR "$!\n"; return undef; } # run java code over it my $java_cmd = $self->{'java_cmd'}; $java_cmd .= " -lang2 $tmp_txt_filename"; $java_cmd .= " $opt_encoding" if (defined $opt_encoding); my $lang_encode_pairs = []; if (open(NGRAMIN,"$java_cmd |")) { # parse what comes back and turn into array of 'lang-encode' form my $line; while (defined ($line = )) { if ($line =~ m/^\s*speed:\s*(.*?)\s*\.\./) { my $lang_group = $1; my @lang_array = split(/\s+/,$lang_group); foreach my $l (@lang_array) { print STDERR "l = $l\n"; my ($lang,$score) = ($l =~ m/^(.+):(.+)$/); my $lang_pair = $lang; $lang_pair .= "-$opt_encoding" if (defined $opt_encoding); push(@$lang_encode_pairs,$lang_pair); } } } } else { print STDERR "Failed to open pipe to $java_cmd\n"; print STDERR "$!\n"; return undef; } &util::rm($tmp_txt_filename); # return cached array of content encodings for the given filename return $lang_encode_pairs } sub classify_contents_for_encoding { my ($self, $contents_ref, $filename, $filter_by_encoding)=@_; return $self->classify_contents($contents_ref,$filename,$filter_by_encoding); } 1;