[25141] | 1 | ###########################################################################
|
---|
| 2 | #
|
---|
| 3 | # ngramj.pm -- Identify the language of a piece of text
|
---|
| 4 | #
|
---|
| 5 | #
|
---|
| 6 | # This file is based on TextCat version 1.08 by Gertjan van Noord
|
---|
| 7 | # Copyright (C) 1997 Gertjan van Noord ([email protected])
|
---|
| 8 | # TextCat is available from: http://odur.let.rug.nl/~vannoord/TextCat
|
---|
| 9 | #
|
---|
| 10 | # It was modified by Gordon Paynter ([email protected]) and turned
|
---|
| 11 | # into a package for use in Greenstone digital library system. Most of
|
---|
| 12 | # the modifications consist of commenting out or deleting functionality
|
---|
| 13 | # I don't need.
|
---|
| 14 | #
|
---|
| 15 | #
|
---|
| 16 | # This program is free software; you can redistribute it and/or modify
|
---|
| 17 | # it under the terms of the GNU General Public License as published by
|
---|
| 18 | # the Free Software Foundation; either version 2 of the License, or
|
---|
| 19 | # (at your option) any later version.
|
---|
| 20 | #
|
---|
| 21 | # This program is distributed in the hope that it will be useful,
|
---|
| 22 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
|
---|
| 23 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
---|
| 24 | # GNU General Public License for more details.
|
---|
| 25 | #
|
---|
| 26 | # You should have received a copy of the GNU General Public License
|
---|
| 27 | # along with this program; if not, write to the Free Software
|
---|
| 28 | # Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
|
---|
| 29 | #
|
---|
| 30 | ###########################################################################
|
---|
| 31 |
|
---|
| 32 | package ngramj;
|
---|
| 33 |
|
---|
| 34 | use strict;
|
---|
| 35 |
|
---|
| 36 |
|
---|
| 37 | sub new {
|
---|
| 38 | my $class = shift (@_);
|
---|
| 39 |
|
---|
| 40 | my $self = {};
|
---|
| 41 |
|
---|
| 42 | my $ngram_jar = &util::filename_cat($ENV{'GSDLHOME'},"ext","ngramj","jars","cngram.jar");
|
---|
| 43 |
|
---|
| 44 | my $java_cmd = "java -jar $ngram_jar";
|
---|
| 45 |
|
---|
| 46 | $self->{'java_cmd'} = $java_cmd;
|
---|
| 47 |
|
---|
| 48 |
|
---|
| 49 | return bless $self, $class;
|
---|
| 50 | }
|
---|
| 51 |
|
---|
| 52 |
|
---|
| 53 | sub classify_contents {
|
---|
| 54 | my ($self, $contents_ref, $filename, $opt_encoding)=@_;
|
---|
| 55 |
|
---|
| 56 | # save contents_ref to tmp file
|
---|
| 57 |
|
---|
| 58 | my $tmp_txt_filename = &util::get_tmp_filename("txt");
|
---|
| 59 |
|
---|
| 60 | if (open(TOUT,">$tmp_txt_filename")) {
|
---|
| 61 |
|
---|
| 62 | binmode(TOUT,":utf8");
|
---|
| 63 | print TOUT $$contents_ref;
|
---|
| 64 | close(TOUT);
|
---|
| 65 | }
|
---|
| 66 | else {
|
---|
| 67 | print STDERR "Failed to open $tmp_txt_filename\n";
|
---|
| 68 | print STDERR "$!\n";
|
---|
| 69 | return undef;
|
---|
| 70 | }
|
---|
| 71 |
|
---|
| 72 | # run java code over it
|
---|
| 73 | my $java_cmd = $self->{'java_cmd'};
|
---|
| 74 | $java_cmd .= " -lang2 $tmp_txt_filename";
|
---|
| 75 | $java_cmd .= " $opt_encoding" if (defined $opt_encoding);
|
---|
| 76 |
|
---|
| 77 | my $lang_encode_pairs = [];
|
---|
| 78 | if (open(NGRAMIN,"$java_cmd |")) {
|
---|
| 79 |
|
---|
| 80 | # parse what comes back and turn into array of 'lang-encode' form
|
---|
| 81 |
|
---|
| 82 | my $line;
|
---|
| 83 | while (defined ($line = <NGRAMIN>)) {
|
---|
| 84 | if ($line =~ m/^\s*speed:\s*(.*?)\s*\.\./) {
|
---|
| 85 | my $lang_group = $1;
|
---|
| 86 |
|
---|
| 87 | my @lang_array = split(/\s+/,$lang_group);
|
---|
| 88 |
|
---|
| 89 | foreach my $l (@lang_array) {
|
---|
| 90 | print STDERR "l = $l\n";
|
---|
| 91 | my ($lang,$score) = ($l =~ m/^(.+):(.+)$/);
|
---|
| 92 |
|
---|
| 93 | my $lang_pair = $lang;
|
---|
| 94 | $lang_pair .= "-$opt_encoding" if (defined $opt_encoding);
|
---|
| 95 |
|
---|
| 96 | push(@$lang_encode_pairs,$lang_pair);
|
---|
| 97 | }
|
---|
| 98 | }
|
---|
| 99 | }
|
---|
| 100 |
|
---|
| 101 | }
|
---|
| 102 | else {
|
---|
| 103 | print STDERR "Failed to open pipe to $java_cmd\n";
|
---|
| 104 | print STDERR "$!\n";
|
---|
| 105 | return undef;
|
---|
| 106 | }
|
---|
| 107 |
|
---|
| 108 |
|
---|
| 109 |
|
---|
| 110 | &util::rm($tmp_txt_filename);
|
---|
| 111 |
|
---|
| 112 | # return cached array of content encodings for the given filename
|
---|
| 113 | return $lang_encode_pairs
|
---|
| 114 | }
|
---|
| 115 |
|
---|
| 116 |
|
---|
| 117 | sub classify_contents_for_encoding {
|
---|
| 118 | my ($self, $contents_ref, $filename, $filter_by_encoding)=@_;
|
---|
| 119 |
|
---|
| 120 | return $self->classify_contents($contents_ref,$filename,$filter_by_encoding);
|
---|
| 121 |
|
---|
| 122 | }
|
---|
| 123 |
|
---|
| 124 |
|
---|
| 125 |
|
---|
| 126 | 1;
|
---|