1 | ###########################################################################
|
---|
2 | #
|
---|
3 | # ngramj.pm -- Identify the language of a piece of text
|
---|
4 | #
|
---|
5 | #
|
---|
6 | # This file is based on TextCat version 1.08 by Gertjan van Noord
|
---|
7 | # Copyright (C) 1997 Gertjan van Noord ([email protected])
|
---|
8 | # TextCat is available from: http://odur.let.rug.nl/~vannoord/TextCat
|
---|
9 | #
|
---|
10 | # It was modified by Gordon Paynter ([email protected]) and turned
|
---|
11 | # into a package for use in Greenstone digital library system. Most of
|
---|
12 | # the modifications consist of commenting out or deleting functionality
|
---|
13 | # I don't need.
|
---|
14 | #
|
---|
15 | #
|
---|
16 | # This program is free software; you can redistribute it and/or modify
|
---|
17 | # it under the terms of the GNU General Public License as published by
|
---|
18 | # the Free Software Foundation; either version 2 of the License, or
|
---|
19 | # (at your option) any later version.
|
---|
20 | #
|
---|
21 | # This program is distributed in the hope that it will be useful,
|
---|
22 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
|
---|
23 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
---|
24 | # GNU General Public License for more details.
|
---|
25 | #
|
---|
26 | # You should have received a copy of the GNU General Public License
|
---|
27 | # along with this program; if not, write to the Free Software
|
---|
28 | # Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
|
---|
29 | #
|
---|
30 | ###########################################################################
|
---|
31 |
|
---|
32 | package ngramj;
|
---|
33 |
|
---|
34 | use strict;
|
---|
35 | no strict 'refs'; # allow filehandles to be variables and viceversa
|
---|
36 |
|
---|
37 | sub new {
|
---|
38 | my $class = shift (@_);
|
---|
39 | my ($verbosity,$outhandle) = @_;
|
---|
40 |
|
---|
41 | my $self = { 'verbosity' => $verbosity, 'outhandle' => $outhandle };
|
---|
42 |
|
---|
43 | my $ngram_jar = &util::filename_cat($ENV{'GSDLHOME'},"ext","ngramj","jars","cngram.jar");
|
---|
44 |
|
---|
45 | my $java_cmd = "java -jar $ngram_jar";
|
---|
46 |
|
---|
47 | $self->{'java_cmd'} = $java_cmd;
|
---|
48 |
|
---|
49 | return bless $self, $class;
|
---|
50 | }
|
---|
51 |
|
---|
52 |
|
---|
53 | sub classify_contents {
|
---|
54 | my ($self, $contents_ref, $filename, $opt_encoding)=@_;
|
---|
55 |
|
---|
56 | # save contents_ref to tmp file
|
---|
57 |
|
---|
58 | my $tmp_txt_filename = &util::get_tmp_filename("txt");
|
---|
59 |
|
---|
60 | if (open(TOUT,">$tmp_txt_filename")) {
|
---|
61 |
|
---|
62 | binmode(TOUT,":utf8");
|
---|
63 | print TOUT $$contents_ref;
|
---|
64 | close(TOUT);
|
---|
65 | }
|
---|
66 | else {
|
---|
67 | print STDERR "Failed to open $tmp_txt_filename\n";
|
---|
68 | print STDERR "$!\n";
|
---|
69 | return undef;
|
---|
70 | }
|
---|
71 |
|
---|
72 | # run java code over it
|
---|
73 | my $java_cmd = $self->{'java_cmd'};
|
---|
74 | $java_cmd .= " -lang2 $tmp_txt_filename";
|
---|
75 | $java_cmd .= " $opt_encoding" if (defined $opt_encoding);
|
---|
76 |
|
---|
77 | my $lang_encode_pairs = [];
|
---|
78 | if (open(NGRAMIN,"$java_cmd |")) {
|
---|
79 |
|
---|
80 | # parse what comes back and turn into array of 'lang-encode' form
|
---|
81 |
|
---|
82 | my $line;
|
---|
83 | while (defined ($line = <NGRAMIN>)) {
|
---|
84 | if ($line =~ m/^\s*speed:\s*(.*?)\s*\.\./) {
|
---|
85 | my $lang_group = $1;
|
---|
86 |
|
---|
87 | my @lang_array = split(/\s+/,$lang_group);
|
---|
88 |
|
---|
89 | my @lang_summary = ( "++ Ngram language probabilities:\n++ ");
|
---|
90 |
|
---|
91 | foreach my $l (@lang_array) {
|
---|
92 | push(@lang_summary,$l);
|
---|
93 | my ($lang,$score) = ($l =~ m/^(.+):(.+)$/);
|
---|
94 |
|
---|
95 | my $lang_pair = $lang;
|
---|
96 | $lang_pair .= "-$opt_encoding" if (defined $opt_encoding);
|
---|
97 |
|
---|
98 | push(@$lang_encode_pairs,$lang_pair);
|
---|
99 | }
|
---|
100 | push(@lang_summary,"\n");
|
---|
101 |
|
---|
102 | if ($self->{'verbosity'}>=2) {
|
---|
103 | my $outhandle = $self->{'outhandle'};
|
---|
104 | my $lang_summary_str = join(" ",@lang_summary);
|
---|
105 | print $outhandle $lang_summary_str;
|
---|
106 | }
|
---|
107 | }
|
---|
108 | }
|
---|
109 |
|
---|
110 | }
|
---|
111 | else {
|
---|
112 | print STDERR "Failed to open pipe to $java_cmd\n";
|
---|
113 | print STDERR "$!\n";
|
---|
114 | return undef;
|
---|
115 | }
|
---|
116 |
|
---|
117 | &util::rm($tmp_txt_filename);
|
---|
118 |
|
---|
119 | # return cached array of content encodings for the given filename
|
---|
120 | return $lang_encode_pairs
|
---|
121 | }
|
---|
122 |
|
---|
123 |
|
---|
124 | sub classify_contents_for_encoding {
|
---|
125 | my ($self, $contents_ref, $filename, $filter_by_encoding)=@_;
|
---|
126 |
|
---|
127 | return $self->classify_contents($contents_ref,$filename,$filter_by_encoding);
|
---|
128 | }
|
---|
129 |
|
---|
130 |
|
---|
131 |
|
---|
132 | 1;
|
---|