source: gs2-extensions/ngramj/perllib/ngramj.pm@ 25155

Last change on this file since 25155 was 25155, checked in by papitha, 10 years ago

Tidy up of code to give controlled output (verbosity>=2)

File size: 3.6 KB
Line 
1###########################################################################
2#
3# ngramj.pm -- Identify the language of a piece of text
4#
5#
6# This file is based on TextCat version 1.08 by Gertjan van Noord
7# Copyright (C) 1997 Gertjan van Noord (vannoord@let.rug.nl)
8# TextCat is available from: http://odur.let.rug.nl/~vannoord/TextCat
9#
10# It was modified by Gordon Paynter (gwp@cs.waikato.ac.nz) and turned
11# into a package for use in Greenstone digital library system. Most of
12# the modifications consist of commenting out or deleting functionality
13# I don't need.
14#
15#
16# This program is free software; you can redistribute it and/or modify
17# it under the terms of the GNU General Public License as published by
18# the Free Software Foundation; either version 2 of the License, or
19# (at your option) any later version.
20#
21# This program is distributed in the hope that it will be useful,
22# but WITHOUT ANY WARRANTY; without even the implied warranty of
23# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
24# GNU General Public License for more details.
25#
26# You should have received a copy of the GNU General Public License
27# along with this program; if not, write to the Free Software
28# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
29#
30###########################################################################
31
32package ngramj;
33
34use strict;
35no strict 'refs'; # allow filehandles to be variables and viceversa
36
37sub new {
38 my $class = shift (@_);
39 my ($verbosity,$outhandle) = @_;
40
41 my $self = { 'verbosity' => $verbosity, 'outhandle' => $outhandle };
42
43 my $ngram_jar = &util::filename_cat($ENV{'GSDLHOME'},"ext","ngramj","jars","cngram.jar");
44
45 my $java_cmd = "java -jar $ngram_jar";
46
47 $self->{'java_cmd'} = $java_cmd;
48
49 return bless $self, $class;
50}
51
52
53sub classify_contents {
54 my ($self, $contents_ref, $filename, $opt_encoding)=@_;
55
56 # save contents_ref to tmp file
57
58 my $tmp_txt_filename = &util::get_tmp_filename("txt");
59
60 if (open(TOUT,">$tmp_txt_filename")) {
61
62 binmode(TOUT,":utf8");
63 print TOUT $$contents_ref;
64 close(TOUT);
65 }
66 else {
67 print STDERR "Failed to open $tmp_txt_filename\n";
68 print STDERR "$!\n";
69 return undef;
70 }
71
72 # run java code over it
73 my $java_cmd = $self->{'java_cmd'};
74 $java_cmd .= " -lang2 $tmp_txt_filename";
75 $java_cmd .= " $opt_encoding" if (defined $opt_encoding);
76
77 my $lang_encode_pairs = [];
78 if (open(NGRAMIN,"$java_cmd |")) {
79
80 # parse what comes back and turn into array of 'lang-encode' form
81
82 my $line;
83 while (defined ($line = <NGRAMIN>)) {
84 if ($line =~ m/^\s*speed:\s*(.*?)\s*\.\./) {
85 my $lang_group = $1;
86
87 my @lang_array = split(/\s+/,$lang_group);
88
89 my @lang_summary = ( "++ Ngram language probabilities:\n++ ");
90
91 foreach my $l (@lang_array) {
92 push(@lang_summary,$l);
93 my ($lang,$score) = ($l =~ m/^(.+):(.+)$/);
94
95 my $lang_pair = $lang;
96 $lang_pair .= "-$opt_encoding" if (defined $opt_encoding);
97
98 push(@$lang_encode_pairs,$lang_pair);
99 }
100 push(@lang_summary,"\n");
101
102 if ($self->{'verbosity'}>=2) {
103 my $outhandle = $self->{'outhandle'};
104 my $lang_summary_str = join(" ",@lang_summary);
105 print $outhandle $lang_summary_str;
106 }
107 }
108 }
109
110 }
111 else {
112 print STDERR "Failed to open pipe to $java_cmd\n";
113 print STDERR "$!\n";
114 return undef;
115 }
116
117 &util::rm($tmp_txt_filename);
118
119 # return cached array of content encodings for the given filename
120 return $lang_encode_pairs
121}
122
123
124sub classify_contents_for_encoding {
125 my ($self, $contents_ref, $filename, $filter_by_encoding)=@_;
126
127 return $self->classify_contents($contents_ref,$filename,$filter_by_encoding);
128}
129
130
131
1321;
Note: See TracBrowser for help on using the repository browser.