root/gs2-extensions/ngramj/perllib/ngramj.pm @ 25155

Revision 25155, 3.6 KB (checked in by papitha, 8 years ago)

Tidy up of code to give controlled output (verbosity>=2)

Line 
1###########################################################################
2#
3# ngramj.pm -- Identify the language of a piece of text
4#
5#
6# This file is based on TextCat version 1.08 by Gertjan van Noord
7# Copyright (C) 1997 Gertjan van Noord (vannoord@let.rug.nl)
8# TextCat is available from: http://odur.let.rug.nl/~vannoord/TextCat
9#
10# It was modified by Gordon Paynter (gwp@cs.waikato.ac.nz) and turned
11# into a package for use in Greenstone digital library system.  Most of
12# the modifications consist of commenting out or deleting functionality
13# I don't need. 
14#
15#
16# This program is free software; you can redistribute it and/or modify
17# it under the terms of the GNU General Public License as published by
18# the Free Software Foundation; either version 2 of the License, or
19# (at your option) any later version.
20#
21# This program is distributed in the hope that it will be useful,
22# but WITHOUT ANY WARRANTY; without even the implied warranty of
23# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
24# GNU General Public License for more details.
25#
26# You should have received a copy of the GNU General Public License
27# along with this program; if not, write to the Free Software
28# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
29#
30###########################################################################
31
32package ngramj;
33
34use strict;
35no strict 'refs'; # allow filehandles to be variables and viceversa
36
37sub new {
38    my $class = shift (@_);
39    my ($verbosity,$outhandle) = @_;
40   
41    my $self = { 'verbosity' => $verbosity, 'outhandle' => $outhandle };
42
43    my $ngram_jar = &util::filename_cat($ENV{'GSDLHOME'},"ext","ngramj","jars","cngram.jar");
44   
45    my $java_cmd = "java -jar $ngram_jar";
46   
47    $self->{'java_cmd'} = $java_cmd;
48 
49    return bless $self, $class;
50}
51
52
53sub classify_contents {
54    my ($self, $contents_ref, $filename, $opt_encoding)=@_;
55     
56     # save contents_ref to tmp file
57     
58     my $tmp_txt_filename = &util::get_tmp_filename("txt");
59     
60     if (open(TOUT,">$tmp_txt_filename")) {
61     
62        binmode(TOUT,":utf8");
63        print TOUT $$contents_ref;
64        close(TOUT);
65     }
66     else {
67        print STDERR "Failed to open $tmp_txt_filename\n";
68        print STDERR "$!\n";
69        return undef;
70    }
71       
72     # run java code over it
73     my $java_cmd = $self->{'java_cmd'};
74     $java_cmd .= " -lang2 $tmp_txt_filename";
75     $java_cmd .= " $opt_encoding" if (defined $opt_encoding);
76     
77     my $lang_encode_pairs = [];
78     if (open(NGRAMIN,"$java_cmd |")) {
79       
80         # parse what comes back and turn into array of 'lang-encode' form
81         
82        my $line;
83        while (defined ($line = <NGRAMIN>)) {
84            if ($line =~ m/^\s*speed:\s*(.*?)\s*\.\./) {
85                my $lang_group = $1;
86               
87                my @lang_array = split(/\s+/,$lang_group);
88               
89                my @lang_summary = ( "++ Ngram language probabilities:\n++  ");
90               
91                foreach my $l (@lang_array) {
92                    push(@lang_summary,$l);
93                    my ($lang,$score) = ($l =~ m/^(.+):(.+)$/);
94                   
95                    my $lang_pair = $lang;
96                    $lang_pair .= "-$opt_encoding" if (defined $opt_encoding);
97                   
98                    push(@$lang_encode_pairs,$lang_pair);
99                }
100                push(@lang_summary,"\n");
101               
102                if ($self->{'verbosity'}>=2) {
103                    my $outhandle = $self->{'outhandle'};
104                    my $lang_summary_str = join(" ",@lang_summary);
105                    print $outhandle $lang_summary_str;
106                }
107            }
108        }
109       
110     }
111    else {
112        print STDERR "Failed to open pipe to $java_cmd\n";
113        print STDERR "$!\n";
114        return undef;
115    }
116   
117    &util::rm($tmp_txt_filename);
118   
119    # return cached array of content encodings for the given filename
120    return $lang_encode_pairs
121}
122
123
124sub classify_contents_for_encoding {
125    my ($self, $contents_ref, $filename, $filter_by_encoding)=@_;
126
127    return $self->classify_contents($contents_ref,$filename,$filter_by_encoding);
128}
129   
130
131
1321;
Note: See TracBrowser for help on using the browser.