root/gs2-extensions/ngramj/perllib/ngramj.pm @ 25141

Revision 25141, 3.2 KB (checked in by papitha, 8 years ago)

NGRAMJ PERL MODULE ADDED /MAORI LANGUAGE GUESSING WORKING WELL!!

Line 
1###########################################################################
2#
3# ngramj.pm -- Identify the language of a piece of text
4#
5#
6# This file is based on TextCat version 1.08 by Gertjan van Noord
7# Copyright (C) 1997 Gertjan van Noord (vannoord@let.rug.nl)
8# TextCat is available from: http://odur.let.rug.nl/~vannoord/TextCat
9#
10# It was modified by Gordon Paynter (gwp@cs.waikato.ac.nz) and turned
11# into a package for use in Greenstone digital library system.  Most of
12# the modifications consist of commenting out or deleting functionality
13# I don't need. 
14#
15#
16# This program is free software; you can redistribute it and/or modify
17# it under the terms of the GNU General Public License as published by
18# the Free Software Foundation; either version 2 of the License, or
19# (at your option) any later version.
20#
21# This program is distributed in the hope that it will be useful,
22# but WITHOUT ANY WARRANTY; without even the implied warranty of
23# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
24# GNU General Public License for more details.
25#
26# You should have received a copy of the GNU General Public License
27# along with this program; if not, write to the Free Software
28# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
29#
30###########################################################################
31
32package ngramj;
33
34use strict;
35
36
37sub new {
38    my $class = shift (@_);
39
40    my $self = {};
41
42    my $ngram_jar = &util::filename_cat($ENV{'GSDLHOME'},"ext","ngramj","jars","cngram.jar");
43   
44    my $java_cmd = "java -jar $ngram_jar";
45   
46    $self->{'java_cmd'} = $java_cmd;
47   
48 
49    return bless $self, $class;
50}
51
52
53sub classify_contents {
54    my ($self, $contents_ref, $filename, $opt_encoding)=@_;
55     
56     # save contents_ref to tmp file
57     
58     my $tmp_txt_filename = &util::get_tmp_filename("txt");
59     
60     if (open(TOUT,">$tmp_txt_filename")) {
61     
62        binmode(TOUT,":utf8");
63        print TOUT $$contents_ref;
64        close(TOUT);
65     }
66     else {
67        print STDERR "Failed to open $tmp_txt_filename\n";
68        print STDERR "$!\n";
69        return undef;
70    }
71       
72     # run java code over it
73     my $java_cmd = $self->{'java_cmd'};
74     $java_cmd .= " -lang2 $tmp_txt_filename";
75     $java_cmd .= " $opt_encoding" if (defined $opt_encoding);
76     
77     my $lang_encode_pairs = [];
78     if (open(NGRAMIN,"$java_cmd |")) {
79       
80         # parse what comes back and turn into array of 'lang-encode' form
81         
82        my $line;
83        while (defined ($line = <NGRAMIN>)) {
84            if ($line =~ m/^\s*speed:\s*(.*?)\s*\.\./) {
85                my $lang_group = $1;
86               
87                my @lang_array = split(/\s+/,$lang_group);
88               
89                foreach my $l (@lang_array) {
90                    print STDERR "l = $l\n";
91                    my ($lang,$score) = ($l =~ m/^(.+):(.+)$/);
92                   
93                    my $lang_pair = $lang;
94                    $lang_pair .= "-$opt_encoding" if (defined $opt_encoding);
95                   
96                    push(@$lang_encode_pairs,$lang_pair);
97                }
98            }
99        }
100       
101     }
102    else {
103        print STDERR "Failed to open pipe to $java_cmd\n";
104        print STDERR "$!\n";
105        return undef;
106    }
107   
108   
109   
110    &util::rm($tmp_txt_filename);
111   
112    # return cached array of content encodings for the given filename
113    return $lang_encode_pairs
114}
115
116
117sub classify_contents_for_encoding {
118    my ($self, $contents_ref, $filename, $filter_by_encoding)=@_;
119
120    return $self->classify_contents($contents_ref,$filename,$filter_by_encoding);
121   
122}
123   
124
125
1261;
Note: See TracBrowser for help on using the browser.