source: gs2-extensions/ngramj/perllib/ngramj.pm@ 25141

Last change on this file since 25141 was 25141, checked in by papitha, 12 years ago

NGRAMJ PERL MODULE ADDED /MAORI LANGUAGE GUESSING WORKING WELL!!

File size: 3.2 KB
Line 
1###########################################################################
2#
3# ngramj.pm -- Identify the language of a piece of text
4#
5#
6# This file is based on TextCat version 1.08 by Gertjan van Noord
7# Copyright (C) 1997 Gertjan van Noord ([email protected])
8# TextCat is available from: http://odur.let.rug.nl/~vannoord/TextCat
9#
10# It was modified by Gordon Paynter ([email protected]) and turned
11# into a package for use in Greenstone digital library system. Most of
12# the modifications consist of commenting out or deleting functionality
13# I don't need.
14#
15#
16# This program is free software; you can redistribute it and/or modify
17# it under the terms of the GNU General Public License as published by
18# the Free Software Foundation; either version 2 of the License, or
19# (at your option) any later version.
20#
21# This program is distributed in the hope that it will be useful,
22# but WITHOUT ANY WARRANTY; without even the implied warranty of
23# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
24# GNU General Public License for more details.
25#
26# You should have received a copy of the GNU General Public License
27# along with this program; if not, write to the Free Software
28# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
29#
30###########################################################################
31
32package ngramj;
33
34use strict;
35
36
37sub new {
38 my $class = shift (@_);
39
40 my $self = {};
41
42 my $ngram_jar = &util::filename_cat($ENV{'GSDLHOME'},"ext","ngramj","jars","cngram.jar");
43
44 my $java_cmd = "java -jar $ngram_jar";
45
46 $self->{'java_cmd'} = $java_cmd;
47
48
49 return bless $self, $class;
50}
51
52
53sub classify_contents {
54 my ($self, $contents_ref, $filename, $opt_encoding)=@_;
55
56 # save contents_ref to tmp file
57
58 my $tmp_txt_filename = &util::get_tmp_filename("txt");
59
60 if (open(TOUT,">$tmp_txt_filename")) {
61
62 binmode(TOUT,":utf8");
63 print TOUT $$contents_ref;
64 close(TOUT);
65 }
66 else {
67 print STDERR "Failed to open $tmp_txt_filename\n";
68 print STDERR "$!\n";
69 return undef;
70 }
71
72 # run java code over it
73 my $java_cmd = $self->{'java_cmd'};
74 $java_cmd .= " -lang2 $tmp_txt_filename";
75 $java_cmd .= " $opt_encoding" if (defined $opt_encoding);
76
77 my $lang_encode_pairs = [];
78 if (open(NGRAMIN,"$java_cmd |")) {
79
80 # parse what comes back and turn into array of 'lang-encode' form
81
82 my $line;
83 while (defined ($line = <NGRAMIN>)) {
84 if ($line =~ m/^\s*speed:\s*(.*?)\s*\.\./) {
85 my $lang_group = $1;
86
87 my @lang_array = split(/\s+/,$lang_group);
88
89 foreach my $l (@lang_array) {
90 print STDERR "l = $l\n";
91 my ($lang,$score) = ($l =~ m/^(.+):(.+)$/);
92
93 my $lang_pair = $lang;
94 $lang_pair .= "-$opt_encoding" if (defined $opt_encoding);
95
96 push(@$lang_encode_pairs,$lang_pair);
97 }
98 }
99 }
100
101 }
102 else {
103 print STDERR "Failed to open pipe to $java_cmd\n";
104 print STDERR "$!\n";
105 return undef;
106 }
107
108
109
110 &util::rm($tmp_txt_filename);
111
112 # return cached array of content encodings for the given filename
113 return $lang_encode_pairs
114}
115
116
117sub classify_contents_for_encoding {
118 my ($self, $contents_ref, $filename, $filter_by_encoding)=@_;
119
120 return $self->classify_contents($contents_ref,$filename,$filter_by_encoding);
121
122}
123
124
125
1261;
Note: See TracBrowser for help on using the repository browser.