source: trunk/gsdl/perllib/gb.pm@ 1218

Last change on this file since 1218 was 1218, checked in by sjboddie, 24 years ago

fixed bug in gb.pm preventing gb encoding text from being translated
correctly on windows

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 7.1 KB
Line 
1###########################################################################
2#
3# gb.pm --
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26#! /usr/local/bin/perl5 -w
27
28# useful methods for dealing with the GuoBiao (GB) encoding
29
30package gb;
31
32use util;
33
34# %encodings is of the form:
35#
36# encodings{encodingname-encodingname}->blocktranslation
37# blocktranslation->[[0-255],[256-511], ..., [65280-65535]]
38#
39# Any of the top translation blocks can point to an undefined
40# value. This data structure aims to allow fast translation and
41# efficient storage.
42
43# @array256 is used for initialisation, there must be
44# a better way...
45@array256 = (0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
46 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
47 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
48 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
49 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
50 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
51 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
52 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
53 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
54 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
55 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
56 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
57 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
58 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
59 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
60 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0);
61
62
63%translations = ();
64
65
66# gb2unicode uses the GBK translation table
67sub gb2unicode {
68 generalgb2unicode ("GBK", @_);
69}
70
71# returns a pointer to unicode array
72sub generalgb2unicode {
73 my ($encoding, $intext) = @_;
74
75 if (!&loadmapencoding ("$encoding-unicode")) {
76 print STDERR "gb: ERROR - could not load encoding $encoding\n";
77 return [];
78 }
79
80 my @outtext = ();
81 my $encodename = "$encoding-unicode";
82 my $len = length($intext);
83 my ($c1, $c2);
84 my $i = 0;
85
86 while ($i < $len) {
87 if (($c1 = ord(substr($intext, $i, 1))) >= 0x81) {
88 if ($i+1 < $len) {
89 # found a GB character, put black square if cannot translate
90 $c2 = ord(substr($intext, $i+1, 1));
91 my $c = &transchar ($encodename, ($c1 << 8) | $c2);
92 $c = 0x25a1 if $c == 0;
93 push (@outtext, $c);
94 $i += 2;
95
96 } else {
97 # error
98 print STDERR "gb: ERROR missing second half of GB character\n";
99 $i++;
100 }
101
102 } else {
103 # normal ascii character
104 push (@outtext, $c1) if ($c1 < 0x80);
105 $i++;
106 }
107 }
108
109 return \@outtext;
110}
111
112
113# unicode2gb uses the GBK translation table
114sub unicode2gb {
115 generalunicode2gb ("GBK", @_);
116}
117
118# returns a gb string. Note that this will not translate
119# traditional forms of characters to their simplified forms
120# Any traditional characters in the unicode will not be translated
121sub generalunicode2gb {
122 my ($encoding, $intext) = @_;
123
124 # load the encoding (if it is not already loaded)
125 if (!&loadmapencoding ("unicode-$encoding")) {
126 print STDERR "gb: ERROR - could not load encoding $encoding\n";
127 return "";
128 }
129
130 # translate the string
131 my ($num, $gbc);
132 my $outtext = "";
133 my $encodename = "unicode-$encoding";
134 foreach $num (@$intext) {
135 if ($num < 0x80) {
136 # normal ascii character
137 $outtext .= chr ($num);
138 } else {
139 # gb character
140 my $gbc = &transchar ($encodename, $num);
141 $gbc = 0xa1f5 if $gbc == 0;
142 $outtext .= chr ($gbc >> 8);
143 $outtext .= chr ($gbc & 0xff);
144 }
145 }
146
147 return $outtext;
148}
149
150
151# It returns 1 if successful, 0 if unsuccessful
152sub loadmapencoding {
153 my ($encoding) = @_;
154
155 # check to see if the encoding has already been loaded
156 return 1 if (defined $translations{$encoding});
157
158 my $filename = $encoding;
159 $filename =~ s/\-//;
160 $filename =~ s/unicode/u/i;
161 $filename =~ tr/A-Z/a-z/;
162 $filename .= ".ump"; # unicode map file
163
164 return 0 unless open (MAPFILE, "$ENV{'GSDLHOME'}/unicode/$filename");
165 binmode (MAPFILE); # f$#@!!! windows
166
167 $translations{$encoding} = [@array256];
168 my $block = $translations{$encoding};
169
170 my ($in,$i,$j);
171 while (read(MAPFILE, $in, 1) == 1) {
172 $i = unpack ("C", $in);
173 $block->[$i] = [@array256];
174 for ($j=0; $j<256 && read(MAPFILE, $in, 2)==2; $j++) {
175 my ($n1, $n2) = unpack ("CC", $in);
176 $block->[$i]->[$j] = ($n1*256) + $n2;
177 }
178 }
179
180 close (MAPFILE);
181}
182
183
184
185# loadencoding expects the mapfile to contain (at least) two
186# tab-seperated fields. The first field is the mapped value
187# and the second field is the unicode value.
188#
189# It returns 1 if successful, 0 if unsuccessful
190sub loadencoding {
191 my ($encoding, $mapfile) = @_;
192
193 my $to = "$encoding-unicode";
194 my $from = "unicode-$encoding";
195
196 # check to see if the encoding has already been loaded
197 if (defined $translations{$to} && defined $translations{$from}) {
198 return 1;
199 }
200
201 return 0 unless open (MAPFILE, $mapfile);
202
203 my ($line, @line);
204 $translations{$to} = [@array256];
205 $translations{$from} = [@array256];
206 while (defined ($line = <MAPFILE>)) {
207 # remove comments
208 $line =~ s/\#.*$//;
209 next unless $line =~ /\S/;
210
211 # split the line into fields and do a few
212 # simple sanity checks
213 @line = split (/\t/, $line);
214 next unless (scalar(@line) >= 2 &&
215 $line[0] =~ /^0x/ &&
216 $line[1] =~ /^0x/);
217
218 my $gbc = hex($line[0]);
219 my $unic = hex($line[1]);
220
221 # gbk has already had the necessary most significant
222 # bits set (it is used in the encoding
223 $gbc = $gbc | 0x8080 unless ($encoding =~ /gbk/i);
224
225 &addchartrans ($translations{$to}, $gbc, $unic);
226 &addchartrans ($translations{$from}, $unic, $gbc);
227 }
228
229 close (MAPFILE);
230
231 return 1;
232}
233
234sub showfilled {
235 my ($block) = @_;
236 my $count = 0;
237 foreach $el (@$block) {
238 if (ref ($el) eq "ARRAY") {
239 print "1";
240 $count++;
241 } else {
242 print "0";
243 }
244 }
245 print "\n";
246 print "total: $count\n";
247}
248
249
250# addchartrans adds one character translation to a translation block.
251# It also simplifies the translation block if possible.
252sub addchartrans {
253 my ($block, $from, $to) = @_;
254 my $i = 0;
255
256 my $high = ($from / 256) % 256;
257 my $low = $from % 256;
258
259 if (ref ($block->[$high]) ne "ARRAY") {
260 $block->[$high] = [@array256];
261 }
262 $block->[$high]->[$low] = $to;
263}
264
265
266sub transchar {
267 my ($encoding, $from) = @_;
268 my $high = ($from / 256) % 256;
269 my $low = $from % 256;
270
271 return 0 unless defined $translations{$encoding};
272
273 my $block = $translations{$encoding};
274
275 if (ref ($block->[$high]) ne "ARRAY") {
276 return 0;
277 }
278 return $block->[$high]->[$low];
279}
280
281
2821;
Note: See TracBrowser for help on using the repository browser.