source: trunk/gsdl/perllib/gb.pm@ 900

Last change on this file since 900 was 537, checked in by sjboddie, 25 years ago

added GPL headers

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 7.1 KB
Line 
1###########################################################################
2#
3# gb.pm --
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26#! /usr/local/bin/perl5 -w
27
28# useful methods for dealing with the GuoBiao (GB) encoding
29
30package gb;
31
32use util;
33
34# %encodings is of the form:
35#
36# encodings{encodingname-encodingname}->blocktranslation
37# blocktranslation->[[0-255],[256-511], ..., [65280-65535]]
38#
39# Any of the top translation blocks can point to an undefined
40# value. This data structure aims to allow fast translation and
41# efficient storage.
42
43# @array256 is used for initialisation, there must be
44# a better way...
45@array256 = (0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
46 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
47 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
48 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
49 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
50 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
51 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
52 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
53 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
54 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
55 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
56 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
57 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
58 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
59 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
60 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0);
61
62
63%translations = ();
64
65
66# gb2unicode uses the GBK translation table
67sub gb2unicode {
68 generalgb2unicode ("GBK", @_);
69}
70
71# returns a pointer to unicode array
72sub generalgb2unicode {
73 my ($encoding, $intext) = @_;
74
75 if (!&loadmapencoding ("$encoding-unicode")) {
76 print STDERR "gb: ERROR - could not load encoding $encoding\n";
77 return [];
78 }
79
80 my @outtext = ();
81 my $encodename = "$encoding-unicode";
82 my $len = length($intext);
83 my ($c1, $c2);
84 my $i = 0;
85
86 while ($i < $len) {
87 if (($c1 = ord(substr($intext, $i, 1))) >= 0x81) {
88 if ($i+1 < $len) {
89 # found a GB character, put black square if cannot translate
90 $c2 = ord(substr($intext, $i+1, 1));
91 my $c = &transchar ($encodename, ($c1 << 8) | $c2);
92 $c = 0x25a1 if $c == 0;
93 push (@outtext, $c);
94 $i += 2;
95
96 } else {
97 # error
98 print STDERR "gb: ERROR missing second half of GB character\n";
99 $i++;
100 }
101
102 } else {
103 # normal ascii character
104 push (@outtext, $c1) if ($c1 < 0x80);
105 $i++;
106 }
107 }
108
109 return \@outtext;
110}
111
112
113# unicode2gb uses the GBK translation table
114sub unicode2gb {
115 generalunicode2gb ("GBK", @_);
116}
117
118# returns a gb string. Note that this will not translate
119# traditional forms of characters to their simplified forms
120# Any traditional characters in the unicode will not be translated
121sub generalunicode2gb {
122 my ($encoding, $intext) = @_;
123
124 # load the encoding (if it is not already loaded)
125 if (!&loadmapencoding ("unicode-$encoding")) {
126 print STDERR "gb: ERROR - could not load encoding $encoding\n";
127 return "";
128 }
129
130 # translate the string
131 my ($num, $gbc);
132 my $outtext = "";
133 my $encodename = "unicode-$encoding";
134 foreach $num (@$intext) {
135 if ($num < 0x80) {
136 # normal ascii character
137 $outtext .= chr ($num);
138 } else {
139 # gb character
140 my $gbc = &transchar ($encodename, $num);
141 $gbc = 0xa1f5 if $gbc == 0;
142 $outtext .= chr ($gbc >> 8);
143 $outtext .= chr ($gbc & 0xff);
144 }
145 }
146
147 return $outtext;
148}
149
150
151# It returns 1 if successful, 0 if unsuccessful
152sub loadmapencoding {
153 my ($encoding) = @_;
154
155 # check to see if the encoding has already been loaded
156 return 1 if (defined $translations{$encoding});
157
158 my $filename = $encoding;
159 $filename =~ s/\-//;
160 $filename =~ s/unicode/u/i;
161 $filename =~ tr/A-Z/a-z/;
162 $filename .= ".ump"; # unicode map file
163
164 return 0 unless open (MAPFILE, "$ENV{'GSDLHOME'}/unicode/$filename");
165
166 $translations{$encoding} = [@array256];
167 my $block = $translations{$encoding};
168
169 my ($in,$i,$j);
170 while (read(MAPFILE, $in, 1) == 1) {
171 $i = unpack ("C", $in);
172 $block->[$i] = [@array256];
173 for ($j=0; $j<256 && read(MAPFILE, $in, 2)==2; $j++) {
174 my ($n1, $n2) = unpack ("CC", $in);
175 $block->[$i]->[$j] = ($n1*256) + $n2;
176 }
177 }
178
179 close (MAPFILE);
180}
181
182
183
184# loadencoding expects the mapfile to contain (at least) two
185# tab-seperated fields. The first field is the mapped value
186# and the second field is the unicode value.
187#
188# It returns 1 if successful, 0 if unsuccessful
189sub loadencoding {
190 my ($encoding, $mapfile) = @_;
191
192 my $to = "$encoding-unicode";
193 my $from = "unicode-$encoding";
194
195 # check to see if the encoding has already been loaded
196 if (defined $translations{$to} && defined $translations{$from}) {
197 return 1;
198 }
199
200 return 0 unless open (MAPFILE, $mapfile);
201
202 my ($line, @line);
203 $translations{$to} = [@array256];
204 $translations{$from} = [@array256];
205 while (defined ($line = <MAPFILE>)) {
206 # remove comments
207 $line =~ s/\#.*$//;
208 next unless $line =~ /\S/;
209
210 # split the line into fields and do a few
211 # simple sanity checks
212 @line = split (/\t/, $line);
213 next unless (scalar(@line) >= 2 &&
214 $line[0] =~ /^0x/ &&
215 $line[1] =~ /^0x/);
216
217 my $gbc = hex($line[0]);
218 my $unic = hex($line[1]);
219
220 # gbk has already had the necessary most significant
221 # bits set (it is used in the encoding
222 $gbc = $gbc | 0x8080 unless ($encoding =~ /gbk/i);
223
224 &addchartrans ($translations{$to}, $gbc, $unic);
225 &addchartrans ($translations{$from}, $unic, $gbc);
226 }
227
228 close (MAPFILE);
229
230 return 1;
231}
232
233sub showfilled {
234 my ($block) = @_;
235 my $count = 0;
236 foreach $el (@$block) {
237 if (ref ($el) eq "ARRAY") {
238 print "1";
239 $count++;
240 } else {
241 print "0";
242 }
243 }
244 print "\n";
245 print "total: $count\n";
246}
247
248
249# addchartrans adds one character translation to a translation block.
250# It also simplifies the translation block if possible.
251sub addchartrans {
252 my ($block, $from, $to) = @_;
253 my $i = 0;
254
255 my $high = ($from / 256) % 256;
256 my $low = $from % 256;
257
258 if (ref ($block->[$high]) ne "ARRAY") {
259 $block->[$high] = [@array256];
260 }
261 $block->[$high]->[$low] = $to;
262}
263
264
265sub transchar {
266 my ($encoding, $from) = @_;
267 my $high = ($from / 256) % 256;
268 my $low = $from % 256;
269
270 return 0 unless defined $translations{$encoding};
271
272 my $block = $translations{$encoding};
273
274 if (ref ($block->[$high]) ne "ARRAY") {
275 return 0;
276 }
277 return $block->[$high]->[$low];
278}
279
280
2811;
Note: See TracBrowser for help on using the repository browser.