source: trunk/gsdl/perllib/gb.pm@ 246

Last change on this file since 246 was 92, checked in by rjmcnab, 25 years ago

Enabled the user to specify the output conversion class. Changed how
the conversion between gb and unicode is handled by the perl code, it
now uses 'map' files residing in the unicode directory

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 6.0 KB
Line 
1#! /usr/local/bin/perl5 -w
2
3# useful methods for dealing with the GuoBiao (GB) encoding
4
5package gb;
6
7use util;
8
9# %encodings is of the form:
10#
11# encodings{encodingname-encodingname}->blocktranslation
12# blocktranslation->[[0-255],[256-511], ..., [65280-65535]]
13#
14# Any of the top translation blocks can point to an undefined
15# value. This data structure aims to allow fast translation and
16# efficient storage.
17
18# @array256 is used for initialisation, there must be
19# a better way...
20@array256 = (0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
21 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
22 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
23 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
24 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
25 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
26 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
27 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
28 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
29 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
30 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
31 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
32 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
33 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
34 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
35 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0);
36
37
38%translations = ();
39
40
41# gb2unicode uses the GBK translation table
42sub gb2unicode {
43 generalgb2unicode ("GBK", @_);
44}
45
46# returns a pointer to unicode array
47sub generalgb2unicode {
48 my ($encoding, $intext) = @_;
49
50 if (!&loadmapencoding ("$encoding-unicode")) {
51 print STDERR "gb: ERROR - could not load encoding $encoding\n";
52 return [];
53 }
54
55 my @outtext = ();
56 my $encodename = "$encoding-unicode";
57 my $len = length($intext);
58 my ($c1, $c2);
59 my $i = 0;
60
61 while ($i < $len) {
62 if (($c1 = ord(substr($intext, $i, 1))) >= 0x81) {
63 if ($i+1 < $len) {
64 # found a GB character, put black square if cannot translate
65 $c2 = ord(substr($intext, $i+1, 1));
66 my $c = &transchar ($encodename, ($c1 << 8) | $c2);
67 $c = 0x25a1 if $c == 0;
68 push (@outtext, $c);
69 $i += 2;
70
71 } else {
72 # error
73 print STDERR "gb: ERROR missing second half of GB character\n";
74 $i++;
75 }
76
77 } else {
78 # normal ascii character
79 push (@outtext, $c1) if ($c1 < 0x80);
80 $i++;
81 }
82 }
83
84 return \@outtext;
85}
86
87
88# unicode2gb uses the GBK translation table
89sub unicode2gb {
90 generalunicode2gb ("GBK", @_);
91}
92
93# returns a gb string. Note that this will not translate
94# traditional forms of characters to their simplified forms
95# Any traditional characters in the unicode will not be translated
96sub generalunicode2gb {
97 my ($encoding, $intext) = @_;
98
99 # load the encoding (if it is not already loaded)
100 if (!&loadmapencoding ("unicode-$encoding")) {
101 print STDERR "gb: ERROR - could not load encoding $encoding\n";
102 return "";
103 }
104
105 # translate the string
106 my ($num, $gbc);
107 my $outtext = "";
108 my $encodename = "unicode-$encoding";
109 foreach $num (@$intext) {
110 if ($num < 0x80) {
111 # normal ascii character
112 $outtext .= chr ($num);
113 } else {
114 # gb character
115 my $gbc = &transchar ($encodename, $num);
116 $gbc = 0xa1f5 if $gbc == 0;
117 $outtext .= chr ($gbc >> 8);
118 $outtext .= chr ($gbc & 0xff);
119 }
120 }
121
122 return $outtext;
123}
124
125
126# It returns 1 if successful, 0 if unsuccessful
127sub loadmapencoding {
128 my ($encoding) = @_;
129
130 # check to see if the encoding has already been loaded
131 return 1 if (defined $translations{$encoding});
132
133 my $filename = $encoding;
134 $filename =~ s/\-//;
135 $filename =~ s/unicode/u/i;
136 $filename =~ tr/A-Z/a-z/;
137 $filename .= ".ump"; # unicode map file
138
139 return 0 unless open (MAPFILE, "$ENV{'GSDLHOME'}/unicode/$filename");
140
141 $translations{$encoding} = [@array256];
142 my $block = $translations{$encoding};
143
144 my ($in,$i,$j);
145 while (read(MAPFILE, $in, 1) == 1) {
146 $i = unpack ("C", $in);
147 $block->[$i] = [@array256];
148 for ($j=0; $j<256 && read(MAPFILE, $in, 2)==2; $j++) {
149 my ($n1, $n2) = unpack ("CC", $in);
150 $block->[$i]->[$j] = ($n1*256) + $n2;
151 }
152 }
153
154 close (MAPFILE);
155}
156
157
158
159# loadencoding expects the mapfile to contain (at least) two
160# tab-seperated fields. The first field is the mapped value
161# and the second field is the unicode value.
162#
163# It returns 1 if successful, 0 if unsuccessful
164sub loadencoding {
165 my ($encoding, $mapfile) = @_;
166
167 my $to = "$encoding-unicode";
168 my $from = "unicode-$encoding";
169
170 # check to see if the encoding has already been loaded
171 if (defined $translations{$to} && defined $translations{$from}) {
172 return 1;
173 }
174
175 return 0 unless open (MAPFILE, $mapfile);
176
177 my ($line, @line);
178 $translations{$to} = [@array256];
179 $translations{$from} = [@array256];
180 while (defined ($line = <MAPFILE>)) {
181 # remove comments
182 $line =~ s/\#.*$//;
183 next unless $line =~ /\S/;
184
185 # split the line into fields and do a few
186 # simple sanity checks
187 @line = split (/\t/, $line);
188 next unless (scalar(@line) >= 2 &&
189 $line[0] =~ /^0x/ &&
190 $line[1] =~ /^0x/);
191
192 my $gbc = hex($line[0]);
193 my $unic = hex($line[1]);
194
195 # gbk has already had the necessary most significant
196 # bits set (it is used in the encoding
197 $gbc = $gbc | 0x8080 unless ($encoding =~ /gbk/i);
198
199 &addchartrans ($translations{$to}, $gbc, $unic);
200 &addchartrans ($translations{$from}, $unic, $gbc);
201 }
202
203 close (MAPFILE);
204
205 return 1;
206}
207
208sub showfilled {
209 my ($block) = @_;
210 my $count = 0;
211 foreach $el (@$block) {
212 if (ref ($el) eq "ARRAY") {
213 print "1";
214 $count++;
215 } else {
216 print "0";
217 }
218 }
219 print "\n";
220 print "total: $count\n";
221}
222
223
224# addchartrans adds one character translation to a translation block.
225# It also simplifies the translation block if possible.
226sub addchartrans {
227 my ($block, $from, $to) = @_;
228 my $i = 0;
229
230 my $high = ($from / 256) % 256;
231 my $low = $from % 256;
232
233 if (ref ($block->[$high]) ne "ARRAY") {
234 $block->[$high] = [@array256];
235 }
236 $block->[$high]->[$low] = $to;
237}
238
239
240sub transchar {
241 my ($encoding, $from) = @_;
242 my $high = ($from / 256) % 256;
243 my $low = $from % 256;
244
245 return 0 unless defined $translations{$encoding};
246
247 my $block = $translations{$encoding};
248
249 if (ref ($block->[$high]) ne "ARRAY") {
250 return 0;
251 }
252 return $block->[$high]->[$low];
253}
254
255
2561;
Note: See TracBrowser for help on using the repository browser.