source: trunk/gsdl/perllib/gb.pm@ 81

Last change on this file since 81 was 73, checked in by rjmcnab, 26 years ago

Added support for UTF-8.

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 5.5 KB
Line 
1#! /usr/local/bin/perl5 -w
2
3# useful methods for dealing with the GuoBiao (GB) encoding
4
5package gb;
6
7use util;
8
9# %encodings is of the form:
10#
11# encodings{encodingname-encodingname}->blocktranslation
12# blocktranslation->[[0-255],[256-511], ..., [65280-65535]]
13#
14# Any of the top translation blocks can point to an undefined
15# value. This data structure aims to allow fast translation and
16# efficient storage.
17
18# @array256 is used for initialisation, there must be
19# a better way...
20@array256 = (0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
21 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
22 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
23 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
24 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
25 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
26 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
27 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
28 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
29 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
30 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
31 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
32 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
33 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
34 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
35 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0);
36
37
38%translations = ();
39
40
41# gb2unicode uses the GB2312 translation table
42sub gb2unicode {
43 generalgb2unicode ("GB2312", @_);
44}
45
46# returns a pointer to unicode array
47sub generalgb2unicode {
48 my ($encoding, $intext) = @_;
49 my $mapfile = &util::filename_cat($ENV{'GSDLHOME'},
50 "unicode/MAPPINGS/EASTASIA/GB",
51 "$encoding.TXT");
52 if (!&loadencoding ($encoding, $mapfile)) {
53 print STDERR "gb: ERROR - could not load encoding $encoding\n";
54 return [];
55 }
56
57 my @outtext = ();
58 my $encodename = "$encoding-unicode";
59 my $len = length($intext);
60 my ($c1, $c2);
61 my $i = 0;
62
63 while ($i < $len) {
64 if (($c1 = ord(substr($intext, $i, 1))) >= 0xa0) {
65 if ($i+1 < $len) {
66 if (($c2 = ord(substr($intext, $i+1, 1))) >= 0xa0) {
67 # found a GB character, put black square if cannot translate
68 my $c = &transchar ($encodename, ($c1-128)*256+$c2-128);
69 $c = 0x25a1 if $c == 0;
70 push (@outtext, $c);
71 $i += 2;
72
73 } else {
74 # error second character not > 127
75 print STDERR "gb: ERROR second GB character not >= 0xa0\n";
76 $i++;
77
78 }
79 } else {
80 # error
81 print STDERR "gb: ERROR missing second half of GB character\n";
82 $i++;
83 }
84
85 } else {
86 # normal ascii character
87 push (@outtext, $c1) if ($c1 < 0x80);
88 $i++;
89 }
90 }
91
92 return \@outtext;
93}
94
95
96# unicode2gb uses the GB2312 translation table
97sub unicode2gb {
98 generalunicode2gb ("GB2312", @_);
99}
100
101# returns a gb string. Note that this will not translate
102# traditional forms of characters to their simplified forms
103# Any traditional characters in the unicode will not be translated
104sub generalunicode2gb {
105 my ($encoding, $intext) = @_;
106
107 # load the encoding (if it is not already loaded)
108 my $mapfile = &util::filename_cat($ENV{'GSDLHOME'},
109 "unicode/MAPPINGS/EASTASIA/GB",
110 "$encoding.TXT");
111 if (!&loadencoding ($encoding, $mapfile)) {
112 print STDERR "gb: ERROR - could not load encoding $encoding\n";
113 return "";
114 }
115
116 # translate the string
117 my ($num, $gbc);
118 my $outtext = "";
119 my $encodename = "unicode-$encoding";
120 foreach $num (@$intext) {
121 if ($num < 0x80) {
122 # normal ascii character
123 $outtext .= chr ($num);
124 } else {
125 # gb character
126 my $gbc = &transchar ($encodename, $num);
127 $gbc = 0x2175 if $gbc == 0;
128 $outtext .= chr (($gbc >> 8) + 0x80);
129 $outtext .= chr (($gbc & 0xff) + 0x80);
130 }
131 }
132
133 return $outtext;
134}
135
136
137# loadmapfile expects the mapfile to contain (at least) two
138# tab-seperated fields. The first field is the mapped value
139# and the second field is the unicode value.
140#
141# It returns 1 if successful, 0 if unsuccessful
142sub loadencoding {
143 my ($encoding, $mapfile) = @_;
144
145 my $to = "$encoding-unicode";
146 my $from = "unicode-$encoding";
147
148 # check to see if the encoding has already been loaded
149 if (defined $translations{$to} && defined $translations{$from}) {
150 return 1;
151 }
152
153 return 0 unless open (MAPFILE, $mapfile);
154
155 my ($line, @line);
156 $translations{$to} = [@array256];
157 $translations{$from} = [@array256];
158 while (defined ($line = <MAPFILE>)) {
159 # remove comments
160 $line =~ s/\#.*$//;
161 next unless $line =~ /\S/;
162
163 # split the line into fields and do a few
164 # simple sanity checks
165 @line = split (/\t/, $line);
166 next unless (scalar(@line) >= 2 &&
167 $line[0] =~ /^0x/ &&
168 $line[1] =~ /^0x/);
169
170 &addchartrans ($translations{$to}, hex($line[0]), hex($line[1]));
171 &addchartrans ($translations{$from}, hex($line[1]), hex($line[0]));
172 }
173
174 close (MAPFILE);
175
176 return 1;
177}
178
179sub showfilled {
180 my ($block) = @_;
181 my $count = 0;
182 foreach $el (@$block) {
183 if (ref ($el) eq "ARRAY") {
184 print "1";
185 $count++;
186 } else {
187 print "0";
188 }
189 }
190 print "\n";
191 print "total: $count\n";
192}
193
194
195# addchartrans adds one character translation to a translation block.
196# It also simplifies the translation block if possible.
197sub addchartrans {
198 my ($block, $from, $to) = @_;
199 my $i = 0;
200
201 my $high = ($from / 256) % 256;
202 my $low = $from % 256;
203
204 if (ref ($block->[$high]) ne "ARRAY") {
205 $block->[$high] = [@array256];
206 }
207 $block->[$high]->[$low] = $to;
208}
209
210
211sub transchar {
212 my ($encoding, $from) = @_;
213 my $high = ($from / 256) % 256;
214 my $low = $from % 256;
215
216 return 0 unless defined $translations{$encoding};
217
218 my $block = $translations{$encoding};
219
220 if (ref ($block->[$high]) ne "ARRAY") {
221 return 0;
222 }
223 return $block->[$high]->[$low];
224}
225
226
2271;
Note: See TracBrowser for help on using the repository browser.