source: trunk/gsdl/perllib/cjk.pm@ 1868

Last change on this file since 1868 was 1868, checked in by sjboddie, 23 years ago

Made a bunch of changes to the building code to support lots of new
languages and encodings. It's still kind of a mess but should be fixed
up over the weekend.

  • Property svn:keywords set to Author Date Id Revision
File size: 12.0 KB
RevLine 
[1868]1###########################################################################
2#
3# cjk.pm --
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 2001 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26# functions for dealing with CJK (and possibly other) complex character
27# encodings
28
29package cjk;
30
31use util;
32
33
34###########################################################################
35
36# GB to Unicode
37sub gb2unicode {
38 return &cjk2unicode ("GBK", 0x25a1, @_);
39}
40
41sub unicode2gb {
42 return &unicode2cjk ("GBK", 0xA1F5, @_);
43}
44
45###########################################################################
46
47# EUC encoded Japanese to unicode (doesn't currently support JIS X
48# 0212-1990 3-byte characters)
49sub eucjp2unicode {
50 my ($intext) = @_;
51
52 if (!&loadmapencoding ("JIS-unicode")) {
53 print STDERR "cjk: ERROR - could not load encoding JIS\n";
54 return [];
55 }
56
57 my @outtext = ();
58 my $encodename = "JIS-unicode";
59 my $len = length($intext);
60 my ($c1, $c2);
61 my $i = 0;
62
63 while ($i < $len) {
64 $c1 = ord(substr($intext, $i, 1));
65 if ($c1 < 0x80) {
66 # ASCII/JIS Roman character
67 $c1 = 0x00A5 if $c1 == 0x5C; # yen sign
68 $c1 = 0x203E if $c1 == 0x7E; # overline
69 push (@outtext, $c1);
70 $i ++;
71
72 } elsif ($c1 == 0x8E) {
73 # half-width katakana character
74 $c2 = ord(substr($intext, $i+1, 1));
75 if ($c2 >= 0xA1 && $c2 <= 0xDF) {
76 my $c = &transchar ($encodename, $c2);
77 # put a black square if cannot translate
78 $c = 0x25A1 if $c == 0;
79 push (@outtext, $c);
80 } else {
81 # error
82 print STDERR "cjk: ERROR Invalid EUS-JP character\n";
83 }
84 $i += 2;
85
86 } elsif ($c1 >= 0xA1 && $c1 <= 0xFE) {
87 # double byte character
88 $c2 = ord(substr($intext, $i+1, 1));
89 if ($c2 >= 0xA1 && $c2 <= 0xFE) {
90 $c1 = $c1 & 0x7F;
91 $c2 = $c2 & 0x7F;
92 my $c = &transchar ($encodename, ($c1 << 8) | $c2);
93
94 # katakana midle dot seems to be the default character of choice
95 # for this encoding
96 $c = 0x30FB if $c == 0;
97 push (@outtext, $c);
98 } else {
99 # error
100 print STDERR "cjk: ERROR Invalid EUS-JP character\n";
101 }
102 $i += 2;
103
104 } elsif ($c1 == 0x8F) {
105 # three byte character - not supported - output a black square
106 print STDERR "cjk: WARNING EUS-JP string appears to contain 3 byte characters ";
107 print STDERR "which aren't supported\n";
108 push (@outtext, 0x25A1);
109 $i += 3;
110
111 } else {
112 # error
113 print STDERR "cjk: ERROR Invalid EUS-JP character\n";
114 $i ++;
115 }
116 }
117 return \@outtext;
118}
119
120# todo unicode2eucjp
121
122###########################################################################
123
124# UHC Korean to unicode
125sub uhc2unicode {
126 return &cjk2unicode ("UHC", 0x25a1, @_);
127}
128
129###########################################################################
130
131# Shift-JIS to unicode
132sub sjis2unicode {
133 my ($intext) = @_;
134
135 if (!&loadmapencoding ("SJIS-unicode")) {
136 print STDERR "cjk: ERROR - could not load encoding SJIS\n";
137 return [];
138 }
139
140 my @outtext = ();
141 my $encodename = "SJIS-unicode";
142 my $len = length($intext);
143 my ($c1, $c2);
144 my $i = 0;
145
146 while ($i < $len) {
147 $c1 = ord(substr($intext, $i, 1));
148
149 if (($c1 >= 0xA1 && $c1 <= 0xDF) || $c1 == 0x5c || $c1 == 0x7E) {
150 # Single byte half-width katakana character or
151 # JIS Roman yen or overline characters
152 my $c = &transchar ($encodename, $c1);
153 # - put a black square if cannot translate
154 $c = 0x25A1 if $c == 0;
155 push (@outtext, $c);
156 $i++;
157
158 } elsif ($c1 < 0x80) {
159 # ASCII
160 push (@outtext, $c1);
161 $i ++;
162
163 } elsif ($c1 < 0xEF) {
164 if ($i+1 < $len) {
165 $c2 = ord(substr($intext, $i+1, 1));
166 if (($c2 >= 0x40 && $c2 <= 0x7E) || ($c2 >= 0x80 && $c2 <= 0xFC)) {
167 # Double byte shift-jis character
168 my $c = &transchar ($encodename, ($c1 << 8) | $c2);
169 # put a black square if cannot translate
170 $c = 0x25A1 if $c == 0;
171 push (@outtext, $c);
172 } else {
173 # error
174 print STDERR "cjk: ERROR Invalid Shift-JIS character\n";
175 }
176 $i += 2;
177 } else {
178 # error
179 print STDERR "cjk: ERROR missing second half of Shift-JIS character\n";
180 $i ++;
181 }
182 } else {
183 # error
184 print STDERR "cjk: ERROR Invalid Shift-JIS character\n";
185 $i ++;
186 }
187 }
188 return \@outtext;
189}
190
191sub unicode2sjis {
192 return &unicode2cjk ("SJIS", 0x81A0, @_,);
193}
194
195###########################################################################
196
197# %translations is of the form:
198#
199# encodings{encodingname-encodingname}->blocktranslation
200# blocktranslation->[[0-255],[256-511], ..., [65280-65535]]
201#
202# Any of the top translation blocks can point to an undefined
203# value. This data structure aims to allow fast translation and
204# efficient storage.
205%translations = ();
206
207# @array256 is used for initialisation, there must be
208# a better way...
209@array256 = (0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
210 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
211 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
212 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
213 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
214 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
215 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
216 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
217 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
218 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
219 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
220 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
221 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
222 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
223 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
224 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0);
225
226# The following two functions may be used when converting between some cjk
227# encodings and unicode. They treat all characters <= 0x80 as ascii and
228# convert everything else using the mapping for $encoding.
229# $dchar is the default character that's inserted if a character can't be
230# translated.
231
232# returns a pointer to unicode array
233sub cjk2unicode {
234 my ($encoding, $dchar, $intext) = @_;
235 $dchar = 0x20 unless defined $dchar;
236
237 if (!&loadmapencoding ("$encoding-unicode")) {
238 print STDERR "cjk: ERROR - could not load encoding $encoding\n";
239 return [];
240 }
241
242 my @outtext = ();
243 my $encodename = "$encoding-unicode";
244 my $len = length($intext);
245 my ($c1, $c2);
246 my $i = 0;
247
248 while ($i < $len) {
249 if (($c1 = ord(substr($intext, $i, 1))) >= 0x80) {
250 if ($i+1 < $len) {
251 # double byte character
252 $c2 = ord(substr($intext, $i+1, 1));
253 my $c = &transchar ($encodename, ($c1 << 8) | $c2);
254 $c = $dchar if $c == 0;
255 push (@outtext, $c);
256 $i += 2;
257
258 } else {
259 # error
260 print STDERR "cjk: ERROR missing second half of double byte character\n";
261 $i++;
262 }
263
264 } else {
265 # normal ascii character
266 push (@outtext, $c1);
267 $i++;
268 }
269 }
270 return \@outtext;
271}
272
273sub unicode2cjk {
274 my ($encoding, $dchar, $intext) = @_;
275 $dchar = 0x20 unless defined $dchar;
276
277 # load the encoding (if it is not already loaded)
278 if (!&loadmapencoding ("unicode-$encoding")) {
279 print STDERR "cjk: ERROR - could not load encoding $encoding\n";
280 return "";
281 }
282
283 # translate the string
284 my ($num, $char);
285 my $outtext = "";
286 my $encodename = "unicode-$encoding";
287 foreach $num (@$intext) {
288 if ($num < 0x80) {
289 # normal ascii character
290 $outtext .= chr ($num);
291 } else {
292 # mapped character
293 my $char = &transchar ($encodename, $num);
294 $char = $dchar if $char == 0;
295 $outtext .= chr ($char >> 8);
296 $outtext .= chr ($char & 0xff);
297 }
298 }
299 return $outtext;
300}
301
302# returns 1 if successful, 0 if unsuccessful
303sub loadmapencoding {
304 my ($encoding) = @_;
305
306 # check to see if the encoding has already been loaded
307 return 1 if (defined $translations{$encoding});
308
309 my $filename = $encoding;
310 $filename =~ s/\-//;
311 $filename =~ s/unicode/u/i;
312 $filename =~ tr/A-Z/a-z/;
313 $filename .= ".ump"; # unicode map file
314
315 return 0 unless open (MAPFILE, &util::filename_cat ($ENV{'GSDLHOME'}, "unicode", $filename));
316 binmode (MAPFILE);
317
318 $translations{$encoding} = [@array256];
319 my $block = $translations{$encoding};
320
321 my ($in,$i,$j);
322 while (read(MAPFILE, $in, 1) == 1) {
323 $i = unpack ("C", $in);
324 $block->[$i] = [@array256];
325 for ($j=0; $j<256 && read(MAPFILE, $in, 2)==2; $j++) {
326 my ($n1, $n2) = unpack ("CC", $in);
327 $block->[$i]->[$j] = ($n1*256) + $n2;
328 }
329 }
330
331 close (MAPFILE);
332}
333
334# loadencoding expects the mapfile to contain (at least) two
335# tab-separated fields. The first field is the mapped value
336# and the second field is the unicode value.
337#
338# It returns 1 if successful, 0 if unsuccessful
339sub loadencoding {
340 my ($encoding, $mapfile) = @_;
341
342 my $to = "$encoding-unicode";
343 my $from = "unicode-$encoding";
344
345 # check to see if the encoding has already been loaded
346 if (defined $translations{$to} && defined $translations{$from}) {
347 return 1;
348 }
349
350 return 0 unless open (MAPFILE, $mapfile);
351
352 my ($line, @line);
353 $translations{$to} = [@array256];
354 $translations{$from} = [@array256];
355 while (defined ($line = <MAPFILE>)) {
356 chomp $line;
357 # remove comments
358 $line =~ s/\#.*$//;
359 next unless $line =~ /\S/;
360
361 # split the line into fields and do a few
362 # simple sanity checks
363 @line = split (/\t/, $line);
364 next unless (scalar(@line) >= 2 &&
365 $line[0] =~ /^0x/ &&
366 $line[1] =~ /^0x/);
367
368 my $char = hex($line[0]);
369 my $unic = hex($line[1]);
370
371 # might need this for some versions of gb but not gbk
372# $char = $char | 0x8080 unless ($encoding =~ /gbk/i);
373
374 &addchartrans ($translations{$to}, $char, $unic);
375 &addchartrans ($translations{$from}, $unic, $char);
376 }
377
378 close (MAPFILE);
379
380 return 1;
381}
382
383# addchartrans adds one character translation to a translation block.
384# It also simplifies the translation block if possible.
385sub addchartrans {
386 my ($block, $from, $to) = @_;
387 my $i = 0;
388
389 my $high = ($from / 256) % 256;
390 my $low = $from % 256;
391
392 if (ref ($block->[$high]) ne "ARRAY") {
393 $block->[$high] = [@array256];
394 }
395 $block->[$high]->[$low] = $to;
396}
397
398sub transchar {
399 my ($encoding, $from) = @_;
400 my $high = ($from / 256) % 256;
401 my $low = $from % 256;
402
403 return 0 unless defined $translations{$encoding};
404
405 my $block = $translations{$encoding};
406
407 if (ref ($block->[$high]) ne "ARRAY") {
408 return 0;
409 }
410 return $block->[$high]->[$low];
411}
412
413sub writemapfile {
414 my ($encoding, $filename, $tounicode) = @_;
415
416 $filename .= ".ump"; # unicode map file
417 if ($tounicode) {
418 $filename = &util::filename_cat ($ENV{'GSDLHOME'}, "mappings", "to_uc", $filename);
419 } else {
420 $filename = &util::filename_cat ($ENV{'GSDLHOME'}, "mappings", "from_uc", $filename);
421 }
422
423 die "translation not defined" if (!defined $translations{$encoding});
424 my $block = $translations{$encoding};
425
426 print "writing $filename\n";
427 open (MAPFILE, ">" . $filename) || die;
428 binmode (MAPFILE);
429
430 my ($i, $j);
431 for ($i=0; $i<256; $i++) {
432 if (ref ($block->[$i]) eq "ARRAY") {
433 print MAPFILE pack ("C", $i);
434 for ($j=0; $j<256; $j++) {
435 # unsigned short in network order
436 print MAPFILE pack ("CC", int($block->[$i]->[$j] / 256),
437 $block->[$i]->[$j] % 256);
438 }
439 }
440 }
441 close (MAPFILE);
442}
443
444sub makeencodingmapfile {
445 my ($encoding, $txtmapfile) = @_;
446
447 if (!&loadencoding ($encoding, $txtmapfile)) {
448 die "couldn't load encoding $encoding";
449 }
450
451 # write out map files
452 &writemapfile ("$encoding-unicode", $encoding, 1);
453 &writemapfile ("unicode-$encoding", $encoding, 0);
454}
455
4561;
Note: See TracBrowser for help on using the repository browser.