########################################################################### # # cjk.pm -- # A component of the Greenstone digital library software # from the New Zealand Digital Library Project at the # University of Waikato, New Zealand. # # Copyright (C) 2001 New Zealand Digital Library Project # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation; either version 2 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program; if not, write to the Free Software # Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. # ########################################################################### # functions for dealing with CJK (and possibly other) complex character # encodings package cjk; use util; ########################################################################### # GB to Unicode sub gb2unicode { return &cjk2unicode ("GBK", 0x25a1, @_); } sub unicode2gb { return &unicode2cjk ("GBK", 0xA1F5, @_); } ########################################################################### # EUC encoded Japanese to unicode (doesn't currently support JIS X # 0212-1990 3-byte characters) sub eucjp2unicode { my ($intext) = @_; if (!&loadmapencoding ("JIS-unicode")) { print STDERR "cjk: ERROR - could not load encoding JIS\n"; return []; } my @outtext = (); my $encodename = "JIS-unicode"; my $len = length($intext); my ($c1, $c2); my $i = 0; while ($i < $len) { $c1 = ord(substr($intext, $i, 1)); if ($c1 < 0x80) { # ASCII/JIS Roman character $c1 = 0x00A5 if $c1 == 0x5C; # yen sign $c1 = 0x203E if $c1 == 0x7E; # overline push (@outtext, $c1); $i ++; } elsif ($c1 == 0x8E) { # half-width katakana character $c2 = ord(substr($intext, $i+1, 1)); if ($c2 >= 0xA1 && $c2 <= 0xDF) { my $c = &transchar ($encodename, $c2); # put a black square if cannot translate $c = 0x25A1 if $c == 0; push (@outtext, $c); } else { # error print STDERR "cjk: ERROR Invalid EUS-JP character\n"; } $i += 2; } elsif ($c1 >= 0xA1 && $c1 <= 0xFE) { # double byte character $c2 = ord(substr($intext, $i+1, 1)); if ($c2 >= 0xA1 && $c2 <= 0xFE) { $c1 = $c1 & 0x7F; $c2 = $c2 & 0x7F; my $c = &transchar ($encodename, ($c1 << 8) | $c2); # katakana midle dot seems to be the default character of choice # for this encoding $c = 0x30FB if $c == 0; push (@outtext, $c); } else { # error print STDERR "cjk: ERROR Invalid EUS-JP character\n"; } $i += 2; } elsif ($c1 == 0x8F) { # three byte character - not supported - output a black square print STDERR "cjk: WARNING EUS-JP string appears to contain 3 byte characters "; print STDERR "which aren't supported\n"; push (@outtext, 0x25A1); $i += 3; } else { # error print STDERR "cjk: ERROR Invalid EUS-JP character\n"; $i ++; } } return \@outtext; } # todo unicode2eucjp ########################################################################### # UHC Korean to unicode sub uhc2unicode { return &cjk2unicode ("UHC", 0x25a1, @_); } ########################################################################### # Shift-JIS to unicode sub sjis2unicode { my ($intext) = @_; if (!&loadmapencoding ("SJIS-unicode")) { print STDERR "cjk: ERROR - could not load encoding SJIS\n"; return []; } my @outtext = (); my $encodename = "SJIS-unicode"; my $len = length($intext); my ($c1, $c2); my $i = 0; while ($i < $len) { $c1 = ord(substr($intext, $i, 1)); if (($c1 >= 0xA1 && $c1 <= 0xDF) || $c1 == 0x5c || $c1 == 0x7E) { # Single byte half-width katakana character or # JIS Roman yen or overline characters my $c = &transchar ($encodename, $c1); # - put a black square if cannot translate $c = 0x25A1 if $c == 0; push (@outtext, $c); $i++; } elsif ($c1 < 0x80) { # ASCII push (@outtext, $c1); $i ++; } elsif ($c1 < 0xEF) { if ($i+1 < $len) { $c2 = ord(substr($intext, $i+1, 1)); if (($c2 >= 0x40 && $c2 <= 0x7E) || ($c2 >= 0x80 && $c2 <= 0xFC)) { # Double byte shift-jis character my $c = &transchar ($encodename, ($c1 << 8) | $c2); # put a black square if cannot translate $c = 0x25A1 if $c == 0; push (@outtext, $c); } else { # error print STDERR "cjk: ERROR Invalid Shift-JIS character\n"; } $i += 2; } else { # error print STDERR "cjk: ERROR missing second half of Shift-JIS character\n"; $i ++; } } else { # error print STDERR "cjk: ERROR Invalid Shift-JIS character\n"; $i ++; } } return \@outtext; } sub unicode2sjis { return &unicode2cjk ("SJIS", 0x81A0, @_,); } ########################################################################### # %translations is of the form: # # encodings{encodingname-encodingname}->blocktranslation # blocktranslation->[[0-255],[256-511], ..., [65280-65535]] # # Any of the top translation blocks can point to an undefined # value. This data structure aims to allow fast translation and # efficient storage. %translations = (); # @array256 is used for initialisation, there must be # a better way... @array256 = (0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0); # The following two functions may be used when converting between some cjk # encodings and unicode. They treat all characters <= 0x80 as ascii and # convert everything else using the mapping for $encoding. # $dchar is the default character that's inserted if a character can't be # translated. # returns a pointer to unicode array sub cjk2unicode { my ($encoding, $dchar, $intext) = @_; $dchar = 0x20 unless defined $dchar; if (!&loadmapencoding ("$encoding-unicode")) { print STDERR "cjk: ERROR - could not load encoding $encoding\n"; return []; } my @outtext = (); my $encodename = "$encoding-unicode"; my $len = length($intext); my ($c1, $c2); my $i = 0; while ($i < $len) { if (($c1 = ord(substr($intext, $i, 1))) >= 0x80) { if ($i+1 < $len) { # double byte character $c2 = ord(substr($intext, $i+1, 1)); my $c = &transchar ($encodename, ($c1 << 8) | $c2); $c = $dchar if $c == 0; push (@outtext, $c); $i += 2; } else { # error print STDERR "cjk: ERROR missing second half of double byte character\n"; $i++; } } else { # normal ascii character push (@outtext, $c1); $i++; } } return \@outtext; } sub unicode2cjk { my ($encoding, $dchar, $intext) = @_; $dchar = 0x20 unless defined $dchar; # load the encoding (if it is not already loaded) if (!&loadmapencoding ("unicode-$encoding")) { print STDERR "cjk: ERROR - could not load encoding $encoding\n"; return ""; } # translate the string my ($num, $char); my $outtext = ""; my $encodename = "unicode-$encoding"; foreach $num (@$intext) { if ($num < 0x80) { # normal ascii character $outtext .= chr ($num); } else { # mapped character my $char = &transchar ($encodename, $num); $char = $dchar if $char == 0; $outtext .= chr ($char >> 8); $outtext .= chr ($char & 0xff); } } return $outtext; } # returns 1 if successful, 0 if unsuccessful sub loadmapencoding { my ($encoding) = @_; # check to see if the encoding has already been loaded return 1 if (defined $translations{$encoding}); my $filename = $encoding; $filename =~ s/\-//; $filename =~ s/unicode/u/i; $filename =~ tr/A-Z/a-z/; $filename .= ".ump"; # unicode map file return 0 unless open (MAPFILE, &util::filename_cat ($ENV{'GSDLHOME'}, "unicode", $filename)); binmode (MAPFILE); $translations{$encoding} = [@array256]; my $block = $translations{$encoding}; my ($in,$i,$j); while (read(MAPFILE, $in, 1) == 1) { $i = unpack ("C", $in); $block->[$i] = [@array256]; for ($j=0; $j<256 && read(MAPFILE, $in, 2)==2; $j++) { my ($n1, $n2) = unpack ("CC", $in); $block->[$i]->[$j] = ($n1*256) + $n2; } } close (MAPFILE); } # loadencoding expects the mapfile to contain (at least) two # tab-separated fields. The first field is the mapped value # and the second field is the unicode value. # # It returns 1 if successful, 0 if unsuccessful sub loadencoding { my ($encoding, $mapfile) = @_; my $to = "$encoding-unicode"; my $from = "unicode-$encoding"; # check to see if the encoding has already been loaded if (defined $translations{$to} && defined $translations{$from}) { return 1; } return 0 unless open (MAPFILE, $mapfile); my ($line, @line); $translations{$to} = [@array256]; $translations{$from} = [@array256]; while (defined ($line = )) { chomp $line; # remove comments $line =~ s/\#.*$//; next unless $line =~ /\S/; # split the line into fields and do a few # simple sanity checks @line = split (/\t/, $line); next unless (scalar(@line) >= 2 && $line[0] =~ /^0x/ && $line[1] =~ /^0x/); my $char = hex($line[0]); my $unic = hex($line[1]); # might need this for some versions of gb but not gbk # $char = $char | 0x8080 unless ($encoding =~ /gbk/i); &addchartrans ($translations{$to}, $char, $unic); &addchartrans ($translations{$from}, $unic, $char); } close (MAPFILE); return 1; } # addchartrans adds one character translation to a translation block. # It also simplifies the translation block if possible. sub addchartrans { my ($block, $from, $to) = @_; my $i = 0; my $high = ($from / 256) % 256; my $low = $from % 256; if (ref ($block->[$high]) ne "ARRAY") { $block->[$high] = [@array256]; } $block->[$high]->[$low] = $to; } sub transchar { my ($encoding, $from) = @_; my $high = ($from / 256) % 256; my $low = $from % 256; return 0 unless defined $translations{$encoding}; my $block = $translations{$encoding}; if (ref ($block->[$high]) ne "ARRAY") { return 0; } return $block->[$high]->[$low]; } sub writemapfile { my ($encoding, $filename, $tounicode) = @_; $filename .= ".ump"; # unicode map file if ($tounicode) { $filename = &util::filename_cat ($ENV{'GSDLHOME'}, "mappings", "to_uc", $filename); } else { $filename = &util::filename_cat ($ENV{'GSDLHOME'}, "mappings", "from_uc", $filename); } die "translation not defined" if (!defined $translations{$encoding}); my $block = $translations{$encoding}; print "writing $filename\n"; open (MAPFILE, ">" . $filename) || die; binmode (MAPFILE); my ($i, $j); for ($i=0; $i<256; $i++) { if (ref ($block->[$i]) eq "ARRAY") { print MAPFILE pack ("C", $i); for ($j=0; $j<256; $j++) { # unsigned short in network order print MAPFILE pack ("CC", int($block->[$i]->[$j] / 256), $block->[$i]->[$j] % 256); } } } close (MAPFILE); } sub makeencodingmapfile { my ($encoding, $txtmapfile) = @_; if (!&loadencoding ($encoding, $txtmapfile)) { die "couldn't load encoding $encoding"; } # write out map files &writemapfile ("$encoding-unicode", $encoding, 1); &writemapfile ("unicode-$encoding", $encoding, 0); } 1;