########################################################################### # # gb.pm -- # A component of the Greenstone digital library software # from the New Zealand Digital Library Project at the # University of Waikato, New Zealand. # # Copyright (C) 1999 New Zealand Digital Library Project # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation; either version 2 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program; if not, write to the Free Software # Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. # ########################################################################### #! /usr/local/bin/perl5 -w # useful methods for dealing with the GuoBiao (GB) encoding package gb; use util; # %encodings is of the form: # # encodings{encodingname-encodingname}->blocktranslation # blocktranslation->[[0-255],[256-511], ..., [65280-65535]] # # Any of the top translation blocks can point to an undefined # value. This data structure aims to allow fast translation and # efficient storage. # @array256 is used for initialisation, there must be # a better way... @array256 = (0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0); %translations = (); # gb2unicode uses the GBK translation table sub gb2unicode { generalgb2unicode ("GBK", @_); } # returns a pointer to unicode array sub generalgb2unicode { my ($encoding, $intext) = @_; if (!&loadmapencoding ("$encoding-unicode")) { print STDERR "gb: ERROR - could not load encoding $encoding\n"; return []; } my @outtext = (); my $encodename = "$encoding-unicode"; my $len = length($intext); my ($c1, $c2); my $i = 0; while ($i < $len) { if (($c1 = ord(substr($intext, $i, 1))) >= 0x81) { if ($i+1 < $len) { # found a GB character, put black square if cannot translate $c2 = ord(substr($intext, $i+1, 1)); my $c = &transchar ($encodename, ($c1 << 8) | $c2); $c = 0x25a1 if $c == 0; push (@outtext, $c); $i += 2; } else { # error print STDERR "gb: ERROR missing second half of GB character\n"; $i++; } } else { # normal ascii character push (@outtext, $c1) if ($c1 < 0x80); $i++; } } return \@outtext; } # unicode2gb uses the GBK translation table sub unicode2gb { generalunicode2gb ("GBK", @_); } # returns a gb string. Note that this will not translate # traditional forms of characters to their simplified forms # Any traditional characters in the unicode will not be translated sub generalunicode2gb { my ($encoding, $intext) = @_; # load the encoding (if it is not already loaded) if (!&loadmapencoding ("unicode-$encoding")) { print STDERR "gb: ERROR - could not load encoding $encoding\n"; return ""; } # translate the string my ($num, $gbc); my $outtext = ""; my $encodename = "unicode-$encoding"; foreach $num (@$intext) { if ($num < 0x80) { # normal ascii character $outtext .= chr ($num); } else { # gb character my $gbc = &transchar ($encodename, $num); $gbc = 0xa1f5 if $gbc == 0; $outtext .= chr ($gbc >> 8); $outtext .= chr ($gbc & 0xff); } } return $outtext; } # It returns 1 if successful, 0 if unsuccessful sub loadmapencoding { my ($encoding) = @_; # check to see if the encoding has already been loaded return 1 if (defined $translations{$encoding}); my $filename = $encoding; $filename =~ s/\-//; $filename =~ s/unicode/u/i; $filename =~ tr/A-Z/a-z/; $filename .= ".ump"; # unicode map file return 0 unless open (MAPFILE, "$ENV{'GSDLHOME'}/unicode/$filename"); binmode (MAPFILE); # f$#@!!! windows $translations{$encoding} = [@array256]; my $block = $translations{$encoding}; my ($in,$i,$j); while (read(MAPFILE, $in, 1) == 1) { $i = unpack ("C", $in); $block->[$i] = [@array256]; for ($j=0; $j<256 && read(MAPFILE, $in, 2)==2; $j++) { my ($n1, $n2) = unpack ("CC", $in); $block->[$i]->[$j] = ($n1*256) + $n2; } } close (MAPFILE); } # loadencoding expects the mapfile to contain (at least) two # tab-seperated fields. The first field is the mapped value # and the second field is the unicode value. # # It returns 1 if successful, 0 if unsuccessful sub loadencoding { my ($encoding, $mapfile) = @_; my $to = "$encoding-unicode"; my $from = "unicode-$encoding"; # check to see if the encoding has already been loaded if (defined $translations{$to} && defined $translations{$from}) { return 1; } return 0 unless open (MAPFILE, $mapfile); my ($line, @line); $translations{$to} = [@array256]; $translations{$from} = [@array256]; while (defined ($line = )) { # remove comments $line =~ s/\#.*$//; next unless $line =~ /\S/; # split the line into fields and do a few # simple sanity checks @line = split (/\t/, $line); next unless (scalar(@line) >= 2 && $line[0] =~ /^0x/ && $line[1] =~ /^0x/); my $gbc = hex($line[0]); my $unic = hex($line[1]); # gbk has already had the necessary most significant # bits set (it is used in the encoding $gbc = $gbc | 0x8080 unless ($encoding =~ /gbk/i); &addchartrans ($translations{$to}, $gbc, $unic); &addchartrans ($translations{$from}, $unic, $gbc); } close (MAPFILE); return 1; } sub showfilled { my ($block) = @_; my $count = 0; foreach $el (@$block) { if (ref ($el) eq "ARRAY") { print "1"; $count++; } else { print "0"; } } print "\n"; print "total: $count\n"; } # addchartrans adds one character translation to a translation block. # It also simplifies the translation block if possible. sub addchartrans { my ($block, $from, $to) = @_; my $i = 0; my $high = ($from / 256) % 256; my $low = $from % 256; if (ref ($block->[$high]) ne "ARRAY") { $block->[$high] = [@array256]; } $block->[$high]->[$low] = $to; } sub transchar { my ($encoding, $from) = @_; my $high = ($from / 256) % 256; my $low = $from % 256; return 0 unless defined $translations{$encoding}; my $block = $translations{$encoding}; if (ref ($block->[$high]) ne "ARRAY") { return 0; } return $block->[$high]->[$low]; } 1;