Context Navigation

← Previous Revision
Latest Revision
Next Revision →
Normal
Revision Log

source: trunk/gsdl/perllib/cjk.pm@ 1868

Last change on this file since 1868 was 1868, checked in by sjboddie, 23 years ago
Made a bunch of changes to the building code to support lots of new languages and encodings. It's still kind of a mess but should be fixed up over the weekend.
Property svn:keywords set to `Author Date Id Revision`
File size: 12.0 KB

Rev	Line
[1868]	1	###########################################################################
	2	#
	3	# cjk.pm --
	4	# A component of the Greenstone digital library software
	5	# from the New Zealand Digital Library Project at the
	6	# University of Waikato, New Zealand.
	7	#
	8	# Copyright (C) 2001 New Zealand Digital Library Project
	9	#
	10	# This program is free software; you can redistribute it and/or modify
	11	# it under the terms of the GNU General Public License as published by
	12	# the Free Software Foundation; either version 2 of the License, or
	13	# (at your option) any later version.
	14	#
	15	# This program is distributed in the hope that it will be useful,
	16	# but WITHOUT ANY WARRANTY; without even the implied warranty of
	17	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	18	# GNU General Public License for more details.
	19	#
	20	# You should have received a copy of the GNU General Public License
	21	# along with this program; if not, write to the Free Software
	22	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
	23	#
	24	###########################################################################
	25
	26	# functions for dealing with CJK (and possibly other) complex character
	27	# encodings
	28
	29	package cjk;
	30
	31	use util;
	32
	33
	34	###########################################################################
	35
	36	# GB to Unicode
	37	sub gb2unicode {
	38	return &cjk2unicode ("GBK", 0x25a1, @_);
	39	}
	40
	41	sub unicode2gb {
	42	return &unicode2cjk ("GBK", 0xA1F5, @_);
	43	}
	44
	45	###########################################################################
	46
	47	# EUC encoded Japanese to unicode (doesn't currently support JIS X
	48	# 0212-1990 3-byte characters)
	49	sub eucjp2unicode {
	50	my ($intext) = @_;
	51
	52	if (!&loadmapencoding ("JIS-unicode")) {
	53	print STDERR "cjk: ERROR - could not load encoding JIS\n";
	54	return [];
	55	}
	56
	57	my @outtext = ();
	58	my $encodename = "JIS-unicode";
	59	my $len = length($intext);
	60	my ($c1, $c2);
	61	my $i = 0;
	62
	63	while ($i < $len) {
	64	$c1 = ord(substr($intext, $i, 1));
	65	if ($c1 < 0x80) {
	66	# ASCII/JIS Roman character
	67	$c1 = 0x00A5 if $c1 == 0x5C; # yen sign
	68	$c1 = 0x203E if $c1 == 0x7E; # overline
	69	push (@outtext, $c1);
	70	$i ++;
	71
	72	} elsif ($c1 == 0x8E) {
	73	# half-width katakana character
	74	$c2 = ord(substr($intext, $i+1, 1));
	75	if ($c2 >= 0xA1 && $c2 <= 0xDF) {
	76	my $c = &transchar ($encodename, $c2);
	77	# put a black square if cannot translate
	78	$c = 0x25A1 if $c == 0;
	79	push (@outtext, $c);
	80	} else {
	81	# error
	82	print STDERR "cjk: ERROR Invalid EUS-JP character\n";
	83	}
	84	$i += 2;
	85
	86	} elsif ($c1 >= 0xA1 && $c1 <= 0xFE) {
	87	# double byte character
	88	$c2 = ord(substr($intext, $i+1, 1));
	89	if ($c2 >= 0xA1 && $c2 <= 0xFE) {
	90	$c1 = $c1 & 0x7F;
	91	$c2 = $c2 & 0x7F;
	92	my $c = &transchar ($encodename, ($c1 << 8) \| $c2);
	93
	94	# katakana midle dot seems to be the default character of choice
	95	# for this encoding
	96	$c = 0x30FB if $c == 0;
	97	push (@outtext, $c);
	98	} else {
	99	# error
	100	print STDERR "cjk: ERROR Invalid EUS-JP character\n";
	101	}
	102	$i += 2;
	103
	104	} elsif ($c1 == 0x8F) {
	105	# three byte character - not supported - output a black square
	106	print STDERR "cjk: WARNING EUS-JP string appears to contain 3 byte characters ";
	107	print STDERR "which aren't supported\n";
	108	push (@outtext, 0x25A1);
	109	$i += 3;
	110
	111	} else {
	112	# error
	113	print STDERR "cjk: ERROR Invalid EUS-JP character\n";
	114	$i ++;
	115	}
	116	}
	117	return \@outtext;
	118	}
	119
	120	# todo unicode2eucjp
	121
	122	###########################################################################
	123
	124	# UHC Korean to unicode
	125	sub uhc2unicode {
	126	return &cjk2unicode ("UHC", 0x25a1, @_);
	127	}
	128
	129	###########################################################################
	130
	131	# Shift-JIS to unicode
	132	sub sjis2unicode {
	133	my ($intext) = @_;
	134
	135	if (!&loadmapencoding ("SJIS-unicode")) {
	136	print STDERR "cjk: ERROR - could not load encoding SJIS\n";
	137	return [];
	138	}
	139
	140	my @outtext = ();
	141	my $encodename = "SJIS-unicode";
	142	my $len = length($intext);
	143	my ($c1, $c2);
	144	my $i = 0;
	145
	146	while ($i < $len) {
	147	$c1 = ord(substr($intext, $i, 1));
	148
	149	if (($c1 >= 0xA1 && $c1 <= 0xDF) \|\| $c1 == 0x5c \|\| $c1 == 0x7E) {
	150	# Single byte half-width katakana character or
	151	# JIS Roman yen or overline characters
	152	my $c = &transchar ($encodename, $c1);
	153	# - put a black square if cannot translate
	154	$c = 0x25A1 if $c == 0;
	155	push (@outtext, $c);
	156	$i++;
	157
	158	} elsif ($c1 < 0x80) {
	159	# ASCII
	160	push (@outtext, $c1);
	161	$i ++;
	162
	163	} elsif ($c1 < 0xEF) {
	164	if ($i+1 < $len) {
	165	$c2 = ord(substr($intext, $i+1, 1));
	166	if (($c2 >= 0x40 && $c2 <= 0x7E) \|\| ($c2 >= 0x80 && $c2 <= 0xFC)) {
	167	# Double byte shift-jis character
	168	my $c = &transchar ($encodename, ($c1 << 8) \| $c2);
	169	# put a black square if cannot translate
	170	$c = 0x25A1 if $c == 0;
	171	push (@outtext, $c);
	172	} else {
	173	# error
	174	print STDERR "cjk: ERROR Invalid Shift-JIS character\n";
	175	}
	176	$i += 2;
	177	} else {
	178	# error
	179	print STDERR "cjk: ERROR missing second half of Shift-JIS character\n";
	180	$i ++;
	181	}
	182	} else {
	183	# error
	184	print STDERR "cjk: ERROR Invalid Shift-JIS character\n";
	185	$i ++;
	186	}
	187	}
	188	return \@outtext;
	189	}
	190
	191	sub unicode2sjis {
	192	return &unicode2cjk ("SJIS", 0x81A0, @_,);
	193	}
	194
	195	###########################################################################
	196
	197	# %translations is of the form:
	198	#
	199	# encodings{encodingname-encodingname}->blocktranslation
	200	# blocktranslation->[[0-255],[256-511], ..., [65280-65535]]
	201	#
	202	# Any of the top translation blocks can point to an undefined
	203	# value. This data structure aims to allow fast translation and
	204	# efficient storage.
	205	%translations = ();
	206
	207	# @array256 is used for initialisation, there must be
	208	# a better way...
	209	@array256 = (0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
	210	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
	211	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
	212	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
	213	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
	214	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
	215	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
	216	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
	217	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
	218	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
	219	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
	220	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
	221	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
	222	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
	223	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
	224	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0);
	225
	226	# The following two functions may be used when converting between some cjk
	227	# encodings and unicode. They treat all characters <= 0x80 as ascii and
	228	# convert everything else using the mapping for $encoding.
	229	# $dchar is the default character that's inserted if a character can't be
	230	# translated.
	231
	232	# returns a pointer to unicode array
	233	sub cjk2unicode {
	234	my ($encoding, $dchar, $intext) = @_;
	235	$dchar = 0x20 unless defined $dchar;
	236
	237	if (!&loadmapencoding ("$encoding-unicode")) {
	238	print STDERR "cjk: ERROR - could not load encoding $encoding\n";
	239	return [];
	240	}
	241
	242	my @outtext = ();
	243	my $encodename = "$encoding-unicode";
	244	my $len = length($intext);
	245	my ($c1, $c2);
	246	my $i = 0;
	247
	248	while ($i < $len) {
	249	if (($c1 = ord(substr($intext, $i, 1))) >= 0x80) {
	250	if ($i+1 < $len) {
	251	# double byte character
	252	$c2 = ord(substr($intext, $i+1, 1));
	253	my $c = &transchar ($encodename, ($c1 << 8) \| $c2);
	254	$c = $dchar if $c == 0;
	255	push (@outtext, $c);
	256	$i += 2;
	257
	258	} else {
	259	# error
	260	print STDERR "cjk: ERROR missing second half of double byte character\n";
	261	$i++;
	262	}
	263
	264	} else {
	265	# normal ascii character
	266	push (@outtext, $c1);
	267	$i++;
	268	}
	269	}
	270	return \@outtext;
	271	}
	272
	273	sub unicode2cjk {
	274	my ($encoding, $dchar, $intext) = @_;
	275	$dchar = 0x20 unless defined $dchar;
	276
	277	# load the encoding (if it is not already loaded)
	278	if (!&loadmapencoding ("unicode-$encoding")) {
	279	print STDERR "cjk: ERROR - could not load encoding $encoding\n";
	280	return "";
	281	}
	282
	283	# translate the string
	284	my ($num, $char);
	285	my $outtext = "";
	286	my $encodename = "unicode-$encoding";
	287	foreach $num (@$intext) {
	288	if ($num < 0x80) {
	289	# normal ascii character
	290	$outtext .= chr ($num);
	291	} else {
	292	# mapped character
	293	my $char = &transchar ($encodename, $num);
	294	$char = $dchar if $char == 0;
	295	$outtext .= chr ($char >> 8);
	296	$outtext .= chr ($char & 0xff);
	297	}
	298	}
	299	return $outtext;
	300	}
	301
	302	# returns 1 if successful, 0 if unsuccessful
	303	sub loadmapencoding {
	304	my ($encoding) = @_;
	305
	306	# check to see if the encoding has already been loaded
	307	return 1 if (defined $translations{$encoding});
	308
	309	my $filename = $encoding;
	310	$filename =~ s/\-//;
	311	$filename =~ s/unicode/u/i;
	312	$filename =~ tr/A-Z/a-z/;
	313	$filename .= ".ump"; # unicode map file
	314
	315	return 0 unless open (MAPFILE, &util::filename_cat ($ENV{'GSDLHOME'}, "unicode", $filename));
	316	binmode (MAPFILE);
	317
	318	$translations{$encoding} = [@array256];
	319	my $block = $translations{$encoding};
	320
	321	my ($in,$i,$j);
	322	while (read(MAPFILE, $in, 1) == 1) {
	323	$i = unpack ("C", $in);
	324	$block->[$i] = [@array256];
	325	for ($j=0; $j<256 && read(MAPFILE, $in, 2)==2; $j++) {
	326	my ($n1, $n2) = unpack ("CC", $in);
	327	$block->[$i]->[$j] = ($n1*256) + $n2;
	328	}
	329	}
	330
	331	close (MAPFILE);
	332	}
	333
	334	# loadencoding expects the mapfile to contain (at least) two
	335	# tab-separated fields. The first field is the mapped value
	336	# and the second field is the unicode value.
	337	#
	338	# It returns 1 if successful, 0 if unsuccessful
	339	sub loadencoding {
	340	my ($encoding, $mapfile) = @_;
	341
	342	my $to = "$encoding-unicode";
	343	my $from = "unicode-$encoding";
	344
	345	# check to see if the encoding has already been loaded
	346	if (defined $translations{$to} && defined $translations{$from}) {
	347	return 1;
	348	}
	349
	350	return 0 unless open (MAPFILE, $mapfile);
	351
	352	my ($line, @line);
	353	$translations{$to} = [@array256];
	354	$translations{$from} = [@array256];
	355	while (defined ($line = <MAPFILE>)) {
	356	chomp $line;
	357	# remove comments
	358	$line =~ s/\#.*$//;
	359	next unless $line =~ /\S/;
	360
	361	# split the line into fields and do a few
	362	# simple sanity checks
	363	@line = split (/\t/, $line);
	364	next unless (scalar(@line) >= 2 &&
	365	$line[0] =~ /^0x/ &&
	366	$line[1] =~ /^0x/);
	367
	368	my $char = hex($line[0]);
	369	my $unic = hex($line[1]);
	370
	371	# might need this for some versions of gb but not gbk
	372	# $char = $char \| 0x8080 unless ($encoding =~ /gbk/i);
	373
	374	&addchartrans ($translations{$to}, $char, $unic);
	375	&addchartrans ($translations{$from}, $unic, $char);
	376	}
	377
	378	close (MAPFILE);
	379
	380	return 1;
	381	}
	382
	383	# addchartrans adds one character translation to a translation block.
	384	# It also simplifies the translation block if possible.
	385	sub addchartrans {
	386	my ($block, $from, $to) = @_;
	387	my $i = 0;
	388
	389	my $high = ($from / 256) % 256;
	390	my $low = $from % 256;
	391
	392	if (ref ($block->[$high]) ne "ARRAY") {
	393	$block->[$high] = [@array256];
	394	}
	395	$block->[$high]->[$low] = $to;
	396	}
	397
	398	sub transchar {
	399	my ($encoding, $from) = @_;
	400	my $high = ($from / 256) % 256;
	401	my $low = $from % 256;
	402
	403	return 0 unless defined $translations{$encoding};
	404
	405	my $block = $translations{$encoding};
	406
	407	if (ref ($block->[$high]) ne "ARRAY") {
	408	return 0;
	409	}
	410	return $block->[$high]->[$low];
	411	}
	412
	413	sub writemapfile {
	414	my ($encoding, $filename, $tounicode) = @_;
	415
	416	$filename .= ".ump"; # unicode map file
	417	if ($tounicode) {
	418	$filename = &util::filename_cat ($ENV{'GSDLHOME'}, "mappings", "to_uc", $filename);
	419	} else {
	420	$filename = &util::filename_cat ($ENV{'GSDLHOME'}, "mappings", "from_uc", $filename);
	421	}
	422
	423	die "translation not defined" if (!defined $translations{$encoding});
	424	my $block = $translations{$encoding};
	425
	426	print "writing $filename\n";
	427	open (MAPFILE, ">" . $filename) \|\| die;
	428	binmode (MAPFILE);
	429
	430	my ($i, $j);
	431	for ($i=0; $i<256; $i++) {
	432	if (ref ($block->[$i]) eq "ARRAY") {
	433	print MAPFILE pack ("C", $i);
	434	for ($j=0; $j<256; $j++) {
	435	# unsigned short in network order
	436	print MAPFILE pack ("CC", int($block->[$i]->[$j] / 256),
	437	$block->[$i]->[$j] % 256);
	438	}
	439	}
	440	}
	441	close (MAPFILE);
	442	}
	443
	444	sub makeencodingmapfile {
	445	my ($encoding, $txtmapfile) = @_;
	446
	447	if (!&loadencoding ($encoding, $txtmapfile)) {
	448	die "couldn't load encoding $encoding";
	449	}
	450
	451	# write out map files
	452	&writemapfile ("$encoding-unicode", $encoding, 1);
	453	&writemapfile ("unicode-$encoding", $encoding, 0);
	454	}
	455
	456	1;

Note: See TracBrowser for help on using the repository browser.

Download in other formats: