Context Navigation

source: main/tags/2.13/gsdl/perllib/gb.pm@ 24526

Last change on this file since 24526 was 537, checked in by sjboddie, 25 years ago
added GPL headers
Property svn:executable set to ``* Property svn:keywords set to `Author Date Id Revision`
File size: 7.1 KB

Line
1	###########################################################################
2	#
3	# gb.pm --
4	# A component of the Greenstone digital library software
5	# from the New Zealand Digital Library Project at the
6	# University of Waikato, New Zealand.
7	#
8	# Copyright (C) 1999 New Zealand Digital Library Project
9	#
10	# This program is free software; you can redistribute it and/or modify
11	# it under the terms of the GNU General Public License as published by
12	# the Free Software Foundation; either version 2 of the License, or
13	# (at your option) any later version.
14	#
15	# This program is distributed in the hope that it will be useful,
16	# but WITHOUT ANY WARRANTY; without even the implied warranty of
17	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18	# GNU General Public License for more details.
19	#
20	# You should have received a copy of the GNU General Public License
21	# along with this program; if not, write to the Free Software
22	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23	#
24	###########################################################################
25
26	#! /usr/local/bin/perl5 -w
27
28	# useful methods for dealing with the GuoBiao (GB) encoding
29
30	package gb;
31
32	use util;
33
34	# %encodings is of the form:
35	#
36	# encodings{encodingname-encodingname}->blocktranslation
37	# blocktranslation->[[0-255],[256-511], ..., [65280-65535]]
38	#
39	# Any of the top translation blocks can point to an undefined
40	# value. This data structure aims to allow fast translation and
41	# efficient storage.
42
43	# @array256 is used for initialisation, there must be
44	# a better way...
45	@array256 = (0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
46	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
47	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
48	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
49	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
50	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
51	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
52	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
53	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
54	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
55	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
56	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
57	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
58	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
59	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
60	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0);
61
62
63	%translations = ();
64
65
66	# gb2unicode uses the GBK translation table
67	sub gb2unicode {
68	generalgb2unicode ("GBK", @_);
69	}
70
71	# returns a pointer to unicode array
72	sub generalgb2unicode {
73	my ($encoding, $intext) = @_;
74
75	if (!&loadmapencoding ("$encoding-unicode")) {
76	print STDERR "gb: ERROR - could not load encoding $encoding\n";
77	return [];
78	}
79
80	my @outtext = ();
81	my $encodename = "$encoding-unicode";
82	my $len = length($intext);
83	my ($c1, $c2);
84	my $i = 0;
85
86	while ($i < $len) {
87	if (($c1 = ord(substr($intext, $i, 1))) >= 0x81) {
88	if ($i+1 < $len) {
89	# found a GB character, put black square if cannot translate
90	$c2 = ord(substr($intext, $i+1, 1));
91	my $c = &transchar ($encodename, ($c1 << 8) \| $c2);
92	$c = 0x25a1 if $c == 0;
93	push (@outtext, $c);
94	$i += 2;
95
96	} else {
97	# error
98	print STDERR "gb: ERROR missing second half of GB character\n";
99	$i++;
100	}
101
102	} else {
103	# normal ascii character
104	push (@outtext, $c1) if ($c1 < 0x80);
105	$i++;
106	}
107	}
108
109	return \@outtext;
110	}
111
112
113	# unicode2gb uses the GBK translation table
114	sub unicode2gb {
115	generalunicode2gb ("GBK", @_);
116	}
117
118	# returns a gb string. Note that this will not translate
119	# traditional forms of characters to their simplified forms
120	# Any traditional characters in the unicode will not be translated
121	sub generalunicode2gb {
122	my ($encoding, $intext) = @_;
123
124	# load the encoding (if it is not already loaded)
125	if (!&loadmapencoding ("unicode-$encoding")) {
126	print STDERR "gb: ERROR - could not load encoding $encoding\n";
127	return "";
128	}
129
130	# translate the string
131	my ($num, $gbc);
132	my $outtext = "";
133	my $encodename = "unicode-$encoding";
134	foreach $num (@$intext) {
135	if ($num < 0x80) {
136	# normal ascii character
137	$outtext .= chr ($num);
138	} else {
139	# gb character
140	my $gbc = &transchar ($encodename, $num);
141	$gbc = 0xa1f5 if $gbc == 0;
142	$outtext .= chr ($gbc >> 8);
143	$outtext .= chr ($gbc & 0xff);
144	}
145	}
146
147	return $outtext;
148	}
149
150
151	# It returns 1 if successful, 0 if unsuccessful
152	sub loadmapencoding {
153	my ($encoding) = @_;
154
155	# check to see if the encoding has already been loaded
156	return 1 if (defined $translations{$encoding});
157
158	my $filename = $encoding;
159	$filename =~ s/\-//;
160	$filename =~ s/unicode/u/i;
161	$filename =~ tr/A-Z/a-z/;
162	$filename .= ".ump"; # unicode map file
163
164	return 0 unless open (MAPFILE, "$ENV{'GSDLHOME'}/unicode/$filename");
165
166	$translations{$encoding} = [@array256];
167	my $block = $translations{$encoding};
168
169	my ($in,$i,$j);
170	while (read(MAPFILE, $in, 1) == 1) {
171	$i = unpack ("C", $in);
172	$block->[$i] = [@array256];
173	for ($j=0; $j<256 && read(MAPFILE, $in, 2)==2; $j++) {
174	my ($n1, $n2) = unpack ("CC", $in);
175	$block->[$i]->[$j] = ($n1*256) + $n2;
176	}
177	}
178
179	close (MAPFILE);
180	}
181
182
183
184	# loadencoding expects the mapfile to contain (at least) two
185	# tab-seperated fields. The first field is the mapped value
186	# and the second field is the unicode value.
187	#
188	# It returns 1 if successful, 0 if unsuccessful
189	sub loadencoding {
190	my ($encoding, $mapfile) = @_;
191
192	my $to = "$encoding-unicode";
193	my $from = "unicode-$encoding";
194
195	# check to see if the encoding has already been loaded
196	if (defined $translations{$to} && defined $translations{$from}) {
197	return 1;
198	}
199
200	return 0 unless open (MAPFILE, $mapfile);
201
202	my ($line, @line);
203	$translations{$to} = [@array256];
204	$translations{$from} = [@array256];
205	while (defined ($line = <MAPFILE>)) {
206	# remove comments
207	$line =~ s/\#.*$//;
208	next unless $line =~ /\S/;
209
210	# split the line into fields and do a few
211	# simple sanity checks
212	@line = split (/\t/, $line);
213	next unless (scalar(@line) >= 2 &&
214	$line[0] =~ /^0x/ &&
215	$line[1] =~ /^0x/);
216
217	my $gbc = hex($line[0]);
218	my $unic = hex($line[1]);
219
220	# gbk has already had the necessary most significant
221	# bits set (it is used in the encoding
222	$gbc = $gbc \| 0x8080 unless ($encoding =~ /gbk/i);
223
224	&addchartrans ($translations{$to}, $gbc, $unic);
225	&addchartrans ($translations{$from}, $unic, $gbc);
226	}
227
228	close (MAPFILE);
229
230	return 1;
231	}
232
233	sub showfilled {
234	my ($block) = @_;
235	my $count = 0;
236	foreach $el (@$block) {
237	if (ref ($el) eq "ARRAY") {
238	print "1";
239	$count++;
240	} else {
241	print "0";
242	}
243	}
244	print "\n";
245	print "total: $count\n";
246	}
247
248
249	# addchartrans adds one character translation to a translation block.
250	# It also simplifies the translation block if possible.
251	sub addchartrans {
252	my ($block, $from, $to) = @_;
253	my $i = 0;
254
255	my $high = ($from / 256) % 256;
256	my $low = $from % 256;
257
258	if (ref ($block->[$high]) ne "ARRAY") {
259	$block->[$high] = [@array256];
260	}
261	$block->[$high]->[$low] = $to;
262	}
263
264
265	sub transchar {
266	my ($encoding, $from) = @_;
267	my $high = ($from / 256) % 256;
268	my $low = $from % 256;
269
270	return 0 unless defined $translations{$encoding};
271
272	my $block = $translations{$encoding};
273
274	if (ref ($block->[$high]) ne "ARRAY") {
275	return 0;
276	}
277	return $block->[$high]->[$low];
278	}
279
280
281	1;

Note: See TracBrowser for help on using the repository browser.

Download in other formats: