Context Navigation

source: trunk/gsdl/perllib/unicode.pm@ 1483

Last change on this file since 1483 was 1227, checked in by sjboddie, 24 years ago
Modified the perl code for importing arabic encoded documents. Plugins now support a windows_1256 and an iso_8859_6 encoding. I was briefly under the impression that these two encodings were similar enough to be treated the same. It turns out they're not. It appears that the Windows codepage 1256 is the most commonly used Arabic encoding so "arabic" is a synonym for windows_1256.
Property svn:executable set to ``* Property svn:keywords set to `Author Date Id Revision`
File size: 6.6 KB

Line
1	###########################################################################
2	#
3	# unicode.pm --
4	# A component of the Greenstone digital library software
5	# from the New Zealand Digital Library Project at the
6	# University of Waikato, New Zealand.
7	#
8	# Copyright (C) 1999 New Zealand Digital Library Project
9	#
10	# This program is free software; you can redistribute it and/or modify
11	# it under the terms of the GNU General Public License as published by
12	# the Free Software Foundation; either version 2 of the License, or
13	# (at your option) any later version.
14	#
15	# This program is distributed in the hope that it will be useful,
16	# but WITHOUT ANY WARRANTY; without even the implied warranty of
17	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18	# GNU General Public License for more details.
19	#
20	# You should have received a copy of the GNU General Public License
21	# along with this program; if not, write to the Free Software
22	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23	#
24	###########################################################################
25
26	# useful functions for dealing with Unicode
27
28	# Unicode strings are stored as arrays of scalars as perl
29	# lacks characters are 8-bit (currently)
30
31	package unicode;
32
33	%translations = ();
34
35	# ascii2unicode takes an (extended) ascii string (ISO-8859-1)
36	# and returns a unicode array.
37	sub ascii2unicode {
38	my ($in) = @_;
39	my $out = [];
40
41	my $i = 0;
42	my $len = length($in);
43	while ($i < $len) {
44	push (@$out, ord(substr ($in, $i, 1)));
45	$i++;
46	}
47
48	return $out;
49	}
50
51	# arabic2unicode takes an 8 bit Arabic string (ISO-8859-6)
52	# and returns a unicode array
53	sub arabic2unicode {
54	my ($in) = @_;
55	my $out = [];
56
57	my $i = 0;
58	my $len = length($in);
59	while ($i < $len) {
60	my $c = ord(substr ($in, $i, 1));
61	$c += (1567-191) if ($c >= 0x80);
62	push (@$out, $c);
63	$i++;
64	}
65
66	return $out;
67	}
68
69	# windows2unicode takes a windows encoding (e.g. Windows 1256 (Arabic))
70	# and returns a unicode array. These encodings are similar to but not
71	# identical to the corresponding ISO-8859 encodings.
72	#
73	# The map files for these encodings should be in unicode/MAPPINGS/WINDOWS
74	sub windows2unicode {
75	my ($encoding, $in) = @_;
76	my $out = [];
77
78	my $mapfile = &util::filename_cat($ENV{'GSDLHOME'}, "unicode", "MAPPINGS",
79	"WINDOWS", "$encoding.TXT");
80	return $out unless &loadmapping ($encoding, $mapfile);
81
82	my $i = 0;
83	my $len = length($in);
84	while ($i < $len) {
85	my $c = ord(substr ($in, $i, 1));
86	$c = $translations{"$encoding-unicode"}->{$c} if ($c >= 0x80);
87	push (@$out, $c);
88	$i++;
89	}
90
91	return $out;
92	}
93
94
95	# ascii2utf8 takes a (extended) ascii string and
96	# returns a UTF-8 encoded string. This is just
97	# a faster version of "&unicode2utf8(&ascii2unicode($str));"
98	sub ascii2utf8 {
99	my ($in) = @_;
100	my $out = "";
101
102	my ($c);
103	my $i = 0;
104	my $len = length($in);
105	while ($i < $len) {
106	$c = ord (substr ($in, $i, 1));
107	if ($c < 0x80) {
108	# ascii character
109	$out .= chr ($c);
110
111	} else {
112	# extended ascii character
113	$out .= chr (0xc0 + (($c >> 6) & 0x1f));
114	$out .= chr (0x80 + ($c & 0x3f));
115	}
116	$i++;
117	}
118
119	return $out;
120	}
121
122
123	# unicode2utf8 takes a unicode array as input and encodes it
124	# using utf-8
125	sub unicode2utf8 {
126	my ($in) = @_;
127	my $out = "";
128
129	foreach $num (@$in) {
130	if ($num < 0x80) {
131	$out .= chr ($num);
132
133	} elsif ($num < 0x800) {
134	$out .= chr (0xc0 + (($num >> 6) & 0x1f));
135	$out .= chr (0x80 + ($num & 0x3f));
136
137	} elsif ($num < 0xFFFF) {
138	$out .= chr (0xe0 + (($num >> 12) & 0xf));
139	$out .= chr (0x80 + (($num >> 6) & 0x3f));
140	$out .= chr (0x80 + ($num & 0x3f));
141
142	} else {
143	# error, don't encode anything
144	die;
145	}
146	}
147
148	return $out;
149	}
150
151
152	# utf82unicode takes a utf-8 string and produces a unicode
153	# array
154	sub utf82unicode {
155	my ($in) = @_;
156	my $out = [];
157
158	my $i = 0;
159	my ($c1, $c2, $c3);
160	$len = length($in);
161	while ($i < $len) {
162	if (($c1 = ord(substr ($in, $i, 1))) < 0x80) {
163	# normal ascii character
164	push (@$out, $c1);
165
166	} elsif ($c1 < 0xc0) {
167	# error, was expecting the first byte of an
168	# encoded character. Do nothing.
169
170	} elsif ($c1 < 0xe0 && $i+1 < $len) {
171	# an encoded character with two bytes
172	$c2 = ord (substr ($in, $i+1, 1));
173	if ($c2 >= 0x80 && $c2 < 0xc0) {
174	# everything looks ok
175	push (@$out, ((($c1 & 0x1f) << 6) +
176	($c2 & 0x3f)));
177	$i++; # gobbled an extra byte
178	}
179
180	} elsif ($c1 < 0xf0 && $i+2 < $len) {
181	# an encoded character with three bytes
182	$c2 = ord (substr ($in, $i+1, 1));
183	$c3 = ord (substr ($in, $i+2, 1));
184	if ($c2 >= 0x80 && $c2 < 0xc0 &&
185	$c3 >= 0x80 && $c3 < 0xc0) {
186	# everything looks ok
187	push (@$out, ((($c1 & 0xf) << 12) +
188	(($c2 & 0x3f) << 6) +
189	($c3 & 0x3f)));
190
191	$i += 2; # gobbled an extra two bytes
192	}
193
194	} else {
195	# error, only decode Unicode characters not full UCS.
196	# Do nothing.
197	}
198
199	$i++;
200	}
201
202	return $out;
203	}
204
205
206	# unicode2ucs2 takes a unicode array and produces a UCS-2
207	# unicode string (every two bytes forms a unicode character)
208	sub unicode2ucs2 {
209	my ($in) = @_;
210	my $out = "";
211
212	foreach $num (@$in) {
213	$out .= chr (($num & 0xff00) >> 8);
214	$out .= chr ($num & 0xff);
215	}
216
217	return $out;
218	}
219
220
221	# ucs22unicode takes a UCS-2 string and produces a unicode array
222	sub ucs22unicode {
223	my ($in) = @_;
224	my $out = [];
225
226	my $i = 0;
227	my $len = length ($in);
228	while ($i+1 < $len) {
229	push (@$out, ord (substr($in, $i, 1)) << 8 +
230	ord (substr($in, $i+1, 1)));
231
232	$i ++;
233	}
234
235	return $out;
236	}
237
238	# loadmapping expects the mapfile to contain (at least) two
239	# tab-separated fields. The first field is the mapped value
240	# and the second field is the unicode value.
241	#
242	# It returns 1 if successful, 0 if unsuccessful
243	sub loadmapping {
244	my ($encoding, $mapfile) = @_;
245
246	my $to = "$encoding-unicode";
247	my $from = "unicode-$encoding";
248
249	# check to see if the encoding has already been loaded
250	if (defined $translations{$to} && defined $translations{$from}) {
251	return 1;
252	}
253
254	if (!open (MAPFILE, $mapfile)) {
255	print STDERR "ERROR: unable to load mapfile $mapfile\n";
256	return 0;
257	}
258
259	my ($line, @line);
260	$translations{$to} = {};
261	$translations{$from} = {};
262	while (defined ($line = <MAPFILE>)) {
263	# remove comments
264	$line =~ s/\#.*$//;
265	next unless $line =~ /\S/;
266
267	# split the line into fields and do a few
268	# simple sanity checks
269	@line = split (/\t/, $line);
270	next unless (scalar(@line) >= 2 &&
271	$line[0] =~ /^0x/ &&
272	$line[1] =~ /^0x/);
273
274	my $a = hex($line[0]);
275	my $b = hex($line[1]);
276
277	$translations{$to}->{$a} = $b;
278	$translations{$from}->{$b} = $a;
279	}
280
281	close (MAPFILE);
282
283	return 1;
284	}
285
286
287	1;

Note: See TracBrowser for help on using the repository browser.

Download in other formats: