Context Navigation

source: trunk/gsdl/perllib/unicode.pm@ 1844

Last change on this file since 1844 was 1844, checked in by sjboddie, 23 years ago
Added an 'auto' argument to BasPlug's '-input_encoding' option ('auto' is now the default instead of 'ascii'). Wihen -input_encoding is 'auto' textcat is used to work out the language and encoding of each document prior to processing it. This allows for documents within the same collection to be in different encodings and all be imported correctly (as long as they're in an encoding that's supported - notable exceptions at the moment are Big5 Chinese and any kind of Japanese). Doing things this way means each document is read in twice at import time, no doubt slowing things down considerably. You can therefore still set -input_encoding explicitly if you know that all your documents are a particular encoding.
Property svn:executable set to ``* Property svn:keywords set to `Author Date Id Revision`
File size: 8.1 KB

Line
1	###########################################################################
2	#
3	# unicode.pm --
4	# A component of the Greenstone digital library software
5	# from the New Zealand Digital Library Project at the
6	# University of Waikato, New Zealand.
7	#
8	# Copyright (C) 1999 New Zealand Digital Library Project
9	#
10	# This program is free software; you can redistribute it and/or modify
11	# it under the terms of the GNU General Public License as published by
12	# the Free Software Foundation; either version 2 of the License, or
13	# (at your option) any later version.
14	#
15	# This program is distributed in the hope that it will be useful,
16	# but WITHOUT ANY WARRANTY; without even the implied warranty of
17	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18	# GNU General Public License for more details.
19	#
20	# You should have received a copy of the GNU General Public License
21	# along with this program; if not, write to the Free Software
22	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23	#
24	###########################################################################
25
26	# useful functions for dealing with Unicode
27
28	# Unicode strings are stored as arrays of scalars as perl
29	# lacks characters are 8-bit (currently)
30
31	package unicode;
32
33	%translations = ();
34
35	# ascii2unicode takes an (extended) ascii string (ISO-8859-1)
36	# and returns a unicode array.
37	sub ascii2unicode {
38	my ($in) = @_;
39	my $out = [];
40
41	my $i = 0;
42	my $len = length($in);
43	while ($i < $len) {
44	push (@$out, ord(substr ($in, $i, 1)));
45	$i++;
46	}
47
48	return $out;
49	}
50
51	# windows2unicode takes a windows encoded string (e.g. Windows 1256 (Arabic))
52	# and returns a unicode array. These encodings are similar to but not
53	# identical to the corresponding ISO-8859 encodings.
54	#
55	# $encoding should be the code page name (e.g. '1252')
56	#
57	# The map files for these encodings should be in unicode/MAPPINGS/WINDOWS
58	sub windows2unicode {
59	my ($encoding, $in) = @_;
60	my $out = [];
61
62	my $mapfile = &util::filename_cat($ENV{'GSDLHOME'}, "unicode", "MAPPINGS",
63	"WINDOWS", "$encoding.TXT");
64	return $out unless &loadmapping ($encoding, $mapfile);
65
66	my $i = 0;
67	my $len = length($in);
68	while ($i < $len) {
69	my $c = ord(substr ($in, $i, 1));
70	$c = $translations{"$encoding-unicode"}->{$c} if ($c >= 0x80);
71	push (@$out, $c);
72	$i++;
73	}
74
75	return $out;
76	}
77
78	# iso2unicode takes an iso-8859 encoded string (e.g. iso-8859-6 (Arabic))
79	# and returns a unicode array. This function is much like windows2unicode()
80	# except that only characters >= 0xA0 are read from the mapping file (since
81	# all characters below that are the same for all iso-8859 character sets
82	# and therefore already the same as unicode).
83	#
84	# Note that while this function will work for iso-8859-1 (latin 1) it'll be
85	# much faster to use ascii2unicode() or ascii2utf8()
86	#
87	# $encoding should be 1,2,3...,9 depending on which breed of iso-8859 the
88	# encoding is
89	#
90	# The map files for these encodings should be in unicode/MAPPINGS/ISO_8859
91	sub iso2unicode {
92	my ($encoding, $in) = @_;
93	my $out = [];
94
95	my $mapfile = &util::filename_cat($ENV{'GSDLHOME'}, "unicode", "MAPPINGS",
96	"ISO_8859", "$encoding.TXT");
97	return $out unless &loadmapping ($encoding, $mapfile);
98
99	my $i = 0;
100	my $len = length($in);
101	while ($i < $len) {
102	my $c = ord(substr ($in, $i, 1));
103	$c = $translations{"$encoding-unicode"}->{$c} if ($c >= 0xA0);
104	push (@$out, $c);
105	$i++;
106	}
107
108	return $out;
109	}
110
111	# cyrillic2unicode is basically identical to windows2unicode, the only
112	# difference being that the map files live in unicode/MAPPINGS/CYRILLIC
113	#
114	# values for $encoding may be 'koi8_r' or 'koi8_u'
115	sub cyrillic2unicode {
116	my ($encoding, $in) = @_;
117	my $out = [];
118
119	my $mapfile = &util::filename_cat($ENV{'GSDLHOME'}, "unicode", "MAPPINGS",
120	"CYRILLIC", "$encoding.txt");
121	return $out unless &loadmapping ($encoding, $mapfile);
122
123	my $i = 0;
124	my $len = length($in);
125	while ($i < $len) {
126	my $c = ord(substr ($in, $i, 1));
127	$c = $translations{"$encoding-unicode"}->{$c} if ($c >= 0x80);
128	push (@$out, $c);
129	$i++;
130	}
131
132	return $out;
133	}
134
135
136	# ascii2utf8 takes a (extended) ascii string and
137	# returns a UTF-8 encoded string. This is just
138	# a faster version of "&unicode2utf8(&ascii2unicode($str));"
139	sub ascii2utf8 {
140	my ($in) = @_;
141	my $out = "";
142
143	my ($c);
144	my $i = 0;
145	my $len = length($in);
146	while ($i < $len) {
147	$c = ord (substr ($in, $i, 1));
148	if ($c < 0x80) {
149	# ascii character
150	$out .= chr ($c);
151
152	} else {
153	# extended ascii character
154	$out .= chr (0xc0 + (($c >> 6) & 0x1f));
155	$out .= chr (0x80 + ($c & 0x3f));
156	}
157	$i++;
158	}
159
160	return $out;
161	}
162
163
164	# unicode2utf8 takes a unicode array as input and encodes it
165	# using utf-8
166	sub unicode2utf8 {
167	my ($in) = @_;
168	my $out = "";
169
170	foreach $num (@$in) {
171	if ($num < 0x80) {
172	$out .= chr ($num);
173
174	} elsif ($num < 0x800) {
175	$out .= chr (0xc0 + (($num >> 6) & 0x1f));
176	$out .= chr (0x80 + ($num & 0x3f));
177
178	} elsif ($num < 0xFFFF) {
179	$out .= chr (0xe0 + (($num >> 12) & 0xf));
180	$out .= chr (0x80 + (($num >> 6) & 0x3f));
181	$out .= chr (0x80 + ($num & 0x3f));
182
183	} else {
184	# error, don't encode anything
185	die;
186	}
187	}
188
189	return $out;
190	}
191
192
193	# utf82unicode takes a utf-8 string and produces a unicode
194	# array
195	sub utf82unicode {
196	my ($in) = @_;
197	my $out = [];
198
199	my $i = 0;
200	my ($c1, $c2, $c3);
201	$len = length($in);
202	while ($i < $len) {
203	if (($c1 = ord(substr ($in, $i, 1))) < 0x80) {
204	# normal ascii character
205	push (@$out, $c1);
206
207	} elsif ($c1 < 0xc0) {
208	# error, was expecting the first byte of an
209	# encoded character. Do nothing.
210
211	} elsif ($c1 < 0xe0 && $i+1 < $len) {
212	# an encoded character with two bytes
213	$c2 = ord (substr ($in, $i+1, 1));
214	if ($c2 >= 0x80 && $c2 < 0xc0) {
215	# everything looks ok
216	push (@$out, ((($c1 & 0x1f) << 6) +
217	($c2 & 0x3f)));
218	$i++; # gobbled an extra byte
219	}
220
221	} elsif ($c1 < 0xf0 && $i+2 < $len) {
222	# an encoded character with three bytes
223	$c2 = ord (substr ($in, $i+1, 1));
224	$c3 = ord (substr ($in, $i+2, 1));
225	if ($c2 >= 0x80 && $c2 < 0xc0 &&
226	$c3 >= 0x80 && $c3 < 0xc0) {
227	# everything looks ok
228	push (@$out, ((($c1 & 0xf) << 12) +
229	(($c2 & 0x3f) << 6) +
230	($c3 & 0x3f)));
231
232	$i += 2; # gobbled an extra two bytes
233	}
234
235	} else {
236	# error, only decode Unicode characters not full UCS.
237	# Do nothing.
238	}
239
240	$i++;
241	}
242
243	return $out;
244	}
245
246
247	# unicode2ucs2 takes a unicode array and produces a UCS-2
248	# unicode string (every two bytes forms a unicode character)
249	sub unicode2ucs2 {
250	my ($in) = @_;
251	my $out = "";
252
253	foreach $num (@$in) {
254	$out .= chr (($num & 0xff00) >> 8);
255	$out .= chr ($num & 0xff);
256	}
257
258	return $out;
259	}
260
261
262	# ucs22unicode takes a UCS-2 string and produces a unicode array
263	sub ucs22unicode {
264	my ($in) = @_;
265	my $out = [];
266
267	my $i = 0;
268	my $len = length ($in);
269	while ($i+1 < $len) {
270	push (@$out, ord (substr($in, $i, 1)) << 8 +
271	ord (substr($in, $i+1, 1)));
272
273	$i ++;
274	}
275
276	return $out;
277	}
278
279	# loadmapping expects the mapfile to contain (at least) two
280	# tab-separated fields. The first field is the mapped value
281	# and the second field is the unicode value.
282	#
283	# It returns 1 if successful, 0 if unsuccessful
284	sub loadmapping {
285	my ($encoding, $mapfile) = @_;
286
287	my $to = "$encoding-unicode";
288	my $from = "unicode-$encoding";
289
290	# check to see if the encoding has already been loaded
291	if (defined $translations{$to} && defined $translations{$from}) {
292	return 1;
293	}
294
295	if (!open (MAPFILE, $mapfile)) {
296	print STDERR "ERROR: unable to load mapfile $mapfile\n";
297	return 0;
298	}
299
300	my ($line, @line);
301	$translations{$to} = {};
302	$translations{$from} = {};
303	while (defined ($line = <MAPFILE>)) {
304	# remove comments
305	$line =~ s/\#.*$//;
306	next unless $line =~ /\S/;
307
308	# split the line into fields and do a few
309	# simple sanity checks
310	@line = split (/\t/, $line);
311	next unless (scalar(@line) >= 2 &&
312	$line[0] =~ /^0x/ &&
313	$line[1] =~ /^0x/);
314
315	my $a = hex($line[0]);
316	my $b = hex($line[1]);
317
318	$translations{$to}->{$a} = $b;
319	$translations{$from}->{$b} = $a;
320	}
321
322	close (MAPFILE);
323
324	return 1;
325	}
326
327
328	1;

Note: See TracBrowser for help on using the repository browser.

Download in other formats: