Context Navigation

source: trunk/gsdl/perllib/unicode.pm@ 932

Last change on this file since 932 was 537, checked in by sjboddie, 25 years ago
added GPL headers
Property svn:executable set to ``* Property svn:keywords set to `Author Date Id Revision`
File size: 4.4 KB

Line
1	###########################################################################
2	#
3	# unicode.pm --
4	# A component of the Greenstone digital library software
5	# from the New Zealand Digital Library Project at the
6	# University of Waikato, New Zealand.
7	#
8	# Copyright (C) 1999 New Zealand Digital Library Project
9	#
10	# This program is free software; you can redistribute it and/or modify
11	# it under the terms of the GNU General Public License as published by
12	# the Free Software Foundation; either version 2 of the License, or
13	# (at your option) any later version.
14	#
15	# This program is distributed in the hope that it will be useful,
16	# but WITHOUT ANY WARRANTY; without even the implied warranty of
17	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18	# GNU General Public License for more details.
19	#
20	# You should have received a copy of the GNU General Public License
21	# along with this program; if not, write to the Free Software
22	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23	#
24	###########################################################################
25
26	# useful functions for dealing with Unicode
27
28	# Unicode strings are stored as arrays of scalars as perl
29	# lacks characters are 8-bit (currently)
30
31	package unicode;
32
33
34
35	# ascii2unicode takes a (extended) ascii string and
36	# returns a unicode array.
37	sub ascii2unicode {
38	my ($in) = @_;
39	my $out = [];
40
41	my $i = 0;
42	my $len = length($in);
43	while ($i < $len) {
44	push (@$out, ord(substr ($in, $i, 1)));
45	$i++;
46	}
47
48	return $out;
49	}
50
51
52	# ascii2utf8 takes a (extended) ascii string and
53	# returns a UTF-8 encoded string. This is just
54	# a faster version of "&unicode2utf8(&ascii2unicode($str));"
55	sub ascii2utf8 {
56	my ($in) = @_;
57	my $out = "";
58
59	my ($c);
60	my $i = 0;
61	my $len = length($in);
62	while ($i < $len) {
63	$c = ord (substr ($in, $i, 1));
64	if ($c < 0x80) {
65	# ascii character
66	$out .= chr ($c);
67
68	} else {
69	# extended ascii character
70	$out .= chr (0xc0 + (($c >> 6) & 0x1f));
71	$out .= chr (0x80 + ($c & 0x3f));
72	}
73	$i++;
74	}
75
76	return $out;
77	}
78
79
80	# unicode2utf8 takes a unicode array as input and encodes it
81	# using utf-8
82	sub unicode2utf8 {
83	my ($in) = @_;
84	my $out = "";
85
86	foreach $num (@$in) {
87	if ($num < 0x80) {
88	$out .= chr ($num);
89
90	} elsif ($num < 0x800) {
91	$out .= chr (0xc0 + (($num >> 6) & 0x1f));
92	$out .= chr (0x80 + ($num & 0x3f));
93
94	} elsif ($num < 0xFFFF) {
95	$out .= chr (0xe0 + (($num >> 12) & 0xf));
96	$out .= chr (0x80 + (($num >> 6) & 0x3f));
97	$out .= chr (0x80 + ($num & 0x3f));
98
99	} else {
100	# error, don't encode anything
101	die;
102	}
103	}
104
105	return $out;
106	}
107
108
109	# utf82unicode takes a utf-8 string and produces a unicode
110	# array
111	sub utf82unicode {
112	my ($in) = @_;
113	my $out = [];
114
115	my $i = 0;
116	my ($c1, $c2, $c3);
117	$len = length($in);
118	while ($i < $len) {
119	if (($c1 = ord(substr ($in, $i, 1))) < 0x80) {
120	# normal ascii character
121	push (@$out, $c1);
122
123	} elsif ($c1 < 0xc0) {
124	# error, was expecting the first byte of an
125	# encoded character. Do nothing.
126
127	} elsif ($c1 < 0xe0 && $i+1 < $len) {
128	# an encoded character with two bytes
129	$c2 = ord (substr ($in, $i+1, 1));
130	if ($c2 >= 0x80 && $c2 < 0xc0) {
131	# everything looks ok
132	push (@$out, ((($c1 & 0x1f) << 6) +
133	($c2 & 0x3f)));
134	$i++; # gobbled an extra byte
135	}
136
137	} elsif ($c1 < 0xf0 && $i+2 < $len) {
138	# an encoded character with three bytes
139	$c2 = ord (substr ($in, $i+1, 1));
140	$c3 = ord (substr ($in, $i+2, 1));
141	if ($c2 >= 0x80 && $c2 < 0xc0 &&
142	$c3 >= 0x80 && $c3 < 0xc0) {
143	# everything looks ok
144	push (@$out, ((($c1 & 0xf) << 12) +
145	(($c2 & 0x3f) << 6) +
146	($c3 & 0x3f)));
147
148	$i += 2; # gobbled an extra two bytes
149	}
150
151	} else {
152	# error, only decode Unicode characters not full UCS.
153	# Do nothing.
154	}
155
156	$i++;
157	}
158
159	return $out;
160	}
161
162
163	# unicode2ucs2 takes a unicode array and produces a UCS-2
164	# unicode string (every two bytes forms a unicode character)
165	sub unicode2ucs2 {
166	my ($in) = @_;
167	my $out = "";
168
169	foreach $num (@$in) {
170	$out .= chr (($num & 0xff00) >> 8);
171	$out .= chr ($num & 0xff);
172	}
173
174	return $out;
175	}
176
177
178	# ucs22unicode takes a UCS-2 string and produces a unicode array
179	sub ucs22unicode {
180	my ($in) = @_;
181	my $out = [];
182
183	my $i = 0;
184	my $len = length ($in);
185	while ($i+1 < $len) {
186	push (@$out, ord (substr($in, $i, 1)) << 8 +
187	ord (substr($in, $i+1, 1)));
188
189	$i ++;
190	}
191
192	return $out;
193	}
194
195
196	1;
197

Note: See TracBrowser for help on using the repository browser.

Download in other formats: