Context Navigation

source: trunk/gsdl/perllib/unicode.pm@ 81

Last change on this file since 81 was 73, checked in by rjmcnab, 26 years ago
Added support for UTF-8.
Property svn:executable set to ``* Property svn:keywords set to `Author Date Id Revision`
File size: 3.3 KB

Line
1	# useful functions for dealing with Unicode
2
3	# Unicode strings are stored as arrays of scalars as perl
4	# lacks characters are 8-bit (currently)
5
6	package unicode;
7
8
9
10	# ascii2unicode takes a (extended) ascii string and
11	# returns a unicode array.
12	sub ascii2unicode {
13	my ($in) = @_;
14	my $out = [];
15
16	my $i = 0;
17	my $len = length($in);
18	while ($i < $len) {
19	push (@$out, ord(substr ($in, $i, 1)));
20	$i++;
21	}
22
23	return $out;
24	}
25
26
27	# ascii2utf8 takes a (extended) ascii string and
28	# returns a UTF-8 encoded string. This is just
29	# a faster version of "&unicode2utf8(&ascii2unicode($str));"
30	sub ascii2utf8 {
31	my ($in) = @_;
32	my $out = "";
33
34	my ($c);
35	my $i = 0;
36	my $len = length($in);
37	while ($i < $len) {
38	$c = ord (substr ($in, $i, 1));
39	if ($c < 0x80) {
40	# ascii character
41	$out .= chr ($c);
42
43	} else {
44	# extended ascii character
45	$out .= chr (0xc0 + (($c >> 6) & 0x1f));
46	$out .= chr (0x80 + ($c & 0x3f));
47	}
48	$i++;
49	}
50
51	return $out;
52	}
53
54
55	# unicode2utf8 takes a unicode array as input and encodes it
56	# using utf-8
57	sub unicode2utf8 {
58	my ($in) = @_;
59	my $out = "";
60
61	foreach $num (@$in) {
62	if ($num < 0x80) {
63	$out .= chr ($num);
64
65	} elsif ($num < 0x800) {
66	$out .= chr (0xc0 + (($num >> 6) & 0x1f));
67	$out .= chr (0x80 + ($num & 0x3f));
68
69	} elsif ($num < 0xFFFF) {
70	$out .= chr (0xe0 + (($num >> 12) & 0xf));
71	$out .= chr (0x80 + (($num >> 6) & 0x3f));
72	$out .= chr (0x80 + ($num & 0x3f));
73
74	} else {
75	# error, don't encode anything
76	die;
77	}
78	}
79
80	return $out;
81	}
82
83
84	# utf82unicode takes a utf-8 string and produces a unicode
85	# array
86	sub utf82unicode {
87	my ($in) = @_;
88	my $out = [];
89
90	my $i = 0;
91	my ($c1, $c2, $c3);
92	$len = length($in);
93	while ($i < $len) {
94	if (($c1 = ord(substr ($in, $i, 1))) < 0x80) {
95	# normal ascii character
96	push (@$out, $c1);
97
98	} elsif ($c1 < 0xc0) {
99	# error, was expecting the first byte of an
100	# encoded character. Do nothing.
101
102	} elsif ($c1 < 0xe0 && $i+1 < $len) {
103	# an encoded character with two bytes
104	$c2 = ord (substr ($in, $i+1, 1));
105	if ($c2 >= 0x80 && $c2 < 0xc0) {
106	# everything looks ok
107	push (@$out, ((($c1 & 0x1f) << 6) +
108	($c2 & 0x3f)));
109	$i++; # gobbled an extra byte
110	}
111
112	} elsif ($c1 < 0xf0 && $i+2 < $len) {
113	# an encoded character with three bytes
114	$c2 = ord (substr ($in, $i+1, 1));
115	$c3 = ord (substr ($in, $i+2, 1));
116	if ($c2 >= 0x80 && $c2 < 0xc0 &&
117	$c3 >= 0x80 && $c3 < 0xc0) {
118	# everything looks ok
119	push (@$out, ((($c1 & 0xf) << 12) +
120	(($c2 & 0x3f) << 6) +
121	($c3 & 0x3f)));
122
123	$i += 2; # gobbled an extra two bytes
124	}
125
126	} else {
127	# error, only decode Unicode characters not full UCS.
128	# Do nothing.
129	}
130
131	$i++;
132	}
133
134	return $out;
135	}
136
137
138	# unicode2ucs2 takes a unicode array and produces a UCS-2
139	# unicode string (every two bytes forms a unicode character)
140	sub unicode2ucs2 {
141	my ($in) = @_;
142	my $out = "";
143
144	foreach $num (@$in) {
145	$out .= chr (($num & 0xff00) >> 8);
146	$out .= chr ($num & 0xff);
147	}
148
149	return $out;
150	}
151
152
153	# ucs22unicode takes a UCS-2 string and produces a unicode array
154	sub ucs22unicode {
155	my ($in) = @_;
156	my $out = [];
157
158	my $i = 0;
159	my $len = length ($in);
160	while ($i+1 < $len) {
161	push (@$out, ord (substr($in, $i, 1)) << 8 +
162	ord (substr($in, $i+1, 1)));
163
164	$i ++;
165	}
166
167	return $out;
168	}
169
170
171	1;
172

Note: See TracBrowser for help on using the repository browser.

Download in other formats: