source: trunk/gsdl/perllib/unicode.pm@ 81

Last change on this file since 81 was 73, checked in by rjmcnab, 26 years ago

Added support for UTF-8.

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 3.3 KB
Line 
1# useful functions for dealing with Unicode
2
3# Unicode strings are stored as arrays of scalars as perl
4# lacks characters are 8-bit (currently)
5
6package unicode;
7
8
9
10# ascii2unicode takes a (extended) ascii string and
11# returns a unicode array.
12sub ascii2unicode {
13 my ($in) = @_;
14 my $out = [];
15
16 my $i = 0;
17 my $len = length($in);
18 while ($i < $len) {
19 push (@$out, ord(substr ($in, $i, 1)));
20 $i++;
21 }
22
23 return $out;
24}
25
26
27# ascii2utf8 takes a (extended) ascii string and
28# returns a UTF-8 encoded string. This is just
29# a faster version of "&unicode2utf8(&ascii2unicode($str));"
30sub ascii2utf8 {
31 my ($in) = @_;
32 my $out = "";
33
34 my ($c);
35 my $i = 0;
36 my $len = length($in);
37 while ($i < $len) {
38 $c = ord (substr ($in, $i, 1));
39 if ($c < 0x80) {
40 # ascii character
41 $out .= chr ($c);
42
43 } else {
44 # extended ascii character
45 $out .= chr (0xc0 + (($c >> 6) & 0x1f));
46 $out .= chr (0x80 + ($c & 0x3f));
47 }
48 $i++;
49 }
50
51 return $out;
52}
53
54
55# unicode2utf8 takes a unicode array as input and encodes it
56# using utf-8
57sub unicode2utf8 {
58 my ($in) = @_;
59 my $out = "";
60
61 foreach $num (@$in) {
62 if ($num < 0x80) {
63 $out .= chr ($num);
64
65 } elsif ($num < 0x800) {
66 $out .= chr (0xc0 + (($num >> 6) & 0x1f));
67 $out .= chr (0x80 + ($num & 0x3f));
68
69 } elsif ($num < 0xFFFF) {
70 $out .= chr (0xe0 + (($num >> 12) & 0xf));
71 $out .= chr (0x80 + (($num >> 6) & 0x3f));
72 $out .= chr (0x80 + ($num & 0x3f));
73
74 } else {
75 # error, don't encode anything
76 die;
77 }
78 }
79
80 return $out;
81}
82
83
84# utf82unicode takes a utf-8 string and produces a unicode
85# array
86sub utf82unicode {
87 my ($in) = @_;
88 my $out = [];
89
90 my $i = 0;
91 my ($c1, $c2, $c3);
92 $len = length($in);
93 while ($i < $len) {
94 if (($c1 = ord(substr ($in, $i, 1))) < 0x80) {
95 # normal ascii character
96 push (@$out, $c1);
97
98 } elsif ($c1 < 0xc0) {
99 # error, was expecting the first byte of an
100 # encoded character. Do nothing.
101
102 } elsif ($c1 < 0xe0 && $i+1 < $len) {
103 # an encoded character with two bytes
104 $c2 = ord (substr ($in, $i+1, 1));
105 if ($c2 >= 0x80 && $c2 < 0xc0) {
106 # everything looks ok
107 push (@$out, ((($c1 & 0x1f) << 6) +
108 ($c2 & 0x3f)));
109 $i++; # gobbled an extra byte
110 }
111
112 } elsif ($c1 < 0xf0 && $i+2 < $len) {
113 # an encoded character with three bytes
114 $c2 = ord (substr ($in, $i+1, 1));
115 $c3 = ord (substr ($in, $i+2, 1));
116 if ($c2 >= 0x80 && $c2 < 0xc0 &&
117 $c3 >= 0x80 && $c3 < 0xc0) {
118 # everything looks ok
119 push (@$out, ((($c1 & 0xf) << 12) +
120 (($c2 & 0x3f) << 6) +
121 ($c3 & 0x3f)));
122
123 $i += 2; # gobbled an extra two bytes
124 }
125
126 } else {
127 # error, only decode Unicode characters not full UCS.
128 # Do nothing.
129 }
130
131 $i++;
132 }
133
134 return $out;
135}
136
137
138# unicode2ucs2 takes a unicode array and produces a UCS-2
139# unicode string (every two bytes forms a unicode character)
140sub unicode2ucs2 {
141 my ($in) = @_;
142 my $out = "";
143
144 foreach $num (@$in) {
145 $out .= chr (($num & 0xff00) >> 8);
146 $out .= chr ($num & 0xff);
147 }
148
149 return $out;
150}
151
152
153# ucs22unicode takes a UCS-2 string and produces a unicode array
154sub ucs22unicode {
155 my ($in) = @_;
156 my $out = [];
157
158 my $i = 0;
159 my $len = length ($in);
160 while ($i+1 < $len) {
161 push (@$out, ord (substr($in, $i, 1)) << 8 +
162 ord (substr($in, $i+1, 1)));
163
164 $i ++;
165 }
166
167 return $out;
168}
169
170
1711;
172
Note: See TracBrowser for help on using the repository browser.