source: trunk/gsdl/perllib/unicode.pm@ 537

Last change on this file since 537 was 537, checked in by sjboddie, 25 years ago

added GPL headers

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 4.4 KB
Line 
1###########################################################################
2#
3# unicode.pm --
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26# useful functions for dealing with Unicode
27
28# Unicode strings are stored as arrays of scalars as perl
29# lacks characters are 8-bit (currently)
30
31package unicode;
32
33
34
35# ascii2unicode takes a (extended) ascii string and
36# returns a unicode array.
37sub ascii2unicode {
38 my ($in) = @_;
39 my $out = [];
40
41 my $i = 0;
42 my $len = length($in);
43 while ($i < $len) {
44 push (@$out, ord(substr ($in, $i, 1)));
45 $i++;
46 }
47
48 return $out;
49}
50
51
52# ascii2utf8 takes a (extended) ascii string and
53# returns a UTF-8 encoded string. This is just
54# a faster version of "&unicode2utf8(&ascii2unicode($str));"
55sub ascii2utf8 {
56 my ($in) = @_;
57 my $out = "";
58
59 my ($c);
60 my $i = 0;
61 my $len = length($in);
62 while ($i < $len) {
63 $c = ord (substr ($in, $i, 1));
64 if ($c < 0x80) {
65 # ascii character
66 $out .= chr ($c);
67
68 } else {
69 # extended ascii character
70 $out .= chr (0xc0 + (($c >> 6) & 0x1f));
71 $out .= chr (0x80 + ($c & 0x3f));
72 }
73 $i++;
74 }
75
76 return $out;
77}
78
79
80# unicode2utf8 takes a unicode array as input and encodes it
81# using utf-8
82sub unicode2utf8 {
83 my ($in) = @_;
84 my $out = "";
85
86 foreach $num (@$in) {
87 if ($num < 0x80) {
88 $out .= chr ($num);
89
90 } elsif ($num < 0x800) {
91 $out .= chr (0xc0 + (($num >> 6) & 0x1f));
92 $out .= chr (0x80 + ($num & 0x3f));
93
94 } elsif ($num < 0xFFFF) {
95 $out .= chr (0xe0 + (($num >> 12) & 0xf));
96 $out .= chr (0x80 + (($num >> 6) & 0x3f));
97 $out .= chr (0x80 + ($num & 0x3f));
98
99 } else {
100 # error, don't encode anything
101 die;
102 }
103 }
104
105 return $out;
106}
107
108
109# utf82unicode takes a utf-8 string and produces a unicode
110# array
111sub utf82unicode {
112 my ($in) = @_;
113 my $out = [];
114
115 my $i = 0;
116 my ($c1, $c2, $c3);
117 $len = length($in);
118 while ($i < $len) {
119 if (($c1 = ord(substr ($in, $i, 1))) < 0x80) {
120 # normal ascii character
121 push (@$out, $c1);
122
123 } elsif ($c1 < 0xc0) {
124 # error, was expecting the first byte of an
125 # encoded character. Do nothing.
126
127 } elsif ($c1 < 0xe0 && $i+1 < $len) {
128 # an encoded character with two bytes
129 $c2 = ord (substr ($in, $i+1, 1));
130 if ($c2 >= 0x80 && $c2 < 0xc0) {
131 # everything looks ok
132 push (@$out, ((($c1 & 0x1f) << 6) +
133 ($c2 & 0x3f)));
134 $i++; # gobbled an extra byte
135 }
136
137 } elsif ($c1 < 0xf0 && $i+2 < $len) {
138 # an encoded character with three bytes
139 $c2 = ord (substr ($in, $i+1, 1));
140 $c3 = ord (substr ($in, $i+2, 1));
141 if ($c2 >= 0x80 && $c2 < 0xc0 &&
142 $c3 >= 0x80 && $c3 < 0xc0) {
143 # everything looks ok
144 push (@$out, ((($c1 & 0xf) << 12) +
145 (($c2 & 0x3f) << 6) +
146 ($c3 & 0x3f)));
147
148 $i += 2; # gobbled an extra two bytes
149 }
150
151 } else {
152 # error, only decode Unicode characters not full UCS.
153 # Do nothing.
154 }
155
156 $i++;
157 }
158
159 return $out;
160}
161
162
163# unicode2ucs2 takes a unicode array and produces a UCS-2
164# unicode string (every two bytes forms a unicode character)
165sub unicode2ucs2 {
166 my ($in) = @_;
167 my $out = "";
168
169 foreach $num (@$in) {
170 $out .= chr (($num & 0xff00) >> 8);
171 $out .= chr ($num & 0xff);
172 }
173
174 return $out;
175}
176
177
178# ucs22unicode takes a UCS-2 string and produces a unicode array
179sub ucs22unicode {
180 my ($in) = @_;
181 my $out = [];
182
183 my $i = 0;
184 my $len = length ($in);
185 while ($i+1 < $len) {
186 push (@$out, ord (substr($in, $i, 1)) << 8 +
187 ord (substr($in, $i+1, 1)));
188
189 $i ++;
190 }
191
192 return $out;
193}
194
195
1961;
197
Note: See TracBrowser for help on using the repository browser.