source: trunk/gsdl/perllib/unicode.pm@ 1483

Last change on this file since 1483 was 1227, checked in by sjboddie, 24 years ago

Modified the perl code for importing arabic encoded documents. Plugins
now support a windows_1256 and an iso_8859_6 encoding. I was briefly under
the impression that these two encodings were similar enough to be treated
the same. It turns out they're not. It appears that the Windows codepage
1256 is the most commonly used Arabic encoding so "arabic" is a synonym
for windows_1256.

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 6.6 KB
Line 
1###########################################################################
2#
3# unicode.pm --
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26# useful functions for dealing with Unicode
27
28# Unicode strings are stored as arrays of scalars as perl
29# lacks characters are 8-bit (currently)
30
31package unicode;
32
33%translations = ();
34
35# ascii2unicode takes an (extended) ascii string (ISO-8859-1)
36# and returns a unicode array.
37sub ascii2unicode {
38 my ($in) = @_;
39 my $out = [];
40
41 my $i = 0;
42 my $len = length($in);
43 while ($i < $len) {
44 push (@$out, ord(substr ($in, $i, 1)));
45 $i++;
46 }
47
48 return $out;
49}
50
51# arabic2unicode takes an 8 bit Arabic string (ISO-8859-6)
52# and returns a unicode array
53sub arabic2unicode {
54 my ($in) = @_;
55 my $out = [];
56
57 my $i = 0;
58 my $len = length($in);
59 while ($i < $len) {
60 my $c = ord(substr ($in, $i, 1));
61 $c += (1567-191) if ($c >= 0x80);
62 push (@$out, $c);
63 $i++;
64 }
65
66 return $out;
67}
68
69# windows2unicode takes a windows encoding (e.g. Windows 1256 (Arabic))
70# and returns a unicode array. These encodings are similar to but not
71# identical to the corresponding ISO-8859 encodings.
72#
73# The map files for these encodings should be in unicode/MAPPINGS/WINDOWS
74sub windows2unicode {
75 my ($encoding, $in) = @_;
76 my $out = [];
77
78 my $mapfile = &util::filename_cat($ENV{'GSDLHOME'}, "unicode", "MAPPINGS",
79 "WINDOWS", "$encoding.TXT");
80 return $out unless &loadmapping ($encoding, $mapfile);
81
82 my $i = 0;
83 my $len = length($in);
84 while ($i < $len) {
85 my $c = ord(substr ($in, $i, 1));
86 $c = $translations{"$encoding-unicode"}->{$c} if ($c >= 0x80);
87 push (@$out, $c);
88 $i++;
89 }
90
91 return $out;
92}
93
94
95# ascii2utf8 takes a (extended) ascii string and
96# returns a UTF-8 encoded string. This is just
97# a faster version of "&unicode2utf8(&ascii2unicode($str));"
98sub ascii2utf8 {
99 my ($in) = @_;
100 my $out = "";
101
102 my ($c);
103 my $i = 0;
104 my $len = length($in);
105 while ($i < $len) {
106 $c = ord (substr ($in, $i, 1));
107 if ($c < 0x80) {
108 # ascii character
109 $out .= chr ($c);
110
111 } else {
112 # extended ascii character
113 $out .= chr (0xc0 + (($c >> 6) & 0x1f));
114 $out .= chr (0x80 + ($c & 0x3f));
115 }
116 $i++;
117 }
118
119 return $out;
120}
121
122
123# unicode2utf8 takes a unicode array as input and encodes it
124# using utf-8
125sub unicode2utf8 {
126 my ($in) = @_;
127 my $out = "";
128
129 foreach $num (@$in) {
130 if ($num < 0x80) {
131 $out .= chr ($num);
132
133 } elsif ($num < 0x800) {
134 $out .= chr (0xc0 + (($num >> 6) & 0x1f));
135 $out .= chr (0x80 + ($num & 0x3f));
136
137 } elsif ($num < 0xFFFF) {
138 $out .= chr (0xe0 + (($num >> 12) & 0xf));
139 $out .= chr (0x80 + (($num >> 6) & 0x3f));
140 $out .= chr (0x80 + ($num & 0x3f));
141
142 } else {
143 # error, don't encode anything
144 die;
145 }
146 }
147
148 return $out;
149}
150
151
152# utf82unicode takes a utf-8 string and produces a unicode
153# array
154sub utf82unicode {
155 my ($in) = @_;
156 my $out = [];
157
158 my $i = 0;
159 my ($c1, $c2, $c3);
160 $len = length($in);
161 while ($i < $len) {
162 if (($c1 = ord(substr ($in, $i, 1))) < 0x80) {
163 # normal ascii character
164 push (@$out, $c1);
165
166 } elsif ($c1 < 0xc0) {
167 # error, was expecting the first byte of an
168 # encoded character. Do nothing.
169
170 } elsif ($c1 < 0xe0 && $i+1 < $len) {
171 # an encoded character with two bytes
172 $c2 = ord (substr ($in, $i+1, 1));
173 if ($c2 >= 0x80 && $c2 < 0xc0) {
174 # everything looks ok
175 push (@$out, ((($c1 & 0x1f) << 6) +
176 ($c2 & 0x3f)));
177 $i++; # gobbled an extra byte
178 }
179
180 } elsif ($c1 < 0xf0 && $i+2 < $len) {
181 # an encoded character with three bytes
182 $c2 = ord (substr ($in, $i+1, 1));
183 $c3 = ord (substr ($in, $i+2, 1));
184 if ($c2 >= 0x80 && $c2 < 0xc0 &&
185 $c3 >= 0x80 && $c3 < 0xc0) {
186 # everything looks ok
187 push (@$out, ((($c1 & 0xf) << 12) +
188 (($c2 & 0x3f) << 6) +
189 ($c3 & 0x3f)));
190
191 $i += 2; # gobbled an extra two bytes
192 }
193
194 } else {
195 # error, only decode Unicode characters not full UCS.
196 # Do nothing.
197 }
198
199 $i++;
200 }
201
202 return $out;
203}
204
205
206# unicode2ucs2 takes a unicode array and produces a UCS-2
207# unicode string (every two bytes forms a unicode character)
208sub unicode2ucs2 {
209 my ($in) = @_;
210 my $out = "";
211
212 foreach $num (@$in) {
213 $out .= chr (($num & 0xff00) >> 8);
214 $out .= chr ($num & 0xff);
215 }
216
217 return $out;
218}
219
220
221# ucs22unicode takes a UCS-2 string and produces a unicode array
222sub ucs22unicode {
223 my ($in) = @_;
224 my $out = [];
225
226 my $i = 0;
227 my $len = length ($in);
228 while ($i+1 < $len) {
229 push (@$out, ord (substr($in, $i, 1)) << 8 +
230 ord (substr($in, $i+1, 1)));
231
232 $i ++;
233 }
234
235 return $out;
236}
237
238# loadmapping expects the mapfile to contain (at least) two
239# tab-separated fields. The first field is the mapped value
240# and the second field is the unicode value.
241#
242# It returns 1 if successful, 0 if unsuccessful
243sub loadmapping {
244 my ($encoding, $mapfile) = @_;
245
246 my $to = "$encoding-unicode";
247 my $from = "unicode-$encoding";
248
249 # check to see if the encoding has already been loaded
250 if (defined $translations{$to} && defined $translations{$from}) {
251 return 1;
252 }
253
254 if (!open (MAPFILE, $mapfile)) {
255 print STDERR "ERROR: unable to load mapfile $mapfile\n";
256 return 0;
257 }
258
259 my ($line, @line);
260 $translations{$to} = {};
261 $translations{$from} = {};
262 while (defined ($line = <MAPFILE>)) {
263 # remove comments
264 $line =~ s/\#.*$//;
265 next unless $line =~ /\S/;
266
267 # split the line into fields and do a few
268 # simple sanity checks
269 @line = split (/\t/, $line);
270 next unless (scalar(@line) >= 2 &&
271 $line[0] =~ /^0x/ &&
272 $line[1] =~ /^0x/);
273
274 my $a = hex($line[0]);
275 my $b = hex($line[1]);
276
277 $translations{$to}->{$a} = $b;
278 $translations{$from}->{$b} = $a;
279 }
280
281 close (MAPFILE);
282
283 return 1;
284}
285
286
2871;
Note: See TracBrowser for help on using the repository browser.