source: trunk/gsdl/perllib/unicode.pm@ 1844

Last change on this file since 1844 was 1844, checked in by sjboddie, 23 years ago

Added an 'auto' argument to BasPlug's '-input_encoding' option ('auto' is
now the default instead of 'ascii'). Wihen -input_encoding is 'auto' textcat
is used to work out the language and encoding of each document prior to
processing it. This allows for documents within the same collection to be
in different encodings and all be imported correctly (as long as they're
in an encoding that's supported - notable exceptions at the moment are
Big5 Chinese and any kind of Japanese).
Doing things this way means each document is read in twice at import time,
no doubt slowing things down considerably. You can therefore still set
-input_encoding explicitly if you know that all your documents are a
particular encoding.

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 8.1 KB
Line 
1###########################################################################
2#
3# unicode.pm --
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26# useful functions for dealing with Unicode
27
28# Unicode strings are stored as arrays of scalars as perl
29# lacks characters are 8-bit (currently)
30
31package unicode;
32
33%translations = ();
34
35# ascii2unicode takes an (extended) ascii string (ISO-8859-1)
36# and returns a unicode array.
37sub ascii2unicode {
38 my ($in) = @_;
39 my $out = [];
40
41 my $i = 0;
42 my $len = length($in);
43 while ($i < $len) {
44 push (@$out, ord(substr ($in, $i, 1)));
45 $i++;
46 }
47
48 return $out;
49}
50
51# windows2unicode takes a windows encoded string (e.g. Windows 1256 (Arabic))
52# and returns a unicode array. These encodings are similar to but not
53# identical to the corresponding ISO-8859 encodings.
54#
55# $encoding should be the code page name (e.g. '1252')
56#
57# The map files for these encodings should be in unicode/MAPPINGS/WINDOWS
58sub windows2unicode {
59 my ($encoding, $in) = @_;
60 my $out = [];
61
62 my $mapfile = &util::filename_cat($ENV{'GSDLHOME'}, "unicode", "MAPPINGS",
63 "WINDOWS", "$encoding.TXT");
64 return $out unless &loadmapping ($encoding, $mapfile);
65
66 my $i = 0;
67 my $len = length($in);
68 while ($i < $len) {
69 my $c = ord(substr ($in, $i, 1));
70 $c = $translations{"$encoding-unicode"}->{$c} if ($c >= 0x80);
71 push (@$out, $c);
72 $i++;
73 }
74
75 return $out;
76}
77
78# iso2unicode takes an iso-8859 encoded string (e.g. iso-8859-6 (Arabic))
79# and returns a unicode array. This function is much like windows2unicode()
80# except that only characters >= 0xA0 are read from the mapping file (since
81# all characters below that are the same for all iso-8859 character sets
82# and therefore already the same as unicode).
83#
84# Note that while this function will work for iso-8859-1 (latin 1) it'll be
85# much faster to use ascii2unicode() or ascii2utf8()
86#
87# $encoding should be 1,2,3...,9 depending on which breed of iso-8859 the
88# encoding is
89#
90# The map files for these encodings should be in unicode/MAPPINGS/ISO_8859
91sub iso2unicode {
92 my ($encoding, $in) = @_;
93 my $out = [];
94
95 my $mapfile = &util::filename_cat($ENV{'GSDLHOME'}, "unicode", "MAPPINGS",
96 "ISO_8859", "$encoding.TXT");
97 return $out unless &loadmapping ($encoding, $mapfile);
98
99 my $i = 0;
100 my $len = length($in);
101 while ($i < $len) {
102 my $c = ord(substr ($in, $i, 1));
103 $c = $translations{"$encoding-unicode"}->{$c} if ($c >= 0xA0);
104 push (@$out, $c);
105 $i++;
106 }
107
108 return $out;
109}
110
111# cyrillic2unicode is basically identical to windows2unicode, the only
112# difference being that the map files live in unicode/MAPPINGS/CYRILLIC
113#
114# values for $encoding may be 'koi8_r' or 'koi8_u'
115sub cyrillic2unicode {
116 my ($encoding, $in) = @_;
117 my $out = [];
118
119 my $mapfile = &util::filename_cat($ENV{'GSDLHOME'}, "unicode", "MAPPINGS",
120 "CYRILLIC", "$encoding.txt");
121 return $out unless &loadmapping ($encoding, $mapfile);
122
123 my $i = 0;
124 my $len = length($in);
125 while ($i < $len) {
126 my $c = ord(substr ($in, $i, 1));
127 $c = $translations{"$encoding-unicode"}->{$c} if ($c >= 0x80);
128 push (@$out, $c);
129 $i++;
130 }
131
132 return $out;
133}
134
135
136# ascii2utf8 takes a (extended) ascii string and
137# returns a UTF-8 encoded string. This is just
138# a faster version of "&unicode2utf8(&ascii2unicode($str));"
139sub ascii2utf8 {
140 my ($in) = @_;
141 my $out = "";
142
143 my ($c);
144 my $i = 0;
145 my $len = length($in);
146 while ($i < $len) {
147 $c = ord (substr ($in, $i, 1));
148 if ($c < 0x80) {
149 # ascii character
150 $out .= chr ($c);
151
152 } else {
153 # extended ascii character
154 $out .= chr (0xc0 + (($c >> 6) & 0x1f));
155 $out .= chr (0x80 + ($c & 0x3f));
156 }
157 $i++;
158 }
159
160 return $out;
161}
162
163
164# unicode2utf8 takes a unicode array as input and encodes it
165# using utf-8
166sub unicode2utf8 {
167 my ($in) = @_;
168 my $out = "";
169
170 foreach $num (@$in) {
171 if ($num < 0x80) {
172 $out .= chr ($num);
173
174 } elsif ($num < 0x800) {
175 $out .= chr (0xc0 + (($num >> 6) & 0x1f));
176 $out .= chr (0x80 + ($num & 0x3f));
177
178 } elsif ($num < 0xFFFF) {
179 $out .= chr (0xe0 + (($num >> 12) & 0xf));
180 $out .= chr (0x80 + (($num >> 6) & 0x3f));
181 $out .= chr (0x80 + ($num & 0x3f));
182
183 } else {
184 # error, don't encode anything
185 die;
186 }
187 }
188
189 return $out;
190}
191
192
193# utf82unicode takes a utf-8 string and produces a unicode
194# array
195sub utf82unicode {
196 my ($in) = @_;
197 my $out = [];
198
199 my $i = 0;
200 my ($c1, $c2, $c3);
201 $len = length($in);
202 while ($i < $len) {
203 if (($c1 = ord(substr ($in, $i, 1))) < 0x80) {
204 # normal ascii character
205 push (@$out, $c1);
206
207 } elsif ($c1 < 0xc0) {
208 # error, was expecting the first byte of an
209 # encoded character. Do nothing.
210
211 } elsif ($c1 < 0xe0 && $i+1 < $len) {
212 # an encoded character with two bytes
213 $c2 = ord (substr ($in, $i+1, 1));
214 if ($c2 >= 0x80 && $c2 < 0xc0) {
215 # everything looks ok
216 push (@$out, ((($c1 & 0x1f) << 6) +
217 ($c2 & 0x3f)));
218 $i++; # gobbled an extra byte
219 }
220
221 } elsif ($c1 < 0xf0 && $i+2 < $len) {
222 # an encoded character with three bytes
223 $c2 = ord (substr ($in, $i+1, 1));
224 $c3 = ord (substr ($in, $i+2, 1));
225 if ($c2 >= 0x80 && $c2 < 0xc0 &&
226 $c3 >= 0x80 && $c3 < 0xc0) {
227 # everything looks ok
228 push (@$out, ((($c1 & 0xf) << 12) +
229 (($c2 & 0x3f) << 6) +
230 ($c3 & 0x3f)));
231
232 $i += 2; # gobbled an extra two bytes
233 }
234
235 } else {
236 # error, only decode Unicode characters not full UCS.
237 # Do nothing.
238 }
239
240 $i++;
241 }
242
243 return $out;
244}
245
246
247# unicode2ucs2 takes a unicode array and produces a UCS-2
248# unicode string (every two bytes forms a unicode character)
249sub unicode2ucs2 {
250 my ($in) = @_;
251 my $out = "";
252
253 foreach $num (@$in) {
254 $out .= chr (($num & 0xff00) >> 8);
255 $out .= chr ($num & 0xff);
256 }
257
258 return $out;
259}
260
261
262# ucs22unicode takes a UCS-2 string and produces a unicode array
263sub ucs22unicode {
264 my ($in) = @_;
265 my $out = [];
266
267 my $i = 0;
268 my $len = length ($in);
269 while ($i+1 < $len) {
270 push (@$out, ord (substr($in, $i, 1)) << 8 +
271 ord (substr($in, $i+1, 1)));
272
273 $i ++;
274 }
275
276 return $out;
277}
278
279# loadmapping expects the mapfile to contain (at least) two
280# tab-separated fields. The first field is the mapped value
281# and the second field is the unicode value.
282#
283# It returns 1 if successful, 0 if unsuccessful
284sub loadmapping {
285 my ($encoding, $mapfile) = @_;
286
287 my $to = "$encoding-unicode";
288 my $from = "unicode-$encoding";
289
290 # check to see if the encoding has already been loaded
291 if (defined $translations{$to} && defined $translations{$from}) {
292 return 1;
293 }
294
295 if (!open (MAPFILE, $mapfile)) {
296 print STDERR "ERROR: unable to load mapfile $mapfile\n";
297 return 0;
298 }
299
300 my ($line, @line);
301 $translations{$to} = {};
302 $translations{$from} = {};
303 while (defined ($line = <MAPFILE>)) {
304 # remove comments
305 $line =~ s/\#.*$//;
306 next unless $line =~ /\S/;
307
308 # split the line into fields and do a few
309 # simple sanity checks
310 @line = split (/\t/, $line);
311 next unless (scalar(@line) >= 2 &&
312 $line[0] =~ /^0x/ &&
313 $line[1] =~ /^0x/);
314
315 my $a = hex($line[0]);
316 my $b = hex($line[1]);
317
318 $translations{$to}->{$a} = $b;
319 $translations{$from}->{$b} = $a;
320 }
321
322 close (MAPFILE);
323
324 return 1;
325}
326
327
3281;
Note: See TracBrowser for help on using the repository browser.