source: gsdl/trunk/perllib/unicode.pm@ 18338

Last change on this file since 18338 was 18338, checked in by ak19, 15 years ago

Moved the line of code that replaced spaces with underscores to the subroutine util::rename_file, since this replacement ought to be done not only when url_encoding but also when using base64 encoding.

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 17.5 KB
Line 
1###########################################################################
2#
3# unicode.pm --
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999-2004 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26# useful functions for dealing with Unicode
27
28# Unicode strings are stored as arrays of scalars as perl
29# lacks characters are 8-bit (currently)
30
31package unicode;
32
33eval {require bytes};
34
35use encodings;
36use strict;
37use util;
38
39no strict 'refs';
40
41# ascii2unicode takes an (extended) ascii string (ISO-8859-1)
42# and returns a unicode array.
43sub ascii2unicode {
44 my ($in) = @_;
45 my $out = [];
46
47 my $i = 0;
48 my $len = length($in);
49 while ($i < $len) {
50 push (@$out, ord(substr ($in, $i, 1)));
51 $i++;
52 }
53
54 return $out;
55}
56
57# ascii2utf8 takes a reference to an (extended) ascii string and returns a
58# UTF-8 encoded string. This is just a faster version of
59# "&unicode2utf8(&ascii2unicode($str));"
60# "Extended ascii" really means "iso_8859_1"
61sub ascii2utf8 {
62 my ($in) = @_;
63 my $out = "";
64
65 if (!defined($in)|| !defined($$in)) {
66 return $out;
67 }
68
69 my ($c);
70 my $i = 0;
71 my $len = length($$in);
72 while ($i < $len) {
73 $c = ord (substr ($$in, $i, 1));
74 if ($c < 0x80) {
75 # ascii character
76 $out .= chr ($c);
77
78 } else {
79 # extended ascii character
80 $out .= chr (0xc0 + (($c >> 6) & 0x1f));
81 $out .= chr (0x80 + ($c & 0x3f));
82 }
83 $i++;
84 }
85
86 return $out;
87}
88
89# unicode2utf8 takes a unicode array as input and encodes it
90# using utf-8
91sub unicode2utf8 {
92 my ($in) = @_;
93 my $out = "";
94
95 foreach my $num (@$in) {
96 next unless defined $num;
97 if ($num < 0x80) {
98 $out .= chr ($num);
99
100 } elsif ($num < 0x800) {
101 $out .= chr (0xc0 + (($num >> 6) & 0x1f));
102 $out .= chr (0x80 + ($num & 0x3f));
103
104 } elsif ($num < 0xFFFF) {
105 $out .= chr (0xe0 + (($num >> 12) & 0xf));
106 $out .= chr (0x80 + (($num >> 6) & 0x3f));
107 $out .= chr (0x80 + ($num & 0x3f));
108
109 } else {
110 # error, don't encode anything
111 die;
112 }
113 }
114 return $out;
115}
116
117# utf82unicode takes a utf-8 string and produces a unicode
118# array
119sub utf82unicode {
120 my ($in) = @_;
121 my $out = [];
122
123 my $i = 0;
124 my ($c1, $c2, $c3);
125 my $len = length($in);
126 while ($i < $len) {
127 if (($c1 = ord(substr ($in, $i, 1))) < 0x80) {
128 # normal ascii character
129 push (@$out, $c1);
130
131 } elsif ($c1 < 0xc0) {
132 # error, was expecting the first byte of an
133 # encoded character. Do nothing.
134
135 } elsif ($c1 < 0xe0 && $i+1 < $len) {
136 # an encoded character with two bytes
137 $c2 = ord (substr ($in, $i+1, 1));
138 if ($c2 >= 0x80 && $c2 < 0xc0) {
139 # everything looks ok
140 push (@$out, ((($c1 & 0x1f) << 6) +
141 ($c2 & 0x3f)));
142 $i++; # gobbled an extra byte
143 }
144
145 } elsif ($c1 < 0xf0 && $i+2 < $len) {
146 # an encoded character with three bytes
147 $c2 = ord (substr ($in, $i+1, 1));
148 $c3 = ord (substr ($in, $i+2, 1));
149 if ($c2 >= 0x80 && $c2 < 0xc0 &&
150 $c3 >= 0x80 && $c3 < 0xc0) {
151 # everything looks ok
152 push (@$out, ((($c1 & 0xf) << 12) +
153 (($c2 & 0x3f) << 6) +
154 ($c3 & 0x3f)));
155
156 $i += 2; # gobbled an extra two bytes
157 }
158
159 } else {
160 # error, only decode Unicode characters not full UCS.
161 # Do nothing.
162 }
163
164 $i++;
165 }
166
167 return $out;
168}
169
170# unicode2ucs2 takes a unicode array and produces a UCS-2
171# unicode string (every two bytes forms a unicode character)
172sub unicode2ucs2 {
173 my ($in) = @_;
174 my $out = "";
175
176 foreach my $num (@$in) {
177 $out .= chr (($num & 0xff00) >> 8);
178 $out .= chr ($num & 0xff);
179 }
180
181 return $out;
182}
183
184# ucs22unicode takes a UCS-2 string and produces a unicode array
185sub ucs22unicode {
186 my ($in) = @_;
187 my $out = [];
188
189 my $i = 0;
190 my $len = length ($in);
191 while ($i+1 < $len) {
192 push (@$out, ord (substr($in, $i, 1)) << 8 +
193 ord (substr($in, $i+1, 1)));
194
195 $i ++;
196 }
197
198 return $out;
199}
200
201# takes a reference to a string and returns a reference to a unicode array
202sub convert2unicode {
203 my ($encoding, $textref) = @_;
204
205 if (!defined $encodings::encodings->{$encoding}) {
206 print STDERR "unicode::convert2unicode: ERROR: Unsupported encoding ($encoding)\n";
207 return [];
208 }
209
210 my $encodename = "$encoding-unicode";
211 my $enc_info = $encodings::encodings->{$encoding};
212 my $mapfile = &util::filename_cat($ENV{'GSDLHOME'}, "mappings",
213 "to_uc", $enc_info->{'mapfile'});
214 if (!&loadmapencoding ($encodename, $mapfile)) {
215 print STDERR "unicode: ERROR - could not load encoding $encodename: $! $mapfile\n";
216 return [];
217 }
218
219 if (defined $enc_info->{'converter'}) {
220 my $converter = $enc_info->{'converter'};
221 return &$converter ($encodename, $textref);
222 }
223
224 if ($unicode::translations{$encodename}->{'count'} == 1) {
225 return &singlebyte2unicode ($encodename, $textref);
226 } else {
227 return &doublebyte2unicode ($encodename, $textref);
228 }
229}
230
231# singlebyte2unicode converts simple 8 bit encodings where characters below
232# 0x80 are normal ascii characters and the rest are decoded using the
233# appropriate mapping files.
234#
235# Examples of encodings that may be converted using singlebyte2unicode are
236# the iso-8859 and windows-125* series.
237sub singlebyte2unicode {
238 my ($encodename, $textref) = @_;
239
240 my @outtext = ();
241 my $len = length($$textref);
242 my ($c);
243 my $i = 0;
244
245 while ($i < $len) {
246 if (($c = ord(substr($$textref, $i, 1))) < 0x80) {
247 # normal ascii character
248 push (@outtext, $c);
249 } else {
250 $c = &transchar ($encodename, $c);
251 # put a black square if cannot translate
252 $c = 0x25A1 if $c == 0;
253 push (@outtext, $c);
254 }
255 $i ++;
256 }
257 return \@outtext;
258}
259
260# doublebyte2unicode converts simple two byte encodings where characters
261# below code point 0x80 are single-byte characters and the rest are
262# double-byte characters.
263#
264# Examples of encodings that may be converted using doublebyte2unicode are
265# CJK encodings like GB encoded Chinese and UHC Korean.
266#
267# Note that no error checking is performed to make sure that the input text
268# is valid for the given encoding.
269#
270# Also, encodings that may contain characters of more than two bytes are
271# not supported (any EUC encoded text may in theory contain 3-byte
272# characters but in practice only one and two byte characters are used).
273sub doublebyte2unicode {
274 my ($encodename, $textref) = @_;
275
276 my @outtext = ();
277 my $len = length($$textref);
278 my ($c1, $c2);
279 my $i = 0;
280
281 while ($i < $len) {
282 if (($c1 = ord(substr($$textref, $i, 1))) >= 0x80) {
283 if ($i+1 < $len) {
284 # double-byte character
285 $c2 = ord(substr($$textref, $i+1, 1));
286 my $c = &transchar ($encodename, ($c1 << 8) | $c2);
287 # put a black square if cannot translate
288 $c = 0x25A1 if $c == 0;
289 push (@outtext, $c);
290 $i += 2;
291
292 } else {
293 # error
294 print STDERR "unicode: ERROR missing second half of double-byte character\n";
295 $i++;
296 }
297
298 } else {
299 # single-byte character
300 push (@outtext, $c1);
301 $i++;
302 }
303 }
304 return \@outtext;
305}
306
307# Shift-JIS to unicode
308# We can't use doublebyte2unicode for Shift-JIS because it uses some
309# single-byte characters above code point 0x80 (i.e. half-width katakana
310# characters in the range 0xA1-0xDF)
311sub shiftjis2unicode {
312 my ($encodename, $textref) = @_;
313
314 my @outtext = ();
315 my $len = length($$textref);
316 my ($c1, $c2);
317 my $i = 0;
318
319 while ($i < $len) {
320 $c1 = ord(substr($$textref, $i, 1));
321
322 if (($c1 >= 0xA1 && $c1 <= 0xDF) || $c1 == 0x5c || $c1 == 0x7E) {
323 # Single-byte half-width katakana character or
324 # JIS Roman yen or overline characters
325 my $c = &transchar ($encodename, $c1);
326 # - put a black square if cannot translate
327 $c = 0x25A1 if $c == 0;
328 push (@outtext, $c);
329 $i++;
330
331 } elsif ($c1 < 0x80) {
332 # ASCII
333 push (@outtext, $c1);
334 $i ++;
335
336 } elsif ($c1 < 0xEF) {
337 if ($i+1 < $len) {
338 $c2 = ord(substr($$textref, $i+1, 1));
339 if (($c2 >= 0x40 && $c2 <= 0x7E) || ($c2 >= 0x80 && $c2 <= 0xFC)) {
340 # Double-byte shift-jis character
341 my $c = &transchar ($encodename, ($c1 << 8) | $c2);
342 # put a black square if cannot translate
343 $c = 0x25A1 if $c == 0;
344 push (@outtext, $c);
345 } else {
346 # error
347 print STDERR "unicode: ERROR Invalid Shift-JIS character\n";
348 }
349 $i += 2;
350 } else {
351 # error
352 print STDERR "unicode: ERROR missing second half of Shift-JIS character\n";
353 $i ++;
354 }
355 } else {
356 # error
357 print STDERR "unicode: ERROR Invalid Shift-JIS character\n";
358 $i ++;
359 }
360 }
361 return \@outtext;
362}
363
364sub transchar {
365 my ($encoding, $from) = @_;
366 my $high = ($from / 256) % 256;
367 my $low = $from % 256;
368
369 return 0 unless defined $unicode::translations{$encoding};
370
371 my $block = $unicode::translations{$encoding}->{'map'};
372
373 if (ref ($block->[$high]) ne "ARRAY") {
374 return 0;
375 }
376 return $block->[$high]->[$low];
377}
378
379# %translations is of the form:
380#
381# encodings{encodingname-encodingname}->{'map'}->blocktranslation
382# blocktranslation->[[0-255],[256-511], ..., [65280-65535]]
383#
384# Any of the top translation blocks can point to an undefined
385# value. This data structure aims to allow fast translation and
386# efficient storage.
387%unicode::translations = ();
388
389# @array256 is used for initialisation, there must be
390# a better way...
391# What about this?: @array256 = (0) x 256;
392@unicode::array256 = (0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
393 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
394 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
395 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
396 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
397 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
398 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
399 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
400 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
401 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
402 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
403 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
404 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
405 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
406 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
407 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0);
408
409# returns 1 if successful, 0 if unsuccessful
410sub loadmapencoding {
411 my ($encoding, $mapfile) = @_;
412
413 # check to see if the encoding has already been loaded
414 return 1 if (defined $unicode::translations{$encoding});
415
416 if (! -r $mapfile || -d $mapfile) {
417 return 0;
418 }
419 return 0 unless open (MAPFILE, $mapfile);
420 binmode (MAPFILE);
421
422 $unicode::translations{$encoding} = {'map' => [@unicode::array256], 'count' => 0};
423 my $block = $unicode::translations{$encoding};
424
425 my ($in,$i,$j);
426 while (1) {
427 my $ret=read(MAPFILE, $in, 1);
428 if (!defined($ret)) { # error
429 print STDERR "unicode.pm: error reading mapfile: $!\n";
430 last;
431 }
432 if ($ret != 1) { last }
433 $i = unpack ("C", $in);
434 $block->{'map'}->[$i] = [@unicode::array256];
435 for ($j=0; $j<256 && read(MAPFILE, $in, 2)==2; $j++) {
436 my ($n1, $n2) = unpack ("CC", $in);
437 $block->{'map'}->[$i]->[$j] = ($n1*256) + $n2;
438 }
439 $block->{'count'} ++;
440 }
441
442 close (MAPFILE);
443}
444
445# unicode2singlebyte converts unicode to simple 8 bit encodings where
446# characters below 0x80 are normal ascii characters and the rest are encoded
447# using the appropriate mapping files.
448#
449# Examples of encodings that may be converted using unicode2singlebyte are
450# the iso-8859 and windows-125* series, KOI8-R (Russian), and the Kazakh encoding.
451sub unicode2singlebyte {
452 my ($uniref, $encoding) = @_;
453
454 my $outtext = "";
455 my $encodename = "unicode-$encoding";
456
457 if (!exists $encodings::encodings->{$encoding}) {
458 print STDERR "unicode.pm: ERROR - unsupported encoding "
459 . "'$encoding' requested\n";
460 return "";
461 }
462
463 my $enc_info = $encodings::encodings->{$encoding};
464 my $mapfile = &util::filename_cat($ENV{'GSDLHOME'}, "mappings",
465 "from_uc", $enc_info->{'mapfile'});
466 if (!&loadmapencoding ($encodename, $mapfile)) {
467 print STDERR "unicode: ERROR - could not load encoding $encodename: $! $mapfile\n";
468 return "";
469 }
470
471 foreach my $c (@$uniref) {
472 if ($c < 0x80) {
473 # normal ascii character
474 $outtext .= chr($c);
475 } else {
476 # extended ascii character
477 $c = &transchar ($encodename, $c);
478
479 # put a question mark if cannot translate
480 if ($c == 0) {
481 $outtext .= "?";
482 } else {
483 $outtext .= chr($c);
484 }
485 }
486 }
487 return $outtext;
488}
489
490
491# this makes sure that the referenced input string is utf8 encoded, and
492# will change/remove bytes that aren't.
493# returns 0 if the text was already utf8, or 1 if text modified to become utf8
494sub ensure_utf8 {
495 my $stringref=shift;
496
497 if (!defined($stringref) || ref($stringref) ne 'SCALAR') {
498 return $stringref;
499 }
500
501 my $value=$$stringref;
502
503 my $non_utf8_found = 0;
504 $value =~ m/^/g; # to set \G
505 while ($value =~ m!\G.*?([\x80-\xff]+)!sg) {
506 my $highbytes=$1;
507 my $highbyteslength=length($highbytes);
508 # make sure this block of high bytes is utf-8
509 $highbytes =~ /^/g; # set pos()
510 my $byte_replaced = 0;
511 while ($highbytes =~
512 m!\G (?: [\xc0-\xdf][\x80-\xbf] | # 2 byte utf-8
513 [\xe0-\xef][\x80-\xbf]{2} | # 3 byte
514 [\xf0-\xf7][\x80-\xbf]{3} | # 4 byte
515 [\xf8-\xfb][\x80-\xbf]{4} | # 5 byte
516 [\xfc-\xfd][\x80-\xbf]{5} | # 6 byte
517 )*([\x80-\xff])? !xg
518 ) {
519 # this highbyte is "out-of-place" for valid utf-8
520 my $badbyte=$1;
521 if (!defined $badbyte) {next} # hit end of string
522 my $pos=pos($highbytes);
523 # replace bad byte. assume iso-8859-1 -> utf-8
524 # ascii2utf8 does "extended ascii"... ie iso-8859-1
525 my $replacement=&unicode::ascii2utf8(\$badbyte);
526 substr($highbytes, $pos-1, 1, $replacement);
527 # update the position to continue searching (for \G)
528 pos($highbytes) = $pos+length($replacement)-1;
529 $byte_replaced = 1;
530 }
531 if ($byte_replaced) {
532 # replace this block of high bytes in the $value
533 $non_utf8_found = 1;
534 my $replength=length($highbytes); # we've changed the length
535 my $textpos=pos($value); # pos at end of last match
536 # replace bad bytes with good bytes
537 substr($value, $textpos-$highbyteslength,
538 $highbyteslength, $highbytes);
539 # update the position to continue searching (for \G)
540 pos($value)=$textpos+($replength-$highbyteslength)+1;
541 }
542 }
543
544 $$stringref = $value;
545 return $non_utf8_found;
546}
547
548# Returns true (1) if the given string is utf8 and false (0) if it isn't.
549# Does not modify the string parameter.
550sub check_is_utf8 {
551 my $value=shift;
552
553 if (!defined($value)) {
554 return 0; # not utf8 because it is undefined
555 }
556
557 $value =~ m/^/g; # to set \G
558 while ($value =~ m!\G.*?([\x80-\xff]+)!sg) {
559 my $highbytes=$1;
560 # make sure this block of high bytes is utf-8
561 $highbytes =~ /^/g; # set pos()
562 while ($highbytes =~
563 m!\G (?: [\xc0-\xdf][\x80-\xbf] | # 2 byte utf-8
564 [\xe0-\xef][\x80-\xbf]{2} | # 3 byte
565 [\xf0-\xf7][\x80-\xbf]{3} | # 4 byte
566 [\xf8-\xfb][\x80-\xbf]{4} | # 5 byte
567 [\xfc-\xfd][\x80-\xbf]{5} | # 6 byte
568 )*([\x80-\xff])? !xg
569 ) {
570 my $badbyte=$1;
571 if (defined $badbyte) { # not end of string
572 return 0; # non-utf8 found
573 }
574 }
575 }
576
577 return 1;
578}
579
580sub url_encode {
581 my ($text) = @_;
582
583 if (!&is_url_encoded($text)) {
584 $text =~ s/([^A-Z0-9\ \.\-\_])/sprintf("%%%02X", ord($1))/iseg;
585 }
586 return $text;
587}
588
589sub url_decode {
590 my ($text) = @_;
591
592 $text =~ s/\%([A-F0-9]{2})/pack('C', hex($1))/ige;
593 return $text;
594}
595
596sub is_url_encoded {
597 my ($text) = @_;
598 return ($text =~ m/\%([A-F0-9]{2})/);
599}
600
601# When a filename on the filesystem is already URL-encoded, the
602# URL to it will have %25s in place of every % sign, so that
603# URLs in html pages can refer to the URL-encoded filename.
604# This method changes the URL reference back into the actual
605# (URL-encoded) filename on the filesystem by replacing %25 with %.
606sub url_to_filename {
607 my ($text) =@_;
608 $text =~ s/%25/%/g if is_url_encoded($text);
609 return $text;
610}
611
612# When a filename on the filesystem is already URL-encoded, the
613# URL to it will have %25s in in place of every % sign, so that
614# URLs in html pages can refer to the URL-encoded filename.
615# Given a (URL-encoded) filename on the filesystem, this subroutine
616# returns the URL reference string for it by replacing % with %25.
617# The output string will be the same as the input string if the input
618# already contains one or more %25s. This is to prevent processing
619# a url more than once this way.
620sub filename_to_url {
621 my ($text, $rename_method) = @_;
622
623 if(!defined $rename_method || $rename_method eq "url") {
624 if($text !~ m/%25/) {
625 $text =~ s/%/%25/g;
626 }
627 }
628 return $text;
629}
630
631
632sub substr
633{
634 my ($utf8_string, $offset, $length) = @_;
635
636 my @unicode_string = @{&utf82unicode($utf8_string)};
637 my $unicode_string_length = scalar(@unicode_string);
638
639 my $substr_start = $offset;
640 if ($substr_start >= $unicode_string_length) {
641 return "";
642 }
643
644 my $substr_end = $offset + $length - 1;
645 if ($substr_end >= $unicode_string_length) {
646 $substr_end = $unicode_string_length - 1;
647 }
648
649 my @unicode_substring = @unicode_string[$substr_start..$substr_end];
650 return &unicode2utf8(\@unicode_substring);
651}
652
653
6541;
Note: See TracBrowser for help on using the repository browser.