root/gsdl/trunk/perllib/unicode.pm @ 18379

Revision 18379, 19.1 KB (checked in by ak19, 11 years ago)

Base64 encoding subroutine now replaces the plus and forward slash symbols that it may generate with the minus and underscore chars, respectively, in order to ensure that it doesn't generate unsafe filenames. Base64 decoding subroutine has been correspondingly modified.

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
Line 
1###########################################################################
2#
3# unicode.pm --
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999-2004 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26# useful functions for dealing with Unicode
27
28# Unicode strings are stored as arrays of scalars as perl
29# lacks characters are 8-bit (currently)
30
31package unicode;
32
33eval {require bytes};
34
35use encodings;
36use strict;
37use util;
38use MIME::Base64; # for base64 encoding
39
40no strict 'refs';
41
42# ascii2unicode takes an (extended) ascii string (ISO-8859-1)
43# and returns a unicode array.
44sub ascii2unicode {
45    my ($in) = @_;
46    my $out = [];
47
48    my $i = 0;
49    my $len = length($in);
50    while ($i < $len) {
51    push (@$out, ord(substr ($in, $i, 1)));
52    $i++;
53    }
54
55    return $out;
56}
57
58# ascii2utf8 takes a reference to an (extended) ascii string and returns a
59# UTF-8 encoded string. This is just a faster version of
60# "&unicode2utf8(&ascii2unicode($str));"
61# "Extended ascii" really means "iso_8859_1"
62sub ascii2utf8 {
63    my ($in) = @_;
64    my $out = "";
65
66    if (!defined($in)|| !defined($$in)) {
67    return $out;
68    }
69
70    my ($c);
71    my $i = 0;
72    my $len = length($$in);
73    while ($i < $len) {
74    $c = ord (substr ($$in, $i, 1));
75    if ($c < 0x80) {
76        # ascii character
77        $out .= chr ($c);
78
79    } else {
80        # extended ascii character
81        $out .= chr (0xc0 + (($c >> 6) & 0x1f));
82        $out .= chr (0x80 + ($c & 0x3f));
83    }
84    $i++;
85    }
86
87    return $out;
88}
89
90# unicode2utf8 takes a unicode array as input and encodes it
91# using utf-8
92sub unicode2utf8 {
93    my ($in) = @_;
94    my $out = "";
95   
96    foreach my $num (@$in) {
97    next unless defined $num;
98    if ($num < 0x80) {
99        $out .= chr ($num);
100
101    } elsif ($num < 0x800) {
102        $out .= chr (0xc0 + (($num >> 6) & 0x1f));
103        $out .= chr (0x80 + ($num & 0x3f));
104
105    } elsif ($num < 0xFFFF) {
106        $out .= chr (0xe0 + (($num >> 12) & 0xf));
107        $out .= chr (0x80 + (($num >> 6) & 0x3f));
108        $out .= chr (0x80 + ($num & 0x3f));
109
110    } else {
111        # error, don't encode anything
112        die;
113    }
114    }
115    return $out;
116}
117
118# utf82unicode takes a utf-8 string and produces a unicode
119# array
120sub utf82unicode {
121    my ($in) = @_;
122    my $out = [];
123
124    my $i = 0;
125    my ($c1, $c2, $c3);
126    my $len = length($in);
127    while ($i < $len) {
128    if (($c1 = ord(substr ($in, $i, 1))) < 0x80) {
129        # normal ascii character
130        push (@$out, $c1);
131
132    } elsif ($c1 < 0xc0) {
133        # error, was expecting the first byte of an
134        # encoded character. Do nothing.
135
136    } elsif ($c1 < 0xe0 && $i+1 < $len) {
137        # an encoded character with two bytes
138        $c2 = ord (substr ($in, $i+1, 1));
139        if ($c2 >= 0x80 && $c2 < 0xc0) {
140        # everything looks ok
141        push (@$out, ((($c1 & 0x1f) << 6) +
142              ($c2 & 0x3f)));
143        $i++; # gobbled an extra byte
144        }
145
146    } elsif ($c1 < 0xf0 && $i+2 < $len) {
147        # an encoded character with three bytes
148        $c2 = ord (substr ($in, $i+1, 1));
149        $c3 = ord (substr ($in, $i+2, 1));
150        if ($c2 >= 0x80 && $c2 < 0xc0 &&
151        $c3 >= 0x80 && $c3 < 0xc0) {
152        # everything looks ok
153        push (@$out, ((($c1 & 0xf) << 12) +
154              (($c2 & 0x3f) << 6) +
155              ($c3 & 0x3f)));
156
157        $i += 2; # gobbled an extra two bytes
158        }
159
160    } else {
161        # error, only decode Unicode characters not full UCS.
162        # Do nothing.
163    }
164
165    $i++;
166    }
167
168    return $out;
169}
170
171# unicode2ucs2 takes a unicode array and produces a UCS-2
172# unicode string (every two bytes forms a unicode character)
173sub unicode2ucs2 {
174    my ($in) = @_;
175    my $out = "";
176
177    foreach my $num (@$in) {
178    $out .= chr (($num & 0xff00) >> 8);
179    $out .= chr ($num & 0xff);
180    }
181
182    return $out;
183}
184
185# ucs22unicode takes a UCS-2 string and produces a unicode array
186sub ucs22unicode {
187    my ($in) = @_;
188    my $out = [];
189
190    my $i = 0;
191    my $len = length ($in);
192    while ($i+1 < $len) {
193    push (@$out, ord (substr($in, $i, 1)) << 8 +
194          ord (substr($in, $i+1, 1)));
195
196    $i ++;
197    }
198
199    return $out;
200}
201
202# takes a reference to a string and returns a reference to a unicode array
203sub convert2unicode {
204    my ($encoding, $textref) = @_;
205
206    if (!defined $encodings::encodings->{$encoding}) {
207    print STDERR "unicode::convert2unicode: ERROR: Unsupported encoding ($encoding)\n";
208    return [];
209    }
210
211    my $encodename = "$encoding-unicode";
212    my $enc_info = $encodings::encodings->{$encoding};
213    my $mapfile = &util::filename_cat($ENV{'GSDLHOME'}, "mappings",
214                      "to_uc", $enc_info->{'mapfile'});
215    if (!&loadmapencoding ($encodename, $mapfile)) {
216    print STDERR "unicode: ERROR - could not load encoding $encodename: $! $mapfile\n";
217    return [];
218    }
219   
220    if (defined $enc_info->{'converter'}) {
221    my $converter = $enc_info->{'converter'};
222    return &$converter ($encodename, $textref);
223    }
224
225    if ($unicode::translations{$encodename}->{'count'} == 1) {
226    return &singlebyte2unicode ($encodename, $textref);
227    } else {
228    return &doublebyte2unicode ($encodename, $textref);
229    }
230}
231
232# singlebyte2unicode converts simple 8 bit encodings where characters below
233# 0x80 are normal ascii characters and the rest are decoded using the
234# appropriate mapping files.
235#
236# Examples of encodings that may be converted using singlebyte2unicode are
237# the iso-8859 and windows-125* series.
238sub singlebyte2unicode {
239    my ($encodename, $textref) = @_;
240
241    my @outtext = ();
242    my $len = length($$textref);
243    my ($c);
244    my $i = 0;
245
246    while ($i < $len) {
247    if (($c = ord(substr($$textref, $i, 1))) < 0x80) {
248        # normal ascii character
249        push (@outtext, $c);
250    } else {
251        $c = &transchar ($encodename, $c);
252        # put a black square if cannot translate
253        $c = 0x25A1 if $c == 0;
254        push (@outtext, $c);
255    }
256    $i ++;
257    }
258    return \@outtext;
259}
260
261# doublebyte2unicode converts simple two byte encodings where characters
262# below code point 0x80 are single-byte characters and the rest are
263# double-byte characters.
264#
265# Examples of encodings that may be converted using doublebyte2unicode are
266# CJK encodings like GB encoded Chinese and UHC Korean.
267#
268# Note that no error checking is performed to make sure that the input text
269# is valid for the given encoding.
270#
271# Also, encodings that may contain characters of more than two bytes are
272# not supported (any EUC encoded text may in theory contain 3-byte
273# characters but in practice only one and two byte characters are used).
274sub doublebyte2unicode {
275    my ($encodename, $textref) = @_;   
276   
277    my @outtext = ();
278    my $len = length($$textref);
279    my ($c1, $c2);
280    my $i = 0;
281
282    while ($i < $len) {
283    if (($c1 = ord(substr($$textref, $i, 1))) >= 0x80) {
284        if ($i+1 < $len) {
285        # double-byte character
286        $c2 = ord(substr($$textref, $i+1, 1));
287        my $c = &transchar ($encodename, ($c1 << 8) | $c2);
288        # put a black square if cannot translate
289        $c = 0x25A1 if $c == 0;
290        push (@outtext, $c);
291        $i += 2;
292       
293        } else {
294        # error
295        print STDERR "unicode: ERROR missing second half of double-byte character\n";
296        $i++;
297        }
298       
299    } else {
300        # single-byte character
301        push (@outtext, $c1);
302        $i++;
303    }
304    }
305    return \@outtext;
306}
307
308# Shift-JIS to unicode
309# We can't use doublebyte2unicode for Shift-JIS because it uses some
310# single-byte characters above code point 0x80 (i.e. half-width katakana
311# characters in the range 0xA1-0xDF)
312sub shiftjis2unicode {
313    my ($encodename, $textref) = @_;
314   
315    my @outtext = ();
316    my $len = length($$textref);
317    my ($c1, $c2);
318    my $i = 0;
319
320    while ($i < $len) {
321    $c1 = ord(substr($$textref, $i, 1));
322
323    if (($c1 >= 0xA1 && $c1 <= 0xDF) || $c1 == 0x5c || $c1 == 0x7E) {
324        # Single-byte half-width katakana character or
325        # JIS Roman yen or overline characters
326        my $c = &transchar ($encodename, $c1);
327        # - put a black square if cannot translate
328        $c = 0x25A1 if $c == 0;
329        push (@outtext, $c);
330        $i++;
331
332    } elsif ($c1 < 0x80) {
333        # ASCII
334        push (@outtext, $c1);
335        $i ++;
336
337    } elsif ($c1 < 0xEF) {
338        if ($i+1 < $len) {
339        $c2 = ord(substr($$textref, $i+1, 1));
340        if (($c2 >= 0x40 && $c2 <= 0x7E) || ($c2 >= 0x80 && $c2 <= 0xFC)) {
341            # Double-byte shift-jis character
342            my $c = &transchar ($encodename, ($c1 << 8) | $c2);
343            # put a black square if cannot translate
344            $c = 0x25A1 if $c == 0;
345            push (@outtext, $c);
346        } else {
347            # error
348            print STDERR "unicode: ERROR Invalid Shift-JIS character\n";
349        }
350        $i += 2;
351        } else {
352        # error
353        print STDERR "unicode: ERROR missing second half of Shift-JIS character\n";
354        $i ++;
355        }
356    } else {
357        # error
358        print STDERR "unicode: ERROR Invalid Shift-JIS character\n";
359        $i ++;
360    }
361    }
362    return \@outtext;
363}
364
365sub transchar {
366    my ($encoding, $from) = @_;
367    my $high = ($from / 256) % 256;
368    my $low = $from % 256;
369
370    return 0 unless defined $unicode::translations{$encoding};
371
372    my $block = $unicode::translations{$encoding}->{'map'};
373
374    if (ref ($block->[$high]) ne "ARRAY") {
375    return 0;
376    }
377    return $block->[$high]->[$low];
378}
379
380# %translations is of the form:
381#
382# encodings{encodingname-encodingname}->{'map'}->blocktranslation
383# blocktranslation->[[0-255],[256-511], ..., [65280-65535]]
384#
385# Any of the top translation blocks can point to an undefined
386# value. This data structure aims to allow fast translation and
387# efficient storage.
388%unicode::translations = ();
389
390# @array256 is used for initialisation, there must be
391# a better way...
392# What about this?: @array256 = (0) x 256;
393@unicode::array256 = (0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
394         0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
395         0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
396         0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
397         0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
398         0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
399         0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
400         0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
401         0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
402         0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
403         0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
404         0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
405         0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
406         0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
407         0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
408         0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0);
409
410# returns 1 if successful, 0 if unsuccessful
411sub loadmapencoding {
412    my ($encoding, $mapfile) = @_;
413   
414    # check to see if the encoding has already been loaded
415    return 1 if (defined $unicode::translations{$encoding});
416
417    if (! -r $mapfile || -d $mapfile) {
418    return 0;
419    }
420    return 0 unless open (MAPFILE, $mapfile);
421    binmode (MAPFILE);
422
423    $unicode::translations{$encoding} = {'map' => [@unicode::array256], 'count' => 0};
424    my $block = $unicode::translations{$encoding};
425
426    my ($in,$i,$j);
427    while (1) {
428    my $ret=read(MAPFILE, $in, 1);
429    if (!defined($ret)) { # error
430        print STDERR "unicode.pm: error reading mapfile: $!\n";
431        last;
432    }
433    if ($ret != 1) { last }
434    $i = unpack ("C", $in);
435    $block->{'map'}->[$i] = [@unicode::array256];
436    for ($j=0; $j<256 && read(MAPFILE, $in, 2)==2; $j++) {
437        my ($n1, $n2) = unpack ("CC", $in);
438        $block->{'map'}->[$i]->[$j] = ($n1*256) + $n2;
439    }
440    $block->{'count'} ++;
441    }
442
443    close (MAPFILE);
444}
445
446# unicode2singlebyte converts unicode to simple 8 bit encodings where
447# characters below 0x80 are normal ascii characters and the rest are encoded
448# using the appropriate mapping files.
449#
450# Examples of encodings that may be converted using unicode2singlebyte are
451# the iso-8859 and windows-125* series, KOI8-R (Russian), and the Kazakh encoding.
452sub unicode2singlebyte {
453    my ($uniref, $encoding) = @_;
454
455    my $outtext = "";
456    my $encodename = "unicode-$encoding";
457
458    if (!exists $encodings::encodings->{$encoding}) {
459    print STDERR "unicode.pm: ERROR - unsupported encoding "
460        . "'$encoding' requested\n";
461    return "";
462    }
463
464    my $enc_info = $encodings::encodings->{$encoding};
465    my $mapfile = &util::filename_cat($ENV{'GSDLHOME'}, "mappings",
466                      "from_uc", $enc_info->{'mapfile'});
467    if (!&loadmapencoding ($encodename, $mapfile)) {
468    print STDERR "unicode: ERROR - could not load encoding $encodename: $! $mapfile\n";
469    return "";
470    }
471   
472    foreach my $c (@$uniref) {
473    if ($c < 0x80) {
474        # normal ascii character
475        $outtext .= chr($c);
476    } else {
477        # extended ascii character
478        $c = &transchar ($encodename, $c);
479
480        # put a question mark if cannot translate
481        if ($c == 0) {
482        $outtext .= "?";
483        } else {
484        $outtext .= chr($c);
485        }
486    }
487    }
488    return $outtext;
489}
490
491
492# this makes sure that the referenced input string is utf8 encoded, and
493# will change/remove bytes that aren't.
494# returns 0 if the text was already utf8, or 1 if text modified to become utf8
495sub ensure_utf8 {
496    my $stringref=shift;
497
498    if (!defined($stringref) || ref($stringref) ne 'SCALAR') {
499    return $stringref;
500    }
501
502    my $value=$$stringref;
503
504    my $non_utf8_found = 0;
505    $value =~ m/^/g; # to set \G
506    while ($value =~ m!\G.*?([\x80-\xff]+)!sg) {
507    my $highbytes=$1;
508    my $highbyteslength=length($highbytes);
509    # make sure this block of high bytes is utf-8
510    $highbytes =~ /^/g; # set pos()
511    my $byte_replaced = 0;
512    while ($highbytes =~
513        m!\G (?: [\xc0-\xdf][\x80-\xbf]   | # 2 byte utf-8
514            [\xe0-\xef][\x80-\xbf]{2} | # 3 byte
515            [\xf0-\xf7][\x80-\xbf]{3} | # 4 byte
516            [\xf8-\xfb][\x80-\xbf]{4} | # 5 byte
517            [\xfc-\xfd][\x80-\xbf]{5} | # 6 byte
518            )*([\x80-\xff])? !xg
519        ) {
520        # this highbyte is "out-of-place" for valid utf-8
521        my $badbyte=$1;
522        if (!defined $badbyte) {next} # hit end of string
523        my $pos=pos($highbytes);
524        # replace bad byte. assume iso-8859-1 -> utf-8
525        # ascii2utf8 does "extended ascii"... ie iso-8859-1
526        my $replacement=&unicode::ascii2utf8(\$badbyte);
527        substr($highbytes, $pos-1, 1, $replacement);
528        # update the position to continue searching (for \G)
529        pos($highbytes) = $pos+length($replacement)-1;
530        $byte_replaced = 1;
531    }
532    if ($byte_replaced) {
533        # replace this block of high bytes in the $value
534        $non_utf8_found = 1;
535        my $replength=length($highbytes); # we've changed the length
536        my $textpos=pos($value); # pos at end of last match
537        # replace bad bytes with good bytes
538        substr($value, $textpos-$highbyteslength,
539                $highbyteslength, $highbytes);
540        # update the position to continue searching (for \G)
541        pos($value)=$textpos+($replength-$highbyteslength)+1;
542    }
543    }
544
545    $$stringref = $value;
546    return $non_utf8_found;
547}
548
549# Returns true (1) if the given string is utf8 and false (0) if it isn't.
550# Does not modify the string parameter.
551sub check_is_utf8 {
552    my $value=shift;
553
554    if (!defined($value)) {
555    return 0; # not utf8 because it is undefined
556    }
557
558    $value =~ m/^/g; # to set \G
559    while ($value =~ m!\G.*?([\x80-\xff]+)!sg) {
560    my $highbytes=$1;
561    # make sure this block of high bytes is utf-8
562    $highbytes =~ /^/g; # set pos()
563    while ($highbytes =~
564        m!\G (?: [\xc0-\xdf][\x80-\xbf]   | # 2 byte utf-8
565            [\xe0-\xef][\x80-\xbf]{2} | # 3 byte
566            [\xf0-\xf7][\x80-\xbf]{3} | # 4 byte
567            [\xf8-\xfb][\x80-\xbf]{4} | # 5 byte
568            [\xfc-\xfd][\x80-\xbf]{5} | # 6 byte
569            )*([\x80-\xff])? !xg
570        ) {
571        my $badbyte=$1;
572        if (defined $badbyte) { # not end of string
573        return 0; # non-utf8 found
574        }
575    }
576    }
577   
578    return 1;
579}
580
581sub url_encode {
582    my ($text) = @_;
583   
584    if (!&is_url_encoded($text)) {
585    $text =~ s/([^A-Z0-9\ \.\-\_])/sprintf("%%%02X", ord($1))/iseg;
586    }
587    return $text;
588}
589
590sub url_decode {
591    my ($text) = @_;
592
593    $text =~ s/\%([A-F0-9]{2})/pack('C', hex($1))/ige;
594    return $text;
595}
596
597sub is_url_encoded {
598    my ($text) = @_;
599    return ($text =~ m/\%([A-F0-9]{2})/);
600}
601
602# When a filename on the filesystem is already URL-encoded, the
603# URL to it will have %25s in place of every % sign, so that
604# URLs in html pages can refer to the URL-encoded filename.
605# This method changes the URL reference back into the actual
606# (URL-encoded) filename on the filesystem by replacing %25 with %.
607sub url_to_filename {
608    my ($text) =@_;
609    $text =~ s/%25/%/g if &is_url_encoded($text);
610    return $text;
611}
612
613# When a filename on the filesystem is already URL-encoded, the
614# URL to it will have %25s in in place of every % sign, so that
615# URLs in html pages can refer to the URL-encoded filename.
616# Given a (URL-encoded) filename on the filesystem, this subroutine
617# returns the URL reference string for it by replacing % with %25.
618# The output string will be the same as the input string if the input
619# already contains one or more %25s. This is to prevent processing
620# a url more than once this way.
621sub filename_to_url {
622    my ($text, $rename_method) = @_;
623   
624    if(!defined $rename_method || $rename_method eq "url") {
625    if($text !~ m/%25/) {
626        $text =~ s/%/%25/g;
627    }
628    }
629    return $text;
630}
631
632sub base64_encode {
633    my ($text) = @_;
634    if(!&conforms_to_mod_base64($text)) {
635    $text = &MIME::Base64::encode_base64($text);
636    # base64 encoding may introduce + and / signs,
637    # replacing them with - and _ to ensure it's filename-safe
638    $text =~ s/\+/\-/g; # + -> -
639    $text =~ s/\//\_/g; # / -> _
640    }
641    return $text;
642}
643
644# If the input fits the modified base64 pattern, this will try decoding it.
645# Still, this method does not guarantee the return value is the 'original', only
646# that the result is where the base64 decoding process has been applied once.
647# THIS METHOD IS NOT USED at the moment. It's here for convenience and symmetry.
648sub base64_decode {
649    my ($text) = @_;
650    if(&conforms_to_mod_base64($text)) {
651    # base64 encodes certain chars with + and /, but if we'd encoded it, we'd
652    # have replaced them with - and _ respectively. Undo this before decoding.
653    $text =~ s/\-/\+/g;      # - -> +
654    $text =~ s/\_/\//g;      # _ -> /
655    $text = &MIME::Base64::decode_base64($text);
656    }
657    return $text;
658}
659
660# Returns true if the given string is compatible with a modified version
661# of base64 (where the + and / are replaced with - and _), a format which
662# includes also regular ASCII alphanumeric values. This method does not
663# guarantee that the given string is actually base64 encoded, since it will
664# return true for any simple alphanumeric ASCII string as well.
665sub conforms_to_mod_base64 {
666    my ($text) = @_;
667    # base 64 takes alphanumeric and [=+/],
668    # but we use modified base64 where + and / are replaced with  - and _
669    return ($text =~ m/^[A-Za-z0-9\=\-\_]+$/); #alphanumeric and [=-_]
670}
671
672sub substr
673{
674    my ($utf8_string, $offset, $length) = @_;
675
676    my @unicode_string = @{&utf82unicode($utf8_string)};
677    my $unicode_string_length = scalar(@unicode_string);
678
679    my $substr_start = $offset;
680    if ($substr_start >= $unicode_string_length) {
681    return "";
682    }
683
684    my $substr_end = $offset + $length - 1;
685    if ($substr_end >= $unicode_string_length) {
686    $substr_end = $unicode_string_length - 1;
687    }
688
689    my @unicode_substring = @unicode_string[$substr_start..$substr_end];
690    return &unicode2utf8(\@unicode_substring);
691}
692
693
6941;
Note: See TracBrowser for help on using the browser.