root/main/trunk/greenstone2/perllib/unicode.pm @ 34393

Revision 34393, 24.8 KB (checked in by ak19, 5 weeks ago)

Commit of independent code update before committing a bugfix. In this commit, the results of base64 encoding can have linebreaks, which aren't what we want or expect, so have a flag for removing them.

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
Line 
1###########################################################################
2#
3# unicode.pm --
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999-2004 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26# useful functions for dealing with Unicode
27
28# Unicode strings are stored as arrays of scalars as perl
29# lacks characters are 8-bit (currently)
30
31package unicode;
32
33eval {require bytes};
34
35use encodings;
36use strict;
37use util;
38use FileUtils;
39use MIME::Base64; # for base64 encoding
40
41no strict 'refs';
42
43
44
45sub utf8decomp
46{   
47    my ($str) = @_;
48
49    return if (!defined $str);
50    return "" if ($str eq "");
51   
52    my @unpacked_chars = unpack("C*", $str); # unpack Unicode characters
53
54    my @each_char
55    = map { ($_ > 255 )
56           ? # if wide character...
57             sprintf("\\x{%04X}", $_)
58           : # \x{...}
59             (chr($_) =~ m/[[:cntrl:]]/ )
60             ? # else if control character ...
61               sprintf("\\x%02X", $_)
62             : # \x..
63               quotemeta(chr($_)) # else quoted or as themselves
64           } @unpacked_chars;
65   
66    return join("",@each_char);
67}
68
69
70sub hex_codepoint {
71    if (my $char = shift) {
72        return sprintf '%2.2x', unpack('U0U*', $char);
73    }
74}
75
76
77
78
79# ascii2unicode takes an (extended) ascii string (ISO-8859-1)
80# and returns a unicode array.
81sub ascii2unicode {
82    my ($in) = @_;
83    my $out = [];
84
85    my $i = 0;
86    my $len = length($in);
87    while ($i < $len) {
88    push (@$out, ord(substr ($in, $i, 1)));
89    $i++;
90    }
91
92    return $out;
93}
94
95# ascii2utf8 takes a reference to an (extended) ascii string and returns a
96# UTF-8 encoded string. This is just a faster version of
97# "&unicode2utf8(&ascii2unicode($str));"
98# "Extended ascii" really means "iso_8859_1"
99sub ascii2utf8 {
100    my ($in) = @_;
101    my $out = "";
102
103    if (!defined($in)|| !defined($$in)) {
104    return $out;
105    }
106
107    my ($c);
108    my $i = 0;
109    my $len = length($$in);
110    while ($i < $len) {
111    $c = ord (substr ($$in, $i, 1));
112    if ($c < 0x80) {
113        # ascii character
114        $out .= chr ($c);
115
116    } else {
117        # extended ascii character
118        $out .= chr (0xc0 + (($c >> 6) & 0x1f));
119        $out .= chr (0x80 + ($c & 0x3f));
120    }
121    $i++;
122    }
123
124    return $out;
125}
126
127# unicode2utf8 takes a unicode array as input and encodes it
128# using utf-8
129sub unicode2utf8 {
130    my ($in) = @_;
131    my $out = "";
132   
133    foreach my $num (@$in) {
134    next unless defined $num;
135    if ($num < 0x80) {
136        $out .= chr ($num);
137
138    } elsif ($num < 0x800) {
139        $out .= chr (0xc0 + (($num >> 6) & 0x1f));
140        $out .= chr (0x80 + ($num & 0x3f));
141
142    } elsif ($num < 0xFFFF) {
143        $out .= chr (0xe0 + (($num >> 12) & 0xf));
144        $out .= chr (0x80 + (($num >> 6) & 0x3f));
145        $out .= chr (0x80 + ($num & 0x3f));
146
147    } else {
148        # error, don't encode anything
149        #die;
150        # Diego's bugfix: instead of aborting the import process, it
151        # is better to get a converted file with a few extra spaces
152        print STDERR "strange char: $num\n";
153        $out .= " ";
154
155    }
156    }
157    return $out;
158}
159
160# utf82unicode takes a utf-8 string and produces a unicode
161# array
162sub utf82unicode {
163    my ($in) = @_;
164    my $out = [];
165
166    if(!defined $in) {
167    return $out;
168    }
169
170    my $i = 0;
171    my ($c1, $c2, $c3);
172    my $len = length($in);
173    while ($i < $len) {
174    if (($c1 = ord(substr ($in, $i, 1))) < 0x80) {
175        # normal ascii character
176        push (@$out, $c1);
177
178    } elsif ($c1 < 0xc0) {
179        # error, was expecting the first byte of an
180        # encoded character. Do nothing.
181
182    } elsif ($c1 < 0xe0 && $i+1 < $len) {
183        # an encoded character with two bytes
184        $c2 = ord (substr ($in, $i+1, 1));
185        if ($c2 >= 0x80 && $c2 < 0xc0) {
186        # everything looks ok
187        push (@$out, ((($c1 & 0x1f) << 6) +
188              ($c2 & 0x3f)));
189        $i++; # gobbled an extra byte
190        }
191
192    } elsif ($c1 < 0xf0 && $i+2 < $len) {
193        # an encoded character with three bytes
194        $c2 = ord (substr ($in, $i+1, 1));
195        $c3 = ord (substr ($in, $i+2, 1));
196        if ($c2 >= 0x80 && $c2 < 0xc0 &&
197        $c3 >= 0x80 && $c3 < 0xc0) {
198        # everything looks ok
199        push (@$out, ((($c1 & 0xf) << 12) +
200              (($c2 & 0x3f) << 6) +
201              ($c3 & 0x3f)));
202
203        $i += 2; # gobbled an extra two bytes
204        }
205
206    } else {
207        # error, only decode Unicode characters not full UCS.
208        # Do nothing.
209    }
210
211    $i++;
212    }
213
214    return $out;
215}
216
217# unicode2ucs2 takes a unicode array and produces a UCS-2
218# unicode string (every two bytes forms a unicode character)
219sub unicode2ucs2 {
220    my ($in) = @_;
221    my $out = "";
222
223    foreach my $num (@$in) {
224    $out .= chr (($num & 0xff00) >> 8);
225    $out .= chr ($num & 0xff);
226    }
227
228    return $out;
229}
230
231# ucs22unicode takes a UCS-2 string and produces a unicode array
232sub ucs22unicode {
233    my ($in) = @_;
234    my $out = [];
235
236    my $i = 0;
237    my $len = length ($in);
238    while ($i+1 < $len) {
239    push (@$out, ord (substr($in, $i, 1)) << 8 +
240          ord (substr($in, $i+1, 1)));
241
242    $i ++;
243    }
244
245    return $out;
246}
247
248# takes a reference to a string and returns a reference to a unicode array
249sub convert2unicode {
250    my ($encoding, $textref) = @_;
251
252    if (!defined $encodings::encodings->{$encoding}) {
253    print STDERR "unicode::convert2unicode: ERROR: Unsupported encoding ($encoding)\n";
254    return [];
255    }
256
257    my $encodename = "$encoding-unicode";
258    my $enc_info = $encodings::encodings->{$encoding};
259    my $mapfile = &FileUtils::filenameConcatenate($ENV{'GSDLHOME'}, "mappings",
260                      "to_uc", $enc_info->{'mapfile'});
261    if (!&loadmapencoding ($encodename, $mapfile)) {
262    print STDERR "unicode: ERROR - could not load encoding $encodename: $! $mapfile\n";
263    return [];
264    }
265   
266    if (defined $enc_info->{'converter'}) {
267    my $converter = $enc_info->{'converter'};
268    return &$converter ($encodename, $textref);
269    }
270
271    if ($unicode::translations{$encodename}->{'count'} == 1) {
272    return &singlebyte2unicode ($encodename, $textref);
273    } else {
274    return &doublebyte2unicode ($encodename, $textref);
275    }
276}
277
278# singlebyte2unicode converts simple 8 bit encodings where characters below
279# 0x80 are normal ascii characters and the rest are decoded using the
280# appropriate mapping files.
281#
282# Examples of encodings that may be converted using singlebyte2unicode are
283# the iso-8859 and windows-125* series.
284sub singlebyte2unicode {
285    my ($encodename, $textref) = @_;
286
287    my @outtext = ();
288    my $len = length($$textref);
289    my ($c);
290    my $i = 0;
291
292    while ($i < $len) {
293    if (($c = ord(substr($$textref, $i, 1))) < 0x80) {
294        # normal ascii character
295        push (@outtext, $c);
296    } else {
297        $c = &transchar ($encodename, $c);
298        # put a black square if cannot translate
299        $c = 0x25A1 if $c == 0;
300        push (@outtext, $c);
301    }
302    $i ++;
303    }
304    return \@outtext;
305}
306
307# doublebyte2unicode converts simple two byte encodings where characters
308# below code point 0x80 are single-byte characters and the rest are
309# double-byte characters.
310#
311# Examples of encodings that may be converted using doublebyte2unicode are
312# CJK encodings like GB encoded Chinese and UHC Korean.
313#
314# Note that no error checking is performed to make sure that the input text
315# is valid for the given encoding.
316#
317# Also, encodings that may contain characters of more than two bytes are
318# not supported (any EUC encoded text may in theory contain 3-byte
319# characters but in practice only one and two byte characters are used).
320sub doublebyte2unicode {
321    my ($encodename, $textref) = @_;   
322   
323    my @outtext = ();
324    my $len = length($$textref);
325    my ($c1, $c2);
326    my $i = 0;
327
328    while ($i < $len) {
329    if (($c1 = ord(substr($$textref, $i, 1))) >= 0x80) {
330        if ($i+1 < $len) {
331        # double-byte character
332        $c2 = ord(substr($$textref, $i+1, 1));
333        my $c = &transchar ($encodename, ($c1 << 8) | $c2);
334        # put a black square if cannot translate
335        $c = 0x25A1 if $c == 0;
336        push (@outtext, $c);
337        $i += 2;
338       
339        } else {
340        # error
341        print STDERR "unicode: ERROR missing second half of double-byte character\n";
342        $i++;
343        }
344       
345    } else {
346        # single-byte character
347        push (@outtext, $c1);
348        $i++;
349    }
350    }
351    return \@outtext;
352}
353
354# Shift-JIS to unicode
355# We can't use doublebyte2unicode for Shift-JIS because it uses some
356# single-byte characters above code point 0x80 (i.e. half-width katakana
357# characters in the range 0xA1-0xDF)
358sub shiftjis2unicode {
359    my ($encodename, $textref) = @_;
360   
361    my @outtext = ();
362    my $len = length($$textref);
363    my ($c1, $c2);
364    my $i = 0;
365
366    while ($i < $len) {
367    $c1 = ord(substr($$textref, $i, 1));
368
369    if (($c1 >= 0xA1 && $c1 <= 0xDF) || $c1 == 0x5c || $c1 == 0x7E) {
370        # Single-byte half-width katakana character or
371        # JIS Roman yen or overline characters
372        my $c = &transchar ($encodename, $c1);
373        # - put a black square if cannot translate
374        $c = 0x25A1 if $c == 0;
375        push (@outtext, $c);
376        $i++;
377
378    } elsif ($c1 < 0x80) {
379        # ASCII
380        push (@outtext, $c1);
381        $i ++;
382
383    } elsif ($c1 < 0xEF) {
384        if ($i+1 < $len) {
385        $c2 = ord(substr($$textref, $i+1, 1));
386        if (($c2 >= 0x40 && $c2 <= 0x7E) || ($c2 >= 0x80 && $c2 <= 0xFC)) {
387            # Double-byte shift-jis character
388            my $c = &transchar ($encodename, ($c1 << 8) | $c2);
389            # put a black square if cannot translate
390            $c = 0x25A1 if $c == 0;
391            push (@outtext, $c);
392        } else {
393            # error
394            print STDERR "unicode: ERROR Invalid Shift-JIS character\n";
395        }
396        $i += 2;
397        } else {
398        # error
399        print STDERR "unicode: ERROR missing second half of Shift-JIS character\n";
400        $i ++;
401        }
402    } else {
403        # error
404        print STDERR "unicode: ERROR Invalid Shift-JIS character\n";
405        $i ++;
406    }
407    }
408    return \@outtext;
409}
410
411sub transchar {
412    my ($encoding, $from) = @_;
413    my $high = ($from / 256) % 256;
414    my $low = $from % 256;
415
416    return 0 unless defined $unicode::translations{$encoding};
417
418    my $block = $unicode::translations{$encoding}->{'map'};
419
420    if (ref ($block->[$high]) ne "ARRAY") {
421    return 0;
422    }
423    return $block->[$high]->[$low];
424}
425
426# %translations is of the form:
427#
428# encodings{encodingname-encodingname}->{'map'}->blocktranslation
429# blocktranslation->[[0-255],[256-511], ..., [65280-65535]]
430#
431# Any of the top translation blocks can point to an undefined
432# value. This data structure aims to allow fast translation and
433# efficient storage.
434%unicode::translations = ();
435
436# @array256 is used for initialisation, there must be
437# a better way...
438# What about this?: @array256 = (0) x 256;
439@unicode::array256 = (0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
440         0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
441         0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
442         0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
443         0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
444         0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
445         0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
446         0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
447         0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
448         0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
449         0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
450         0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
451         0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
452         0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
453         0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
454         0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0);
455
456# returns 1 if successful, 0 if unsuccessful
457sub loadmapencoding {
458    my ($encoding, $mapfile) = @_;
459   
460    # check to see if the encoding has already been loaded
461    return 1 if (defined $unicode::translations{$encoding});
462
463    if (! -r $mapfile || -d $mapfile) {
464    return 0;
465    }
466    return 0 unless open (MAPFILE, $mapfile);
467    binmode (MAPFILE);
468
469    $unicode::translations{$encoding} = {'map' => [@unicode::array256], 'count' => 0};
470    my $block = $unicode::translations{$encoding};
471
472    my ($in,$i,$j);
473    while (1) {
474    my $ret=read(MAPFILE, $in, 1);
475    if (!defined($ret)) { # error
476        print STDERR "unicode.pm: error reading mapfile: $!\n";
477        last;
478    }
479    if ($ret != 1) { last }
480    $i = unpack ("C", $in);
481    $block->{'map'}->[$i] = [@unicode::array256];
482    for ($j=0; $j<256 && read(MAPFILE, $in, 2)==2; $j++) {
483        my ($n1, $n2) = unpack ("CC", $in);
484        $block->{'map'}->[$i]->[$j] = ($n1*256) + $n2;
485    }
486    $block->{'count'} ++;
487    }
488
489    close (MAPFILE);
490}
491
492# unicode2singlebyte converts unicode to simple 8 bit encodings where
493# characters below 0x80 are normal ascii characters and the rest are encoded
494# using the appropriate mapping files.
495#
496# Examples of encodings that may be converted using unicode2singlebyte are
497# the iso-8859 and windows-125* series, KOI8-R (Russian), and the Kazakh encoding.
498sub unicode2singlebyte {
499    my ($uniref, $encoding) = @_;
500
501    my $outtext = "";
502    my $encodename = "unicode-$encoding";
503
504    if (!exists $encodings::encodings->{$encoding}) {
505    print STDERR "unicode.pm: ERROR - unsupported encoding "
506        . "'$encoding' requested\n";
507    return "";
508    }
509
510    my $enc_info = $encodings::encodings->{$encoding};
511    my $mapfile = &FileUtils::filenameConcatenate($ENV{'GSDLHOME'}, "mappings",
512                      "from_uc", $enc_info->{'mapfile'});
513    if (!&loadmapencoding ($encodename, $mapfile)) {
514    print STDERR "unicode: ERROR - could not load encoding $encodename: $! $mapfile\n";
515    return "";
516    }
517   
518    foreach my $c (@$uniref) {
519    if ($c < 0x80) {
520        # normal ascii character
521        $outtext .= chr($c);
522    } else {
523        # extended ascii character
524        $c = &transchar ($encodename, $c);
525
526        # put a question mark if cannot translate
527        if ($c == 0) {
528        $outtext .= "?";
529        } else {
530        $outtext .= chr($c);
531        }
532    }
533    }
534    return $outtext;
535}
536
537
538# this makes sure that the referenced input string is utf8 encoded, and
539# will change/remove bytes that aren't.
540# returns 0 if the text was already utf8, or 1 if text modified to become utf8
541sub ensure_utf8 {
542    my $stringref=shift;
543
544    if (!defined($stringref) || ref($stringref) ne 'SCALAR') {
545    return $stringref;
546    }
547
548    my $value=$$stringref;
549
550    my $non_utf8_found = 0;
551    $value =~ m/^/g; # to set \G
552    while ($value =~ m!\G.*?([\x80-\xff]+)!sg) {
553    my $highbytes=$1;
554    my $highbyteslength=length($highbytes);
555    # make sure this block of high bytes is utf-8
556    $highbytes =~ /^/g; # set pos()
557    my $byte_replaced = 0;
558    while ($highbytes =~
559        m!\G (?: [\xc0-\xdf][\x80-\xbf]   | # 2 byte utf-8
560            [\xe0-\xef][\x80-\xbf]{2} | # 3 byte
561            [\xf0-\xf7][\x80-\xbf]{3} | # 4 byte
562            [\xf8-\xfb][\x80-\xbf]{4} | # 5 byte
563            [\xfc-\xfd][\x80-\xbf]{5} | # 6 byte
564            )*([\x80-\xff])? !xg
565        ) {
566        # this highbyte is "out-of-place" for valid utf-8
567        my $badbyte=$1;
568        if (!defined $badbyte) {next} # hit end of string
569        my $pos=pos($highbytes);
570        # replace bad byte. assume iso-8859-1 -> utf-8
571        # ascii2utf8 does "extended ascii"... ie iso-8859-1
572        my $replacement=&unicode::ascii2utf8(\$badbyte);
573        substr($highbytes, $pos-1, 1, $replacement);
574        # update the position to continue searching (for \G)
575        pos($highbytes) = $pos+length($replacement)-1;
576        $byte_replaced = 1;
577    }
578    if ($byte_replaced) {
579        # replace this block of high bytes in the $value
580        $non_utf8_found = 1;
581        my $replength=length($highbytes); # we've changed the length
582        my $textpos=pos($value); # pos at end of last match
583        # replace bad bytes with good bytes
584        substr($value, $textpos-$highbyteslength,
585                $highbyteslength, $highbytes);
586        # update the position to continue searching (for \G)
587        pos($value)=$textpos+($replength-$highbyteslength)+1;
588    }
589    }
590
591    $$stringref = $value;
592    return $non_utf8_found;
593}
594
595# Returns true (1) if the given string is utf8 and false (0) if it isn't.
596# Does not modify the string parameter.
597sub check_is_utf8 {
598    my $value=shift;
599
600    if (!defined($value)) {
601    return 0; # not utf8 because it is undefined
602    }
603
604    $value =~ m/^/g; # to set \G
605    while ($value =~ m!\G.*?([\x80-\xff]+)!sg) {
606    my $highbytes=$1;
607    # make sure this block of high bytes is utf-8
608    $highbytes =~ /^/g; # set pos()
609    while ($highbytes =~
610        m!\G (?: [\xc0-\xdf][\x80-\xbf]   | # 2 byte utf-8
611            [\xe0-\xef][\x80-\xbf]{2} | # 3 byte
612            [\xf0-\xf7][\x80-\xbf]{3} | # 4 byte
613            [\xf8-\xfb][\x80-\xbf]{4} | # 5 byte
614            [\xfc-\xfd][\x80-\xbf]{5} | # 6 byte
615            )*([\x80-\xff])? !xg
616        ) {
617        my $badbyte=$1;
618        if (defined $badbyte) { # not end of string
619        return 0; # non-utf8 found
620        }
621    }
622    }
623   
624    return 1;
625}
626
627sub url_encode {
628    my ($text) = @_;
629   
630    if (!&is_url_encoded($text)) {
631    $text =~ s/([^0-9A-Z\ \.\-\_])/sprintf("%%%02X", ord($1))/iseg;
632    # return the url-encoded character entity for underscore back to the entity
633    $text =~ s/%26%23095%3B/&\#095;/g;
634    }
635    return $text;
636}
637
638sub url_decode {
639    my ($text,$and_numeric_entities) = @_;
640
641    if(defined $text) {
642    $text =~ s/\%([0-9A-F]{2})/pack('C', hex($1))/ige;
643   
644    if ((defined $and_numeric_entities) && ($and_numeric_entities)) {
645        $text =~ s/\&\#x([0-9A-F]+);/pack('C', hex($1))/ige;
646        $text =~ s/\&\#u?([0-9]+);/pack('C', $1)/ige;
647    }
648    }
649
650    return $text;
651}
652
653sub url_decode_utf8 {
654    my ($text,$and_numeric_entities) = @_;
655
656    $text =~ s/\%([0-9A-F]{2})/pack('b', hex($1))/ige;
657
658    $text = Encode::decode("utf8",$text);
659
660    return $text;
661}
662
663sub is_url_encoded {
664    my ($text) = @_;
665    return ($text =~ m/\%([0-9A-F]{2})/i) || ($text =~ m/\&\#x([0-9A-F]+;)/i) || ($text =~ m/\&\#([0-9]+;)/i);
666}
667
668# When a filename on the filesystem is already URL-encoded, the
669# URL to it will have %25s in place of every % sign, so that
670# URLs in html pages can refer to the URL-encoded filename.
671# This method changes the URL reference back into the actual
672# (URL-encoded) filename on the filesystem by replacing %25 with %.
673sub url_to_filename {
674    my ($text) =@_;
675    $text =~ s/%25/%/g if &is_url_encoded($text);
676    # DM safing would have replaced underscores with character entity &#095;
677    # in SourceFile meta. Undo any such change to get the filename referred to.
678    $text =~ s/&\#095;/_/g;
679    return $text;
680}
681
682# When a filename on the filesystem is already URL-encoded, the
683# URL to it will have %25s in place of every % sign, so that
684# URLs in html pages can refer to the URL-encoded filename.
685# Given a (URL-encoded) filename on the filesystem, this subroutine
686# returns the URL reference string for it by replacing % with %25.
687# The output string will be the same as the input string if the input
688# already contains one or more %25s. This is to prevent processing
689# a url more than once this way.
690sub filename_to_url {
691    my ($text) = @_;
692   
693    if($text !~ m/%25/) {
694    $text =~ s/%/%25/g;
695    }
696    return $text;
697}
698
699# Base64 encoding does not encode a pure ASCII to itself. This is important to know.
700# If the $force_encode parameter is true, then this method WILL base64 encode whatever
701# string is passed in, including any plain ASCII string.
702# That means this method could double encode an already encoded string.
703# However, this method is necessary because on the GLI end, we can't detect whether a plain
704# ASCII string has been encoded or not. And if gsdlsourcefilerenamemethod is set to base64,
705# then gli will always attempt to decode all parts of the relative path gsdlsourcefilename
706# (except the "import" prefix, which is special) or none of the parts.
707sub force_base64_encode {
708    my ($text) = @_;
709    my $force_encode = 1;
710    my $no_newlines = 1;
711    return &base64_encode($text, $force_encode, $no_newlines);
712}
713
714sub base64_encode {
715    my ($text, $force_encode, $no_newlines) = @_;
716    if($force_encode || !&conforms_to_mod_base64($text)) {
717    # return entity for underscore to underscore before encoding
718    $text =~ s/&\#095;/_/g;
719
720    $text = &MIME::Base64::encode_base64($text);
721    # base64 encoding may introduce + and / signs,
722    # replacing them with - and _ to ensure it's filename-safe
723    $text =~ s/\+/\-/g; # + -> -
724    $text =~ s/\//\_/g; # / -> _
725   
726    # by default base64 encoding a long string introduces newlines to break long strings,
727    # see https://stackoverflow.com/questions/19952621/is-it-ok-to-remove-newline-in-base64-encoding
728    # Java doesn't like that
729    $text =~ s/\n//gs if $no_newlines;
730    }
731    return $text;
732}
733
734# If the input fits the modified base64 pattern, this will try decoding it.
735# Still, this method does not guarantee the return value is the 'original', only
736# that the result is where the base64 decoding process has been applied once.
737# THIS METHOD IS NOT USED at the moment. It's here for convenience and symmetry.
738sub base64_decode {
739    my ($text) = @_;
740    if(&conforms_to_mod_base64($text)) {
741    # base64 encodes certain chars with + and /, but if we'd encoded it, we'd
742    # have replaced them with - and _ respectively. Undo this before decoding.
743    $text =~ s/\-/\+/g;      # - -> +
744    $text =~ s/\_/\//g;      # _ -> /
745    $text = &MIME::Base64::decode_base64($text);
746    }
747    return $text;
748}
749
750# Returns true if the given string is compatible with a modified version
751# of base64 (where the + and / are replaced with - and _), a format which
752# includes also regular ASCII alphanumeric values. This method does not
753# guarantee that the given string is actually base64 encoded, since it will
754# return true for any simple alphanumeric ASCII string as well.
755sub conforms_to_mod_base64 {
756    my ($text) = @_;
757
758    # need to treat the entity ref for underscore as underscore
759    $text =~ s/&\#095;/_/g;
760
761    # base 64 takes alphanumeric and [=+/],
762    # but we use modified base64 where + and / are replaced with  - and _
763    return ($text =~ m/^[A-Za-z0-9\=\-\_]+$/); #alphanumeric and [=-_]
764}
765
766sub substr
767{
768    my ($utf8_string, $offset, $length) = @_;
769
770    my @unicode_string = @{&utf82unicode($utf8_string)};
771    my $unicode_string_length = scalar(@unicode_string);
772
773    my $substr_start = $offset;
774    if ($substr_start >= $unicode_string_length) {
775    return "";
776    }
777
778    my $substr_end = $offset + $length - 1;
779    if ($substr_end >= $unicode_string_length) {
780    $substr_end = $unicode_string_length - 1;
781    }
782
783    my @unicode_substring = @unicode_string[$substr_start..$substr_end];
784    return &unicode2utf8(\@unicode_substring);
785}
786
787# perl version of stringToHex
788# Useful method to print UTF8 (or other unicode) for debugging.
789# Characters that are easily displayed (that is, printable ASCII)
790# are shown as-is, whereas hex values of the unicode code points
791# are shown for all other chars.
792sub debug_unicode_string
793{
794    join("",
795         map { $_ > 127 ?                      # if wide character...
796                   sprintf("\\x{%04X}", $_) :  # \x{...}
797                   chr($_)         
798               } unpack("U*", $_[0]));         # unpack Unicode characters
799}
800
801sub raw_filename_to_url_encoded
802{
803    my ($str_in) = @_;
804
805    my @url_encoded_chars
806    = map { $_ > 255 ?                  # Needs to be represent in entity form
807            sprintf("&#x%X;",$_) : 
808            $_>127 || $_==ord("%") ?              # Representable in %XX form
809            sprintf("%%%2X", $_) : 
810            chr($_)                 # otherwise, Ascii char
811        } unpack("U*", $str_in); # Unpack Unicode characters
812
813   
814    my $str_out = join("", @url_encoded_chars);
815
816    return $str_out;
817
818}
819
820sub url_encoded_to_raw_filename
821{
822    my ($str_in) = @_;
823
824    my $str_out = $str_in;
825
826    $str_out =~ s/%([0-9A-F]{2})/chr(hex($1))/eig;
827    $str_out =~ s/&#x([0-9A-F]+);/chr(hex($1))/eig;
828    $str_out =~ s/&#([0-9]+);/chr($1)/eig;
829
830    return $str_out;
831}
832
833
834sub raw_filename_to_utf8_url_encoded
835{
836    my ($str_in) = @_;
837
838    $str_in = Encode::encode("utf8",$str_in) if !check_is_utf8($str_in);
839
840    my @url_encoded_chars
841    = map { $_ > 127 ?                  # Representable in %XX form
842            sprintf("%%%2X", $_) : 
843            chr($_)                 # otherwise, Ascii char
844        } unpack("U*", $str_in); # Unpack utf8 characters
845
846   
847    my $str_out = join("", @url_encoded_chars);
848
849    return $str_out;
850
851}
852
853sub utf8_url_encoded_to_raw_filename
854{
855    my ($str_in) = @_;
856
857    my $utf8_str_out = $str_in;
858
859    $utf8_str_out =~ s/%([0-9A-F]{2})/chr(hex($1))/eig;
860
861    my $unicode_str_out = decode("utf8",$utf8_str_out);
862    my $raw_str_out = utf8::downgrade($unicode_str_out);
863   
864    return $raw_str_out;
865}
866
867sub analyze_raw_string
868{
869    my ($str_in) = @_;
870
871    my $uses_bytecodes = 0;
872    my $exceeds_bytecodes = 0;
873
874    map { $exceeds_bytecodes = 1 if ($_ >= 256);
875      $uses_bytecodes    = 1 if (($_ >= 128) && ($_ < 256));
876    } unpack("U*", $str_in); # Unpack Unicode characters
877
878    return ($uses_bytecodes,$exceeds_bytecodes);
879}
880
881
882sub convert_utf8_string_to_unicode_string
883{
884    my $utf8_string = shift(@_);
885
886    my $unicode_string = "";
887    foreach my $unicode_value (@{&unicode::utf82unicode($utf8_string)}) {
888    $unicode_string .= chr($unicode_value);
889    }
890    return $unicode_string;
891}
892
893sub convert_unicode_string_to_utf8_string
894{
895    my $unicode_string = shift(@_);
896
897    my @unicode_array;
898    for (my $i = 0; $i < length($unicode_string); $i++) {
899    push(@unicode_array, ord(&substr($unicode_string, $i, 1)));
900    }
901    return &unicode::unicode2utf8(\@unicode_array);
902}
903
904
9051;
Note: See TracBrowser for help on using the browser.