root/main/trunk/greenstone2/perllib/unicode.pm @ 33757

Revision 33757, 24.5 KB (checked in by ak19, 11 months ago)

1. Windows bugfix for getting exMeta to be loaded into GLI where there are subdirs involved in the Gather pane, or there are non-ASCII filenames, or the file rename method is set to base64. 2. Bugfix for Linux and Windows: Using Base64 to rename files was still a problem despite the previous commit (which was supposed to have fixed all GLI exMeta loading issues on Linux) in the special case where a subfolder was pure ASCII. The perl code wouldn't base64 encode such subdirs. However, GLI won't know which part of a relative file path to decode based on the file rename method used and which parts are not to be decoded. So GLI uniformly decoded them, and ASCII named subfolders that were not base64 encoded (but contained files that were to be renamed with base64) got base64 decoded into garbage, so that exMeta still did not get attached. 3. This commit contains debug stmts.

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
Line 
1###########################################################################
2#
3# unicode.pm --
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999-2004 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26# useful functions for dealing with Unicode
27
28# Unicode strings are stored as arrays of scalars as perl
29# lacks characters are 8-bit (currently)
30
31package unicode;
32
33eval {require bytes};
34
35use encodings;
36use strict;
37use util;
38use FileUtils;
39use MIME::Base64; # for base64 encoding
40
41no strict 'refs';
42
43
44
45sub utf8decomp
46{   
47    my ($str) = @_;
48
49    return if (!defined $str);
50    return "" if ($str eq "");
51   
52    my @unpacked_chars = unpack("C*", $str); # unpack Unicode characters
53
54    my @each_char
55    = map { ($_ > 255 )
56           ? # if wide character...
57             sprintf("\\x{%04X}", $_)
58           : # \x{...}
59             (chr($_) =~ m/[[:cntrl:]]/ )
60             ? # else if control character ...
61               sprintf("\\x%02X", $_)
62             : # \x..
63               quotemeta(chr($_)) # else quoted or as themselves
64           } @unpacked_chars;
65   
66    return join("",@each_char);
67}
68
69
70sub hex_codepoint {
71    if (my $char = shift) {
72        return sprintf '%2.2x', unpack('U0U*', $char);
73    }
74}
75
76
77
78
79# ascii2unicode takes an (extended) ascii string (ISO-8859-1)
80# and returns a unicode array.
81sub ascii2unicode {
82    my ($in) = @_;
83    my $out = [];
84
85    my $i = 0;
86    my $len = length($in);
87    while ($i < $len) {
88    push (@$out, ord(substr ($in, $i, 1)));
89    $i++;
90    }
91
92    return $out;
93}
94
95# ascii2utf8 takes a reference to an (extended) ascii string and returns a
96# UTF-8 encoded string. This is just a faster version of
97# "&unicode2utf8(&ascii2unicode($str));"
98# "Extended ascii" really means "iso_8859_1"
99sub ascii2utf8 {
100    my ($in) = @_;
101    my $out = "";
102
103    if (!defined($in)|| !defined($$in)) {
104    return $out;
105    }
106
107    my ($c);
108    my $i = 0;
109    my $len = length($$in);
110    while ($i < $len) {
111    $c = ord (substr ($$in, $i, 1));
112    if ($c < 0x80) {
113        # ascii character
114        $out .= chr ($c);
115
116    } else {
117        # extended ascii character
118        $out .= chr (0xc0 + (($c >> 6) & 0x1f));
119        $out .= chr (0x80 + ($c & 0x3f));
120    }
121    $i++;
122    }
123
124    return $out;
125}
126
127# unicode2utf8 takes a unicode array as input and encodes it
128# using utf-8
129sub unicode2utf8 {
130    my ($in) = @_;
131    my $out = "";
132   
133    foreach my $num (@$in) {
134    next unless defined $num;
135    if ($num < 0x80) {
136        $out .= chr ($num);
137
138    } elsif ($num < 0x800) {
139        $out .= chr (0xc0 + (($num >> 6) & 0x1f));
140        $out .= chr (0x80 + ($num & 0x3f));
141
142    } elsif ($num < 0xFFFF) {
143        $out .= chr (0xe0 + (($num >> 12) & 0xf));
144        $out .= chr (0x80 + (($num >> 6) & 0x3f));
145        $out .= chr (0x80 + ($num & 0x3f));
146
147    } else {
148        # error, don't encode anything
149        #die;
150        # Diego's bugfix: instead of aborting the import process, it
151        # is better to get a converted file with a few extra spaces
152        print STDERR "strange char: $num\n";
153        $out .= " ";
154
155    }
156    }
157    return $out;
158}
159
160# utf82unicode takes a utf-8 string and produces a unicode
161# array
162sub utf82unicode {
163    my ($in) = @_;
164    my $out = [];
165
166    if(!defined $in) {
167    return $out;
168    }
169
170    my $i = 0;
171    my ($c1, $c2, $c3);
172    my $len = length($in);
173    while ($i < $len) {
174    if (($c1 = ord(substr ($in, $i, 1))) < 0x80) {
175        # normal ascii character
176        push (@$out, $c1);
177
178    } elsif ($c1 < 0xc0) {
179        # error, was expecting the first byte of an
180        # encoded character. Do nothing.
181
182    } elsif ($c1 < 0xe0 && $i+1 < $len) {
183        # an encoded character with two bytes
184        $c2 = ord (substr ($in, $i+1, 1));
185        if ($c2 >= 0x80 && $c2 < 0xc0) {
186        # everything looks ok
187        push (@$out, ((($c1 & 0x1f) << 6) +
188              ($c2 & 0x3f)));
189        $i++; # gobbled an extra byte
190        }
191
192    } elsif ($c1 < 0xf0 && $i+2 < $len) {
193        # an encoded character with three bytes
194        $c2 = ord (substr ($in, $i+1, 1));
195        $c3 = ord (substr ($in, $i+2, 1));
196        if ($c2 >= 0x80 && $c2 < 0xc0 &&
197        $c3 >= 0x80 && $c3 < 0xc0) {
198        # everything looks ok
199        push (@$out, ((($c1 & 0xf) << 12) +
200              (($c2 & 0x3f) << 6) +
201              ($c3 & 0x3f)));
202
203        $i += 2; # gobbled an extra two bytes
204        }
205
206    } else {
207        # error, only decode Unicode characters not full UCS.
208        # Do nothing.
209    }
210
211    $i++;
212    }
213
214    return $out;
215}
216
217# unicode2ucs2 takes a unicode array and produces a UCS-2
218# unicode string (every two bytes forms a unicode character)
219sub unicode2ucs2 {
220    my ($in) = @_;
221    my $out = "";
222
223    foreach my $num (@$in) {
224    $out .= chr (($num & 0xff00) >> 8);
225    $out .= chr ($num & 0xff);
226    }
227
228    return $out;
229}
230
231# ucs22unicode takes a UCS-2 string and produces a unicode array
232sub ucs22unicode {
233    my ($in) = @_;
234    my $out = [];
235
236    my $i = 0;
237    my $len = length ($in);
238    while ($i+1 < $len) {
239    push (@$out, ord (substr($in, $i, 1)) << 8 +
240          ord (substr($in, $i+1, 1)));
241
242    $i ++;
243    }
244
245    return $out;
246}
247
248# takes a reference to a string and returns a reference to a unicode array
249sub convert2unicode {
250    my ($encoding, $textref) = @_;
251
252    if (!defined $encodings::encodings->{$encoding}) {
253    print STDERR "unicode::convert2unicode: ERROR: Unsupported encoding ($encoding)\n";
254    return [];
255    }
256
257    my $encodename = "$encoding-unicode";
258    my $enc_info = $encodings::encodings->{$encoding};
259    my $mapfile = &FileUtils::filenameConcatenate($ENV{'GSDLHOME'}, "mappings",
260                      "to_uc", $enc_info->{'mapfile'});
261    if (!&loadmapencoding ($encodename, $mapfile)) {
262    print STDERR "unicode: ERROR - could not load encoding $encodename: $! $mapfile\n";
263    return [];
264    }
265   
266    if (defined $enc_info->{'converter'}) {
267    my $converter = $enc_info->{'converter'};
268    return &$converter ($encodename, $textref);
269    }
270
271    if ($unicode::translations{$encodename}->{'count'} == 1) {
272    return &singlebyte2unicode ($encodename, $textref);
273    } else {
274    return &doublebyte2unicode ($encodename, $textref);
275    }
276}
277
278# singlebyte2unicode converts simple 8 bit encodings where characters below
279# 0x80 are normal ascii characters and the rest are decoded using the
280# appropriate mapping files.
281#
282# Examples of encodings that may be converted using singlebyte2unicode are
283# the iso-8859 and windows-125* series.
284sub singlebyte2unicode {
285    my ($encodename, $textref) = @_;
286
287    my @outtext = ();
288    my $len = length($$textref);
289    my ($c);
290    my $i = 0;
291
292    while ($i < $len) {
293    if (($c = ord(substr($$textref, $i, 1))) < 0x80) {
294        # normal ascii character
295        push (@outtext, $c);
296    } else {
297        $c = &transchar ($encodename, $c);
298        # put a black square if cannot translate
299        $c = 0x25A1 if $c == 0;
300        push (@outtext, $c);
301    }
302    $i ++;
303    }
304    return \@outtext;
305}
306
307# doublebyte2unicode converts simple two byte encodings where characters
308# below code point 0x80 are single-byte characters and the rest are
309# double-byte characters.
310#
311# Examples of encodings that may be converted using doublebyte2unicode are
312# CJK encodings like GB encoded Chinese and UHC Korean.
313#
314# Note that no error checking is performed to make sure that the input text
315# is valid for the given encoding.
316#
317# Also, encodings that may contain characters of more than two bytes are
318# not supported (any EUC encoded text may in theory contain 3-byte
319# characters but in practice only one and two byte characters are used).
320sub doublebyte2unicode {
321    my ($encodename, $textref) = @_;   
322   
323    my @outtext = ();
324    my $len = length($$textref);
325    my ($c1, $c2);
326    my $i = 0;
327
328    while ($i < $len) {
329    if (($c1 = ord(substr($$textref, $i, 1))) >= 0x80) {
330        if ($i+1 < $len) {
331        # double-byte character
332        $c2 = ord(substr($$textref, $i+1, 1));
333        my $c = &transchar ($encodename, ($c1 << 8) | $c2);
334        # put a black square if cannot translate
335        $c = 0x25A1 if $c == 0;
336        push (@outtext, $c);
337        $i += 2;
338       
339        } else {
340        # error
341        print STDERR "unicode: ERROR missing second half of double-byte character\n";
342        $i++;
343        }
344       
345    } else {
346        # single-byte character
347        push (@outtext, $c1);
348        $i++;
349    }
350    }
351    return \@outtext;
352}
353
354# Shift-JIS to unicode
355# We can't use doublebyte2unicode for Shift-JIS because it uses some
356# single-byte characters above code point 0x80 (i.e. half-width katakana
357# characters in the range 0xA1-0xDF)
358sub shiftjis2unicode {
359    my ($encodename, $textref) = @_;
360   
361    my @outtext = ();
362    my $len = length($$textref);
363    my ($c1, $c2);
364    my $i = 0;
365
366    while ($i < $len) {
367    $c1 = ord(substr($$textref, $i, 1));
368
369    if (($c1 >= 0xA1 && $c1 <= 0xDF) || $c1 == 0x5c || $c1 == 0x7E) {
370        # Single-byte half-width katakana character or
371        # JIS Roman yen or overline characters
372        my $c = &transchar ($encodename, $c1);
373        # - put a black square if cannot translate
374        $c = 0x25A1 if $c == 0;
375        push (@outtext, $c);
376        $i++;
377
378    } elsif ($c1 < 0x80) {
379        # ASCII
380        push (@outtext, $c1);
381        $i ++;
382
383    } elsif ($c1 < 0xEF) {
384        if ($i+1 < $len) {
385        $c2 = ord(substr($$textref, $i+1, 1));
386        if (($c2 >= 0x40 && $c2 <= 0x7E) || ($c2 >= 0x80 && $c2 <= 0xFC)) {
387            # Double-byte shift-jis character
388            my $c = &transchar ($encodename, ($c1 << 8) | $c2);
389            # put a black square if cannot translate
390            $c = 0x25A1 if $c == 0;
391            push (@outtext, $c);
392        } else {
393            # error
394            print STDERR "unicode: ERROR Invalid Shift-JIS character\n";
395        }
396        $i += 2;
397        } else {
398        # error
399        print STDERR "unicode: ERROR missing second half of Shift-JIS character\n";
400        $i ++;
401        }
402    } else {
403        # error
404        print STDERR "unicode: ERROR Invalid Shift-JIS character\n";
405        $i ++;
406    }
407    }
408    return \@outtext;
409}
410
411sub transchar {
412    my ($encoding, $from) = @_;
413    my $high = ($from / 256) % 256;
414    my $low = $from % 256;
415
416    return 0 unless defined $unicode::translations{$encoding};
417
418    my $block = $unicode::translations{$encoding}->{'map'};
419
420    if (ref ($block->[$high]) ne "ARRAY") {
421    return 0;
422    }
423    return $block->[$high]->[$low];
424}
425
426# %translations is of the form:
427#
428# encodings{encodingname-encodingname}->{'map'}->blocktranslation
429# blocktranslation->[[0-255],[256-511], ..., [65280-65535]]
430#
431# Any of the top translation blocks can point to an undefined
432# value. This data structure aims to allow fast translation and
433# efficient storage.
434%unicode::translations = ();
435
436# @array256 is used for initialisation, there must be
437# a better way...
438# What about this?: @array256 = (0) x 256;
439@unicode::array256 = (0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
440         0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
441         0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
442         0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
443         0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
444         0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
445         0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
446         0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
447         0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
448         0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
449         0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
450         0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
451         0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
452         0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
453         0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
454         0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0);
455
456# returns 1 if successful, 0 if unsuccessful
457sub loadmapencoding {
458    my ($encoding, $mapfile) = @_;
459   
460    # check to see if the encoding has already been loaded
461    return 1 if (defined $unicode::translations{$encoding});
462
463    if (! -r $mapfile || -d $mapfile) {
464    return 0;
465    }
466    return 0 unless open (MAPFILE, $mapfile);
467    binmode (MAPFILE);
468
469    $unicode::translations{$encoding} = {'map' => [@unicode::array256], 'count' => 0};
470    my $block = $unicode::translations{$encoding};
471
472    my ($in,$i,$j);
473    while (1) {
474    my $ret=read(MAPFILE, $in, 1);
475    if (!defined($ret)) { # error
476        print STDERR "unicode.pm: error reading mapfile: $!\n";
477        last;
478    }
479    if ($ret != 1) { last }
480    $i = unpack ("C", $in);
481    $block->{'map'}->[$i] = [@unicode::array256];
482    for ($j=0; $j<256 && read(MAPFILE, $in, 2)==2; $j++) {
483        my ($n1, $n2) = unpack ("CC", $in);
484        $block->{'map'}->[$i]->[$j] = ($n1*256) + $n2;
485    }
486    $block->{'count'} ++;
487    }
488
489    close (MAPFILE);
490}
491
492# unicode2singlebyte converts unicode to simple 8 bit encodings where
493# characters below 0x80 are normal ascii characters and the rest are encoded
494# using the appropriate mapping files.
495#
496# Examples of encodings that may be converted using unicode2singlebyte are
497# the iso-8859 and windows-125* series, KOI8-R (Russian), and the Kazakh encoding.
498sub unicode2singlebyte {
499    my ($uniref, $encoding) = @_;
500
501    my $outtext = "";
502    my $encodename = "unicode-$encoding";
503
504    if (!exists $encodings::encodings->{$encoding}) {
505    print STDERR "unicode.pm: ERROR - unsupported encoding "
506        . "'$encoding' requested\n";
507    return "";
508    }
509
510    my $enc_info = $encodings::encodings->{$encoding};
511    my $mapfile = &FileUtils::filenameConcatenate($ENV{'GSDLHOME'}, "mappings",
512                      "from_uc", $enc_info->{'mapfile'});
513    if (!&loadmapencoding ($encodename, $mapfile)) {
514    print STDERR "unicode: ERROR - could not load encoding $encodename: $! $mapfile\n";
515    return "";
516    }
517   
518    foreach my $c (@$uniref) {
519    if ($c < 0x80) {
520        # normal ascii character
521        $outtext .= chr($c);
522    } else {
523        # extended ascii character
524        $c = &transchar ($encodename, $c);
525
526        # put a question mark if cannot translate
527        if ($c == 0) {
528        $outtext .= "?";
529        } else {
530        $outtext .= chr($c);
531        }
532    }
533    }
534    return $outtext;
535}
536
537
538# this makes sure that the referenced input string is utf8 encoded, and
539# will change/remove bytes that aren't.
540# returns 0 if the text was already utf8, or 1 if text modified to become utf8
541sub ensure_utf8 {
542    my $stringref=shift;
543
544    if (!defined($stringref) || ref($stringref) ne 'SCALAR') {
545    return $stringref;
546    }
547
548    my $value=$$stringref;
549
550    my $non_utf8_found = 0;
551    $value =~ m/^/g; # to set \G
552    while ($value =~ m!\G.*?([\x80-\xff]+)!sg) {
553    my $highbytes=$1;
554    my $highbyteslength=length($highbytes);
555    # make sure this block of high bytes is utf-8
556    $highbytes =~ /^/g; # set pos()
557    my $byte_replaced = 0;
558    while ($highbytes =~
559        m!\G (?: [\xc0-\xdf][\x80-\xbf]   | # 2 byte utf-8
560            [\xe0-\xef][\x80-\xbf]{2} | # 3 byte
561            [\xf0-\xf7][\x80-\xbf]{3} | # 4 byte
562            [\xf8-\xfb][\x80-\xbf]{4} | # 5 byte
563            [\xfc-\xfd][\x80-\xbf]{5} | # 6 byte
564            )*([\x80-\xff])? !xg
565        ) {
566        # this highbyte is "out-of-place" for valid utf-8
567        my $badbyte=$1;
568        if (!defined $badbyte) {next} # hit end of string
569        my $pos=pos($highbytes);
570        # replace bad byte. assume iso-8859-1 -> utf-8
571        # ascii2utf8 does "extended ascii"... ie iso-8859-1
572        my $replacement=&unicode::ascii2utf8(\$badbyte);
573        substr($highbytes, $pos-1, 1, $replacement);
574        # update the position to continue searching (for \G)
575        pos($highbytes) = $pos+length($replacement)-1;
576        $byte_replaced = 1;
577    }
578    if ($byte_replaced) {
579        # replace this block of high bytes in the $value
580        $non_utf8_found = 1;
581        my $replength=length($highbytes); # we've changed the length
582        my $textpos=pos($value); # pos at end of last match
583        # replace bad bytes with good bytes
584        substr($value, $textpos-$highbyteslength,
585                $highbyteslength, $highbytes);
586        # update the position to continue searching (for \G)
587        pos($value)=$textpos+($replength-$highbyteslength)+1;
588    }
589    }
590
591    $$stringref = $value;
592    return $non_utf8_found;
593}
594
595# Returns true (1) if the given string is utf8 and false (0) if it isn't.
596# Does not modify the string parameter.
597sub check_is_utf8 {
598    my $value=shift;
599
600    if (!defined($value)) {
601    return 0; # not utf8 because it is undefined
602    }
603
604    $value =~ m/^/g; # to set \G
605    while ($value =~ m!\G.*?([\x80-\xff]+)!sg) {
606    my $highbytes=$1;
607    # make sure this block of high bytes is utf-8
608    $highbytes =~ /^/g; # set pos()
609    while ($highbytes =~
610        m!\G (?: [\xc0-\xdf][\x80-\xbf]   | # 2 byte utf-8
611            [\xe0-\xef][\x80-\xbf]{2} | # 3 byte
612            [\xf0-\xf7][\x80-\xbf]{3} | # 4 byte
613            [\xf8-\xfb][\x80-\xbf]{4} | # 5 byte
614            [\xfc-\xfd][\x80-\xbf]{5} | # 6 byte
615            )*([\x80-\xff])? !xg
616        ) {
617        my $badbyte=$1;
618        if (defined $badbyte) { # not end of string
619        return 0; # non-utf8 found
620        }
621    }
622    }
623   
624    return 1;
625}
626
627sub url_encode {
628    my ($text) = @_;
629   
630    if (!&is_url_encoded($text)) {
631    $text =~ s/([^0-9A-Z\ \.\-\_])/sprintf("%%%02X", ord($1))/iseg;
632    # return the url-encoded character entity for underscore back to the entity
633    $text =~ s/%26%23095%3B/&\#095;/g;
634    }
635    return $text;
636}
637
638sub url_decode {
639    my ($text,$and_numeric_entities) = @_;
640
641    if(defined $text) {
642    $text =~ s/\%([0-9A-F]{2})/pack('C', hex($1))/ige;
643   
644    if ((defined $and_numeric_entities) && ($and_numeric_entities)) {
645        $text =~ s/\&\#x([0-9A-F]+);/pack('C', hex($1))/ige;
646        $text =~ s/\&\#u?([0-9]+);/pack('C', $1)/ige;
647    }
648    }
649
650    return $text;
651}
652
653sub url_decode_utf8 {
654    my ($text,$and_numeric_entities) = @_;
655
656    $text =~ s/\%([0-9A-F]{2})/pack('b', hex($1))/ige;
657
658    $text = Encode::decode("utf8",$text);
659
660    return $text;
661}
662
663sub is_url_encoded {
664    my ($text) = @_;
665    return ($text =~ m/\%([0-9A-F]{2})/i) || ($text =~ m/\&\#x([0-9A-F]+;)/i) || ($text =~ m/\&\#([0-9]+;)/i);
666}
667
668# When a filename on the filesystem is already URL-encoded, the
669# URL to it will have %25s in place of every % sign, so that
670# URLs in html pages can refer to the URL-encoded filename.
671# This method changes the URL reference back into the actual
672# (URL-encoded) filename on the filesystem by replacing %25 with %.
673sub url_to_filename {
674    my ($text) =@_;
675    $text =~ s/%25/%/g if &is_url_encoded($text);
676    # DM safing would have replaced underscores with character entity &#095;
677    # in SourceFile meta. Undo any such change to get the filename referred to.
678    $text =~ s/&\#095;/_/g;
679    return $text;
680}
681
682# When a filename on the filesystem is already URL-encoded, the
683# URL to it will have %25s in place of every % sign, so that
684# URLs in html pages can refer to the URL-encoded filename.
685# Given a (URL-encoded) filename on the filesystem, this subroutine
686# returns the URL reference string for it by replacing % with %25.
687# The output string will be the same as the input string if the input
688# already contains one or more %25s. This is to prevent processing
689# a url more than once this way.
690sub filename_to_url {
691    my ($text) = @_;
692   
693    if($text !~ m/%25/) {
694    $text =~ s/%/%25/g;
695    }
696    return $text;
697}
698
699# Base64 encoding does not encode a pure ASCII to itself. This is important to know.
700# If the $force_encode parameter is true, then this method WILL base64 encode whatever
701# string is passed in, including any plain ASCII string.
702# That means this method could double encode an already encoded string.
703# However, this method is necessary because on the GLI end, we can't detect whether a plain
704# ASCII string has been encoded or not. And if gsdlsourcefilerenamemethod is set to base64,
705# then gli will always attempt to decode all parts of the relative path gsdlsourcefilename
706# (except the "import" prefix, which is special) or none of the parts.
707sub force_base64_encode {
708    my ($text) = @_;
709    my $force_encode = 1;
710    return &base64_encode($text, $force_encode);
711}
712
713sub base64_encode {
714    my ($text, $force_encode) = @_;
715    if($force_encode || !&conforms_to_mod_base64($text)) {
716    # return entity for underscore to underscore before encoding
717    $text =~ s/&\#095;/_/g;
718
719    $text = &MIME::Base64::encode_base64($text);
720    # base64 encoding may introduce + and / signs,
721    # replacing them with - and _ to ensure it's filename-safe
722    $text =~ s/\+/\-/g; # + -> -
723    $text =~ s/\//\_/g; # / -> _
724    }
725    return $text;
726}
727
728# If the input fits the modified base64 pattern, this will try decoding it.
729# Still, this method does not guarantee the return value is the 'original', only
730# that the result is where the base64 decoding process has been applied once.
731# THIS METHOD IS NOT USED at the moment. It's here for convenience and symmetry.
732sub base64_decode {
733    my ($text) = @_;
734    if(&conforms_to_mod_base64($text)) {
735    # base64 encodes certain chars with + and /, but if we'd encoded it, we'd
736    # have replaced them with - and _ respectively. Undo this before decoding.
737    $text =~ s/\-/\+/g;      # - -> +
738    $text =~ s/\_/\//g;      # _ -> /
739    $text = &MIME::Base64::decode_base64($text);
740    }
741    return $text;
742}
743
744# Returns true if the given string is compatible with a modified version
745# of base64 (where the + and / are replaced with - and _), a format which
746# includes also regular ASCII alphanumeric values. This method does not
747# guarantee that the given string is actually base64 encoded, since it will
748# return true for any simple alphanumeric ASCII string as well.
749sub conforms_to_mod_base64 {
750    my ($text) = @_;
751
752    # need to treat the entity ref for underscore as underscore
753    $text =~ s/&\#095;/_/g;
754
755    # base 64 takes alphanumeric and [=+/],
756    # but we use modified base64 where + and / are replaced with  - and _
757    return ($text =~ m/^[A-Za-z0-9\=\-\_]+$/); #alphanumeric and [=-_]
758}
759
760sub substr
761{
762    my ($utf8_string, $offset, $length) = @_;
763
764    my @unicode_string = @{&utf82unicode($utf8_string)};
765    my $unicode_string_length = scalar(@unicode_string);
766
767    my $substr_start = $offset;
768    if ($substr_start >= $unicode_string_length) {
769    return "";
770    }
771
772    my $substr_end = $offset + $length - 1;
773    if ($substr_end >= $unicode_string_length) {
774    $substr_end = $unicode_string_length - 1;
775    }
776
777    my @unicode_substring = @unicode_string[$substr_start..$substr_end];
778    return &unicode2utf8(\@unicode_substring);
779}
780
781# perl version of stringToHex
782# Useful method to print UTF8 (or other unicode) for debugging.
783# Characters that are easily displayed (that is, printable ASCII)
784# are shown as-is, whereas hex values of the unicode code points
785# are shown for all other chars.
786sub debug_unicode_string
787{
788    join("",
789         map { $_ > 127 ?                      # if wide character...
790                   sprintf("\\x{%04X}", $_) :  # \x{...}
791                   chr($_)         
792               } unpack("U*", $_[0]));         # unpack Unicode characters
793}
794
795
796sub raw_filename_to_url_encoded
797{
798    my ($str_in) = @_;
799
800    my @url_encoded_chars
801    = map { $_ > 255 ?                  # Needs to be represent in entity form
802            sprintf("&#x%X;",$_) : 
803            $_>127 || $_==ord("%") ?              # Representable in %XX form
804            sprintf("%%%2X", $_) : 
805            chr($_)                 # otherwise, Ascii char
806        } unpack("U*", $str_in); # Unpack Unicode characters
807
808   
809    my $str_out = join("", @url_encoded_chars);
810
811    return $str_out;
812
813}
814
815sub url_encoded_to_raw_filename
816{
817    my ($str_in) = @_;
818
819    my $str_out = $str_in;
820
821    $str_out =~ s/%([0-9A-F]{2})/chr(hex($1))/eig;
822    $str_out =~ s/&#x([0-9A-F]+);/chr(hex($1))/eig;
823    $str_out =~ s/&#([0-9]+);/chr($1)/eig;
824
825    return $str_out;
826}
827
828
829sub raw_filename_to_utf8_url_encoded
830{
831    my ($str_in) = @_;
832
833    $str_in = Encode::encode("utf8",$str_in) if !check_is_utf8($str_in);
834
835    my @url_encoded_chars
836    = map { $_ > 127 ?                  # Representable in %XX form
837            sprintf("%%%2X", $_) : 
838            chr($_)                 # otherwise, Ascii char
839        } unpack("U*", $str_in); # Unpack utf8 characters
840
841   
842    my $str_out = join("", @url_encoded_chars);
843
844    return $str_out;
845
846}
847
848sub utf8_url_encoded_to_raw_filename
849{
850    my ($str_in) = @_;
851
852    my $utf8_str_out = $str_in;
853
854    $utf8_str_out =~ s/%([0-9A-F]{2})/chr(hex($1))/eig;
855
856    my $unicode_str_out = decode("utf8",$utf8_str_out);
857    my $raw_str_out = utf8::downgrade($unicode_str_out);
858   
859    return $raw_str_out;
860}
861
862sub analyze_raw_string
863{
864    my ($str_in) = @_;
865
866    my $uses_bytecodes = 0;
867    my $exceeds_bytecodes = 0;
868
869    map { $exceeds_bytecodes = 1 if ($_ >= 256);
870      $uses_bytecodes    = 1 if (($_ >= 128) && ($_ < 256));
871    } unpack("U*", $str_in); # Unpack Unicode characters
872
873    return ($uses_bytecodes,$exceeds_bytecodes);
874}
875
876
877sub convert_utf8_string_to_unicode_string
878{
879    my $utf8_string = shift(@_);
880
881    my $unicode_string = "";
882    foreach my $unicode_value (@{&unicode::utf82unicode($utf8_string)}) {
883    $unicode_string .= chr($unicode_value);
884    }
885    return $unicode_string;
886}
887
888sub convert_unicode_string_to_utf8_string
889{
890    my $unicode_string = shift(@_);
891
892    my @unicode_array;
893    for (my $i = 0; $i < length($unicode_string); $i++) {
894    push(@unicode_array, ord(&substr($unicode_string, $i, 1)));
895    }
896    return &unicode::unicode2utf8(\@unicode_array);
897}
898
899
9001;
Note: See TracBrowser for help on using the browser.