source: main/trunk/greenstone2/perllib/unicode.pm@ 32594

Last change on this file since 32594 was 28796, checked in by ak19, 10 years ago

Diego's Spano's bugfix to bug found by Justin Cooper when Justin was trying to process a problematic PDF. The mailing list documents their discussion.

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 23.7 KB
Line 
1###########################################################################
2#
3# unicode.pm --
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999-2004 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26# useful functions for dealing with Unicode
27
28# Unicode strings are stored as arrays of scalars as perl
29# lacks characters are 8-bit (currently)
30
31package unicode;
32
33eval {require bytes};
34
35use encodings;
36use strict;
37use util;
38use FileUtils;
39use MIME::Base64; # for base64 encoding
40
41no strict 'refs';
42
43
44
45sub utf8decomp
46{
47 my ($str) = @_;
48
49 return if (!defined $str);
50 return "" if ($str eq "");
51
52 my @unpacked_chars = unpack("C*", $str); # unpack Unicode characters
53
54 my @each_char
55 = map { ($_ > 255 )
56 ? # if wide character...
57 sprintf("\\x{%04X}", $_)
58 : # \x{...}
59 (chr($_) =~ m/[[:cntrl:]]/ )
60 ? # else if control character ...
61 sprintf("\\x%02X", $_)
62 : # \x..
63 quotemeta(chr($_)) # else quoted or as themselves
64 } @unpacked_chars;
65
66 return join("",@each_char);
67}
68
69
70sub hex_codepoint {
71 if (my $char = shift) {
72 return sprintf '%2.2x', unpack('U0U*', $char);
73 }
74}
75
76
77
78
79# ascii2unicode takes an (extended) ascii string (ISO-8859-1)
80# and returns a unicode array.
81sub ascii2unicode {
82 my ($in) = @_;
83 my $out = [];
84
85 my $i = 0;
86 my $len = length($in);
87 while ($i < $len) {
88 push (@$out, ord(substr ($in, $i, 1)));
89 $i++;
90 }
91
92 return $out;
93}
94
95# ascii2utf8 takes a reference to an (extended) ascii string and returns a
96# UTF-8 encoded string. This is just a faster version of
97# "&unicode2utf8(&ascii2unicode($str));"
98# "Extended ascii" really means "iso_8859_1"
99sub ascii2utf8 {
100 my ($in) = @_;
101 my $out = "";
102
103 if (!defined($in)|| !defined($$in)) {
104 return $out;
105 }
106
107 my ($c);
108 my $i = 0;
109 my $len = length($$in);
110 while ($i < $len) {
111 $c = ord (substr ($$in, $i, 1));
112 if ($c < 0x80) {
113 # ascii character
114 $out .= chr ($c);
115
116 } else {
117 # extended ascii character
118 $out .= chr (0xc0 + (($c >> 6) & 0x1f));
119 $out .= chr (0x80 + ($c & 0x3f));
120 }
121 $i++;
122 }
123
124 return $out;
125}
126
127# unicode2utf8 takes a unicode array as input and encodes it
128# using utf-8
129sub unicode2utf8 {
130 my ($in) = @_;
131 my $out = "";
132
133 foreach my $num (@$in) {
134 next unless defined $num;
135 if ($num < 0x80) {
136 $out .= chr ($num);
137
138 } elsif ($num < 0x800) {
139 $out .= chr (0xc0 + (($num >> 6) & 0x1f));
140 $out .= chr (0x80 + ($num & 0x3f));
141
142 } elsif ($num < 0xFFFF) {
143 $out .= chr (0xe0 + (($num >> 12) & 0xf));
144 $out .= chr (0x80 + (($num >> 6) & 0x3f));
145 $out .= chr (0x80 + ($num & 0x3f));
146
147 } else {
148 # error, don't encode anything
149 #die;
150 # Diego's bugfix: instead of aborting the import process, it
151 # is better to get a converted file with a few extra spaces
152 print STDERR "strange char: $num\n";
153 $out .= " ";
154
155 }
156 }
157 return $out;
158}
159
160# utf82unicode takes a utf-8 string and produces a unicode
161# array
162sub utf82unicode {
163 my ($in) = @_;
164 my $out = [];
165
166 if(!defined $in) {
167 return $out;
168 }
169
170 my $i = 0;
171 my ($c1, $c2, $c3);
172 my $len = length($in);
173 while ($i < $len) {
174 if (($c1 = ord(substr ($in, $i, 1))) < 0x80) {
175 # normal ascii character
176 push (@$out, $c1);
177
178 } elsif ($c1 < 0xc0) {
179 # error, was expecting the first byte of an
180 # encoded character. Do nothing.
181
182 } elsif ($c1 < 0xe0 && $i+1 < $len) {
183 # an encoded character with two bytes
184 $c2 = ord (substr ($in, $i+1, 1));
185 if ($c2 >= 0x80 && $c2 < 0xc0) {
186 # everything looks ok
187 push (@$out, ((($c1 & 0x1f) << 6) +
188 ($c2 & 0x3f)));
189 $i++; # gobbled an extra byte
190 }
191
192 } elsif ($c1 < 0xf0 && $i+2 < $len) {
193 # an encoded character with three bytes
194 $c2 = ord (substr ($in, $i+1, 1));
195 $c3 = ord (substr ($in, $i+2, 1));
196 if ($c2 >= 0x80 && $c2 < 0xc0 &&
197 $c3 >= 0x80 && $c3 < 0xc0) {
198 # everything looks ok
199 push (@$out, ((($c1 & 0xf) << 12) +
200 (($c2 & 0x3f) << 6) +
201 ($c3 & 0x3f)));
202
203 $i += 2; # gobbled an extra two bytes
204 }
205
206 } else {
207 # error, only decode Unicode characters not full UCS.
208 # Do nothing.
209 }
210
211 $i++;
212 }
213
214 return $out;
215}
216
217# unicode2ucs2 takes a unicode array and produces a UCS-2
218# unicode string (every two bytes forms a unicode character)
219sub unicode2ucs2 {
220 my ($in) = @_;
221 my $out = "";
222
223 foreach my $num (@$in) {
224 $out .= chr (($num & 0xff00) >> 8);
225 $out .= chr ($num & 0xff);
226 }
227
228 return $out;
229}
230
231# ucs22unicode takes a UCS-2 string and produces a unicode array
232sub ucs22unicode {
233 my ($in) = @_;
234 my $out = [];
235
236 my $i = 0;
237 my $len = length ($in);
238 while ($i+1 < $len) {
239 push (@$out, ord (substr($in, $i, 1)) << 8 +
240 ord (substr($in, $i+1, 1)));
241
242 $i ++;
243 }
244
245 return $out;
246}
247
248# takes a reference to a string and returns a reference to a unicode array
249sub convert2unicode {
250 my ($encoding, $textref) = @_;
251
252 if (!defined $encodings::encodings->{$encoding}) {
253 print STDERR "unicode::convert2unicode: ERROR: Unsupported encoding ($encoding)\n";
254 return [];
255 }
256
257 my $encodename = "$encoding-unicode";
258 my $enc_info = $encodings::encodings->{$encoding};
259 my $mapfile = &FileUtils::filenameConcatenate($ENV{'GSDLHOME'}, "mappings",
260 "to_uc", $enc_info->{'mapfile'});
261 if (!&loadmapencoding ($encodename, $mapfile)) {
262 print STDERR "unicode: ERROR - could not load encoding $encodename: $! $mapfile\n";
263 return [];
264 }
265
266 if (defined $enc_info->{'converter'}) {
267 my $converter = $enc_info->{'converter'};
268 return &$converter ($encodename, $textref);
269 }
270
271 if ($unicode::translations{$encodename}->{'count'} == 1) {
272 return &singlebyte2unicode ($encodename, $textref);
273 } else {
274 return &doublebyte2unicode ($encodename, $textref);
275 }
276}
277
278# singlebyte2unicode converts simple 8 bit encodings where characters below
279# 0x80 are normal ascii characters and the rest are decoded using the
280# appropriate mapping files.
281#
282# Examples of encodings that may be converted using singlebyte2unicode are
283# the iso-8859 and windows-125* series.
284sub singlebyte2unicode {
285 my ($encodename, $textref) = @_;
286
287 my @outtext = ();
288 my $len = length($$textref);
289 my ($c);
290 my $i = 0;
291
292 while ($i < $len) {
293 if (($c = ord(substr($$textref, $i, 1))) < 0x80) {
294 # normal ascii character
295 push (@outtext, $c);
296 } else {
297 $c = &transchar ($encodename, $c);
298 # put a black square if cannot translate
299 $c = 0x25A1 if $c == 0;
300 push (@outtext, $c);
301 }
302 $i ++;
303 }
304 return \@outtext;
305}
306
307# doublebyte2unicode converts simple two byte encodings where characters
308# below code point 0x80 are single-byte characters and the rest are
309# double-byte characters.
310#
311# Examples of encodings that may be converted using doublebyte2unicode are
312# CJK encodings like GB encoded Chinese and UHC Korean.
313#
314# Note that no error checking is performed to make sure that the input text
315# is valid for the given encoding.
316#
317# Also, encodings that may contain characters of more than two bytes are
318# not supported (any EUC encoded text may in theory contain 3-byte
319# characters but in practice only one and two byte characters are used).
320sub doublebyte2unicode {
321 my ($encodename, $textref) = @_;
322
323 my @outtext = ();
324 my $len = length($$textref);
325 my ($c1, $c2);
326 my $i = 0;
327
328 while ($i < $len) {
329 if (($c1 = ord(substr($$textref, $i, 1))) >= 0x80) {
330 if ($i+1 < $len) {
331 # double-byte character
332 $c2 = ord(substr($$textref, $i+1, 1));
333 my $c = &transchar ($encodename, ($c1 << 8) | $c2);
334 # put a black square if cannot translate
335 $c = 0x25A1 if $c == 0;
336 push (@outtext, $c);
337 $i += 2;
338
339 } else {
340 # error
341 print STDERR "unicode: ERROR missing second half of double-byte character\n";
342 $i++;
343 }
344
345 } else {
346 # single-byte character
347 push (@outtext, $c1);
348 $i++;
349 }
350 }
351 return \@outtext;
352}
353
354# Shift-JIS to unicode
355# We can't use doublebyte2unicode for Shift-JIS because it uses some
356# single-byte characters above code point 0x80 (i.e. half-width katakana
357# characters in the range 0xA1-0xDF)
358sub shiftjis2unicode {
359 my ($encodename, $textref) = @_;
360
361 my @outtext = ();
362 my $len = length($$textref);
363 my ($c1, $c2);
364 my $i = 0;
365
366 while ($i < $len) {
367 $c1 = ord(substr($$textref, $i, 1));
368
369 if (($c1 >= 0xA1 && $c1 <= 0xDF) || $c1 == 0x5c || $c1 == 0x7E) {
370 # Single-byte half-width katakana character or
371 # JIS Roman yen or overline characters
372 my $c = &transchar ($encodename, $c1);
373 # - put a black square if cannot translate
374 $c = 0x25A1 if $c == 0;
375 push (@outtext, $c);
376 $i++;
377
378 } elsif ($c1 < 0x80) {
379 # ASCII
380 push (@outtext, $c1);
381 $i ++;
382
383 } elsif ($c1 < 0xEF) {
384 if ($i+1 < $len) {
385 $c2 = ord(substr($$textref, $i+1, 1));
386 if (($c2 >= 0x40 && $c2 <= 0x7E) || ($c2 >= 0x80 && $c2 <= 0xFC)) {
387 # Double-byte shift-jis character
388 my $c = &transchar ($encodename, ($c1 << 8) | $c2);
389 # put a black square if cannot translate
390 $c = 0x25A1 if $c == 0;
391 push (@outtext, $c);
392 } else {
393 # error
394 print STDERR "unicode: ERROR Invalid Shift-JIS character\n";
395 }
396 $i += 2;
397 } else {
398 # error
399 print STDERR "unicode: ERROR missing second half of Shift-JIS character\n";
400 $i ++;
401 }
402 } else {
403 # error
404 print STDERR "unicode: ERROR Invalid Shift-JIS character\n";
405 $i ++;
406 }
407 }
408 return \@outtext;
409}
410
411sub transchar {
412 my ($encoding, $from) = @_;
413 my $high = ($from / 256) % 256;
414 my $low = $from % 256;
415
416 return 0 unless defined $unicode::translations{$encoding};
417
418 my $block = $unicode::translations{$encoding}->{'map'};
419
420 if (ref ($block->[$high]) ne "ARRAY") {
421 return 0;
422 }
423 return $block->[$high]->[$low];
424}
425
426# %translations is of the form:
427#
428# encodings{encodingname-encodingname}->{'map'}->blocktranslation
429# blocktranslation->[[0-255],[256-511], ..., [65280-65535]]
430#
431# Any of the top translation blocks can point to an undefined
432# value. This data structure aims to allow fast translation and
433# efficient storage.
434%unicode::translations = ();
435
436# @array256 is used for initialisation, there must be
437# a better way...
438# What about this?: @array256 = (0) x 256;
439@unicode::array256 = (0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
440 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
441 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
442 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
443 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
444 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
445 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
446 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
447 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
448 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
449 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
450 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
451 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
452 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
453 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
454 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0);
455
456# returns 1 if successful, 0 if unsuccessful
457sub loadmapencoding {
458 my ($encoding, $mapfile) = @_;
459
460 # check to see if the encoding has already been loaded
461 return 1 if (defined $unicode::translations{$encoding});
462
463 if (! -r $mapfile || -d $mapfile) {
464 return 0;
465 }
466 return 0 unless open (MAPFILE, $mapfile);
467 binmode (MAPFILE);
468
469 $unicode::translations{$encoding} = {'map' => [@unicode::array256], 'count' => 0};
470 my $block = $unicode::translations{$encoding};
471
472 my ($in,$i,$j);
473 while (1) {
474 my $ret=read(MAPFILE, $in, 1);
475 if (!defined($ret)) { # error
476 print STDERR "unicode.pm: error reading mapfile: $!\n";
477 last;
478 }
479 if ($ret != 1) { last }
480 $i = unpack ("C", $in);
481 $block->{'map'}->[$i] = [@unicode::array256];
482 for ($j=0; $j<256 && read(MAPFILE, $in, 2)==2; $j++) {
483 my ($n1, $n2) = unpack ("CC", $in);
484 $block->{'map'}->[$i]->[$j] = ($n1*256) + $n2;
485 }
486 $block->{'count'} ++;
487 }
488
489 close (MAPFILE);
490}
491
492# unicode2singlebyte converts unicode to simple 8 bit encodings where
493# characters below 0x80 are normal ascii characters and the rest are encoded
494# using the appropriate mapping files.
495#
496# Examples of encodings that may be converted using unicode2singlebyte are
497# the iso-8859 and windows-125* series, KOI8-R (Russian), and the Kazakh encoding.
498sub unicode2singlebyte {
499 my ($uniref, $encoding) = @_;
500
501 my $outtext = "";
502 my $encodename = "unicode-$encoding";
503
504 if (!exists $encodings::encodings->{$encoding}) {
505 print STDERR "unicode.pm: ERROR - unsupported encoding "
506 . "'$encoding' requested\n";
507 return "";
508 }
509
510 my $enc_info = $encodings::encodings->{$encoding};
511 my $mapfile = &FileUtils::filenameConcatenate($ENV{'GSDLHOME'}, "mappings",
512 "from_uc", $enc_info->{'mapfile'});
513 if (!&loadmapencoding ($encodename, $mapfile)) {
514 print STDERR "unicode: ERROR - could not load encoding $encodename: $! $mapfile\n";
515 return "";
516 }
517
518 foreach my $c (@$uniref) {
519 if ($c < 0x80) {
520 # normal ascii character
521 $outtext .= chr($c);
522 } else {
523 # extended ascii character
524 $c = &transchar ($encodename, $c);
525
526 # put a question mark if cannot translate
527 if ($c == 0) {
528 $outtext .= "?";
529 } else {
530 $outtext .= chr($c);
531 }
532 }
533 }
534 return $outtext;
535}
536
537
538# this makes sure that the referenced input string is utf8 encoded, and
539# will change/remove bytes that aren't.
540# returns 0 if the text was already utf8, or 1 if text modified to become utf8
541sub ensure_utf8 {
542 my $stringref=shift;
543
544 if (!defined($stringref) || ref($stringref) ne 'SCALAR') {
545 return $stringref;
546 }
547
548 my $value=$$stringref;
549
550 my $non_utf8_found = 0;
551 $value =~ m/^/g; # to set \G
552 while ($value =~ m!\G.*?([\x80-\xff]+)!sg) {
553 my $highbytes=$1;
554 my $highbyteslength=length($highbytes);
555 # make sure this block of high bytes is utf-8
556 $highbytes =~ /^/g; # set pos()
557 my $byte_replaced = 0;
558 while ($highbytes =~
559 m!\G (?: [\xc0-\xdf][\x80-\xbf] | # 2 byte utf-8
560 [\xe0-\xef][\x80-\xbf]{2} | # 3 byte
561 [\xf0-\xf7][\x80-\xbf]{3} | # 4 byte
562 [\xf8-\xfb][\x80-\xbf]{4} | # 5 byte
563 [\xfc-\xfd][\x80-\xbf]{5} | # 6 byte
564 )*([\x80-\xff])? !xg
565 ) {
566 # this highbyte is "out-of-place" for valid utf-8
567 my $badbyte=$1;
568 if (!defined $badbyte) {next} # hit end of string
569 my $pos=pos($highbytes);
570 # replace bad byte. assume iso-8859-1 -> utf-8
571 # ascii2utf8 does "extended ascii"... ie iso-8859-1
572 my $replacement=&unicode::ascii2utf8(\$badbyte);
573 substr($highbytes, $pos-1, 1, $replacement);
574 # update the position to continue searching (for \G)
575 pos($highbytes) = $pos+length($replacement)-1;
576 $byte_replaced = 1;
577 }
578 if ($byte_replaced) {
579 # replace this block of high bytes in the $value
580 $non_utf8_found = 1;
581 my $replength=length($highbytes); # we've changed the length
582 my $textpos=pos($value); # pos at end of last match
583 # replace bad bytes with good bytes
584 substr($value, $textpos-$highbyteslength,
585 $highbyteslength, $highbytes);
586 # update the position to continue searching (for \G)
587 pos($value)=$textpos+($replength-$highbyteslength)+1;
588 }
589 }
590
591 $$stringref = $value;
592 return $non_utf8_found;
593}
594
595# Returns true (1) if the given string is utf8 and false (0) if it isn't.
596# Does not modify the string parameter.
597sub check_is_utf8 {
598 my $value=shift;
599
600 if (!defined($value)) {
601 return 0; # not utf8 because it is undefined
602 }
603
604 $value =~ m/^/g; # to set \G
605 while ($value =~ m!\G.*?([\x80-\xff]+)!sg) {
606 my $highbytes=$1;
607 # make sure this block of high bytes is utf-8
608 $highbytes =~ /^/g; # set pos()
609 while ($highbytes =~
610 m!\G (?: [\xc0-\xdf][\x80-\xbf] | # 2 byte utf-8
611 [\xe0-\xef][\x80-\xbf]{2} | # 3 byte
612 [\xf0-\xf7][\x80-\xbf]{3} | # 4 byte
613 [\xf8-\xfb][\x80-\xbf]{4} | # 5 byte
614 [\xfc-\xfd][\x80-\xbf]{5} | # 6 byte
615 )*([\x80-\xff])? !xg
616 ) {
617 my $badbyte=$1;
618 if (defined $badbyte) { # not end of string
619 return 0; # non-utf8 found
620 }
621 }
622 }
623
624 return 1;
625}
626
627sub url_encode {
628 my ($text) = @_;
629
630 if (!&is_url_encoded($text)) {
631 $text =~ s/([^0-9A-Z\ \.\-\_])/sprintf("%%%02X", ord($1))/iseg;
632 # return the url-encoded character entity for underscore back to the entity
633 $text =~ s/%26%23095%3B/&\#095;/g;
634 }
635 return $text;
636}
637
638sub url_decode {
639 my ($text,$and_numeric_entities) = @_;
640
641 if(defined $text) {
642 $text =~ s/\%([0-9A-F]{2})/pack('C', hex($1))/ige;
643
644 if ((defined $and_numeric_entities) && ($and_numeric_entities)) {
645 $text =~ s/\&\#x([0-9A-F]+);/pack('C', hex($1))/ige;
646 $text =~ s/\&\#u?([0-9]+);/pack('C', $1)/ige;
647 }
648 }
649
650 return $text;
651}
652
653sub url_decode_utf8 {
654 my ($text,$and_numeric_entities) = @_;
655
656 $text =~ s/\%([0-9A-F]{2})/pack('b', hex($1))/ige;
657
658 $text = Encode::decode("utf8",$text);
659
660 return $text;
661}
662
663sub is_url_encoded {
664 my ($text) = @_;
665 return ($text =~ m/\%([0-9A-F]{2})/i) || ($text =~ m/\&\#x([0-9A-F]+;)/i) || ($text =~ m/\&\#([0-9]+;)/i);
666}
667
668# When a filename on the filesystem is already URL-encoded, the
669# URL to it will have %25s in place of every % sign, so that
670# URLs in html pages can refer to the URL-encoded filename.
671# This method changes the URL reference back into the actual
672# (URL-encoded) filename on the filesystem by replacing %25 with %.
673sub url_to_filename {
674 my ($text) =@_;
675 $text =~ s/%25/%/g if &is_url_encoded($text);
676 # DM safing would have replaced underscores with character entity &#095;
677 # in SourceFile meta. Undo any such change to get the filename referred to.
678 $text =~ s/&\#095;/_/g;
679 return $text;
680}
681
682# When a filename on the filesystem is already URL-encoded, the
683# URL to it will have %25s in place of every % sign, so that
684# URLs in html pages can refer to the URL-encoded filename.
685# Given a (URL-encoded) filename on the filesystem, this subroutine
686# returns the URL reference string for it by replacing % with %25.
687# The output string will be the same as the input string if the input
688# already contains one or more %25s. This is to prevent processing
689# a url more than once this way.
690sub filename_to_url {
691 my ($text) = @_;
692
693 if($text !~ m/%25/) {
694 $text =~ s/%/%25/g;
695 }
696 return $text;
697}
698
699sub base64_encode {
700 my ($text) = @_;
701 if(!&conforms_to_mod_base64($text)) {
702 # return entity for underscore to underscore before encoding
703 $text =~ s/&\#095;/_/g;
704
705 $text = &MIME::Base64::encode_base64($text);
706 # base64 encoding may introduce + and / signs,
707 # replacing them with - and _ to ensure it's filename-safe
708 $text =~ s/\+/\-/g; # + -> -
709 $text =~ s/\//\_/g; # / -> _
710 }
711 return $text;
712}
713
714# If the input fits the modified base64 pattern, this will try decoding it.
715# Still, this method does not guarantee the return value is the 'original', only
716# that the result is where the base64 decoding process has been applied once.
717# THIS METHOD IS NOT USED at the moment. It's here for convenience and symmetry.
718sub base64_decode {
719 my ($text) = @_;
720 if(&conforms_to_mod_base64($text)) {
721 # base64 encodes certain chars with + and /, but if we'd encoded it, we'd
722 # have replaced them with - and _ respectively. Undo this before decoding.
723 $text =~ s/\-/\+/g; # - -> +
724 $text =~ s/\_/\//g; # _ -> /
725 $text = &MIME::Base64::decode_base64($text);
726 }
727 return $text;
728}
729
730# Returns true if the given string is compatible with a modified version
731# of base64 (where the + and / are replaced with - and _), a format which
732# includes also regular ASCII alphanumeric values. This method does not
733# guarantee that the given string is actually base64 encoded, since it will
734# return true for any simple alphanumeric ASCII string as well.
735sub conforms_to_mod_base64 {
736 my ($text) = @_;
737
738 # need to treat the entity ref for underscore as underscore
739 $text =~ s/&\#095;/_/g;
740
741 # base 64 takes alphanumeric and [=+/],
742 # but we use modified base64 where + and / are replaced with - and _
743 return ($text =~ m/^[A-Za-z0-9\=\-\_]+$/); #alphanumeric and [=-_]
744}
745
746sub substr
747{
748 my ($utf8_string, $offset, $length) = @_;
749
750 my @unicode_string = @{&utf82unicode($utf8_string)};
751 my $unicode_string_length = scalar(@unicode_string);
752
753 my $substr_start = $offset;
754 if ($substr_start >= $unicode_string_length) {
755 return "";
756 }
757
758 my $substr_end = $offset + $length - 1;
759 if ($substr_end >= $unicode_string_length) {
760 $substr_end = $unicode_string_length - 1;
761 }
762
763 my @unicode_substring = @unicode_string[$substr_start..$substr_end];
764 return &unicode2utf8(\@unicode_substring);
765}
766
767# Useful method to print UTF8 (or other unicode) for debugging.
768# Characters that are easily displayed (that is, printable ASCII)
769# are shown as-is, whereas hex values of the unicode code points
770# are shown for all other chars.
771sub debug_unicode_string
772{
773 join("",
774 map { $_ > 127 ? # if wide character...
775 sprintf("\\x{%04X}", $_) : # \x{...}
776 chr($_)
777 } unpack("U*", $_[0])); # unpack Unicode characters
778}
779
780
781sub raw_filename_to_url_encoded
782{
783 my ($str_in) = @_;
784
785 my @url_encoded_chars
786 = map { $_ > 255 ? # Needs to be represent in entity form
787 sprintf("&#x%X;",$_) :
788 $_>127 || $_==ord("%") ? # Representable in %XX form
789 sprintf("%%%2X", $_) :
790 chr($_) # otherwise, Ascii char
791 } unpack("U*", $str_in); # Unpack Unicode characters
792
793
794 my $str_out = join("", @url_encoded_chars);
795
796 return $str_out;
797
798}
799
800sub url_encoded_to_raw_filename
801{
802 my ($str_in) = @_;
803
804 my $str_out = $str_in;
805
806 $str_out =~ s/%([0-9A-F]{2})/chr(hex($1))/eig;
807 $str_out =~ s/&#x([0-9A-F]+);/chr(hex($1))/eig;
808 $str_out =~ s/&#([0-9]+);/chr($1)/eig;
809
810 return $str_out;
811}
812
813
814sub raw_filename_to_utf8_url_encoded
815{
816 my ($str_in) = @_;
817
818 $str_in = Encode::encode("utf8",$str_in) if !check_is_utf8($str_in);
819
820 my @url_encoded_chars
821 = map { $_ > 127 ? # Representable in %XX form
822 sprintf("%%%2X", $_) :
823 chr($_) # otherwise, Ascii char
824 } unpack("U*", $str_in); # Unpack utf8 characters
825
826
827 my $str_out = join("", @url_encoded_chars);
828
829 return $str_out;
830
831}
832
833sub utf8_url_encoded_to_raw_filename
834{
835 my ($str_in) = @_;
836
837 my $utf8_str_out = $str_in;
838
839 $utf8_str_out =~ s/%([0-9A-F]{2})/chr(hex($1))/eig;
840
841 my $unicode_str_out = decode("utf8",$utf8_str_out);
842 my $raw_str_out = utf8::downgrade($unicode_str_out);
843
844 return $raw_str_out;
845}
846
847sub analyze_raw_string
848{
849 my ($str_in) = @_;
850
851 my $uses_bytecodes = 0;
852 my $exceeds_bytecodes = 0;
853
854 map { $exceeds_bytecodes = 1 if ($_ >= 256);
855 $uses_bytecodes = 1 if (($_ >= 128) && ($_ < 256));
856 } unpack("U*", $str_in); # Unpack Unicode characters
857
858 return ($uses_bytecodes,$exceeds_bytecodes);
859}
860
861
862sub convert_utf8_string_to_unicode_string
863{
864 my $utf8_string = shift(@_);
865
866 my $unicode_string = "";
867 foreach my $unicode_value (@{&unicode::utf82unicode($utf8_string)}) {
868 $unicode_string .= chr($unicode_value);
869 }
870 return $unicode_string;
871}
872
873sub convert_unicode_string_to_utf8_string
874{
875 my $unicode_string = shift(@_);
876
877 my @unicode_array;
878 for (my $i = 0; $i < length($unicode_string); $i++) {
879 push(@unicode_array, ord(&substr($unicode_string, $i, 1)));
880 }
881 return &unicode::unicode2utf8(\@unicode_array);
882}
883
884
8851;
Note: See TracBrowser for help on using the repository browser.