source: main/trunk/greenstone2/perllib/unicode.pm@ 23831

Last change on this file since 23831 was 23831, checked in by ak19, 10 years ago

Fixed a minor off by one error: wide characters printed are to be printed as unicode if greater than 127 (instead of testing for greater than 128).

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 22.9 KB
Line 
1###########################################################################
2#
3# unicode.pm --
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999-2004 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26# useful functions for dealing with Unicode
27
28# Unicode strings are stored as arrays of scalars as perl
29# lacks characters are 8-bit (currently)
30
31package unicode;
32
33eval {require bytes};
34
35use encodings;
36use strict;
37use util;
38use MIME::Base64; # for base64 encoding
39
40no strict 'refs';
41
42
43
44sub utf8decomp
45{
46 my ($str) = @_;
47
48 return if (!defined $str);
49 return "" if ($str eq "");
50
51 my @unpacked_chars = unpack("C*", $str); # unpack Unicode characters
52
53 my @each_char
54 = map { ($_ > 255 )
55 ? # if wide character...
56 sprintf("\\x{%04X}", $_)
57 : # \x{...}
58 (chr($_) =~ m/[[:cntrl:]]/ )
59 ? # else if control character ...
60 sprintf("\\x%02X", $_)
61 : # \x..
62 quotemeta(chr($_)) # else quoted or as themselves
63 } @unpacked_chars;
64
65 return join("",@each_char);
66}
67
68
69sub hex_codepoint {
70 if (my $char = shift) {
71 return sprintf '%2.2x', unpack('U0U*', $char);
72 }
73}
74
75
76
77
78# ascii2unicode takes an (extended) ascii string (ISO-8859-1)
79# and returns a unicode array.
80sub ascii2unicode {
81 my ($in) = @_;
82 my $out = [];
83
84 my $i = 0;
85 my $len = length($in);
86 while ($i < $len) {
87 push (@$out, ord(substr ($in, $i, 1)));
88 $i++;
89 }
90
91 return $out;
92}
93
94# ascii2utf8 takes a reference to an (extended) ascii string and returns a
95# UTF-8 encoded string. This is just a faster version of
96# "&unicode2utf8(&ascii2unicode($str));"
97# "Extended ascii" really means "iso_8859_1"
98sub ascii2utf8 {
99 my ($in) = @_;
100 my $out = "";
101
102 if (!defined($in)|| !defined($$in)) {
103 return $out;
104 }
105
106 my ($c);
107 my $i = 0;
108 my $len = length($$in);
109 while ($i < $len) {
110 $c = ord (substr ($$in, $i, 1));
111 if ($c < 0x80) {
112 # ascii character
113 $out .= chr ($c);
114
115 } else {
116 # extended ascii character
117 $out .= chr (0xc0 + (($c >> 6) & 0x1f));
118 $out .= chr (0x80 + ($c & 0x3f));
119 }
120 $i++;
121 }
122
123 return $out;
124}
125
126# unicode2utf8 takes a unicode array as input and encodes it
127# using utf-8
128sub unicode2utf8 {
129 my ($in) = @_;
130 my $out = "";
131
132 foreach my $num (@$in) {
133 next unless defined $num;
134 if ($num < 0x80) {
135 $out .= chr ($num);
136
137 } elsif ($num < 0x800) {
138 $out .= chr (0xc0 + (($num >> 6) & 0x1f));
139 $out .= chr (0x80 + ($num & 0x3f));
140
141 } elsif ($num < 0xFFFF) {
142 $out .= chr (0xe0 + (($num >> 12) & 0xf));
143 $out .= chr (0x80 + (($num >> 6) & 0x3f));
144 $out .= chr (0x80 + ($num & 0x3f));
145
146 } else {
147 # error, don't encode anything
148 die;
149 }
150 }
151 return $out;
152}
153
154# utf82unicode takes a utf-8 string and produces a unicode
155# array
156sub utf82unicode {
157 my ($in) = @_;
158 my $out = [];
159
160 my $i = 0;
161 my ($c1, $c2, $c3);
162 my $len = length($in);
163 while ($i < $len) {
164 if (($c1 = ord(substr ($in, $i, 1))) < 0x80) {
165 # normal ascii character
166 push (@$out, $c1);
167
168 } elsif ($c1 < 0xc0) {
169 # error, was expecting the first byte of an
170 # encoded character. Do nothing.
171
172 } elsif ($c1 < 0xe0 && $i+1 < $len) {
173 # an encoded character with two bytes
174 $c2 = ord (substr ($in, $i+1, 1));
175 if ($c2 >= 0x80 && $c2 < 0xc0) {
176 # everything looks ok
177 push (@$out, ((($c1 & 0x1f) << 6) +
178 ($c2 & 0x3f)));
179 $i++; # gobbled an extra byte
180 }
181
182 } elsif ($c1 < 0xf0 && $i+2 < $len) {
183 # an encoded character with three bytes
184 $c2 = ord (substr ($in, $i+1, 1));
185 $c3 = ord (substr ($in, $i+2, 1));
186 if ($c2 >= 0x80 && $c2 < 0xc0 &&
187 $c3 >= 0x80 && $c3 < 0xc0) {
188 # everything looks ok
189 push (@$out, ((($c1 & 0xf) << 12) +
190 (($c2 & 0x3f) << 6) +
191 ($c3 & 0x3f)));
192
193 $i += 2; # gobbled an extra two bytes
194 }
195
196 } else {
197 # error, only decode Unicode characters not full UCS.
198 # Do nothing.
199 }
200
201 $i++;
202 }
203
204 return $out;
205}
206
207# unicode2ucs2 takes a unicode array and produces a UCS-2
208# unicode string (every two bytes forms a unicode character)
209sub unicode2ucs2 {
210 my ($in) = @_;
211 my $out = "";
212
213 foreach my $num (@$in) {
214 $out .= chr (($num & 0xff00) >> 8);
215 $out .= chr ($num & 0xff);
216 }
217
218 return $out;
219}
220
221# ucs22unicode takes a UCS-2 string and produces a unicode array
222sub ucs22unicode {
223 my ($in) = @_;
224 my $out = [];
225
226 my $i = 0;
227 my $len = length ($in);
228 while ($i+1 < $len) {
229 push (@$out, ord (substr($in, $i, 1)) << 8 +
230 ord (substr($in, $i+1, 1)));
231
232 $i ++;
233 }
234
235 return $out;
236}
237
238# takes a reference to a string and returns a reference to a unicode array
239sub convert2unicode {
240 my ($encoding, $textref) = @_;
241
242 if (!defined $encodings::encodings->{$encoding}) {
243 print STDERR "unicode::convert2unicode: ERROR: Unsupported encoding ($encoding)\n";
244 return [];
245 }
246
247 my $encodename = "$encoding-unicode";
248 my $enc_info = $encodings::encodings->{$encoding};
249 my $mapfile = &util::filename_cat($ENV{'GSDLHOME'}, "mappings",
250 "to_uc", $enc_info->{'mapfile'});
251 if (!&loadmapencoding ($encodename, $mapfile)) {
252 print STDERR "unicode: ERROR - could not load encoding $encodename: $! $mapfile\n";
253 return [];
254 }
255
256 if (defined $enc_info->{'converter'}) {
257 my $converter = $enc_info->{'converter'};
258 return &$converter ($encodename, $textref);
259 }
260
261 if ($unicode::translations{$encodename}->{'count'} == 1) {
262 return &singlebyte2unicode ($encodename, $textref);
263 } else {
264 return &doublebyte2unicode ($encodename, $textref);
265 }
266}
267
268# singlebyte2unicode converts simple 8 bit encodings where characters below
269# 0x80 are normal ascii characters and the rest are decoded using the
270# appropriate mapping files.
271#
272# Examples of encodings that may be converted using singlebyte2unicode are
273# the iso-8859 and windows-125* series.
274sub singlebyte2unicode {
275 my ($encodename, $textref) = @_;
276
277 my @outtext = ();
278 my $len = length($$textref);
279 my ($c);
280 my $i = 0;
281
282 while ($i < $len) {
283 if (($c = ord(substr($$textref, $i, 1))) < 0x80) {
284 # normal ascii character
285 push (@outtext, $c);
286 } else {
287 $c = &transchar ($encodename, $c);
288 # put a black square if cannot translate
289 $c = 0x25A1 if $c == 0;
290 push (@outtext, $c);
291 }
292 $i ++;
293 }
294 return \@outtext;
295}
296
297# doublebyte2unicode converts simple two byte encodings where characters
298# below code point 0x80 are single-byte characters and the rest are
299# double-byte characters.
300#
301# Examples of encodings that may be converted using doublebyte2unicode are
302# CJK encodings like GB encoded Chinese and UHC Korean.
303#
304# Note that no error checking is performed to make sure that the input text
305# is valid for the given encoding.
306#
307# Also, encodings that may contain characters of more than two bytes are
308# not supported (any EUC encoded text may in theory contain 3-byte
309# characters but in practice only one and two byte characters are used).
310sub doublebyte2unicode {
311 my ($encodename, $textref) = @_;
312
313 my @outtext = ();
314 my $len = length($$textref);
315 my ($c1, $c2);
316 my $i = 0;
317
318 while ($i < $len) {
319 if (($c1 = ord(substr($$textref, $i, 1))) >= 0x80) {
320 if ($i+1 < $len) {
321 # double-byte character
322 $c2 = ord(substr($$textref, $i+1, 1));
323 my $c = &transchar ($encodename, ($c1 << 8) | $c2);
324 # put a black square if cannot translate
325 $c = 0x25A1 if $c == 0;
326 push (@outtext, $c);
327 $i += 2;
328
329 } else {
330 # error
331 print STDERR "unicode: ERROR missing second half of double-byte character\n";
332 $i++;
333 }
334
335 } else {
336 # single-byte character
337 push (@outtext, $c1);
338 $i++;
339 }
340 }
341 return \@outtext;
342}
343
344# Shift-JIS to unicode
345# We can't use doublebyte2unicode for Shift-JIS because it uses some
346# single-byte characters above code point 0x80 (i.e. half-width katakana
347# characters in the range 0xA1-0xDF)
348sub shiftjis2unicode {
349 my ($encodename, $textref) = @_;
350
351 my @outtext = ();
352 my $len = length($$textref);
353 my ($c1, $c2);
354 my $i = 0;
355
356 while ($i < $len) {
357 $c1 = ord(substr($$textref, $i, 1));
358
359 if (($c1 >= 0xA1 && $c1 <= 0xDF) || $c1 == 0x5c || $c1 == 0x7E) {
360 # Single-byte half-width katakana character or
361 # JIS Roman yen or overline characters
362 my $c = &transchar ($encodename, $c1);
363 # - put a black square if cannot translate
364 $c = 0x25A1 if $c == 0;
365 push (@outtext, $c);
366 $i++;
367
368 } elsif ($c1 < 0x80) {
369 # ASCII
370 push (@outtext, $c1);
371 $i ++;
372
373 } elsif ($c1 < 0xEF) {
374 if ($i+1 < $len) {
375 $c2 = ord(substr($$textref, $i+1, 1));
376 if (($c2 >= 0x40 && $c2 <= 0x7E) || ($c2 >= 0x80 && $c2 <= 0xFC)) {
377 # Double-byte shift-jis character
378 my $c = &transchar ($encodename, ($c1 << 8) | $c2);
379 # put a black square if cannot translate
380 $c = 0x25A1 if $c == 0;
381 push (@outtext, $c);
382 } else {
383 # error
384 print STDERR "unicode: ERROR Invalid Shift-JIS character\n";
385 }
386 $i += 2;
387 } else {
388 # error
389 print STDERR "unicode: ERROR missing second half of Shift-JIS character\n";
390 $i ++;
391 }
392 } else {
393 # error
394 print STDERR "unicode: ERROR Invalid Shift-JIS character\n";
395 $i ++;
396 }
397 }
398 return \@outtext;
399}
400
401sub transchar {
402 my ($encoding, $from) = @_;
403 my $high = ($from / 256) % 256;
404 my $low = $from % 256;
405
406 return 0 unless defined $unicode::translations{$encoding};
407
408 my $block = $unicode::translations{$encoding}->{'map'};
409
410 if (ref ($block->[$high]) ne "ARRAY") {
411 return 0;
412 }
413 return $block->[$high]->[$low];
414}
415
416# %translations is of the form:
417#
418# encodings{encodingname-encodingname}->{'map'}->blocktranslation
419# blocktranslation->[[0-255],[256-511], ..., [65280-65535]]
420#
421# Any of the top translation blocks can point to an undefined
422# value. This data structure aims to allow fast translation and
423# efficient storage.
424%unicode::translations = ();
425
426# @array256 is used for initialisation, there must be
427# a better way...
428# What about this?: @array256 = (0) x 256;
429@unicode::array256 = (0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
430 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
431 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
432 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
433 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
434 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
435 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
436 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
437 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
438 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
439 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
440 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
441 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
442 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
443 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
444 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0);
445
446# returns 1 if successful, 0 if unsuccessful
447sub loadmapencoding {
448 my ($encoding, $mapfile) = @_;
449
450 # check to see if the encoding has already been loaded
451 return 1 if (defined $unicode::translations{$encoding});
452
453 if (! -r $mapfile || -d $mapfile) {
454 return 0;
455 }
456 return 0 unless open (MAPFILE, $mapfile);
457 binmode (MAPFILE);
458
459 $unicode::translations{$encoding} = {'map' => [@unicode::array256], 'count' => 0};
460 my $block = $unicode::translations{$encoding};
461
462 my ($in,$i,$j);
463 while (1) {
464 my $ret=read(MAPFILE, $in, 1);
465 if (!defined($ret)) { # error
466 print STDERR "unicode.pm: error reading mapfile: $!\n";
467 last;
468 }
469 if ($ret != 1) { last }
470 $i = unpack ("C", $in);
471 $block->{'map'}->[$i] = [@unicode::array256];
472 for ($j=0; $j<256 && read(MAPFILE, $in, 2)==2; $j++) {
473 my ($n1, $n2) = unpack ("CC", $in);
474 $block->{'map'}->[$i]->[$j] = ($n1*256) + $n2;
475 }
476 $block->{'count'} ++;
477 }
478
479 close (MAPFILE);
480}
481
482# unicode2singlebyte converts unicode to simple 8 bit encodings where
483# characters below 0x80 are normal ascii characters and the rest are encoded
484# using the appropriate mapping files.
485#
486# Examples of encodings that may be converted using unicode2singlebyte are
487# the iso-8859 and windows-125* series, KOI8-R (Russian), and the Kazakh encoding.
488sub unicode2singlebyte {
489 my ($uniref, $encoding) = @_;
490
491 my $outtext = "";
492 my $encodename = "unicode-$encoding";
493
494 if (!exists $encodings::encodings->{$encoding}) {
495 print STDERR "unicode.pm: ERROR - unsupported encoding "
496 . "'$encoding' requested\n";
497 return "";
498 }
499
500 my $enc_info = $encodings::encodings->{$encoding};
501 my $mapfile = &util::filename_cat($ENV{'GSDLHOME'}, "mappings",
502 "from_uc", $enc_info->{'mapfile'});
503 if (!&loadmapencoding ($encodename, $mapfile)) {
504 print STDERR "unicode: ERROR - could not load encoding $encodename: $! $mapfile\n";
505 return "";
506 }
507
508 foreach my $c (@$uniref) {
509 if ($c < 0x80) {
510 # normal ascii character
511 $outtext .= chr($c);
512 } else {
513 # extended ascii character
514 $c = &transchar ($encodename, $c);
515
516 # put a question mark if cannot translate
517 if ($c == 0) {
518 $outtext .= "?";
519 } else {
520 $outtext .= chr($c);
521 }
522 }
523 }
524 return $outtext;
525}
526
527
528# this makes sure that the referenced input string is utf8 encoded, and
529# will change/remove bytes that aren't.
530# returns 0 if the text was already utf8, or 1 if text modified to become utf8
531sub ensure_utf8 {
532 my $stringref=shift;
533
534 if (!defined($stringref) || ref($stringref) ne 'SCALAR') {
535 return $stringref;
536 }
537
538 my $value=$$stringref;
539
540 my $non_utf8_found = 0;
541 $value =~ m/^/g; # to set \G
542 while ($value =~ m!\G.*?([\x80-\xff]+)!sg) {
543 my $highbytes=$1;
544 my $highbyteslength=length($highbytes);
545 # make sure this block of high bytes is utf-8
546 $highbytes =~ /^/g; # set pos()
547 my $byte_replaced = 0;
548 while ($highbytes =~
549 m!\G (?: [\xc0-\xdf][\x80-\xbf] | # 2 byte utf-8
550 [\xe0-\xef][\x80-\xbf]{2} | # 3 byte
551 [\xf0-\xf7][\x80-\xbf]{3} | # 4 byte
552 [\xf8-\xfb][\x80-\xbf]{4} | # 5 byte
553 [\xfc-\xfd][\x80-\xbf]{5} | # 6 byte
554 )*([\x80-\xff])? !xg
555 ) {
556 # this highbyte is "out-of-place" for valid utf-8
557 my $badbyte=$1;
558 if (!defined $badbyte) {next} # hit end of string
559 my $pos=pos($highbytes);
560 # replace bad byte. assume iso-8859-1 -> utf-8
561 # ascii2utf8 does "extended ascii"... ie iso-8859-1
562 my $replacement=&unicode::ascii2utf8(\$badbyte);
563 substr($highbytes, $pos-1, 1, $replacement);
564 # update the position to continue searching (for \G)
565 pos($highbytes) = $pos+length($replacement)-1;
566 $byte_replaced = 1;
567 }
568 if ($byte_replaced) {
569 # replace this block of high bytes in the $value
570 $non_utf8_found = 1;
571 my $replength=length($highbytes); # we've changed the length
572 my $textpos=pos($value); # pos at end of last match
573 # replace bad bytes with good bytes
574 substr($value, $textpos-$highbyteslength,
575 $highbyteslength, $highbytes);
576 # update the position to continue searching (for \G)
577 pos($value)=$textpos+($replength-$highbyteslength)+1;
578 }
579 }
580
581 $$stringref = $value;
582 return $non_utf8_found;
583}
584
585# Returns true (1) if the given string is utf8 and false (0) if it isn't.
586# Does not modify the string parameter.
587sub check_is_utf8 {
588 my $value=shift;
589
590 if (!defined($value)) {
591 return 0; # not utf8 because it is undefined
592 }
593
594 $value =~ m/^/g; # to set \G
595 while ($value =~ m!\G.*?([\x80-\xff]+)!sg) {
596 my $highbytes=$1;
597 # make sure this block of high bytes is utf-8
598 $highbytes =~ /^/g; # set pos()
599 while ($highbytes =~
600 m!\G (?: [\xc0-\xdf][\x80-\xbf] | # 2 byte utf-8
601 [\xe0-\xef][\x80-\xbf]{2} | # 3 byte
602 [\xf0-\xf7][\x80-\xbf]{3} | # 4 byte
603 [\xf8-\xfb][\x80-\xbf]{4} | # 5 byte
604 [\xfc-\xfd][\x80-\xbf]{5} | # 6 byte
605 )*([\x80-\xff])? !xg
606 ) {
607 my $badbyte=$1;
608 if (defined $badbyte) { # not end of string
609 return 0; # non-utf8 found
610 }
611 }
612 }
613
614 return 1;
615}
616
617sub url_encode {
618 my ($text) = @_;
619
620 if (!&is_url_encoded($text)) {
621 $text =~ s/([^0-9A-Z\ \.\-\_])/sprintf("%%%02X", ord($1))/iseg;
622 # return the url-encoded character entity for underscore back to the entity
623 $text =~ s/%26%23095%3B/&\#095;/g;
624 }
625 return $text;
626}
627
628sub url_decode {
629 my ($text,$and_numeric_entities) = @_;
630
631 $text =~ s/\%([0-9A-F]{2})/pack('C', hex($1))/ige;
632
633 if ((defined $and_numeric_entities) && ($and_numeric_entities)) {
634 $text =~ s/\&\#x([0-9A-F]+);/pack('C', hex($1))/ige;
635 $text =~ s/\&\#u?([0-9]+);/pack('C', $1)/ige;
636 }
637
638 return $text;
639}
640
641sub url_decode_utf8 {
642 my ($text,$and_numeric_entities) = @_;
643
644 $text =~ s/\%([0-9A-F]{2})/pack('b', hex($1))/ige;
645
646 $text = Encode::decode("utf8",$text);
647
648 return $text;
649}
650
651sub is_url_encoded {
652 my ($text) = @_;
653 return ($text =~ m/\%([0-9A-F]{2})/i) || ($text =~ m/\&\#x([0-9A-F]+;)/i) || ($text =~ m/\&\#([0-9]+;)/i);
654}
655
656# When a filename on the filesystem is already URL-encoded, the
657# URL to it will have %25s in place of every % sign, so that
658# URLs in html pages can refer to the URL-encoded filename.
659# This method changes the URL reference back into the actual
660# (URL-encoded) filename on the filesystem by replacing %25 with %.
661sub url_to_filename {
662 my ($text) =@_;
663 $text =~ s/%25/%/g if &is_url_encoded($text);
664 # DM safing would have replaced underscores with character entity &#095;
665 # in SourceFile meta. Undo any such change to get the filename referred to.
666 $text =~ s/&\#095;/_/g;
667 return $text;
668}
669
670# When a filename on the filesystem is already URL-encoded, the
671# URL to it will have %25s in place of every % sign, so that
672# URLs in html pages can refer to the URL-encoded filename.
673# Given a (URL-encoded) filename on the filesystem, this subroutine
674# returns the URL reference string for it by replacing % with %25.
675# The output string will be the same as the input string if the input
676# already contains one or more %25s. This is to prevent processing
677# a url more than once this way.
678sub filename_to_url {
679 my ($text) = @_;
680
681 if($text !~ m/%25/) {
682 $text =~ s/%/%25/g;
683 }
684 return $text;
685}
686
687sub base64_encode {
688 my ($text) = @_;
689 if(!&conforms_to_mod_base64($text)) {
690 # return entity for underscore to underscore before encoding
691 $text =~ s/&\#095;/_/g;
692
693 $text = &MIME::Base64::encode_base64($text);
694 # base64 encoding may introduce + and / signs,
695 # replacing them with - and _ to ensure it's filename-safe
696 $text =~ s/\+/\-/g; # + -> -
697 $text =~ s/\//\_/g; # / -> _
698 }
699 return $text;
700}
701
702# If the input fits the modified base64 pattern, this will try decoding it.
703# Still, this method does not guarantee the return value is the 'original', only
704# that the result is where the base64 decoding process has been applied once.
705# THIS METHOD IS NOT USED at the moment. It's here for convenience and symmetry.
706sub base64_decode {
707 my ($text) = @_;
708 if(&conforms_to_mod_base64($text)) {
709 # base64 encodes certain chars with + and /, but if we'd encoded it, we'd
710 # have replaced them with - and _ respectively. Undo this before decoding.
711 $text =~ s/\-/\+/g; # - -> +
712 $text =~ s/\_/\//g; # _ -> /
713 $text = &MIME::Base64::decode_base64($text);
714 }
715 return $text;
716}
717
718# Returns true if the given string is compatible with a modified version
719# of base64 (where the + and / are replaced with - and _), a format which
720# includes also regular ASCII alphanumeric values. This method does not
721# guarantee that the given string is actually base64 encoded, since it will
722# return true for any simple alphanumeric ASCII string as well.
723sub conforms_to_mod_base64 {
724 my ($text) = @_;
725
726 # need to treat the entity ref for underscore as underscore
727 $text =~ s/&\#095;/_/g;
728
729 # base 64 takes alphanumeric and [=+/],
730 # but we use modified base64 where + and / are replaced with - and _
731 return ($text =~ m/^[A-Za-z0-9\=\-\_]+$/); #alphanumeric and [=-_]
732}
733
734sub substr
735{
736 my ($utf8_string, $offset, $length) = @_;
737
738 my @unicode_string = @{&utf82unicode($utf8_string)};
739 my $unicode_string_length = scalar(@unicode_string);
740
741 my $substr_start = $offset;
742 if ($substr_start >= $unicode_string_length) {
743 return "";
744 }
745
746 my $substr_end = $offset + $length - 1;
747 if ($substr_end >= $unicode_string_length) {
748 $substr_end = $unicode_string_length - 1;
749 }
750
751 my @unicode_substring = @unicode_string[$substr_start..$substr_end];
752 return &unicode2utf8(\@unicode_substring);
753}
754
755# Useful method to print UTF8 (or other unicode) for debugging.
756# Characters that are easily displayed (that is, printable ASCII)
757# are shown as-is, whereas hex values of the unicode code points
758# are shown for all other chars.
759sub debug_unicode_string
760{
761 join("",
762 map { $_ > 127 ? # if wide character...
763 sprintf("\\x{%04X}", $_) : # \x{...}
764 chr($_)
765 } unpack("U*", $_[0])); # unpack Unicode characters
766}
767
768
769sub raw_filename_to_url_encoded
770{
771 my ($str_in) = @_;
772
773 my @url_encoded_chars
774 = map { $_ > 255 ? # Needs to be represent in entity form
775 sprintf("&#x%X;",$_) :
776 $_>127 || $_==ord("%") ? # Representable in %XX form
777 sprintf("%%%2X", $_) :
778 chr($_) # otherwise, Ascii char
779 } unpack("U*", $str_in); # Unpack Unicode characters
780
781
782 my $str_out = join("", @url_encoded_chars);
783
784 return $str_out;
785
786}
787
788sub url_encoded_to_raw_filename
789{
790 my ($str_in) = @_;
791
792 my $str_out = $str_in;
793
794 $str_out =~ s/%([0-9A-F]{2})/chr(hex($1))/eig;
795 $str_out =~ s/&#x([0-9A-F]+);/chr(hex($1))/eig;
796 $str_out =~ s/&#([0-9]+);/chr($1)/eig;
797
798 return $str_out;
799}
800
801
802sub raw_filename_to_utf8_url_encoded
803{
804 my ($str_in) = @_;
805
806 $str_in = Encode::encode("utf8",$str_in) if !check_is_utf8($str_in);
807
808 my @url_encoded_chars
809 = map { $_ > 127 ? # Representable in %XX form
810 sprintf("%%%2X", $_) :
811 chr($_) # otherwise, Ascii char
812 } unpack("U*", $str_in); # Unpack utf8 characters
813
814
815 my $str_out = join("", @url_encoded_chars);
816
817 return $str_out;
818
819}
820
821sub utf8_url_encoded_to_raw_filename
822{
823 my ($str_in) = @_;
824
825 my $utf8_str_out = $str_in;
826
827 $utf8_str_out =~ s/%([0-9A-F]{2})/chr(hex($1))/eig;
828
829 my $unicode_str_out = decode("utf8",$utf8_str_out);
830 my $raw_str_out = utf8::downgrade($unicode_str_out);
831
832 return $raw_str_out;
833}
834
835sub analyze_raw_string
836{
837 my ($str_in) = @_;
838
839 my $uses_bytecodes = 0;
840 my $exceeds_bytecodes = 0;
841
842 map { $exceeds_bytecodes = 1 if ($_ >= 256);
843 $uses_bytecodes = 1 if (($_ >= 128) && ($_ < 256));
844 } unpack("U*", $str_in); # Unpack Unicode characters
845
846 return ($uses_bytecodes,$exceeds_bytecodes);
847}
848
849
8501;
Note: See TracBrowser for help on using the repository browser.